-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathengine_convert.py
More file actions
129 lines (94 loc) · 3.92 KB
/
engine_convert.py
File metadata and controls
129 lines (94 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import logging, sys, csv
import dateutil.parser, re
from model_operating_hours import OperatingHours
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
def ConvertFrom(filename):
"""
Takes the name of a csv file matching the business specification, then
maps to a list of our internal data model
DESIGN: mapping to a data model may be overkill. Can just map to a json structure
and treat it like a database of sorts
"""
if not filename or not filename.strip():
logging.error("Invalid filename for input data")
raise ValueError()
internal_data = {}
with open(filename, newline='') as cvsfile:
reader = csv.DictReader(cvsfile)
for row in reader:
restaurant_name = row['Restaurant Name']
in_hours = row['Hours'].strip()
parsed_op_hours = parse_op_hours(in_hours)
internal_data[restaurant_name] = parsed_op_hours
return internal_data
def parse_op_hours(human_readable_op_hours):
"""
TODO:
Future: if performance concerns, compile the regex
beforehand (plus other options)
Future: the find/replace can be replaced by one operation
"""
if human_readable_op_hours is None or human_readable_op_hours == "":
return None
sections = re.split(r'\s*\/\s*', human_readable_op_hours)
op_hours_per_day = {}
for section in sections:
section, close_time_text = parse_last_time_text(section)
section, open_time_text = parse_last_time_text(section)
open_time = parse_time(open_time_text)
close_time = parse_time(close_time_text)
subsections = re.split(r'\s*,\s*', section)
for subsection in subsections:
for dow in parse_dow_text(subsection):
op_hours_per_day[dow] = OperatingHours(open_time, close_time)
return op_hours_per_day
def parse_last_time_text(section):
if section is None or section == "":
return section, None
# non-overlapping
# look for the last occurrence first
# TODO: FUTURE: possible performance improvements with a different search, if needed
match = re.search(r"(?s:.*)\s([0-9][0-9:]*\s*[ap]m)\s*", section)
if match is None:
return section, None
time_text = match.group(1)
section = section.replace(time_text, "")
# TODO: as regex, in case spaces aren't there
section = section.replace(" - ", "")
# TODO: error check on the match results
return section, re.sub(r'\s', "", time_text)
def parse_dow_text(section):
# TODO; right now, relying on exceptions for the error handling
# TODO: move to common location
dow_full = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
dow_list = []
section = section.replace("\"", "")
section = re.sub(r'\s+', "", section)
# disconnected days
first_break = section.split(",")
for f in first_break:
second_break = f.split("-")
start_dow = second_break[0].strip()[0:3]
if start_dow not in dow_full:
continue
df_start_index = dow_full.index(start_dow)
if len(second_break) == 1:
dow_list.append(f.strip())
else:
end_dow = second_break[1].strip()[0:3]
if end_dow not in dow_full:
continue
df_end_index = dow_full.index(end_dow)
for df in dow_full[df_start_index:df_end_index + 1]:
dow_list.append(df)
return dow_list
# TODO: FUTURE: can be moved to a utility for common use
# TODO: FUTURE: better way to do this in python
def parse_time(time_text):
# a bit of a hack, but the python time library doesn't seem to have the power of C# DateTime
dummy_datestamp = "2000-01-01 " + time_text
parsed = dateutil.parser.parse(dummy_datestamp)
return parsed.strftime("%H:%M")
# Area of volatility, if other data patterns will break this, will need to update
def split_line(line):
return re.split(r',(?=")', line)