Upload app.py
Browse files
app.py
CHANGED
|
@@ -58,18 +58,14 @@ def parens_to_angles(s):
|
|
| 58 |
def split_num(num):
|
| 59 |
num = num.group()
|
| 60 |
if '.' in num:
|
| 61 |
-
|
| 62 |
-
a, b = num.split('.')
|
| 63 |
-
return ' point '.join([a, ' '.join(b)])
|
| 64 |
elif ':' in num:
|
| 65 |
-
# Time
|
| 66 |
h, m = [int(n) for n in num.split(':')]
|
| 67 |
if m == 0:
|
| 68 |
return f"{h} o'clock"
|
| 69 |
elif m < 10:
|
| 70 |
return f'{h} oh {m}'
|
| 71 |
return f'{h} {m}'
|
| 72 |
-
# Year
|
| 73 |
year = int(num[:4])
|
| 74 |
if year < 1100 or year % 1000 < 10:
|
| 75 |
return num
|
|
@@ -82,6 +78,24 @@ def split_num(num):
|
|
| 82 |
return f'{left} oh {right}{s}'
|
| 83 |
return f'{left} {right}{s}'
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
def normalize(text):
|
| 86 |
# TODO: Custom text normalization rules?
|
| 87 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
|
@@ -97,6 +111,8 @@ def normalize(text):
|
|
| 97 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 98 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
| 99 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
|
|
|
|
|
|
| 100 |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
|
| 101 |
text = re.sub(r'(?<=\d)S', ' S', text)
|
| 102 |
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
|
|
|
|
| 58 |
def split_num(num):
|
| 59 |
num = num.group()
|
| 60 |
if '.' in num:
|
| 61 |
+
return num
|
|
|
|
|
|
|
| 62 |
elif ':' in num:
|
|
|
|
| 63 |
h, m = [int(n) for n in num.split(':')]
|
| 64 |
if m == 0:
|
| 65 |
return f"{h} o'clock"
|
| 66 |
elif m < 10:
|
| 67 |
return f'{h} oh {m}'
|
| 68 |
return f'{h} {m}'
|
|
|
|
| 69 |
year = int(num[:4])
|
| 70 |
if year < 1100 or year % 1000 < 10:
|
| 71 |
return num
|
|
|
|
| 78 |
return f'{left} oh {right}{s}'
|
| 79 |
return f'{left} {right}{s}'
|
| 80 |
|
| 81 |
+
def flip_money(m):
|
| 82 |
+
m = m.group()
|
| 83 |
+
bill = 'dollar' if m[0] == '$' else 'pound'
|
| 84 |
+
if m[-1].isalpha():
|
| 85 |
+
return f'{m[1:]} {bill}s'
|
| 86 |
+
elif '.' not in m:
|
| 87 |
+
s = '' if m[1:] == '1' else 's'
|
| 88 |
+
return f'{m[1:]} {bill}{s}'
|
| 89 |
+
b, c = m[1:].split('.')
|
| 90 |
+
s = '' if b == '1' else 's'
|
| 91 |
+
c = int(c.ljust(2, '0'))
|
| 92 |
+
coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence')
|
| 93 |
+
return f'{b} {bill}{s} and {c} {coins}'
|
| 94 |
+
|
| 95 |
+
def point_num(num):
|
| 96 |
+
a, b = num.group().split('.')
|
| 97 |
+
return ' point '.join([a, ' '.join(b)])
|
| 98 |
+
|
| 99 |
def normalize(text):
|
| 100 |
# TODO: Custom text normalization rules?
|
| 101 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
|
|
|
| 111 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 112 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
| 113 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
| 114 |
+
text = re.sub(r'[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
|
| 115 |
+
text = re.sub(r'\d*\.\d+', point_num, text)
|
| 116 |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
|
| 117 |
text = re.sub(r'(?<=\d)S', ' S', text)
|
| 118 |
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
|