|
a |
|
b/language_processing.py |
|
|
1 |
import re |
|
|
2 |
import datetime |
|
|
3 |
from datetime import timedelta |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
#---------------------------------------------------------- |
|
|
7 |
# Functions for parsing the header of a note file and modifying |
|
|
8 |
# the JSON to incorporate these structured fields |
|
|
9 |
#---------------------------------------------------------- |
|
|
10 |
|
|
|
11 |
def parse_m_d_y(s): |
|
|
12 |
# I (Josh) added this to catch a date that had a typo in it |
|
|
13 |
s = s.replace(".", '') |
|
|
14 |
re_m_d_y = r"([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{4})|([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{2})" |
|
|
15 |
match = re.search(re_m_d_y, s) |
|
|
16 |
if match: |
|
|
17 |
groups = list(match.groups()) |
|
|
18 |
if groups[0] == None: |
|
|
19 |
groups = groups[3:] |
|
|
20 |
if int(groups[2]) - 17 >= 0: |
|
|
21 |
groups[2] = '19' + groups[2] |
|
|
22 |
else: |
|
|
23 |
groups[2] = '20' + groups[2] |
|
|
24 |
else: |
|
|
25 |
groups = groups[:3] |
|
|
26 |
return datetime.date(int(groups[2]),int(groups[0]),int(groups[1])) |
|
|
27 |
else: |
|
|
28 |
return None |
|
|
29 |
|
|
|
30 |
def parse_m_y(s): |
|
|
31 |
re_m_y = "([0-9]{1,2})/([0-9]{4})|([0-9]{1,2})/([0-9]{2})" |
|
|
32 |
match = re.search(re_m_y, s) |
|
|
33 |
if match: |
|
|
34 |
groups = list(match.groups()) |
|
|
35 |
if groups[0] == None: |
|
|
36 |
groups = groups[2:] |
|
|
37 |
if int(groups[1]) - 17 >= 0: |
|
|
38 |
groups[1] = str(19) + groups[1] |
|
|
39 |
else: |
|
|
40 |
groups[1] = str(20) + groups[1] |
|
|
41 |
else: |
|
|
42 |
groups = groups[:2] |
|
|
43 |
|
|
|
44 |
return datetime.date(int(groups[1]),int(groups[0]), 1) |
|
|
45 |
else: |
|
|
46 |
return None |
|
|
47 |
|
|
|
48 |
def parse_date(s): |
|
|
49 |
date = parse_m_d_y(s) |
|
|
50 |
if not date: |
|
|
51 |
date = parse_m_y(s) |
|
|
52 |
if not date: |
|
|
53 |
return None |
|
|
54 |
return date |
|
|
55 |
|
|
|
56 |
def format_date(s): |
|
|
57 |
if type(s) in [type(""), type(u'')]: |
|
|
58 |
date = parse_date(s) |
|
|
59 |
else: |
|
|
60 |
date = s |
|
|
61 |
return date.strftime("%m/%d/%Y 00:00") |
|
|
62 |
|
|
|
63 |
#---------------------------------------------------------- |
|
|
64 |
# Functions for extracting sentences from text |
|
|
65 |
#---------------------------------------------------------- |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
''' |
|
|
69 |
description |
|
|
70 |
given an index in a string i, extracts the sentence [a, b] where b >= i |
|
|
71 |
this means if s[i] = '.', then this will return the preceeding sentence |
|
|
72 |
input |
|
|
73 |
s: a string |
|
|
74 |
i: the index of the string that should be contained in the output sentence |
|
|
75 |
index: if true, then the indicies [a, b) that define the sentence are returned |
|
|
76 |
otherwise, the string s[a:b] is returned (default FALSE) |
|
|
77 |
output |
|
|
78 |
if index = TRUE |
|
|
79 |
a tuple (a, b) |
|
|
80 |
else |
|
|
81 |
a string s[a, b) |
|
|
82 |
''' |
|
|
83 |
def get_sentence(s, i, index = False): |
|
|
84 |
#A new line must end a sentence and no ne gives a fuck about '\r' |
|
|
85 |
s = s.replace("\r", "") |
|
|
86 |
s = s.replace("\n", ". ")#TODO: more careful way of achieving this effect |
|
|
87 |
|
|
|
88 |
a = i-1 |
|
|
89 |
#find the end of the previous sentence |
|
|
90 |
while a > 0 and not is_sentence_end(s, a-1): |
|
|
91 |
a -= 1 |
|
|
92 |
b = i |
|
|
93 |
#find end of sentence |
|
|
94 |
while b < len(s) and not is_sentence_end(s, b): |
|
|
95 |
b += 1 |
|
|
96 |
|
|
|
97 |
#return tuple or string based on index variable |
|
|
98 |
if index: |
|
|
99 |
return (a, b+1) |
|
|
100 |
else: |
|
|
101 |
return s[a : b+1] |
|
|
102 |
|
|
|
103 |
''' |
|
|
104 |
description |
|
|
105 |
uses get_sentence to tokenize string into sentences |
|
|
106 |
input |
|
|
107 |
s: string |
|
|
108 |
output |
|
|
109 |
a list of sentences |
|
|
110 |
''' |
|
|
111 |
def split_sentences(s): |
|
|
112 |
s = s.replace("\r","").replace("\n", ". ") |
|
|
113 |
i = 0 |
|
|
114 |
result = [] |
|
|
115 |
while i < len(s): |
|
|
116 |
a, i_new = get_sentence(s, i, True) |
|
|
117 |
result += [s[i:i_new].strip(" ")] |
|
|
118 |
i = i_new |
|
|
119 |
return result |
|
|
120 |
|
|
|
121 |
|
|
|
122 |
|
|
|
123 |
''' |
|
|
124 |
description |
|
|
125 |
helper function to help identify if '.' in a string indicates if a sentence ends |
|
|
126 |
input |
|
|
127 |
s: a string |
|
|
128 |
i: the index of the '.' |
|
|
129 |
output |
|
|
130 |
boolean of if that '.' indicates an end of a sentence |
|
|
131 |
''' |
|
|
132 |
def is_sentence_end(s, i): |
|
|
133 |
if i == 0: #if period starts string, not end of sentence |
|
|
134 |
return False |
|
|
135 |
elif i >= len(s)-1: #if period ends string, must be end of sentence |
|
|
136 |
return True |
|
|
137 |
elif unicode(s[i]) in [u'!',u'?']: #these are unambiguous |
|
|
138 |
return True |
|
|
139 |
elif unicode(s[i]) != u'.': #if its not a period it can't end a sentence |
|
|
140 |
return False |
|
|
141 |
else: #this is the case that it is a period |
|
|
142 |
before = unicode(s[i-1]) |
|
|
143 |
after = unicode(s[i+1]) |
|
|
144 |
if not before.isnumeric(): #e.g. "...and he stopped. 5 is a nice number" |
|
|
145 |
return True |
|
|
146 |
elif after.isnumeric(): #e.g. "I have 5.2 liters" |
|
|
147 |
return False |
|
|
148 |
else: #e.g. "I have work until 5.I need a friend" |
|
|
149 |
return True |
|
|
150 |
#TODO: include cases for Mr. , Mrs. or any arbitary list of abreviations |
|
|
151 |
|
|
|
152 |
|