--- a +++ b/language_processing.py @@ -0,0 +1,152 @@ +import re +import datetime +from datetime import timedelta + + +#---------------------------------------------------------- +# Functions for parsing the header of a note file and modifying +# the JSON to incorporate these structured fields +#---------------------------------------------------------- + +def parse_m_d_y(s): + # I (Josh) added this to catch a date that had a typo in it + s = s.replace(".", '') + re_m_d_y = r"([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{4})|([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{2})" + match = re.search(re_m_d_y, s) + if match: + groups = list(match.groups()) + if groups[0] == None: + groups = groups[3:] + if int(groups[2]) - 17 >= 0: + groups[2] = '19' + groups[2] + else: + groups[2] = '20' + groups[2] + else: + groups = groups[:3] + return datetime.date(int(groups[2]),int(groups[0]),int(groups[1])) + else: + return None + +def parse_m_y(s): + re_m_y = "([0-9]{1,2})/([0-9]{4})|([0-9]{1,2})/([0-9]{2})" + match = re.search(re_m_y, s) + if match: + groups = list(match.groups()) + if groups[0] == None: + groups = groups[2:] + if int(groups[1]) - 17 >= 0: + groups[1] = str(19) + groups[1] + else: + groups[1] = str(20) + groups[1] + else: + groups = groups[:2] + + return datetime.date(int(groups[1]),int(groups[0]), 1) + else: + return None + +def parse_date(s): + date = parse_m_d_y(s) + if not date: + date = parse_m_y(s) + if not date: + return None + return date + +def format_date(s): + if type(s) in [type(""), type(u'')]: + date = parse_date(s) + else: + date = s + return date.strftime("%m/%d/%Y 00:00") + +#---------------------------------------------------------- +# Functions for extracting sentences from text +#---------------------------------------------------------- + + +''' +description + given an index in a string i, extracts the sentence [a, b] where b >= i + this means if s[i] = '.', then this will return the preceeding sentence +input + s: a string + i: the index of the string that should be contained in the output sentence + index: if true, then the indicies [a, b) that define the sentence are returned + otherwise, the string s[a:b] is returned (default FALSE) +output + if index = TRUE + a tuple (a, b) + else + a string s[a, b) +''' +def get_sentence(s, i, index = False): + #A new line must end a sentence and no ne gives a fuck about '\r' + s = s.replace("\r", "") + s = s.replace("\n", ". ")#TODO: more careful way of achieving this effect + + a = i-1 + #find the end of the previous sentence + while a > 0 and not is_sentence_end(s, a-1): + a -= 1 + b = i + #find end of sentence + while b < len(s) and not is_sentence_end(s, b): + b += 1 + + #return tuple or string based on index variable + if index: + return (a, b+1) + else: + return s[a : b+1] + +''' +description + uses get_sentence to tokenize string into sentences +input + s: string +output + a list of sentences +''' +def split_sentences(s): + s = s.replace("\r","").replace("\n", ". ") + i = 0 + result = [] + while i < len(s): + a, i_new = get_sentence(s, i, True) + result += [s[i:i_new].strip(" ")] + i = i_new + return result + + + +''' +description + helper function to help identify if '.' in a string indicates if a sentence ends +input + s: a string + i: the index of the '.' +output + boolean of if that '.' indicates an end of a sentence +''' +def is_sentence_end(s, i): + if i == 0: #if period starts string, not end of sentence + return False + elif i >= len(s)-1: #if period ends string, must be end of sentence + return True + elif unicode(s[i]) in [u'!',u'?']: #these are unambiguous + return True + elif unicode(s[i]) != u'.': #if its not a period it can't end a sentence + return False + else: #this is the case that it is a period + before = unicode(s[i-1]) + after = unicode(s[i+1]) + if not before.isnumeric(): #e.g. "...and he stopped. 5 is a nice number" + return True + elif after.isnumeric(): #e.g. "I have 5.2 liters" + return False + else: #e.g. "I have work until 5.I need a friend" + return True + #TODO: include cases for Mr. , Mrs. or any arbitary list of abreviations + +