NLP_CRT / Git / Diff of /language

Models:

philipB/

NLP_CRT

Downloads: 1

Diff of /language_processing.py [000000] .. [8d2107]

Switch to unified view

 b/language_processing.py
+import re
+import datetime
+from datetime import timedelta
+#----------------------------------------------------------
+# Functions for parsing the header of a note file and modifying
+# the JSON to incorporate these structured fields
+#----------------------------------------------------------
+def parse_m_d_y(s):
+    # I (Josh) added this to catch a date that had a typo in it
+    s = s.replace(".", '')
+    re_m_d_y = r"([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{4})|([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{2})"
+    match = re.search(re_m_d_y, s)
+    if match:
+        groups = list(match.groups())
+        if groups[0] == None:
+            groups = groups[3:]
+            if int(groups[2]) - 17 >= 0:
+                groups[2] = '19' + groups[2]
+            else:
+                groups[2] = '20' + groups[2]
+        else:
+            groups = groups[:3]
+        return datetime.date(int(groups[2]),int(groups[0]),int(groups[1]))
+    else:
+        return None
+def parse_m_y(s):
+    re_m_y = "([0-9]{1,2})/([0-9]{4})|([0-9]{1,2})/([0-9]{2})"
+    match = re.search(re_m_y, s)
+    if match:
+        groups = list(match.groups())
+        if groups[0] == None:
+            groups = groups[2:]
+            if int(groups[1]) - 17 >= 0:
+                groups[1] = str(19) + groups[1]
+            else:
+                groups[1] = str(20) + groups[1]
+        else:
+            groups = groups[:2]
+        return datetime.date(int(groups[1]),int(groups[0]), 1)
+    else:
+        return None
+def parse_date(s):
+    date = parse_m_d_y(s)
+    if not date:
+        date = parse_m_y(s)
+    if not date:
+        return None
+    return date
+def format_date(s):
+    if type(s) in [type(""), type(u'')]:
+        date = parse_date(s)
+    else:
+        date = s
+    return date.strftime("%m/%d/%Y 00:00")
+#----------------------------------------------------------
+# Functions for extracting sentences from text
+#----------------------------------------------------------
+'''
+description
+    given an index in a string i, extracts the sentence [a, b] where b >= i
+    this means if s[i] = '.', then this will return the preceeding sentence
+input
+    s: a string
+    i: the index of the string that should be contained in the output sentence
+    index: if true, then the indicies [a, b) that define the sentence are returned
+        otherwise, the string s[a:b] is returned (default FALSE)
+output
+    if index = TRUE
+        a tuple (a, b)
+    else
+        a string s[a, b)
+'''
+def get_sentence(s, i, index = False):
+    #A new line must end a sentence and no ne gives a fuck about '\r'
+    s = s.replace("\r", "")
+    s = s.replace("\n", ". ")#TODO: more careful way of achieving this effect
+    a = i-1
+    #find the end of the previous sentence
+    while a > 0 and not is_sentence_end(s, a-1):
+        a -= 1
+    b = i
+    #find end of sentence
+    while b < len(s) and not is_sentence_end(s, b):
+        b += 1
+    #return tuple or string based on index variable
+    if index:
+        return (a, b+1)
+    else:
+        return s[a : b+1]
+'''
+description
+    uses get_sentence to tokenize string into sentences
+input
+    s: string
+output
+    a list of sentences
+'''
+def split_sentences(s):
+    s = s.replace("\r","").replace("\n", ". ")
+    i = 0
+    result = []
+    while i < len(s):
+        a, i_new = get_sentence(s, i, True)
+        result += [s[i:i_new].strip(" ")]
+        i = i_new
+    return result
+'''
+description
+    helper function to help identify if '.' in a string indicates if a sentence ends
+input
+    s: a string
+    i: the index of the '.'
+output
+    boolean of if that '.' indicates an end of a sentence
+'''
+def is_sentence_end(s, i):
+    if i == 0: #if period starts string, not end of sentence
+        return False
+    elif i >= len(s)-1: #if period ends string, must be end of sentence
+        return True
+    elif unicode(s[i]) in [u'!',u'?']: #these are unambiguous
+        return True
+    elif unicode(s[i]) != u'.': #if its not a period it can't end a sentence
+        return False
+    else: #this is the case that it is a period
+        before = unicode(s[i-1])
+        after = unicode(s[i+1])
+        if not before.isnumeric(): #e.g. "...and he stopped. 5 is a nice number"
+            return True
+        elif after.isnumeric(): #e.g. "I have 5.2 liters"
+            return False
+        else: #e.g. "I have work until 5.I need a friend"
+            return True
+    #TODO: include cases for Mr. , Mrs. or any arbitary list of abreviations