Diff of /language_processing.py [000000] .. [8d2107]

Switch to unified view

a b/language_processing.py
1
import re
2
import datetime
3
from datetime import timedelta
4
5
6
#----------------------------------------------------------
7
# Functions for parsing the header of a note file and modifying
8
# the JSON to incorporate these structured fields
9
#----------------------------------------------------------
10
11
def parse_m_d_y(s):
12
    # I (Josh) added this to catch a date that had a typo in it
13
    s = s.replace(".", '')
14
    re_m_d_y = r"([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{4})|([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{2})"
15
    match = re.search(re_m_d_y, s)   
16
    if match:
17
        groups = list(match.groups())
18
        if groups[0] == None:
19
            groups = groups[3:]
20
            if int(groups[2]) - 17 >= 0:
21
                groups[2] = '19' + groups[2]
22
            else:
23
                groups[2] = '20' + groups[2]
24
        else:
25
            groups = groups[:3]
26
        return datetime.date(int(groups[2]),int(groups[0]),int(groups[1]))
27
    else:
28
        return None
29
30
def parse_m_y(s):
31
    re_m_y = "([0-9]{1,2})/([0-9]{4})|([0-9]{1,2})/([0-9]{2})"
32
    match = re.search(re_m_y, s)   
33
    if match:
34
        groups = list(match.groups())
35
        if groups[0] == None:
36
            groups = groups[2:]
37
            if int(groups[1]) - 17 >= 0:
38
                groups[1] = str(19) + groups[1]
39
            else:
40
                groups[1] = str(20) + groups[1]
41
        else:
42
            groups = groups[:2]
43
       
44
        return datetime.date(int(groups[1]),int(groups[0]), 1)
45
    else:
46
        return None
47
48
def parse_date(s):
49
    date = parse_m_d_y(s)
50
    if not date:
51
        date = parse_m_y(s)
52
    if not date:
53
        return None
54
    return date
55
56
def format_date(s):
57
    if type(s) in [type(""), type(u'')]:
58
        date = parse_date(s)
59
    else:
60
        date = s
61
    return date.strftime("%m/%d/%Y 00:00")
62
63
#----------------------------------------------------------
64
# Functions for extracting sentences from text
65
#----------------------------------------------------------
66
67
68
'''
69
description
70
    given an index in a string i, extracts the sentence [a, b] where b >= i
71
    this means if s[i] = '.', then this will return the preceeding sentence
72
input
73
    s: a string
74
    i: the index of the string that should be contained in the output sentence
75
    index: if true, then the indicies [a, b) that define the sentence are returned
76
        otherwise, the string s[a:b] is returned (default FALSE)
77
output
78
    if index = TRUE
79
        a tuple (a, b)
80
    else
81
        a string s[a, b)
82
'''
83
def get_sentence(s, i, index = False):
84
    #A new line must end a sentence and no ne gives a fuck about '\r'
85
    s = s.replace("\r", "")
86
    s = s.replace("\n", ". ")#TODO: more careful way of achieving this effect
87
88
    a = i-1
89
    #find the end of the previous sentence
90
    while a > 0 and not is_sentence_end(s, a-1):
91
        a -= 1
92
    b = i
93
    #find end of sentence
94
    while b < len(s) and not is_sentence_end(s, b):
95
        b += 1
96
97
    #return tuple or string based on index variable
98
    if index:
99
        return (a, b+1)
100
    else:
101
        return s[a : b+1]
102
103
'''
104
description
105
    uses get_sentence to tokenize string into sentences
106
input
107
    s: string
108
output
109
    a list of sentences
110
'''
111
def split_sentences(s):
112
    s = s.replace("\r","").replace("\n", ". ")
113
    i = 0
114
    result = []
115
    while i < len(s):
116
        a, i_new = get_sentence(s, i, True)
117
        result += [s[i:i_new].strip(" ")]
118
        i = i_new
119
    return result
120
121
122
123
'''
124
description
125
    helper function to help identify if '.' in a string indicates if a sentence ends
126
input
127
    s: a string
128
    i: the index of the '.'
129
output
130
    boolean of if that '.' indicates an end of a sentence
131
'''
132
def is_sentence_end(s, i):
133
    if i == 0: #if period starts string, not end of sentence
134
        return False
135
    elif i >= len(s)-1: #if period ends string, must be end of sentence
136
        return True
137
    elif unicode(s[i]) in [u'!',u'?']: #these are unambiguous
138
        return True
139
    elif unicode(s[i]) != u'.': #if its not a period it can't end a sentence
140
        return False
141
    else: #this is the case that it is a period 
142
        before = unicode(s[i-1])
143
        after = unicode(s[i+1])
144
        if not before.isnumeric(): #e.g. "...and he stopped. 5 is a nice number"
145
            return True
146
        elif after.isnumeric(): #e.g. "I have 5.2 liters"
147
            return False
148
        else: #e.g. "I have work until 5.I need a friend"
149
            return True
150
    #TODO: include cases for Mr. , Mrs. or any arbitary list of abreviations
151
152