NLP_CRT / Git / [8d2107] /language

Models:
philipB/
NLP_CRT
Downloads: 1
[8d2107]: / language_processing.py
History
Download this file
153 lines (134 with data), 4.5 kB

import re
import datetime
from datetime import timedelta


#----------------------------------------------------------
# Functions for parsing the header of a note file and modifying
# the JSON to incorporate these structured fields
#----------------------------------------------------------

def parse_m_d_y(s):
    # I (Josh) added this to catch a date that had a typo in it
    s = s.replace(".", '')
    re_m_d_y = r"([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{4})|([0-9]{1,2})[/-]([0-9]{1,2})[/-]([0-9]{2})"
    match = re.search(re_m_d_y, s)   
    if match:
        groups = list(match.groups())
        if groups[0] == None:
            groups = groups[3:]
            if int(groups[2]) - 17 >= 0:
                groups[2] = '19' + groups[2]
            else:
                groups[2] = '20' + groups[2]
        else:
            groups = groups[:3]
        return datetime.date(int(groups[2]),int(groups[0]),int(groups[1]))
    else:
        return None

def parse_m_y(s):
    re_m_y = "([0-9]{1,2})/([0-9]{4})|([0-9]{1,2})/([0-9]{2})"
    match = re.search(re_m_y, s)   
    if match:
        groups = list(match.groups())
        if groups[0] == None:
            groups = groups[2:]
            if int(groups[1]) - 17 >= 0:
                groups[1] = str(19) + groups[1]
            else:
                groups[1] = str(20) + groups[1]
        else:
            groups = groups[:2]
       
        return datetime.date(int(groups[1]),int(groups[0]), 1)
    else:
        return None

def parse_date(s):
    date = parse_m_d_y(s)
    if not date:
        date = parse_m_y(s)
    if not date:
        return None
    return date

def format_date(s):
    if type(s) in [type(""), type(u'')]:
        date = parse_date(s)
    else:
        date = s
    return date.strftime("%m/%d/%Y 00:00")

#----------------------------------------------------------
# Functions for extracting sentences from text
#----------------------------------------------------------


'''
description
    given an index in a string i, extracts the sentence [a, b] where b >= i
    this means if s[i] = '.', then this will return the preceeding sentence
input
    s: a string
    i: the index of the string that should be contained in the output sentence
    index: if true, then the indicies [a, b) that define the sentence are returned
        otherwise, the string s[a:b] is returned (default FALSE)
output
    if index = TRUE
        a tuple (a, b)
    else
        a string s[a, b)
'''
def get_sentence(s, i, index = False):
    #A new line must end a sentence and no ne gives a fuck about '\r'
    s = s.replace("\r", "")
    s = s.replace("\n", ". ")#TODO: more careful way of achieving this effect

    a = i-1
    #find the end of the previous sentence
    while a > 0 and not is_sentence_end(s, a-1):
        a -= 1
    b = i
    #find end of sentence
    while b < len(s) and not is_sentence_end(s, b):
        b += 1

    #return tuple or string based on index variable
    if index:
        return (a, b+1)
    else:
        return s[a : b+1]

'''
description
    uses get_sentence to tokenize string into sentences
input
    s: string
output
    a list of sentences
'''
def split_sentences(s):
    s = s.replace("\r","").replace("\n", ". ")
    i = 0
    result = []
    while i < len(s):
        a, i_new = get_sentence(s, i, True)
        result += [s[i:i_new].strip(" ")]
        i = i_new
    return result



'''
description
    helper function to help identify if '.' in a string indicates if a sentence ends
input
    s: a string
    i: the index of the '.'
output
    boolean of if that '.' indicates an end of a sentence
'''
def is_sentence_end(s, i):
    if i == 0: #if period starts string, not end of sentence
        return False
    elif i >= len(s)-1: #if period ends string, must be end of sentence
        return True
    elif unicode(s[i]) in [u'!',u'?']: #these are unambiguous
        return True
    elif unicode(s[i]) != u'.': #if its not a period it can't end a sentence
        return False
    else: #this is the case that it is a period 
        before = unicode(s[i-1])
        after = unicode(s[i+1])
        if not before.isnumeric(): #e.g. "...and he stopped. 5 is a nice number"
            return True
        elif after.isnumeric(): #e.g. "I have 5.2 liters"
            return False
        else: #e.g. "I have work until 5.I need a friend"
            return True
    #TODO: include cases for Mr. , Mrs. or any arbitary list of abreviations