[1bd6b5]: / helpers / text_processing.py

Download this file

119 lines (96 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from difflib import SequenceMatcher
from re import escape
from typing import Dict, Tuple
from pandas import DataFrame, Series
def matches_n_consecutive_words(text: str, database: set, consecutive_n: int):
"""Check whether a phrase (one or more words separated by whitespace characters)
from given database (set of phrases) is present in the provided text quickly.
Return all matches phrase.
"""
words = text.split()
matches = []
for span_size in range(1, consecutive_n + 1):
for start_position in range(0, len(words)):
if start_position + span_size <= len(words):
substring = ' '.join(words[start_position:start_position + span_size])
if substring in database:
matches.append(substring)
return matches
def highlight_first(text: str, keyword: str, margin: int = 50):
pos = text.index(keyword)
return text[max(0, pos - margin):min(pos + margin, len(text))]
def check_usage(
term: str, data: DataFrame, column: str,
words: bool = True, highlight: str = None, limit: int = None
):
"""
highlight - defaults to term"""
# \b = a word break
search_term = fr'\b{escape(term)}\b' if words else term
highlight = highlight if highlight else term
is_match = data[column].str.contains(search_term)
results = data[is_match][column].apply(highlight_first, keyword=highlight)
if limit:
results = results.head(limit)
return results
def sequence_similarity_ratio(a: str, b: str):
return SequenceMatcher(a=a, b=b, autojunk=False).ratio()
def find_term_typos(term_counts: Series, threshold: int):
typos_terms_check = DataFrame([
{
'rare_term': term,
'popular_term': suggested_term,
'similarity': sequence_similarity_ratio(term, suggested_term)
}
for term in term_counts[term_counts < threshold].index
for suggested_term in term_counts[term_counts >= threshold].index
])
potential_typos = (
typos_terms_check[typos_terms_check.similarity > 0.9]
.sort_values(['similarity', 'popular_term', 'rare_term'], ascending=False)
.reset_index(drop=True)
)
return potential_typos
def create_typos_map(
potential_typos: DataFrame,
is_typo: Dict[Tuple[str, str], bool]
) -> Dict[str, str]:
"""Create a mapping of rare_term → popular term
Args:
potential_typos: a frame returned by find_term_typos
is_typo: manual mapping of tuple (rare_term, popular_term) to boolean values
"""
recieved_set = set(is_typo.keys())
expected_set = set(potential_typos[['rare_term', 'popular_term']].apply(tuple, axis=1))
if recieved_set != expected_set:
raise ValueError(f'{recieved_set - expected_set} too much, {expected_set - recieved_set} missing')
return {
rare_term: term
for (rare_term, term), need_changing in is_typo.items()
if need_changing
}
def prefix_remover(prefix: str, enforce=True):
def remove_prefix(text: str):
if text.startswith(prefix):
return text[len(prefix):]
else:
if enforce:
raise ValueError(f'Prefix {prefix!r} missing in {text!r}')
else:
return text
return remove_prefix
def report_hyphenation_trends(terms: Series) -> DataFrame:
counts = terms.value_counts()
mapping = []
for i, more_popular_term in enumerate(counts.index):
for other_term in list(counts.index)[i + 1:]:
if more_popular_term.replace('-', '') == other_term.replace('-', ''):
mapping.append({
'more_popular': more_popular_term,
'less_popular': other_term
})
return DataFrame(mapping)
def harmonise_hyphenation(terms: Series, trends: DataFrame):
return terms.replace(
trends.set_index('less_popular').more_popular.to_dict()
)