[f8624c]: / ai_genomics / utils / text_modeling.py

Download this file

136 lines (99 with data), 3.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re
from typing import List, Optional, Dict, Any, Tuple
import toolz.curried as t
from gensim.models import Phrases
from gensim.models.phrases import FrozenPhrases
from pandas import DataFrame
import tomotopy as tp
from string import punctuation, digits
PUNCT = "|\\".join([x for x in punctuation])
DIGITS = "|".join([x for x in digits])
def remove_digits(doc: str):
"""Remove digits from a document"""
return re.sub(DIGITS, "", doc)
def remove_short_tokens(doc: str):
"""Remove short tokens"""
return [d for d in doc if len(d) > 2]
def remove_symbols(doc: str):
"""Remove symbols from a document"""
return re.sub("\n", " ", re.sub(PUNCT, "", doc.lower()))
def remove_stop_punct(doc):
"""Remove stop words and punctuation"""
return [d.lower_ for d in doc if (d.is_punct is False) & (d.is_stop is False)]
def build_ngrams(
documents: List[List[str]], n: int = 2, phrase_kws: Optional[Dict[str, Any]] = None
) -> Tuple:
"""Create ngrams using Gensim's phrases.
Args:
documents: List of tokenised documents.
n: The `n` in n-gram.
phrase_kws: Passed to `gensim.models.Phrases`.
Returns:
List of n-grammed documents.
"""
if n < 2:
return (documents, None)
def_phrase_kws = {
"scoring": "npmi",
"threshold": 0.25,
"min_count": 2,
"delimiter": "_",
}
phrase_kws = t.merge(def_phrase_kws, phrase_kws or {})
def step(documents, n):
print(f"N-gram: {n}")
bigram = FrozenPhrases(Phrases(documents, **phrase_kws))
return bigram[documents], bigram
for n in range(2, n + 1):
documents, bigram = step(documents, n)
return documents, bigram
# return reduce(step, range(2, n + 1), documents)
def train_lda(docs: List[str], k: int = 50, top_remove: int = 500, verbose=False):
"""Train an LDA model on a list of tokenised documents
Args:
docs: List of tokenised documents.
k: Number of topics.
top_remove: Number of top words to remove.
verbose: Print progress (NB we always print output topics)
"""
mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=top_remove, k=k)
for n, doc in enumerate(docs):
idx = mdl.add_doc(doc)
if n != idx:
print(n)
mdl.burn_in = 100
mdl.train(0)
if verbose:
print(
"Num docs:",
len(mdl.docs),
", Vocab size:",
len(mdl.used_vocabs),
", Num words:",
mdl.num_words,
)
print("Removed top words:", mdl.removed_top_words)
print("Training...", flush=True)
for i in range(0, 1000, 10):
mdl.train(10)
if verbose:
print("Iteration: {}\tLog-likelihood: {}".format(i, mdl.ll_per_word))
mdl.summary()
for k in range(mdl.k):
print("Topic #{}".format(k))
for word, prob in mdl.get_topic_words(k):
print("\t", word, prob, sep="\t")
return mdl
def create_topic_names(mdl: tp.LDAModel, k: int, n_words=10) -> list:
"""Create a list of topic names based on their weights on a topic"""
return [
"_".join([el[0] for n, el in enumerate(mdl.get_topic_words(n)) if n < n_words])
for n in range(k)
]
def create_doc_topics(mdl: tp.LDAModel, topic_names: list, doc_ids: list) -> DataFrame:
"""Create a dataframe with topic distributions per document"""
return DataFrame(
[mdl.docs[n].get_topic_dist() for n in range(len(mdl.docs))],
columns=topic_names,
index=doc_ids,
)