[1bd6b5]: / helpers / features.py

Download this file

47 lines (39 with data), 1.3 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from typing import Iterable
from pandas import DataFrame, Series
def eval_features_frame(data: DataFrame, prefix='mentioned_'):
"""Parses stringified lists and sets of features in columns of `data`."""
return (
data
[data.columns[data.columns.str.startswith(prefix)]]
.applymap(eval)
)
def number_of_articles_mentioning_feature(
data_py: DataFrame, features: Iterable[str],
exclude: Iterable[str] = None
):
"""Counts number of articles mentioning features of kind given by `features`.
Args:
data_py: features data frame evaluated by `eval_features_frame`
features: names of feature types (column infixes) to count
exclude: feature to exclude
"""
if not exclude:
exclude = []
result = (
Series(
data_py[list('mentioned_' + Series(features) + '_set')]
.stack()
.apply(list)
.sum()
)
.value_counts()
.drop(exclude)
.to_frame('count')
.rename_axis(index='term')
.reset_index()
.rename_axis(index='rank')
.assign(kind=','.join(features))
)
result['proportion_of_features'] = result['count'] / sum(result['count'])
result['proportion_of_articles'] = result['count'] / len(data_py)
return result