[f8624c]: / ai_genomics / utils / openalex.py

Download this file

315 lines (231 with data), 8.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Functions to fetch and process OpenAlex data
import json
import logging
from collections import Counter
from itertools import chain
from typing import Dict, List, Any, Union
import boto3
import pandas as pd
from toolz import pipe, partial
from ai_genomics import config
CONCEPT_THRES = config["concept_threshold"]
INST_META_VARS, WORK_META_VARS, VENUE_META_VARS = [
config[var_list]
for var_list in ["inst_meta_vars", "work_meta_vars", "venue_meta_vars"]
]
MESH_VARS = ["descriptor_ui", "descriptor_name", "qualifier_name"]
OA_NAME_ID_LOOKUP = {"artificial_intelligence": "C154945302", "genetics": "C54355233"}
def fetch_openalex(
concept_name: str,
year: int,
) -> List[Dict]:
"""Fetch a json object
Args:
s3_bucket where we store the data
concept_name: The name of the concept
year: the year
Returns:
"""
s3 = boto3.resource("s3")
ai_genomics_bucket = s3.Bucket("ai-genomics")
logging.info(f"Fetching {concept_name} for year {year}")
return pipe(
f"inputs/openalex/{concept_name}/openalex-works_production-True_concept-{OA_NAME_ID_LOOKUP[concept_name]}_year-{year}.json",
ai_genomics_bucket.Object,
lambda _object: _object.get()["Body"].read().decode(),
json.loads,
)
def deinvert_abstract(inverted_abstract: Dict[str, List]) -> Union[str, None]:
"""Convert inverted abstract into normal abstract
Args:
inverted_abstract: a dict where the keys are words
and the values lists of positions
Returns:
A str that reconstitutes the abstract or None if the deinvered abstract
is empty
"""
if len(inverted_abstract) == 0:
return None
else:
abstr_empty = (max(chain(*inverted_abstract.values())) + 1) * [""]
for word, pos in inverted_abstract.items():
for p in pos:
abstr_empty[p] = word
return " ".join(abstr_empty)
def extract_obj_meta(oalex_object: Dict, meta_vars: List) -> Dict:
"""Extracts variables of interest from an OpenAlex object (eg work, insitution...)
Args:
oalex_object: an OpenAlex object
meta_vars: a list of variables to extract
Returns:
A dict with the variables of interest
"""
return {var: val for var, val in oalex_object.items() if var in meta_vars}
def extract_work_venue(work: Dict, venue_vars: List) -> Dict:
"""Extracts nested metadata about a publication venue
Args:
work: an OpenAlex work
venue_vars: a list of variables to extract
Returns:
A dict with the variables of interest
"""
return {
f"venue_{var}": val
for var, val in work["host_venue"].items()
if var in venue_vars
}
def make_inst_metadata(inst_list: List, meta_vars: List) -> pd.DataFrame:
"""Makes a df with metadata about oalex institutions
Args:
doc_list: list of oalex institutions
meta_vars: list of variables to extract
Returns
A df with instit-level metadata
"""
return pipe(
inst_list,
lambda list_of_dicts: [
extract_obj_meta(
d,
meta_vars=meta_vars,
)
for d in list_of_dicts
],
pd.DataFrame,
)
def make_work_metadata(work: Dict) -> Dict:
"""Extracts metadata about a work"""
return {
**extract_obj_meta(work, meta_vars=WORK_META_VARS),
**extract_work_venue(work, venue_vars=VENUE_META_VARS),
}
def make_work_corpus_metadata(works_list: List) -> pd.DataFrame:
"""Makes a df with work metadata
Args:
work_list: list of oalex works
Returns:
A df with work-level metadata
"""
return pipe(
works_list,
lambda list_of_dicts: [make_work_metadata(pd.Series(d)) for d in list_of_dicts],
pd.DataFrame,
).rename(columns={"id": "work_id"})
def get_nested_vars(work: Dict, variable: str, keys_to_keep: List) -> Union[None, List]:
"""Extracts nested variables from a document
Args:
doc: an open alex work
nested_variable: the nested variable to extract
keys_to_keep: the keys to keep in the nested variable
Returns:
A list of dicts with the nested variables
"""
if variable not in work.keys():
return None
else:
return [
{
**{"doc_id": work["id"]},
**{k: v for k, v in conc.items() if k in keys_to_keep},
}
for conc in work[variable]
]
def make_work_concepts(
works_list: List,
variable: str = "concepts",
keys_to_keep: List = ["id", "display_name", "score"],
make_df: bool = True,
) -> pd.DataFrame:
"""
Extracts concepts from work (could be openalex or mesh)
Args:
doc_list: list of openalex
variable: concept variable to extract
keys_to_keep: keys to keep in the concept
make_df: whether to make a df or not
Returns:
A df with work-level concepts
"""
return pipe(
works_list,
lambda doc_list: [
get_nested_vars(doc, variable=variable, keys_to_keep=keys_to_keep)
for doc in doc_list
],
lambda dict_list: pd.DataFrame(chain(*dict_list))
if make_df
else chain(*dict_list),
)
def get_authorships(work: Dict) -> List:
"""
Extract authorships from a document
Args:
work: an openalex
Returns:
A list of parsed dicts with the authors and their affiliations
"""
return list(
chain(
*[
[
{
**{"id": work["id"]},
**{f"auth_{k}": v for k, v in auth["author"].items()},
**{"affiliation_string": auth["raw_affiliation_string"]},
**{f"inst_{k}": v for k, v in inst.items() if k == "id"},
}
for inst in auth[
"institutions"
] # Some authors are affiliated to more than
# one institution.
]
for auth in work["authorships"]
]
)
)
def make_work_authorships(works_list: List) -> pd.DataFrame:
"""
Creates a df with authors and institutions per works
Args:
works_list: list of openalex works
"""
return pipe(
works_list,
lambda list_of_docs: [get_authorships(doc) for doc in list_of_docs],
lambda extracted: pd.DataFrame(chain(*extracted)),
)
def make_citations(work_list: List) -> Dict:
"""Dict with the papers cited by each work"""
return {doc["id"]: doc["referenced_works"] for doc in work_list}
def make_deinverted_abstracts(work_list: List) -> Dict:
"""Dict with the deinverted abstracts of each work (where available"""
return {
doc["id"]: deinvert_abstract(doc["abstract_inverted_index"])
if (type(doc["abstract_inverted_index"]) == dict)
else None
for doc in work_list
}
if __name__ == "__main__":
import logging
from ai_genomics.getters.openalex import get_openalex_instits
logging.info("getting data")
instits = get_openalex_instits()
works = fetch_openalex("artificial_intelligence", 2007)
logging.info("Checking institutions")
inst = make_inst_metadata(instits, INST_META_VARS)
logging.info(inst.head())
logging.info("Checking works")
works_df = make_work_corpus_metadata(works)
logging.info(works_df.head())
logging.info("Checking authorships")
authorships = make_work_authorships(works)
logging.info(authorships.head())
logging.info("Checking concepts")
oa_concepts = make_work_concepts(works, variable="concepts")
logging.info(oa_concepts.head())
mesh_subjects = make_work_concepts(works, variable="mesh", keys_to_keep=MESH_VARS)
logging.info(mesh_subjects.head())
logging.info("Checking citations")
logging.info(list(make_citations(works).values())[0])
logging.info("checking deinverted abstracts")
logging.info(list(make_deinverted_abstracts(works).values())[0])