[f8624c]: / ai_genomics / analysis / crunchbase_definitions.py

Download this file

136 lines (100 with data), 4.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Script to test definitions in the crunchbase data
import logging
from typing import Set, Union
import pandas as pd
import numpy as np
from toolz import pipe
from ai_genomics.utils.crunchbase import fetch_crunchbase, parse_s3_table
from ai_genomics import PROJECT_DIR, config
CB_INPUTS_DATA_DIR = PROJECT_DIR / "inputs/data/crunchbase/"
def search_terms(abstract: str, terms: Set) -> Union[bool, float]:
"""Checks if a string contains any of the terms in a set.
Args:
abstract: string to check
terms: set of terms to check for
Returns:
True if any of the terms are found, False otherwise (and
nan if we don't have an abstract)
"""
if pd.isnull(abstract) is False:
return any(t in abstract.lower() for t in terms)
else:
return np.nan
if __name__ == "__main__":
logging.info("Check organisations in relevant categories")
cb_cat_comps = pipe(fetch_crunchbase("org_cats"), parse_s3_table)
# Get set of organisations in each of the categories
gen_cats, ai_cats = [
set(cb_cat_comps.query(f"category_name=='{cat}'")["organization_id"])
for cat in ["genetics", "artificial intelligence"]
]
logging.info(f"Genetics category organisations: {len(gen_cats)}")
logging.info(f"Artificial intelligence category organisations: {len(ai_cats)}")
logging.info(f"organisations in both categories:{len(gen_cats & ai_cats)}")
logging.info("Check organisations with relevant abstracts")
cb_comps = pipe(fetch_crunchbase("orgs"), parse_s3_table)
logging.info(f"total organisations: {len(cb_comps)/1e6} M")
ai_terms, genom_terms = [
config[category] for category in ["ai_cb_terms", "genom_cb_terms"]
]
cb_comps["description_combined"] = [
f"{str(descr_short)} {str(descr_long)}"
for descr_short, descr_long in zip(
cb_comps["short_description"], cb_comps["long_description"]
)
]
cb_comps["has_ai"], cb_comps["has_genom"] = [
[search_terms(descr, terms) for descr in cb_comps["description_combined"]]
for terms in [ai_terms, genom_terms]
]
logging.info(f"Genomics terms organisations: {sum(cb_comps['has_genom'])}")
logging.info(
f"Artificial intelligence terms organisations: {sum(cb_comps['has_ai'])}"
)
logging.info(
f"organisations with terms in both categories:{sum(cb_comps['has_ai'] & cb_comps['has_genom'])}"
)
# Extract ids for ai / genom description organisations
ai_descr, genom_descr = [
set(cb_comps[cb_comps[f"has_{var}"] == True]["id"]) for var in ["ai", "genom"]
]
ai_combined = ai_cats.union(ai_descr)
gen_combined = gen_cats.union(genom_descr)
ai_gen_combined = ai_combined & gen_combined
# Save AI genomics crunchbase org ids
CB_INPUTS_DATA_DIR.mkdir(parents=True, exist_ok=True)
pd.DataFrame(ai_gen_combined, columns=["cb_org_id"]).to_csv(
CB_INPUTS_DATA_DIR / "ai_genomics_org_ids.csv", index=False
)
logging.info(f"Genomics terms combined: {len(gen_combined)}")
logging.info(f"Artificial intelligence terms organisations: {len(ai_combined)}")
logging.info(f"organisations with terms in both categories:{len(ai_gen_combined)}")
logging.info("Get 5 random examples")
sample_orgs = []
for cat, name in zip(
[ai_combined, gen_combined, ai_gen_combined],
["ai_combined", "gen_combined", "ai_gen_combined"],
):
relevant = cb_comps.loc[cb_comps["id"].isin(cat)]
sampled = relevant.sample(5)
for _, sampled in sampled.iterrows():
sample_orgs.append(
{
**{
k: v
for k, v in dict(sampled).items()
if k in ["name", "description_combined"]
},
**{"category": name},
}
)
logging.info(pd.DataFrame(sample_orgs).head())
(
pd.DataFrame(sample_orgs)
.assign(
description_combined=lambda df: df["description_combined"].str[:400] + "..."
)
.to_markdown(
f"{PROJECT_DIR}/outputs/crunchbase_examples_reproduce.md", index=False
)
)