[f8624c]: / ai_genomics / getters / crunchbase.py

Download this file

72 lines (52 with data), 2.3 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from ai_genomics import PROJECT_DIR, logger, bucket_name
import pandas as pd
from typing import Mapping, Union
from ai_genomics.pipeline.crunchbase_data.make_crunchbase_data import (
CB_COMP_PATH,
CB_COMP_NAME,
)
from ai_genomics.getters.data_getters import load_s3_data
def get_ai_genomics_crunchbase_org_ids() -> pd.DataFrame:
"""Returns dataframe of AI and Genomics crunchbase organisation ids"""
try:
return pd.read_csv(
PROJECT_DIR / "inputs/data/crunchbase/ai_genomics_org_ids.csv",
)
except FileNotFoundError as e:
logger.error(
"FileNotFoundError: To create the missing file, run ai_genomics/analysis/crunchbase_definitions.py"
)
raise e
def get_ai_genomics_crunchbase_orgs(local: bool = True) -> pd.DataFrame:
"""Reads a table with information about AI, Genomics or AI and genomics companies
Args:
local: If True, read from local file. If False, read from s3 bucket.
"""
if local:
return pd.read_csv(CB_COMP_PATH)
else:
return load_s3_data("ai-genomics", f"outputs/crunchbase/{CB_COMP_NAME}")
def get_crunchbase_entities() -> Mapping[str, Mapping[str, Union[str, str]]]:
"""From S3 loads ai genomics cb entities"""
return load_s3_data(
bucket_name,
"outputs/entity_extraction/cb_lookup_clean.json",
)
def get_crunchbase_ai_genomics_entity_groups(k: int = 500) -> pd.DataFrame:
"""Gets a dataframe of vectors representing the presence of DBpedia entity
clusters in each document.
Args:
k (int, optional): The number of clusters. Defaults to 500.
Returns:
pd.DataFrame: A sparse dataframe where the index is company IDs and
the columns are vector dimensions (entity cluster IDs).
"""
fname = f"inputs/entities/crunchbase_entity_group_vectors_k_{k}.csv"
return load_s3_data(bucket_name, fname)
def get_crunchbase_ai_genomics_description_embeddings() -> pd.DataFrame:
"""Gets an array of description embeddings and the associated company IDs.
Returns:
pd.DataFrame: Description embeddings and the associated company IDs.
"""
fname = "inputs/embedding/cb_ai_genomics_embeddings.csv"
return load_s3_data(bucket_name, fname)