[f8624c]: / ai_genomics / getters / data_getters.py

Download this file

91 lines (75 with data), 2.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pickle
from fnmatch import fnmatch
import boto3
import json
import pandas as pd
from ai_genomics import logger
from typing import Union, List
from decimal import Decimal
S3 = boto3.resource("s3")
def get_s3_dir_files(bucket_name: str, dir_name: str) -> List[str]:
"""
Get a list of all files in bucket directory.
Args:
bucket_name: S3 bucket name
dir_name: S3 bucket directory name
Returns:
list_dir: List of file names in bucket directory
"""
my_bucket = S3.Bucket(bucket_name)
return [
object_summary.key
for object_summary in my_bucket.objects.filter(Prefix=dir_name)
]
def load_s3_data(bucket_name: str, file_name: str) -> Union[pd.DataFrame, str, dict]:
"""
Load data from S3 location.
Args:
bucket_name: The S3 bucket name
file_name: S3 key to load
Returns:
Loaded data from S3 location.
"""
obj = S3.Object(bucket_name, file_name)
if fnmatch(file_name, "*.csv"):
return pd.read_csv(f"s3://{bucket_name}/{file_name}")
elif fnmatch(file_name, "*.tsv.zip"):
return pd.read_csv(
f"s3://{bucket_name}/{file_name}", compression="zip", sep="\t",
)
elif fnmatch(file_name, "*.pickle") or fnmatch(file_name, "*.pkl"):
file = obj.get()["Body"].read()
return pickle.loads(file)
elif fnmatch(file_name, "*.txt"):
file = obj.get()["Body"].read().decode()
return [f.split("\t") for f in file.split("\n")]
elif fnmatch(file_name, "*.json"):
file = obj.get()["Body"].read().decode()
return json.loads(file)
else:
logger.exception(
'Function not supported for file type other than "*.json", *.txt", "*.pickle", "*.tsv" and "*.csv"'
)
def save_to_s3(bucket_name: str, output_var, output_file_dir: str):
"""
Save data to S3 location.
Args:
s3: S3 boto3 resource
bucket_name: The S3 bucket name
output_var: Object to be saved
output_file_dir: file path to save object to
"""
obj = S3.Object(bucket_name, output_file_dir)
if fnmatch(output_file_dir, "*.pkl") or fnmatch(output_file_dir, "*.pickle"):
obj.put(Body=pickle.dumps(output_var))
elif fnmatch(output_file_dir, "*.txt"):
obj.put(Body=output_var)
elif fnmatch(output_file_dir, "*.csv"):
output_var.to_csv("s3://" + bucket_name + "/" + output_file_dir, index=False)
elif fnmatch(output_file_dir, "*.json"):
obj.put(Body=json.dumps(output_var))
else:
logger.exception(
'Function not supported for file type other than "*.json", *.txt", "*.pickle", "*.tsv" and "*.csv"'
)
logger.info(f"Saved to s3://{bucket_name} + {output_file_dir} ...")