[98867e]: / util / inventory.py

Download this file

93 lines (68 with data), 4.0 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import sys
sys.path.append("..")
from faigen.data.sequence import Dna2VecList,regex_filter
import pandas as pd
import numpy as np
import os
from functools import partial
import configargparse
from pathlib import Path
from Bio.SeqRecord import SeqRecord
import yaml
def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame:
res=df.copy()
drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]]
return res.drop(drop, axis=0)
def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame:
res,mask=df.copy(),[]
for x in df.label.values: mask.append(False if word in x else True)
drop = res.index[mask]
return res.drop(drop, axis=0)
def main():
argp = configargparse.get_argument_parser()
argp.add_argument('-i', help='input folder with Fasta files', type=str, default='/data/genomes/GenSeq_fastas')
argp.add_argument('-o', help='output file name', type=str)
argp.add_argument('-g', choices=['folder', 'file'], help='granularity', type=str, default="file")
argp.add_argument('-l', choices=['description', 'id', 'file'], help='source of labels', type=str, default="description")
argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str)
argp.add_argument('-lsr', help='regular expression for labeling', type=str)
argp.add_argument('-rxkeep', help='keep if regular expression found', type=str)
argp.add_argument('-rxdrop', help='drop if regular expression found', type=str)
argp.add_argument('-d', help='label delimiter', type=str, default=" ")
argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test")
argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1")
args = {k:v for k,v in vars(argp.parse_args()).items()}
input= Path(args["i"]) if args["i"] is not None else Path(".")
filters=[]
if args["rxkeep"] is not None: filters.append(partial(regex_filter, rx=args["rxkeep"]))
if args["rxdrop"] is not None: filters.append(partial(regex_filter, rx=args["rxdrop"], keep=False))
all_fastas = Dna2VecList.from_folder(input, filters=filters if len(filters) > 0 else None ).items
output = input / "inventory"
print("Creating Inventory in", str(output))
if not os.path.exists(output):
os.makedirs(output)
fn = "sequences" if args["o"] is None else args["o"]
with open(output / f'{fn}_inventory.yml', 'w') as outfile:
yaml.dump(args, outfile, default_flow_style=False)
inventory = pd.DataFrame(data=all_fastas)
if args['lsi'] is not None:
lsi = [int(x) for x in args["lsi"].split(",")]
label_source = inventory.loc[:,args['l']].values
tokens = [np.asarray(x.split(args["d"])) for x in list(label_source)]
inventory["label"] = [" ".join(t[lsi]) for t in tokens]
inventory.to_csv(Path(output / f"{fn}.csv"))
if "label" not in inventory.columns.values: return
if args["g"] == "file":
files_df = inventory.groupby(["file"]).agg({"id": ['count', list],"label":set,
"len": [list, min, max, np.mean, np.std], "description": list})
files_df.columns=["seq_count", "id", "label", 'len', 'min' , 'max', 'mean', 'std', "description"]
files_df["label"] = [list(x)[0] for x in list(files_df.label.values)]
files_df.to_csv( output / f"{fn}_by_file.csv")
label_df = inventory.groupby("label").agg({ "id": ["count", list], "len": [list, min, max, np.median],"file": list})
label_df.columns = [ "seq_count", 'id', "lengths", "min", "max", "median","files"]
label_df.files = [list(set(x)) for x in list(label_df.files.values)]
label_df["file_count"] = [len(x) for x in list(label_df.files.values)]
label_df.to_csv(output / f"{fn}_by_label.csv")
label_df.to_pickle( output / f"{fn}_by_label.pkl")
if __name__ == '__main__':
main()