|
a |
|
b/util/inventory.py |
|
|
1 |
import sys |
|
|
2 |
sys.path.append("..") |
|
|
3 |
from faigen.data.sequence import Dna2VecList,regex_filter |
|
|
4 |
import pandas as pd |
|
|
5 |
import numpy as np |
|
|
6 |
import os |
|
|
7 |
from functools import partial |
|
|
8 |
import configargparse |
|
|
9 |
from pathlib import Path |
|
|
10 |
from Bio.SeqRecord import SeqRecord |
|
|
11 |
import yaml |
|
|
12 |
|
|
|
13 |
def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame: |
|
|
14 |
res=df.copy() |
|
|
15 |
drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]] |
|
|
16 |
return res.drop(drop, axis=0) |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame: |
|
|
20 |
res,mask=df.copy(),[] |
|
|
21 |
for x in df.label.values: mask.append(False if word in x else True) |
|
|
22 |
drop = res.index[mask] |
|
|
23 |
return res.drop(drop, axis=0) |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
def main(): |
|
|
27 |
argp = configargparse.get_argument_parser() |
|
|
28 |
argp.add_argument('-i', help='input folder with Fasta files', type=str, default='/data/genomes/GenSeq_fastas') |
|
|
29 |
argp.add_argument('-o', help='output file name', type=str) |
|
|
30 |
argp.add_argument('-g', choices=['folder', 'file'], help='granularity', type=str, default="file") |
|
|
31 |
argp.add_argument('-l', choices=['description', 'id', 'file'], help='source of labels', type=str, default="description") |
|
|
32 |
argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str) |
|
|
33 |
argp.add_argument('-lsr', help='regular expression for labeling', type=str) |
|
|
34 |
argp.add_argument('-rxkeep', help='keep if regular expression found', type=str) |
|
|
35 |
argp.add_argument('-rxdrop', help='drop if regular expression found', type=str) |
|
|
36 |
argp.add_argument('-d', help='label delimiter', type=str, default=" ") |
|
|
37 |
argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test") |
|
|
38 |
argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1") |
|
|
39 |
|
|
|
40 |
|
|
|
41 |
args = {k:v for k,v in vars(argp.parse_args()).items()} |
|
|
42 |
input= Path(args["i"]) if args["i"] is not None else Path(".") |
|
|
43 |
filters=[] |
|
|
44 |
if args["rxkeep"] is not None: filters.append(partial(regex_filter, rx=args["rxkeep"])) |
|
|
45 |
if args["rxdrop"] is not None: filters.append(partial(regex_filter, rx=args["rxdrop"], keep=False)) |
|
|
46 |
|
|
|
47 |
all_fastas = Dna2VecList.from_folder(input, filters=filters if len(filters) > 0 else None ).items |
|
|
48 |
|
|
|
49 |
output = input / "inventory" |
|
|
50 |
print("Creating Inventory in", str(output)) |
|
|
51 |
|
|
|
52 |
if not os.path.exists(output): |
|
|
53 |
os.makedirs(output) |
|
|
54 |
|
|
|
55 |
fn = "sequences" if args["o"] is None else args["o"] |
|
|
56 |
|
|
|
57 |
with open(output / f'{fn}_inventory.yml', 'w') as outfile: |
|
|
58 |
yaml.dump(args, outfile, default_flow_style=False) |
|
|
59 |
|
|
|
60 |
|
|
|
61 |
inventory = pd.DataFrame(data=all_fastas) |
|
|
62 |
if args['lsi'] is not None: |
|
|
63 |
lsi = [int(x) for x in args["lsi"].split(",")] |
|
|
64 |
label_source = inventory.loc[:,args['l']].values |
|
|
65 |
tokens = [np.asarray(x.split(args["d"])) for x in list(label_source)] |
|
|
66 |
inventory["label"] = [" ".join(t[lsi]) for t in tokens] |
|
|
67 |
|
|
|
68 |
inventory.to_csv(Path(output / f"{fn}.csv")) |
|
|
69 |
|
|
|
70 |
if "label" not in inventory.columns.values: return |
|
|
71 |
|
|
|
72 |
if args["g"] == "file": |
|
|
73 |
files_df = inventory.groupby(["file"]).agg({"id": ['count', list],"label":set, |
|
|
74 |
"len": [list, min, max, np.mean, np.std], "description": list}) |
|
|
75 |
files_df.columns=["seq_count", "id", "label", 'len', 'min' , 'max', 'mean', 'std', "description"] |
|
|
76 |
files_df["label"] = [list(x)[0] for x in list(files_df.label.values)] |
|
|
77 |
|
|
|
78 |
files_df.to_csv( output / f"{fn}_by_file.csv") |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
label_df = inventory.groupby("label").agg({ "id": ["count", list], "len": [list, min, max, np.median],"file": list}) |
|
|
82 |
label_df.columns = [ "seq_count", 'id', "lengths", "min", "max", "median","files"] |
|
|
83 |
label_df.files = [list(set(x)) for x in list(label_df.files.values)] |
|
|
84 |
label_df["file_count"] = [len(x) for x in list(label_df.files.values)] |
|
|
85 |
|
|
|
86 |
label_df.to_csv(output / f"{fn}_by_label.csv") |
|
|
87 |
label_df.to_pickle( output / f"{fn}_by_label.pkl") |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
|
|
|
91 |
if __name__ == '__main__': |
|
|
92 |
main() |