a b/util/inventory.py
1
import sys
2
sys.path.append("..")
3
from faigen.data.sequence import Dna2VecList,regex_filter
4
import pandas as pd
5
import numpy as np
6
import os
7
from functools import partial
8
import configargparse
9
from pathlib import Path
10
from Bio.SeqRecord import SeqRecord
11
import yaml
12
13
def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame:
14
    res=df.copy()
15
    drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]]
16
    return res.drop(drop, axis=0)
17
18
19
def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame:
20
    res,mask=df.copy(),[]
21
    for x in df.label.values: mask.append(False if word in x else True)
22
    drop = res.index[mask]
23
    return res.drop(drop, axis=0)
24
25
26
def main():
27
    argp = configargparse.get_argument_parser()
28
    argp.add_argument('-i', help='input folder with Fasta files', type=str, default='/data/genomes/GenSeq_fastas')
29
    argp.add_argument('-o', help='output file name', type=str)
30
    argp.add_argument('-g', choices=['folder', 'file'], help='granularity', type=str, default="file")
31
    argp.add_argument('-l', choices=['description', 'id', 'file'], help='source of labels', type=str, default="description")
32
    argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str)
33
    argp.add_argument('-lsr', help='regular expression for labeling', type=str)
34
    argp.add_argument('-rxkeep', help='keep if regular expression found', type=str)
35
    argp.add_argument('-rxdrop', help='drop if regular expression found', type=str)
36
    argp.add_argument('-d', help='label delimiter', type=str, default=" ")
37
    argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test")
38
    argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1")
39
40
41
    args = {k:v for k,v in vars(argp.parse_args()).items()}
42
    input= Path(args["i"]) if args["i"] is not None else Path(".")
43
    filters=[]
44
    if args["rxkeep"] is not None: filters.append(partial(regex_filter, rx=args["rxkeep"]))
45
    if args["rxdrop"] is not None: filters.append(partial(regex_filter, rx=args["rxdrop"], keep=False))
46
47
    all_fastas = Dna2VecList.from_folder(input, filters=filters if len(filters) > 0 else None ).items
48
49
    output = input / "inventory"
50
    print("Creating Inventory in", str(output))
51
52
    if not os.path.exists(output):
53
        os.makedirs(output)
54
55
    fn = "sequences" if args["o"] is None else args["o"]
56
57
    with open(output / f'{fn}_inventory.yml', 'w') as outfile:
58
        yaml.dump(args, outfile, default_flow_style=False)
59
60
61
    inventory = pd.DataFrame(data=all_fastas)
62
    if args['lsi'] is not None:
63
        lsi = [int(x) for x in args["lsi"].split(",")]
64
        label_source = inventory.loc[:,args['l']].values
65
        tokens = [np.asarray(x.split(args["d"])) for x in list(label_source)]
66
        inventory["label"] = [" ".join(t[lsi]) for t in tokens]
67
68
    inventory.to_csv(Path(output / f"{fn}.csv"))
69
70
    if "label" not in inventory.columns.values: return
71
72
    if args["g"] == "file":
73
        files_df = inventory.groupby(["file"]).agg({"id": ['count', list],"label":set,
74
                                                    "len": [list, min, max, np.mean, np.std], "description": list})
75
        files_df.columns=["seq_count", "id", "label", 'len', 'min' , 'max', 'mean', 'std', "description"]
76
        files_df["label"] = [list(x)[0] for x in list(files_df.label.values)]
77
78
    files_df.to_csv( output / f"{fn}_by_file.csv")
79
80
81
    label_df = inventory.groupby("label").agg({ "id": ["count", list], "len": [list, min, max, np.median],"file": list})
82
    label_df.columns = [ "seq_count", 'id', "lengths", "min", "max", "median","files"]
83
    label_df.files = [list(set(x)) for x in list(label_df.files.values)]
84
    label_df["file_count"] = [len(x) for x in list(label_df.files.values)]
85
86
    label_df.to_csv(output / f"{fn}_by_label.csv")
87
    label_df.to_pickle( output / f"{fn}_by_label.pkl")
88
89
90
91
if __name__ == '__main__':
92
    main()