Diff of /util/inventory.py [000000] .. [98867e]

Switch to side-by-side view

--- a
+++ b/util/inventory.py
@@ -0,0 +1,92 @@
+import sys
+sys.path.append("..")
+from faigen.data.sequence import Dna2VecList,regex_filter
+import pandas as pd
+import numpy as np
+import os
+from functools import partial
+import configargparse
+from pathlib import Path
+from Bio.SeqRecord import SeqRecord
+import yaml
+
+def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame:
+    res=df.copy()
+    drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]]
+    return res.drop(drop, axis=0)
+
+
+def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame:
+    res,mask=df.copy(),[]
+    for x in df.label.values: mask.append(False if word in x else True)
+    drop = res.index[mask]
+    return res.drop(drop, axis=0)
+
+
+def main():
+    argp = configargparse.get_argument_parser()
+    argp.add_argument('-i', help='input folder with Fasta files', type=str, default='/data/genomes/GenSeq_fastas')
+    argp.add_argument('-o', help='output file name', type=str)
+    argp.add_argument('-g', choices=['folder', 'file'], help='granularity', type=str, default="file")
+    argp.add_argument('-l', choices=['description', 'id', 'file'], help='source of labels', type=str, default="description")
+    argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str)
+    argp.add_argument('-lsr', help='regular expression for labeling', type=str)
+    argp.add_argument('-rxkeep', help='keep if regular expression found', type=str)
+    argp.add_argument('-rxdrop', help='drop if regular expression found', type=str)
+    argp.add_argument('-d', help='label delimiter', type=str, default=" ")
+    argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test")
+    argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1")
+
+
+    args = {k:v for k,v in vars(argp.parse_args()).items()}
+    input= Path(args["i"]) if args["i"] is not None else Path(".")
+    filters=[]
+    if args["rxkeep"] is not None: filters.append(partial(regex_filter, rx=args["rxkeep"]))
+    if args["rxdrop"] is not None: filters.append(partial(regex_filter, rx=args["rxdrop"], keep=False))
+
+    all_fastas = Dna2VecList.from_folder(input, filters=filters if len(filters) > 0 else None ).items
+
+    output = input / "inventory"
+    print("Creating Inventory in", str(output))
+
+    if not os.path.exists(output):
+        os.makedirs(output)
+
+    fn = "sequences" if args["o"] is None else args["o"]
+
+    with open(output / f'{fn}_inventory.yml', 'w') as outfile:
+        yaml.dump(args, outfile, default_flow_style=False)
+
+
+    inventory = pd.DataFrame(data=all_fastas)
+    if args['lsi'] is not None:
+        lsi = [int(x) for x in args["lsi"].split(",")]
+        label_source = inventory.loc[:,args['l']].values
+        tokens = [np.asarray(x.split(args["d"])) for x in list(label_source)]
+        inventory["label"] = [" ".join(t[lsi]) for t in tokens]
+
+    inventory.to_csv(Path(output / f"{fn}.csv"))
+
+    if "label" not in inventory.columns.values: return
+
+    if args["g"] == "file":
+        files_df = inventory.groupby(["file"]).agg({"id": ['count', list],"label":set,
+                                                    "len": [list, min, max, np.mean, np.std], "description": list})
+        files_df.columns=["seq_count", "id", "label", 'len', 'min' , 'max', 'mean', 'std', "description"]
+        files_df["label"] = [list(x)[0] for x in list(files_df.label.values)]
+
+    files_df.to_csv( output / f"{fn}_by_file.csv")
+
+
+    label_df = inventory.groupby("label").agg({ "id": ["count", list], "len": [list, min, max, np.median],"file": list})
+    label_df.columns = [ "seq_count", 'id', "lengths", "min", "max", "median","files"]
+    label_df.files = [list(set(x)) for x in list(label_df.files.values)]
+    label_df["file_count"] = [len(x) for x in list(label_df.files.values)]
+
+    label_df.to_csv(output / f"{fn}_by_label.csv")
+    label_df.to_pickle( output / f"{fn}_by_label.pkl")
+
+
+
+if __name__ == '__main__':
+    main()