fastai-genomic / Git / Diff of /util/split_into

Models:
MarcoTheBlack/
fastai-genomic
Downloads: 1
Diff of /util/split_into_folders.py [000000] .. [98867e]
Switch to side-by-side view

--- a
+++ b/util/split_into_folders.py
@@ -0,0 +1,64 @@
+import sys
+sys.path.append("..")
+from faigen.data.sequence import Dna2VecList,regex_filter
+import pandas as pd
+import numpy as np
+import os
+from functools import partial
+import configargparse
+from pathlib import Path
+from Bio.SeqRecord import SeqRecord
+import yaml
+from pathlib import Path
+import os
+from shutil import copy
+from tqdm import tqdm
+
+def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame:
+    res=df.copy()
+    drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]]
+    res.drop(drop, axis=0,inplace=True)
+    return res.reset_index(drop=True)
+
+
+def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame:
+    res,mask=df.copy(),[]
+    for x in df.label.values: mask.append(False if word in x else True)
+    drop = res.index[mask]
+    res.drop(drop, axis=0,inplace=True)
+    return res.reset_index(drop=True)
+
+def main():
+    argp = configargparse.get_argument_parser()
+    argp.add_argument('-i', help='input label inventory csv', type=str)
+    argp.add_argument('-o', help='output folder', type=str)
+    argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str)
+    argp.add_argument('-lsr', help='regular expression for labeling', type=str)
+    argp.add_argument('-rxkeep', help='keep if regular expression found', type=str)
+    argp.add_argument('-rxdrop', help='drop if regular expression found', type=str)
+    argp.add_argument('-d', help='label delimiter', type=str, default=" ")
+    argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test")
+    argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1")
+
+
+    args = {k:v for k,v in vars(argp.parse_args()).items()}
+
+
+
+    out = Path('/home/serge/database/data/genomes/ncbi-genomes-2019-04-07')
+    folders = {
+        'train': out / "Bacillus" / "train",
+        'valid': out / "Bacillus" / "valid",
+        'test': out / "Bacillus" / "test"
+    }
+    for k in folders:
+        if not os.path.exists(folders[k]):
+            os.makedirs(folders[k])
+
+    for i in tqdm(range(short_list.shape[0])):
+        cnt = short_list.loc[i, "seq_count"]
+        train = int(0.75 * cnt)
+        valid = cnt - train
+        files = short_list.loc[i, "files"]
+        for i in range(cnt):
+            copy(files[i], folders["train"]) if i < train else copy(files[i], folders["valid"])
\ No newline at end of file