--- a +++ b/util/split_into_folders.py @@ -0,0 +1,64 @@ +import sys +sys.path.append("..") +from faigen.data.sequence import Dna2VecList,regex_filter +import pandas as pd +import numpy as np +import os +from functools import partial +import configargparse +from pathlib import Path +from Bio.SeqRecord import SeqRecord +import yaml +from pathlib import Path +import os +from shutil import copy +from tqdm import tqdm + +def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame: + res=df.copy() + drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]] + res.drop(drop, axis=0,inplace=True) + return res.reset_index(drop=True) + + +def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame: + res,mask=df.copy(),[] + for x in df.label.values: mask.append(False if word in x else True) + drop = res.index[mask] + res.drop(drop, axis=0,inplace=True) + return res.reset_index(drop=True) + +def main(): + argp = configargparse.get_argument_parser() + argp.add_argument('-i', help='input label inventory csv', type=str) + argp.add_argument('-o', help='output folder', type=str) + argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str) + argp.add_argument('-lsr', help='regular expression for labeling', type=str) + argp.add_argument('-rxkeep', help='keep if regular expression found', type=str) + argp.add_argument('-rxdrop', help='drop if regular expression found', type=str) + argp.add_argument('-d', help='label delimiter', type=str, default=" ") + argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test") + argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1") + + + args = {k:v for k,v in vars(argp.parse_args()).items()} + + + + out = Path('/home/serge/database/data/genomes/ncbi-genomes-2019-04-07') + folders = { + 'train': out / "Bacillus" / "train", + 'valid': out / "Bacillus" / "valid", + 'test': out / "Bacillus" / "test" + } + for k in folders: + if not os.path.exists(folders[k]): + os.makedirs(folders[k]) + + for i in tqdm(range(short_list.shape[0])): + cnt = short_list.loc[i, "seq_count"] + train = int(0.75 * cnt) + valid = cnt - train + files = short_list.loc[i, "files"] + for i in range(cnt): + copy(files[i], folders["train"]) if i < train else copy(files[i], folders["valid"]) \ No newline at end of file