a b/util/split_into_folders.py
1
import sys
2
sys.path.append("..")
3
from faigen.data.sequence import Dna2VecList,regex_filter
4
import pandas as pd
5
import numpy as np
6
import os
7
from functools import partial
8
import configargparse
9
from pathlib import Path
10
from Bio.SeqRecord import SeqRecord
11
import yaml
12
from pathlib import Path
13
import os
14
from shutil import copy
15
from tqdm import tqdm
16
17
def filter_by_count(df:pd.DataFrame, min=1)->pd.DataFrame:
18
    res=df.copy()
19
    drop = res.index[res.index.values[np.asarray(res.seq_count.values) < min]]
20
    res.drop(drop, axis=0,inplace=True)
21
    return res.reset_index(drop=True)
22
23
24
def filter_by_label(df:pd.DataFrame, word:str)->pd.DataFrame:
25
    res,mask=df.copy(),[]
26
    for x in df.label.values: mask.append(False if word in x else True)
27
    drop = res.index[mask]
28
    res.drop(drop, axis=0,inplace=True)
29
    return res.reset_index(drop=True)
30
31
def main():
32
    argp = configargparse.get_argument_parser()
33
    argp.add_argument('-i', help='input label inventory csv', type=str)
34
    argp.add_argument('-o', help='output folder', type=str)
35
    argp.add_argument('-lsi', help='label selector (comma delimited numbers)', type=str)
36
    argp.add_argument('-lsr', help='regular expression for labeling', type=str)
37
    argp.add_argument('-rxkeep', help='keep if regular expression found', type=str)
38
    argp.add_argument('-rxdrop', help='drop if regular expression found', type=str)
39
    argp.add_argument('-d', help='label delimiter', type=str, default=" ")
40
    argp.add_argument('-split', help='split by folders, coma delimited string', type=str, default="train,valid,test")
41
    argp.add_argument('-portions', help='split by folders, coma delimited string', type=str, default="0.7,0.2,0.1")
42
43
44
    args = {k:v for k,v in vars(argp.parse_args()).items()}
45
46
47
48
    out = Path('/home/serge/database/data/genomes/ncbi-genomes-2019-04-07')
49
    folders = {
50
        'train': out / "Bacillus" / "train",
51
        'valid': out / "Bacillus" / "valid",
52
        'test': out / "Bacillus" / "test"
53
    }
54
    for k in folders:
55
        if not os.path.exists(folders[k]):
56
            os.makedirs(folders[k])
57
58
    for i in tqdm(range(short_list.shape[0])):
59
        cnt = short_list.loc[i, "seq_count"]
60
        train = int(0.75 * cnt)
61
        valid = cnt - train
62
        files = short_list.loc[i, "files"]
63
        for i in range(cnt):
64
            copy(files[i], folders["train"]) if i < train else copy(files[i], folders["valid"])