[286bfb]: / src / utils / pandas_helper_funcs.py

Download this file

121 lines (96 with data), 4.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
from os.path import join as j_
from os import listdir as ldir_
from os import scandir as sdir_
import shutil
from tqdm import tqdm
import numpy as np
import pandas
import pandas as pd
from pandas import Series
def series_int(s1: pandas.Series, s2: pandas.Series, dtype='O'):
"""
Returns set intersection of two pd.Series (resets index).
Args:
s1 (pandas.Series): Series (of strings).
s2 (pandas.Series): Series (of strings).
Returns:
(pandas.Series): set interesection of s1 and s2
"""
return pd.Series(list(set(s1) & set(s2)), dtype=dtype)
def df_loc_col(df1: pandas.DataFrame, s1: pandas.Series, col_name: str, apply_sint=True, drop_orig_index=False):
"""
Performs pandas.loc with column <col_name> as index of df1.
Args:
df1 (pandas.DataFrame): Dataframe.
s1 (pandas.Series): Series (of strings).
col_name (str): column to use as index for pandas.loc
apply_sint (bool): Whether to take series intersection first.
drop_orig_index (bool): Drops original index.
Returns:
(pandas.DataFrame): df1 subsetted by s1 using column <col_name>.
"""
return df1.reset_index(drop=drop_orig_index).set_index(col_name).loc[series_int(df1[col_name], s1) if apply_sint else s1].reset_index(drop=False)
def series_ldir_int(path1, path2, exts=['.', '.'], add_ext=False):
"""
Gets intersection of fnames (accounting for differing exts) in path1 and path2.
Args:
path1 (_type_): Path to directory of fnames.
path2 (_type_): Path to directory of fnames.
exts (list): Which exts to split for fnames in path1 and path2. Defaults to ['.', '.'].
add_ext (bool, optional): Whether to add back in the extension.
If True, defaults to using extension of path1 (order matters). Defaults to False.
Returns:
(pd.Series): Intersection of fnames (accounting for differing exts) in path1 and path2
"""
df1 = pd.Series(ldir_(path1)).str.rsplit(pat=exts[0], n=1, expand=True).set_axis(['fname', 'ext'], axis=1)
df2 = pd.Series(ldir_(path2)).str.rsplit(pat=exts[1], n=1, expand=True).set_axis(['fname', 'ext'], axis=1)
df = df_loc_col(df1, df2['fname'], col_name='fname', apply_sint=True, drop_orig_index=True)
return df['fname']+exts[0]+df['ext'] if add_ext else df['fname']
def df_sdir(dataroot: str, cols=['fpath', 'fname', 'slide_id']):
"""
Returns pd.DataFrame of the file paths and fnames of contents in dataroot.
Args:
dataroot (str): path to files.
Returns:
(pandas.Dataframe): pd.DataFrame of the file paths and fnames of contents in dataroot (make default cols: ['fpath', 'fname_ext', 'fname_noext']?)
"""
return pd.DataFrame([(e.path, e.name, os.path.splitext(e.name)[0]) for e in sdir_(dataroot)], columns=cols)
# TODO: Fix doc + also make function for ldir_diff
def series_diff(s1, s2, dtype='O'):
r"""
Returns set difference of two pd.Series.
"""
return pd.Series(list(set(s1).difference(set(s2))), dtype=dtype)
def transfer_dir2dir_shutil(dataroot: str, saveroot: str, subset_fnames:str=None, lim: int=None):
r"""
Transfer files from dir2dir
Args:
- dataroot (str): Source folder from which you want to transfer files from
- subset_fnames (list): List of filenames
- saveroot (str): Destination folder from which you want to transfer files to
Return:
- None
"""
from tqdm import tqdm
from os.path import join as j_
if lim == None:
pbar = tqdm(os.listdir(dataroot))
else:
pbar = tqdm(os.listdir(dataroot)[:lim])
missing = []
for fname in pbar:
pbar.set_description(f'Copying: {fname}')
src = j_(dataroot, fname)
dst = j_(saveroot, fname)
if not os.path.isfile(src):
missing.append(fname)
elif os.path.isfile(dst):
continue
else:
shutil.copyfile(src=src, dst=dst)
pass
print('Num Missing:', len(missing))
print("Missing Files:", missing)
series_intersection = series_int
series_difference = series_diff