ecg-selfsupervised / Git / [134fd7] /clinical_ts/timeseries

Models:
ReneeD/
ecg-selfsupervised
Downloads: 1
[134fd7]: / clinical_ts / timeseries_utils.py
History
Download this file
829 lines (686 with data), 35.6 kB

# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/A_timeseries_utils.ipynb (unless otherwise specified).

__all__ = ['butter_filter', 'butter_filter_frequency_response', 'apply_butter_filter', 'save_dataset', 'load_dataset',
           'dataset_add_chunk_col', 'dataset_add_length_col', 'dataset_add_labels_col', 'dataset_add_mean_col',
           'dataset_add_median_col', 'dataset_add_std_col', 'dataset_add_iqr_col', 'dataset_get_stats',
           'npys_to_memmap_batched', 'npys_to_memmap', 'reformat_as_memmap', 'TimeseriesDatasetCrops', 'RandomCrop',
           'CenterCrop', 'GaussianNoise', 'Rescale', 'ToTensor', 'Normalize', 'NormalizeBatch', 'ButterFilter',
           'ChannelFilter', 'Transform', 'TupleTransform', 'aggregate_predictions']

# Cell
import numpy as np
import pandas as pd
import torch
import torch.utils.data
from torch import nn
from pathlib import Path
from scipy.stats import iqr

try:
    import pickle5 as pickle
except ImportError as e:
    import pickle

#Note: due to issues with the numpy rng for multiprocessing (https://github.com/pytorch/pytorch/issues/5059) that could be fixed by a custom worker_init_fn we use random throught for convenience
import random

#Note: multiprocessing issues with python lists and dicts (https://github.com/pytorch/pytorch/issues/13246) and pandas dfs (https://github.com/pytorch/pytorch/issues/5902)
import multiprocessing as mp

from skimage import transform

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from scipy.signal import butter, sosfilt, sosfiltfilt, sosfreqz

from tqdm.auto import tqdm

# Cell
#https://stackoverflow.com/questions/12093594/how-to-implement-band-pass-butterworth-filter-with-scipy-signal-butter
def butter_filter(lowcut=10, highcut=20, fs=50, order=5, btype='band'):
    '''returns butterworth filter with given specifications'''
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq

    sos = butter(order, [low, high] if btype=="band" else (low if btype=="low" else high), analog=False, btype=btype, output='sos')
    return sos

def butter_filter_frequency_response(filter):
    '''returns frequency response of a given filter (result of call of butter_filter)'''
    w, h = sosfreqz(filter)
    #gain vs. freq(Hz)
    #plt.plot((fs * 0.5 / np.pi) * w, abs(h))
    return w,h

def apply_butter_filter(data, filter, forwardbackward=True):
    '''pass filter from call of butter_filter to data (assuming time axis at dimension 0)'''
    if(forwardbackward):
        return sosfiltfilt(filter, data, axis=0)
    else:
        data = sosfilt(filter, data, axis=0)

# Cell
def save_dataset(df,lbl_itos,mean,std,target_root,filename_postfix="",protocol=4):
    target_root = Path(target_root)
    df.to_pickle(target_root/("df"+filename_postfix+".pkl"), protocol=protocol)

    if(isinstance(lbl_itos,dict)):#dict as pickle
        outfile = open(target_root/("lbl_itos"+filename_postfix+".pkl"), "wb")
        pickle.dump(lbl_itos, outfile, protocol=protocol)
        outfile.close()
    else:#array
        np.save(target_root/("lbl_itos"+filename_postfix+".npy"),lbl_itos)

    np.save(target_root/("mean"+filename_postfix+".npy"),mean)
    np.save(target_root/("std"+filename_postfix+".npy"),std)

def load_dataset(target_root,filename_postfix="",df_mapped=True):
    target_root = Path(target_root)
    # if(df_mapped):
    #     df = pd.read_pickle(target_root/("df_memmap"+filename_postfix+".pkl"))
    # else:
    #     df = pd.read_pickle(target_root/("df"+filename_postfix+".pkl")
    
    ### due to pickle 5 protocol error

    if(df_mapped):
        df = pickle.load(open(target_root/("df_memmap"+filename_postfix+".pkl"), "rb"))
    else:
        df = pickle.load(open(target_root/("df"+filename_postfix+".pkl"), "rb"))


    if((target_root/("lbl_itos"+filename_postfix+".pkl")).exists()):#dict as pickle
        infile = open(target_root/("lbl_itos"+filename_postfix+".pkl"), "rb")
        lbl_itos=pickle.load(infile)
        infile.close()
    else:#array
        lbl_itos = np.load(target_root/("lbl_itos"+filename_postfix+".npy"))


    mean = np.load(target_root/("mean"+filename_postfix+".npy"))
    std = np.load(target_root/("std"+filename_postfix+".npy"))
    return df, lbl_itos, mean, std

# Cell
def dataset_add_chunk_col(df, col="data"):
    '''add a chunk column to the dataset df'''
    df["chunk"]=df.groupby(col).cumcount()

def dataset_add_length_col(df, col="data", data_folder=None):
    '''add a length column to the dataset df'''
    df[col+"_length"]=df[col].apply(lambda x: len(np.load(x if data_folder is None else data_folder/x, allow_pickle=True)))

def dataset_add_labels_col(df, col="label", data_folder=None):
    '''add a column with unique labels in column col'''
    df[col+"_labels"]=df[col].apply(lambda x: list(np.unique(np.load(x if data_folder is None else data_folder/x, allow_pickle=True))))

def dataset_add_mean_col(df, col="data", axis=(0), data_folder=None):
    '''adds a column with mean'''
    df[col+"_mean"]=df[col].apply(lambda x: np.mean(np.load(x if data_folder is None else data_folder/x, allow_pickle=True),axis=axis))

def dataset_add_median_col(df, col="data", axis=(0), data_folder=None):
    '''adds a column with median'''
    df[col+"_median"]=df[col].apply(lambda x: np.median(np.load(x if data_folder is None else data_folder/x, allow_pickle=True),axis=axis))

def dataset_add_std_col(df, col="data", axis=(0), data_folder=None):
    '''adds a column with mean'''
    df[col+"_std"]=df[col].apply(lambda x: np.std(np.load(x if data_folder is None else data_folder/x, allow_pickle=True),axis=axis))

def dataset_add_iqr_col(df, col="data", axis=(0), data_folder=None):
    '''adds a column with mean'''
    df[col+"_iqr"]=df[col].apply(lambda x: iqr(np.load(x if data_folder is None else data_folder/x, allow_pickle=True),axis=axis))

def dataset_get_stats(df, col="data", simple=True):
    '''creates (weighted) means and stds from mean, std and length cols of the df'''
    if(simple):
        return df[col+"_mean"].mean(), df[col+"_std"].mean()
    else:
        #https://notmatthancock.github.io/2017/03/23/simple-batch-stat-updates.html
        #or https://gist.github.com/thomasbrandon/ad5b1218fc573c10ea4e1f0c63658469
        def combine_two_means_vars(x1,x2):
            (mean1,var1,n1) = x1
            (mean2,var2,n2) = x2
            mean = mean1*n1/(n1+n2)+ mean2*n2/(n1+n2)
            var = var1*n1/(n1+n2)+ var2*n2/(n1+n2)+n1*n2/(n1+n2)/(n1+n2)*np.power(mean1-mean2,2)
            return (mean, var, (n1+n2))

        def combine_all_means_vars(means,vars,lengths):
            inputs = list(zip(means,vars,lengths))
            result = inputs[0]

            for inputs2 in inputs[1:]:
                result= combine_two_means_vars(result,inputs2)
            return result

        means = list(df[col+"_mean"])
        vars = np.power(list(df[col+"_std"]),2)
        lengths = list(df[col+"_length"])
        mean,var,length = combine_all_means_vars(means,vars,lengths)
        return mean, np.sqrt(var)

# Cell
def npys_to_memmap_batched(npys, target_filename, max_len=0, delete_npys=True, batch_length=900000):
    memmap = None
    start = np.array([0])#start_idx in current memmap file (always already the next start- delete last token in the end)
    length = []#length of segment
    filenames= []#memmap files
    file_idx=[]#corresponding memmap file for sample
    shape=[]#shapes of all memmap files

    data = []
    data_lengths=[]
    dtype = None

    for idx,npy in tqdm(list(enumerate(npys))):

        data.append(np.load(npy, allow_pickle=True))
        data_lengths.append(len(data[-1]))

        if(idx==len(npys)-1 or np.sum(data_lengths)>batch_length):#flush
            data = np.concatenate(data)
            if(memmap is None or (max_len>0 and start[-1]>max_len)):#new memmap file has to be created
                if(max_len>0):
                    filenames.append(target_filename.parent/(target_filename.stem+"_"+str(len(filenames))+".npy"))
                else:
                    filenames.append(target_filename)

                shape.append([np.sum(data_lengths)]+[l for l in data.shape[1:]])#insert present shape

                if(memmap is not None):#an existing memmap exceeded max_len
                    del memmap
                #create new memmap
                start[-1] = 0
                start = np.concatenate([start,np.cumsum(data_lengths)])
                length = np.concatenate([length,data_lengths])

                memmap = np.memmap(filenames[-1], dtype=data.dtype, mode='w+', shape=data.shape)
            else:
                #append to existing memmap
                start = np.concatenate([start,start[-1]+np.cumsum(data_lengths)])
                length = np.concatenate([length,data_lengths])
                shape[-1] = [start[-1]]+[l for l in data.shape[1:]]
                memmap = np.memmap(filenames[-1], dtype=data.dtype, mode='r+', shape=tuple(shape[-1]))

            #store mapping memmap_id to memmap_file_id
            file_idx=np.concatenate([file_idx,[(len(filenames)-1)]*len(data_lengths)])
            #insert the actual data
            memmap[start[-len(data_lengths)-1]:start[-len(data_lengths)-1]+len(data)]=data[:]
            memmap.flush()
            dtype = data.dtype
            data = []#reset data storage
            data_lengths = []

    start= start[:-1]#remove the last element
    #cleanup
    for npy in npys:
        if(delete_npys is True):
            npy.unlink()
    del memmap

    #convert everything to relative paths
    filenames= [f.name for f in filenames]
    #save metadata
    np.savez(target_filename.parent/(target_filename.stem+"_meta.npz"),start=start,length=length,shape=shape,file_idx=file_idx,dtype=dtype,filenames=filenames)


def npys_to_memmap(npys, target_filename, max_len=0, delete_npys=True):
    memmap = None
    start = []#start_idx in current memmap file
    length = []#length of segment
    filenames= []#memmap files
    file_idx=[]#corresponding memmap file for sample
    shape=[]

    for idx,npy in tqdm(list(enumerate(npys))):
        data = np.load(npy, allow_pickle=True)
        if(memmap is None or (max_len>0 and start[-1]+length[-1]>max_len)):
            if(max_len>0):
                filenames.append(target_filename.parent/(target_filename.stem+"_"+str(len(filenames)+".npy")))
            else:
                filenames.append(target_filename)

            if(memmap is not None):#an existing memmap exceeded max_len
                shape.append([start[-1]+length[-1]]+[l for l in data.shape[1:]])
                del memmap
            #create new memmap
            start.append(0)
            length.append(data.shape[0])
            memmap = np.memmap(filenames[-1], dtype=data.dtype, mode='w+', shape=data.shape)
        else:
            #append to existing memmap
            start.append(start[-1]+length[-1])
            length.append(data.shape[0])
            memmap = np.memmap(filenames[-1], dtype=data.dtype, mode='r+', shape=tuple([start[-1]+length[-1]]+[l for l in data.shape[1:]]))

        #store mapping memmap_id to memmap_file_id
        file_idx.append(len(filenames)-1)
        #insert the actual data
        memmap[start[-1]:start[-1]+length[-1]]=data[:]
        memmap.flush()
        if(delete_npys is True):
            npy.unlink()
    del memmap

    #append final shape if necessary
    if(len(shape)<len(filenames)):
        shape.append([start[-1]+length[-1]]+[l for l in data.shape[1:]])
    #convert everything to relative paths
    filenames= [f.name for f in filenames]
    #save metadata
    np.savez(target_filename.parent/(target_filename.stem+"_meta.npz"),start=start,length=length,shape=shape,file_idx=file_idx,dtype=data.dtype,filenames=filenames)

def reformat_as_memmap(df, target_filename, data_folder=None, annotation=False, max_len=0, delete_npys=True,col_data="data",col_label="label", batch_length=0):
    npys_data = []
    npys_label = []

    for id,row in df.iterrows():
        npys_data.append(data_folder/row[col_data] if data_folder is not None else row[col_data])
        if(annotation):
            npys_label.append(data_folder/row[col_label] if data_folder is not None else row[col_label])
    if(batch_length==0):
        npys_to_memmap(npys_data, target_filename, max_len=max_len, delete_npys=delete_npys)
    else:
        npys_to_memmap_batched(npys_data, target_filename, max_len=max_len, delete_npys=delete_npys,batch_length=batch_length)
    if(annotation):
        if(batch_length==0):
            npys_to_memmap(npys_label, target_filename.parent/(target_filename.stem+"_label.npy"), max_len=max_len, delete_npys=delete_npys)
        else:
            npys_to_memmap_batched(npys_label, target_filename.parent/(target_filename.stem+"_label.npy"), max_len=max_len, delete_npys=delete_npys, batch_length=batch_length)

    #replace data(filename) by integer
    df_mapped = df.copy()
    df_mapped["data_original"]=df_mapped.data
    df_mapped["data"]=np.arange(len(df_mapped))

    df_mapped.to_pickle(target_filename.parent/("df_"+target_filename.stem+".pkl"))
    return df_mapped

# Cell
class TimeseriesDatasetCrops(torch.utils.data.Dataset):
    """timeseries dataset with partial crops."""

    def __init__(self, df, output_size, chunk_length, min_chunk_length, memmap_filename=None, npy_data=None, random_crop=True, data_folder=None, num_classes=2, copies=0, col_lbl="label", stride=None, start_idx=0, annotation=False, transforms=None, sample_items_per_record=1):
        """
        accepts three kinds of input:
        1) filenames pointing to aligned numpy arrays [timesteps,channels,...] for data and either integer labels or filename pointing to numpy arrays[timesteps,...] e.g. for annotations
        2) memmap_filename to memmap file (same argument that was passed to reformat_as_memmap) for data [concatenated,...] and labels- label column in df corresponds to index in this memmap
        3) npy_data [samples,ts,...] (either path or np.array directly- also supporting variable length input) - label column in df corresponds to sampleid

        transforms: list of callables (transformations) or (preferred) single instance e.g. from torchvision.transforms.Compose (applied in the specified order i.e. leftmost element first)
        
        col_lbl = None: return dummy label 0 (e.g. for unsupervised pretraining)
        """
        assert not((memmap_filename is not None) and (npy_data is not None))
        # require integer entries if using memmap or npy
        assert (memmap_filename is None and npy_data is None) or df.data.dtype==np.int64

        self.timeseries_df_data = np.array(df["data"])
        if(self.timeseries_df_data.dtype not in [np.int16, np.int32, np.int64]):
            assert(memmap_filename is None and npy_data is None) #only for filenames in mode files
            self.timeseries_df_data = np.array(df["data"].astype(str)).astype(np.string_)

        if(col_lbl is None):# use dummy labels
            self.timeseries_df_label = np.zeros(len(df))
        else: # use actual labels
            if(isinstance(df[col_lbl].iloc[0],list) or isinstance(df[col_lbl].iloc[0],np.ndarray)):#stack arrays/lists for proper batching
                self.timeseries_df_label = np.stack(df[col_lbl])
            else: # single integers/floats
                self.timeseries_df_label = np.array(df[col_lbl])
                    
            if(self.timeseries_df_label.dtype not in [np.int16, np.int32, np.int64, np.float32, np.float64]): #everything else cannot be batched anyway mp.Manager().list(self.timeseries_df_label)
                assert(annotation and memmap_filename is None and npy_data is None)#only for filenames in mode files
                self.timeseries_df_label = np.array(df[col_lbl].apply(lambda x:str(x))).astype(np.string_)
        
        self.output_size = output_size
        self.data_folder = data_folder
        self.transforms = transforms
        if(isinstance(self.transforms,list) or isinstance(self.transforms,np.ndarray)):
            print("Warning: the use of list as arguments for transforms is dicouraged")
        self.annotation = annotation
        self.col_lbl = col_lbl

        self.c = num_classes

        self.mode="files"

        if(memmap_filename is not None):
            self.memmap_meta_filename = memmap_filename.parent/(memmap_filename.stem+"_meta.npz")
            self.mode="memmap"
            memmap_meta = np.load(self.memmap_meta_filename, allow_pickle=True)
            self.memmap_start = memmap_meta["start"]
            self.memmap_shape = memmap_meta["shape"]
            self.memmap_length = memmap_meta["length"]
            self.memmap_file_idx = memmap_meta["file_idx"]
            self.memmap_dtype = np.dtype(str(memmap_meta["dtype"]))
            self.memmap_filenames = np.array(memmap_meta["filenames"]).astype(np.string_)#save as byte to avoid issue with mp
            if(annotation):
                memmap_meta_label = np.load(self.memmap_meta_filename.parent/("_".join(self.memmap_meta_filename.stem.split("_")[:-1])+"_label_meta.npz"), allow_pickle=True)
                self.memmap_shape_label = memmap_meta_label["shape"]
                self.memmap_filenames_label = np.array(memmap_meta_label["filenames"]).astype(np.string_)
                self.memmap_dtype_label = np.dtype(str(memmap_meta_label["dtype"]))
        elif(npy_data is not None):
            self.mode="npy"
            if(isinstance(npy_data,np.ndarray) or isinstance(npy_data,list)):
                self.npy_data = np.array(npy_data)
                assert(annotation is False)
            else:
                self.npy_data = np.load(npy_data, allow_pickle=True)
            if(annotation):
                self.npy_data_label = np.load(npy_data.parent/(npy_data.stem+"_label.npy"), allow_pickle=True)

        self.random_crop = random_crop
        self.sample_items_per_record = sample_items_per_record

        self.df_idx_mapping=[]
        self.start_idx_mapping=[]
        self.end_idx_mapping=[]

        for df_idx,(id,row) in enumerate(df.iterrows()):
            if(self.mode=="files"):
                data_length = row["data_length"]
            elif(self.mode=="memmap"):
                data_length= self.memmap_length[row["data"]]
            else: #npy
                data_length = len(self.npy_data[row["data"]])

            if(chunk_length == 0):#do not split
                idx_start = [start_idx]
                idx_end = [data_length]
            else:
                idx_start = list(range(start_idx,data_length,chunk_length if stride is None else stride))
                idx_end = [min(l+chunk_length, data_length) for l in idx_start]

            #remove final chunk(s) if too short
            for i in range(len(idx_start)):
                if(idx_end[i]-idx_start[i]< min_chunk_length):
                    del idx_start[i:]
                    del idx_end[i:]
                    break
            #append to lists
            for _ in range(copies+1):
                for i_s,i_e in zip(idx_start,idx_end):
                    self.df_idx_mapping.append(df_idx)
                    self.start_idx_mapping.append(i_s)
                    self.end_idx_mapping.append(i_e)
        #convert to np.array to avoid mp issues with python lists
        self.df_idx_mapping = np.array(self.df_idx_mapping)
        self.start_idx_mapping = np.array(self.start_idx_mapping)
        self.end_idx_mapping = np.array(self.end_idx_mapping)
            
    def __len__(self):
        return len(self.df_idx_mapping)

    @property
    def is_empty(self):
        return len(self.df_idx_mapping)==0

    def __getitem__(self, idx):
        lst=[]
        for _ in range(self.sample_items_per_record):
            #determine crop idxs
            timesteps= self.get_sample_length(idx)

            if(self.random_crop):#random crop
                if(timesteps==self.output_size):
                    start_idx_rel = 0
                else:
                    start_idx_rel = random.randint(0, timesteps - self.output_size -1)#np.random.randint(0, timesteps - self.output_size)
            else:
                start_idx_rel =  (timesteps - self.output_size)//2
            if(self.sample_items_per_record==1):
                return self._getitem(idx,start_idx_rel)
            else:
                lst.append(self._getitem(idx,start_idx_rel))
        return tuple(lst)

    def _getitem(self, idx,start_idx_rel):
        #low-level function that actually fetches the data
        df_idx = self.df_idx_mapping[idx]
        start_idx = self.start_idx_mapping[idx]
        end_idx = self.end_idx_mapping[idx]
        #determine crop idxs
        timesteps= end_idx - start_idx
        assert(timesteps>=self.output_size)
        start_idx_crop = start_idx + start_idx_rel
        end_idx_crop = start_idx_crop+self.output_size

        #print(idx,start_idx,end_idx,start_idx_crop,end_idx_crop)
        #load the actual data
        if(self.mode=="files"):#from separate files
            data_filename = str(self.timeseries_df_data[df_idx],encoding='utf-8') #todo: fix potential issues here
            if self.data_folder is not None:
                data_filename = self.data_folder/data_filename
            data = np.load(data_filename, allow_pickle=True)[start_idx_crop:end_idx_crop] #data type has to be adjusted when saving to npy

            ID = data_filename.stem

            if(self.annotation is True):
                label_filename = str(self.timeseries_df_label[df_idx],encoding='utf-8')
                if self.data_folder is not None:
                    label_filename = self.data_folder/label_filename
                label = np.load(label_filename, allow_pickle=True)[start_idx_crop:end_idx_crop] #data type has to be adjusted when saving to npy
            else:
                label = self.timeseries_df_label[df_idx] #input type has to be adjusted in the dataframe
        elif(self.mode=="memmap"): #from one memmap file
            memmap_idx = self.timeseries_df_data[df_idx] #grab the actual index (Note the df to create the ds might be a subset of the original df used to create the memmap)
            memmap_file_idx = self.memmap_file_idx[memmap_idx]
            idx_offset = self.memmap_start[memmap_idx]

            #wi = torch.utils.data.get_worker_info()
            #pid = 0 if wi is None else wi.id#os.getpid()
            #print("idx",idx,"ID",ID,"idx_offset",idx_offset,"start_idx_crop",start_idx_crop,"df_idx", self.df_idx_mapping[idx],"pid",pid)
            mem_filename = str(self.memmap_filenames[memmap_file_idx],encoding='utf-8')
            mem_file = np.memmap(self.memmap_meta_filename.parent/mem_filename, self.memmap_dtype, mode='r', shape=tuple(self.memmap_shape[memmap_file_idx]))
            data = np.copy(mem_file[idx_offset + start_idx_crop: idx_offset + end_idx_crop])
            del mem_file
            #print(mem_file[idx_offset + start_idx_crop: idx_offset + end_idx_crop])
            if(self.annotation):
                mem_filename_label = str(self.memmap_filenames_label[memmap_file_idx],encoding='utf-8')
                mem_file_label = np.memmap(self.memmap_meta_filename.parent/mem_filename_label, self.memmap_dtype_label, mode='r', shape=tuple(self.memmap_shape_label[memmap_file_idx]))
                
                label = np.copy(mem_file_label[idx_offset + start_idx_crop: idx_offset + end_idx_crop])
                del mem_file_label
            else:
                label = self.timeseries_df_label[df_idx]
        else:#single npy array
            ID = self.timeseries_df_data[df_idx]

            data = self.npy_data[ID][start_idx_crop:end_idx_crop]

            if(self.annotation):
                label = self.npy_data_label[ID][start_idx_crop:end_idx_crop]
            else:
                label = self.timeseries_df_label[df_idx]

        sample = (data,label)

        if(isinstance(self.transforms,list)):#transforms passed as list
            for t in self.transforms:
                sample = t(sample)
        elif(self.transforms is not None):#single transform e.g. from torchvision.transforms.Compose
            sample = self.transforms(sample)

        return sample

    def get_sampling_weights(self, class_weight_dict,length_weighting=False, timeseries_df_group_by_col=None):
        '''
        class_weight_dict: dictionary of class weights
        length_weighting: weigh samples by length
        timeseries_df_group_by_col: column of the pandas df used to create the object'''
        assert(self.annotation is False)
        assert(length_weighting is False or timeseries_df_group_by_col is None)
        weights = np.zeros(len(self.df_idx_mapping),dtype=np.float32)
        length_per_class = {}
        length_per_group = {}
        for iw,(i,s,e) in enumerate(zip(self.df_idx_mapping,self.start_idx_mapping,self.end_idx_mapping)):
            label = self.timeseries_df_label[i]
            weight = class_weight_dict[label]
            if(length_weighting):
                if label in length_per_class.keys():
                    length_per_class[label] += e-s
                else:
                    length_per_class[label] = e-s
            if(timeseries_df_group_by_col is not None):
                group = timeseries_df_group_by_col[i]
                if group in length_per_group.keys():
                    length_per_group[group] += e-s
                else:
                    length_per_group[group] = e-s
            weights[iw] = weight

        if(length_weighting):#need second pass to properly take into account the total length per class
            for iw,(i,s,e) in enumerate(zip(self.df_idx_mapping,self.start_idx_mapping,self.end_idx_mapping)):
                label = self.timeseries_df_label[i]
                weights[iw]= (e-s)/length_per_class[label]*weights[iw]
        if(timeseries_df_group_by_col is not None):
            for iw,(i,s,e) in enumerate(zip(self.df_idx_mapping,self.start_idx_mapping,self.end_idx_mapping)):
                group = timeseries_df_group_by_col[i]
                weights[iw]= (e-s)/length_per_group[group]*weights[iw]

        weights = weights/np.min(weights)#normalize smallest weight to 1
        return weights

    def get_id_mapping(self):
        return self.df_idx_mapping

    def get_sample_id(self,idx):
        return self.df_idx_mapping[idx]

    def get_sample_length(self,idx):
        return self.end_idx_mapping[idx]-self.start_idx_mapping[idx]

    def get_sample_start(self,idx):
        return self.start_idx_mapping[idx]

# Cell
class RandomCrop(object):
    """Crop randomly the image in a sample.
    """

    def __init__(self, output_size,annotation=False):
        self.output_size = output_size
        self.annotation = annotation

    def __call__(self, sample):
        data, label = sample

        timesteps= len(data)
        assert(timesteps>=self.output_size)
        if(timesteps==self.output_size):
            start=0
        else:
            start = random.randint(0, timesteps - self.output_size-1) #np.random.randint(0, timesteps - self.output_size)

        data = data[start: start + self.output_size]
        if(self.annotation):
            label = label[start: start + self.output_size]

        return data, label

# Cell
class CenterCrop(object):
    """Center crop the image in a sample.
    """

    def __init__(self, output_size, annotation=False):
        self.output_size = output_size
        self.annotation = annotation

    def __call__(self, sample):
        data, label = sample

        timesteps= len(data)
        start = (timesteps - self.output_size)//2

        data = data[start: start + self.output_size]
        if(self.annotation):
            label = label[start: start + self.output_size]

        return data, label

# Cell
class GaussianNoise(object):
    """Add gaussian noise to sample.
    """

    def __init__(self, scale=0.1):
        self.scale = scale

    def __call__(self, sample):
        if self.scale ==0:
            return sample
        else:
            data, label = sample
            data = data + np.reshape(np.array([random.gauss(0,self.scale) for _ in range(np.prod(data.shape))]),data.shape)#np.random.normal(scale=self.scale,size=data.shape).astype(np.float32)
            return data, label

# Cell
class Rescale(object):
    """Rescale by factor.
    """

    def __init__(self, scale=0.5,interpolation_order=3):
        self.scale = scale
        self.interpolation_order = interpolation_order

    def __call__(self, sample):
        if self.scale ==1:
            return sample
        else:
            data, label = sample
            timesteps_new = int(self.scale * len(data))
            data = transform.resize(data,(timesteps_new,data.shape[1]),order=interpolation_order).astype(np.float32)
            return data,label

# Cell
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""
    def __init__(self, transpose_data=True, transpose_label=False):
        #swap channel and time axis for direct application of pytorch's convs
        self.transpose_data=transpose_data
        self.transpose_label=transpose_label

    def __call__(self, sample):

        def _to_tensor(data,transpose=False):
            if(isinstance(data,np.ndarray)):
                if(transpose):#seq,[x,y,]ch
                    return torch.from_numpy(np.moveaxis(data,-1,0))
                else:
                    return torch.from_numpy(data)
            else:#default_collate will take care of it
                return data

        data, label = sample

        if not isinstance(data,tuple):
            data = _to_tensor(data,self.transpose_data)
        else:
            data = tuple(_to_tensor(x,self.transpose_data) for x in data)

        if not isinstance(label,tuple):
            label = _to_tensor(label,self.transpose_label)
        else:
            label = tuple(_to_tensor(x,self.transpose_label) for x in label)

        return data,label #returning as a tuple (potentially of lists)

# Cell
class Normalize(object):
    """Normalize using given stats.
    """
    def __init__(self, stats_mean, stats_std, input=True, channels=[]):
        self.stats_mean=stats_mean.astype(np.float32) if stats_mean is not None else None
        self.stats_std=stats_std.astype(np.float32)+1e-8 if stats_std is not None else None
        self.input = input
        if(len(channels)>0):
            for i in range(len(stats_mean)):
                if(not(i in channels)):
                    self.stats_mean[:,i]=0
                    self.stats_std[:,i]=1

    def __call__(self, sample):
        datax, labelx = sample
        data = datax if self.input else labelx
        #assuming channel last
        if(self.stats_mean is not None):
            data = data - self.stats_mean
        if(self.stats_std is not None):
            data = data/self.stats_std

        if(self.input):
            return (data, labelx)
        else:
            return (datax, data)

# Cell
class NormalizeBatch(object):
    """Normalize using batch statistics.
    axis: tuple of integers of axis numbers to be normalized over (by default everything but the last)
    """
    def __init__(self, input=True, channels=[],axis=None):
        self.channels = channels
        self.channels_keep = None
        self.input = input
        self.axis = axis

    def __call__(self, sample):
        datax, labelx = sample
        data = datax if self.input else labelx
        #assuming channel last
        #batch_mean = np.mean(data,axis=tuple(range(0,len(data)-1)))
        #batch_std = np.std(data,axis=tuple(range(0,len(data)-1)))+1e-8
        batch_mean = np.mean(data,axis=self.axis if self.axis is not None else tuple(range(0,len(data.shape)-1)))
        batch_std = np.std(data,axis=self.axis if self.axis is not None else tuple(range(0,len(data.shape)-1)))+1e-8

        if(len(self.channels)>0):
            if(self.channels_keep is None):
                self.channels_keep = np.setdiff(range(data.shape[-1]),self.channels)

            batch_mean[self.channels_keep]=0
            batch_std[self.channels_keep]=1

        data = (data - batch_mean)/batch_std

        if(self.input):
            return (data, labelx)
        else:
            return (datax, data)

# Cell
class ButterFilter(object):
    """Apply filter
    """

    def __init__(self, lowcut=50, highcut=50, fs=100, order=5, btype='band', forwardbackward=True, input=True):
        self.filter = butter_filter(lowcut,highcut,fs,order,btype)
        self.input = input
        self.forwardbackward = forwardbackward

    def __call__(self, sample):
        datax, labelx = sample
        data = datax if self.input else labelx

        if(self.forwardbackward):
            data = sosfiltfilt(self.filter, data, axis=0)
        else:
            data = sosfilt(self.filter, data, axis=0)

        if(self.input):
            return (data, labelx)
        else:
            return (datax, data)

# Cell
class ChannelFilter(object):
    """Select certain channels.
    """

    def __init__(self, channels=[0], input=True):
        self.channels = channels
        self.input = input

    def __call__(self, sample):
        data,label = sample
        if(self.input):
            return (data[...,self.channels], label)
        else:
            return (data, label[...,self.channels])

# Cell
class Transform(object):
    """Transforms data using a given function i.e. data_new = func(data) for input is True else label_new = func(label)
    """

    def __init__(self, func, input=False):
        self.func = func
        self.input = input

    def __call__(self, sample):
        data,label = sample
        if(self.input):
            return (self.func(data), label)
        else:
            return (data, self.func(label))

# Cell
class TupleTransform(object):
    """Transforms data using a given function (operating on both data and label and return a tuple) i.e. data_new, label_new = func(data_old, label_old)
    """

    def __init__(self, func, input=False):
        self.func = func

    def __call__(self, sample):
        data,label = sample
        return  self.func(data,label)

# Cell
def aggregate_predictions(preds,targs=None,idmap=None,aggregate_fn = np.mean,verbose=False):
    '''
    aggregates potentially multiple predictions per sample (can also pass targs for convenience)
    idmap: idmap as returned by TimeSeriesCropsDataset's get_id_mapping
    preds: ordered predictions as returned by learn.get_preds()
    aggregate_fn: function that is used to aggregate multiple predictions per sample (most commonly np.amax or np.mean)
    '''
    if(idmap is not None and len(idmap)!=len(np.unique(idmap))):
        if(verbose):
            print("aggregating predictions...")
        preds_aggregated = []
        targs_aggregated = []
        for i in np.unique(idmap):
            preds_local = preds[np.where(idmap==i)[0]]
            preds_aggregated.append(aggregate_fn(preds_local,axis=0))
            if targs is not None:
                targs_local = targs[np.where(idmap==i)[0]]
                assert(np.all(targs_local==targs_local[0])) #all labels have to agree
                targs_aggregated.append(targs_local[0])
        if(targs is None):
            return np.array(preds_aggregated)
        else:
            return np.array(preds_aggregated),np.array(targs_aggregated)
    else:
        if(targs is None):
            return preds
        else:
            return preds,targs