a b/transformers.py
1
from sklearn.base import BaseEstimator, TransformerMixin
2
import utils
3
import abc
4
import pandas as pd
5
from constants import variable_type,column_names,NO_UNITS,ALL
6
import logger
7
8
9
10
11
class safe_unstacker(BaseEstimator,TransformerMixin):
12
13
    def __init__(self, *levels):
14
        self.levels = levels
15
16
    def fit(self, x, y=None):
17
        return self
18
19
    def transform(self, df):
20
        return safe_unstack(df,self.levels)
21
22
def safe_unstack(df,levels):
23
    subindex = 'subindex'
24
    #add subindex to facilitate unstacking
25
    df = utils.add_subindex(df,subindex)
26
27
    #unstack!
28
    df_unstacked = df.unstack(levels)
29
30
    #drop "value" level, which is derivative from value column that is being unstacked against
31
    df_unstacked.columns = df_unstacked.columns.droplevel(0)
32
33
    # Drop subindex
34
    df_unstacked.index = df_unstacked.index.droplevel(subindex)
35
36
    df_unstacked.dropna(axis=1,inplace=True,how='all')
37
    return df_unstacked
38
39
class add_level(BaseEstimator,TransformerMixin):
40
        def __init__(self,level_val,level_name,axis=0):
41
            self.level_val = level_val
42
            self.level_name = level_name
43
            self.axis = axis
44
45
        def fit(self, x, y=None):
46
            return self
47
48
        def transform(self, df):
49
            return utils.add_same_val_index_level(df,self.level_val,self.level_name,self.axis)
50
51
class column_standardizer(BaseEstimator,TransformerMixin):
52
53
    def __init__(self,data_dict,ureg,convert_units=True):
54
        self.data_dict = data_dict
55
        self.ureg = ureg
56
        self.convert_units=convert_units
57
58
    def fit(self, x, y=None):
59
        return self
60
61
    def transform(self, df):
62
        df = df.copy()
63
        col_cnt = df.columns.size
64
        if col_cnt == 0: return df
65
        names = ['component','status','variable_type','units','description']
66
        tuples=[]
67
        for col_ix in range(0,col_cnt):
68
            col = df.iloc[:,col_ix]
69
            new_col,new_name = self.standardize(col)
70
            df.iloc[:,col_ix] = new_col
71
            tuples.append(map(str,new_name))
72
        df.columns = pd.MultiIndex.from_tuples(tuples,names=names)
73
        df.sort_index(axis=1, inplace=True)
74
        return df
75
76
    def standardize(self,col):
77
        old_col_name = col.name
78
        guess_component = old_col_name[0]
79
        units = old_col_name[-2]
80
        desc = old_col_name[-1]
81
        dtype = col.dtype
82
        defs = self.data_dict.tables.definitions
83
        defs = defs[defs.component == guess_component]
84
        best_def = None
85
        for ix,row in defs.iterrows():
86
            def_units = row['units']
87
            if can_convert(def_units,units,self.ureg):
88
                best_def = row
89
                break
90
91
        if (best_def is None) and (dtype != pd.np.object):
92
            status = 'unknown'
93
            var_type = variable_type.QUANTITATIVE
94
        elif (best_def is None) or ((best_def['variable_type'] == variable_type.QUANTITATIVE) & (dtype == pd.np.object)):
95
            status = 'unknown'
96
            var_type = variable_type.NOMINAL
97
            if units != NO_UNITS:
98
                desc = utils.append_to_description(desc,units)
99
                units = NO_UNITS
100
        else:
101
            status = 'known'
102
            var_type = best_def['variable_type']
103
            new_units = best_def['units']
104
            if new_units != units:
105
                if not self.ureg.same_units(units,new_units) and self.convert_units:
106
                    col = self.ureg.convert_units(units,new_units,col)
107
                desc = utils.append_to_description(str(desc),units)
108
                units = new_units
109
110
111
112
        return (col,(guess_component,status,var_type,units,desc))
113
114
def can_convert(unit1,unit2,med_ureg):
115
    if (unit1 == unit2): return True
116
    if (NO_UNITS in [unit1,unit2]): return False
117
    return med_ureg.same_dimensionality(unit1,unit2)
118
119
class oob_value_remover(BaseEstimator,TransformerMixin):
120
    def __init__(self,data_dict):
121
        self.data_dict = data_dict
122
123
    def fit(self, x, y=None):
124
        return self
125
126
    def transform(self, df):
127
        logger.log('Drop OOB data | {}'.format(df.shape),new_level=True)
128
        df = df.copy()
129
        idx = pd.IndexSlice
130
        df = df.sort_index(axis=1).sort_index()
131
        for component in df.columns.get_level_values('component').unique().tolist():
132
            component_defs = self.data_dict.defs_for_component(component)
133
            for units in df[component].columns.get_level_values(column_names.UNITS).unique().tolist():
134
                df_slice = df.loc[:,idx[component,:,:,units,:]]
135
                logger.log('{}, {}, {}'.format(component,units,df_slice.count().sum()))
136
                matching_defs = component_defs[(component_defs.units == units)]
137
                if matching_defs.empty: continue
138
                def_row = matching_defs.iloc[0]
139
                lower = def_row['lower']
140
                upper = def_row['upper']
141
                df.loc[:,idx[component,:,:,units,:]] = remove_oob_values(df_slice,lower,upper)
142
        df.dropna(how='all',inplace=True,axis=1)
143
        logger.end_log_level()
144
        return df
145
146
def remove_oob_values(data,lower,upper):
147
    oob_mask = (data < lower) | (data > upper)
148
    return data[~oob_mask]
149
150
151
152
153
class split_dtype(BaseEstimator,TransformerMixin):
154
155
    def fit(self, x, y=None):
156
        return self
157
158
    def transform(self, df):
159
        if df.empty: return df
160
        df_numeric  = df.apply(pd.to_numeric,errors='coerce')
161
        is_string = pd.isnull(df_numeric) & ~pd.isnull(df)
162
163
        df_string = df[is_string].dropna(how='all')
164
        tuples = [(col_name[0],NO_UNITS,utils.append_to_description(*map(str,col_name[3:0:-1]))) for col_name in df_string.columns]
165
        df_string.columns = pd.MultiIndex.from_tuples(tuples,names = df_string.columns.names)
166
        df_string = utils.add_same_val_index_level(df_string,level_val='string',level_name='dtype',axis=1)
167
168
        df_numeric = df_numeric.dropna(how='all')
169
        df_numeric = utils.add_same_val_index_level(df_numeric,level_val='number',level_name='dtype',axis=1)
170
171
        df_joined = df_numeric.join(df_string,how='outer')
172
        del df_string,df_numeric
173
174
        df_joined.columns = df_joined.columns.droplevel('dtype')
175
        df_joined.dropna(how='all',inplace=True,axis=1)
176
        return df_joined
177
178
179
class combine_like_cols(BaseEstimator,TransformerMixin):
180
    def fit(self, df, y=None, **fit_params):
181
        logger.log('FIT Combine like columns {}'.format(df.shape),new_level=True)
182
183
        self.columns_to_combine = {}
184
        groupby_cols = list(df.columns.names)
185
        groupby_cols.remove(column_names.DESCRIPTION)
186
        grouped = df.groupby(level=groupby_cols,axis=1)
187
188
        column_list = []
189
        df_out=None
190
        for index,group in grouped:
191
            index
192
            logger.log(index)
193
            if index[2] == variable_type.NOMINAL: continue
194
195
            ordered_cols = group[group.count().sort_values(ascending=False).index.tolist()].columns.tolist()
196
            self.columns_to_combine[index] = ordered_cols
197
198
        logger.end_log_level()
199
        return self
200
201
    def transform(self, df):
202
        logger.log('TRANSFORM Combine like columns {}'.format(df.shape),new_level=True)
203
204
        column_list = []
205
        for index,columns in self.columns_to_combine.iteritems():
206
            logger.log(index)
207
            df_list=[]
208
            for col_name in columns:
209
                if col_name not in df.columns:
210
                    df[col_name] = pd.np.nan
211
                col = df[col_name].dropna()
212
                col.name = index + (ALL,)
213
                df_list.append(col)
214
215
            df_combined = pd.concat(df_list).to_frame()
216
217
            # Here we will drop all duplicate values; since we sort the max col first,
218
            # BEFORE we loop and combine, we will be prioritizing all values from the max value
219
            # column. Although this may be a change in style from previous, it is easy, and will
220
            # most of the time be RIGHT.
221
            duplicates_to_drop = df_combined.index.duplicated(keep='first')
222
            df_combined = df_combined.loc[~duplicates_to_drop]
223
224
            #drop the combined columns
225
            df.drop(columns,axis=1,inplace=True)
226
227
            #join the combined column back to the DF
228
            df = df.join(df_combined,how='outer')
229
230
        df.columns.names = df.columns.names
231
        df.sort_index(inplace=True)
232
        df.sort_index(inplace=True,axis=1)
233
234
        logger.end_log_level()
235
236
237
        return df
238
239
class flatten_index(BaseEstimator,TransformerMixin):
240
        def __init__(self,axis=0,suffix=None):
241
            self.axis=axis
242
            self.suffix=suffix
243
244
        def fit(self, x, y=None):
245
            return self
246
247
        def transform(self, df):
248
            df = utils.flatten_index(df,axis=self.axis,suffix=self.suffix)
249
            return df
250
251
252
"""
253
Deal with categorical data
254
"""
255
class standardize_categories(BaseEstimator,TransformerMixin):
256
257
    def __init__(self,data_dict,category_map,use_numeric=True):
258
        self.data_dict = data_dict
259
        self.category_map = category_map
260
        self.use_numeric = use_numeric
261
262
    def fit(self, x, y=None):
263
        return self
264
265
    def transform(self, df):
266
        for component in utils.get_components(df):
267
            cat_map = self.category_map.get(component,None)
268
            if cat_map is None: continue
269
            df_slice = df.loc[:,[component]]
270
            categorical_mask = df_slice.columns.get_level_values('variable_type').isin([variable_type.NOMINAL,variable_type.ORDINAL])
271
            df_categories = self.data_dict.tables.categories
272
            to_replace = cat_map.keys()
273
            col = 'val_numeric' if self.use_numeric else 'val_text'
274
            values = [df_categories.loc[cat_ix,col] for cat_ix in cat_map.values()]
275
276
            df_slice.loc[:,categorical_mask] = df_slice.loc[:,categorical_mask].replace(to_replace=to_replace,value=values)
277
            if not self.use_numeric:
278
                to_replace = [df_categories.loc[cat_ix,'val_numeric'] for cat_ix in cat_map.values()]
279
                df_slice.loc[:,categorical_mask] = df_slice.loc[:,categorical_mask].replace(to_replace=to_replace,value=values)
280
            df.loc[:,[component]] = df_slice
281
        return df
282
283
class split_bad_categories(BaseEstimator,TransformerMixin):
284
285
    def __init__(self,data_dict,use_numeric=True):
286
        self.data_dict = data_dict
287
        self.use_numeric = use_numeric
288
289
    def fit(self, x, y=None):
290
        return self
291
292
    def transform(self, df):
293
        for component in utils.get_components(df):
294
            df_categories = self.data_dict.get_categories(component)
295
            if df_categories is None: continue
296
            df_slice = df.loc[:,[component]]
297
            col = 'val_numeric' if self.use_numeric else 'val_text'
298
            valid_values = df_categories.loc[:,col]
299
300
            categorical_mask = df_slice.columns.get_level_values('variable_type').isin([variable_type.NOMINAL,variable_type.ORDINAL])
301
            categorical_slice = df_slice.loc[:,categorical_mask]
302
303
            df_valid_mask  = categorical_slice.apply(lambda x: x.isin(valid_values))
304
305
            df_slice.loc[:,categorical_mask] = categorical_slice[df_valid_mask]
306
            df.loc[:,[component]] = df_slice
307
308
            df_invalid = categorical_slice[~df_valid_mask]
309
            df_invalid.columns = utils.set_level_to_same_val(df_invalid.columns,'status','unknown')
310
            df_invalid.columns = utils.set_level_to_same_val(df_invalid.columns,'variable_type',variable_type.NOMINAL)
311
            df = df.join(df_invalid,how='outer')
312
            del df_invalid
313
        df.dropna(how='all',inplace=True,axis=1)
314
        return df
315
316
class nominal_to_onehot(BaseEstimator,TransformerMixin):
317
318
    def fit(self, x, y=None):
319
        return self
320
321
    def transform(self, df):
322
        if df.empty: return df
323
324
        logger.log('Nominal to OneHot',new_level=True)
325
        nominal_cols = df.columns.get_level_values('variable_type') == variable_type.NOMINAL
326
327
        for col_name in df.loc[:,nominal_cols]:
328
            column = df[col_name]
329
            df.drop(col_name,axis=1,inplace=True)
330
            df_dummies = pd.get_dummies(column)
331
            if df_dummies.empty: continue
332
            dummy_col_names = [col_name[:-1] + ('{}_{}'.format(col_name[-1],text),) for text in df_dummies.columns]
333
            df_dummies.columns = pd.MultiIndex.from_tuples(dummy_col_names,names=df.columns.names)
334
            df = df.join(df_dummies,how='outer')
335
        logger.end_log_level()
336
        return df
337
338
339
"""
340
Duplicate index aggregators
341
"""
342
343
class same_index_aggregator(BaseEstimator,TransformerMixin):
344
345
    def __init__(self,agg_func):
346
        self.agg_func = agg_func
347
348
    def fit(self, x, y=None):
349
        return self
350
351
    def transform(self, df):
352
353
        duplicated = df.index.duplicated(keep=False)
354
355
        df_safe = df[~duplicated]
356
        df_duplicated = df[duplicated]
357
358
        df_fixed = df_duplicated.groupby(level=df_duplicated.index.names).agg(lambda x:self.agg_func(x))
359
360
        df_no_dups = pd.concat([df_safe,df_fixed])
361
        df_no_dups.sort_index(inplace=True)
362
        return df_no_dups
363
364
"""
365
Fill NA
366
"""
367
368
class NaNFiller(BaseEstimator,TransformerMixin):
369
370
    def fit(self, X, y, **fit_params):
371
        self.fill_vals = self.get_fill_vals(X, y, **fit_params)
372
        return self
373
374
    def transform(self,df):
375
        return df.apply(lambda col: col.fillna(self.fill_vals[col.name]))
376
377
    def get_fill_vals(self, X, y, **fit_params):
378
        return pd.Series(np.NaN,index=X.columns)
379
380
class FillerZero(NaNFiller):
381
382
    def get_fill_vals(self, X, y, **fit_params):
383
        return pd.Series(0,index=X.columns)
384
385
class FillerMean(NaNFiller):
386
387
    def get_fill_vals(self, X, y, **fit_params):
388
        return X.mean()
389
390
class FillerMode(NaNFiller):
391
392
    def get_fill_vals(self, X, y, **fit_params):
393
        return X.mode().iloc[0]
394
395
396
class do_nothing(BaseEstimator,TransformerMixin):
397
398
    def fit(self, x, y=None):
399
        return self
400
401
    def transform(self, df):
402
        return df
403
404
class GroupbyAndFFill(BaseEstimator,TransformerMixin):
405
        def __init__(self,level=None,by=None):
406
            self.level=level
407
            self.by=by
408
409
        def fit(self, x, y=None):
410
            return self
411
412
        def transform(self, df):
413
            return df.groupby(level=self.level,by=self.by).ffill()
414
415
class GroupbyAndBFill(BaseEstimator,TransformerMixin):
416
        def __init__(self,level=None,by=None):
417
            self.level=level
418
            self.by=by
419
420
        def fit(self, x, y=None):
421
            return self
422
423
        def transform(self, df):
424
            return df.groupby(level=self.level,by=self.by).bfill()
425
426
427
"""
428
filtering
429
"""
430
431
432
class column_filter(BaseEstimator,TransformerMixin):
433
434
    def fit(self, df, y=None, **fit_params):
435
        logger.log('*fit* Filter columns ({}) {}'.format(self.__class__.__name__, df.shape).format(self.__class__),new_level=True)
436
        if df.empty:
437
            self.cols_to_keep = []
438
        else:
439
            self.cols_to_keep = self.get_columns_to_keep(df, y, **fit_params)
440
        logger.end_log_level()
441
        return self
442
443
    def transform(self, df):
444
        logger.log('*transform* Filter columns ({}) {}'.format(self.__class__.__name__, df.shape))
445
        df_out = None
446
        if df.empty or len(self.cols_to_keep) == 0: df_out = df.drop(df.columns,axis=1)
447
        else: df_out = df.loc[:,self.cols_to_keep]
448
        logger.log(end_prev=True)
449
        return df_out
450
451
    def get_columns_to_keep(self,df, y=None, **fit_params):
452
        return df.columns
453
454
class DataSpecFilter(column_filter):
455
456
    def __init__(self,data_specs):
457
        self.data_specs = data_specs
458
459
    def get_columns_to_keep(self, df, y=None, **fit_params):
460
461
        df_cols = pd.DataFrame(map(list,df.columns.tolist()),columns=df.columns.names)
462
463
        mask = utils.complex_row_mask(df_cols,self.data_specs)
464
465
        return [tuple(x) for x in df_cols[mask].to_records(index=False)]
466
467
class max_col_only(column_filter):
468
    def get_columns_to_keep(self, df, y=None, **fit_params):
469
        self.max_col =  df.apply(utils.smart_count).sort_values().index.tolist()[-1]
470
        return [self.max_col]
471
472
473
class remove_small_columns(column_filter):
474
475
    def __init__(self,threshold):
476
        self.threshold = threshold
477
478
    def get_columns_to_keep(self, df, y=None, **fit_params):
479
        return df.loc[:,df.apply(utils.smart_count) > self.threshold].columns
480
481
482
class multislice_filter(column_filter):
483
484
    def __init__(self,slice_dict_list):
485
        self.slice_dict_list = slice_dict_list
486
487
    def get_columns_to_keep(self,df, y=None, **fit_params):
488
489
        cols = []
490
        for slice_dict in self.slice_dict_list:
491
            levels = slice_dict.keys()
492
            vals = slice_dict.values()
493
            cols += df.xs(vals,level=levels,axis=1,drop_level=False).columns.tolist()
494
495
496
        return cols
497
498
class DataNeedsFilter(multislice_filter):
499
500
    def __init__(self,data_needs):
501
        comp_dict = {}
502
        for dn in data_needs:
503
            component = dn[0]
504
            units = dn[1]
505
            units_list = comp_dict.get(component,[])
506
            units_list.append(units)
507
508
            comp_dict[component] = units_list
509
510
        slice_dict_list = []
511
        for component,units_list in comp_dict.iteritems():
512
            if ALL in units_list:
513
                slice_dict_list.append({column_names.COMPONENT: component})
514
                continue
515
            for unit in units_list:
516
                slice_dict_list.append({
517
                            column_names.COMPONENT: component,
518
                            column_names.UNITS : units
519
                        })
520
        super(DataNeedsFilter,self).__init__(slice_dict_list)
521
522
class func_filter(column_filter):
523
524
    def __init__(self,filter_func):
525
        self.filter_func = filter_func
526
527
    def get_columns_to_keep(self,df, y=None, **fit_params):
528
        return df.loc[:,df.apply(self.filter_func)].columns
529
530
531
class record_threshold(func_filter):
532
533
    def __init__(self,threshold):
534
        self.threshold = threshold
535
        filter_func = lambda col: col.dropna().index.get_level_values(column_names.ID).unique().size > self.threshold
536
        super(record_threshold,self).__init__(filter_func)
537
538
539
class drop_all_nan_cols(func_filter):
540
541
    def __init__(self):
542
        filter_func = lambda col: ~pd.isnull(col).all()
543
        super(drop_all_nan_cols,self).__init__(filter_func)
544
545
546
class known_col_only(func_filter):
547
548
    def __init__(self):
549
        filter_func = lambda col: col.name[1] == 'known'
550
        super(known_col_only,self).__init__(filter_func)
551
552
class filter_to_component(func_filter):
553
    def __init__(self,components):
554
        self.components = components
555
        filter_func = lambda col: col.name[0] in self.components
556
        super(filter_to_component,self).__init__(filter_func)
557
558
class filter_var_type(func_filter):
559
560
    def __init__(self,var_types):
561
        self.var_types =var_types
562
        filter_func = lambda col: col.name[2] in self.var_types
563
        super(filter_var_type,self).__init__(filter_func)
564
565
class summable_only(func_filter):
566
567
    def __init__(self,ureg,ignore_component_list):
568
        self.ureg = ureg
569
        self.ignore_component_list = ignore_component_list
570
        filter_func = lambda col:summable_only_filter(col,self.ureg,self.ignore_component_list)
571
        super(summable_only,self).__init__(filter_func)
572
573
def summable_only_filter(col,ureg,ignore_component_list):
574
    is_summable_unit = lambda col: (col.name[-2] != NO_UNITS) and (ureg.is_volume(str(col.name[-2])) or ureg.is_mass(str(col.name[-2])))
575
    should_ignore_component = lambda col: (col.name[0] in ignore_component_list)
576
    return lambda col: is_summable_unit(col.name) and not should_ignore_component(col.name)
577
578
class DropNaN(BaseEstimator,TransformerMixin):
579
580
    def __init__(self,axis=0,how='any',thresh=None):
581
        self.axis=axis
582
        self.how=how
583
        self.thresh=thresh
584
585
    def fit(self, df, y=None):
586
        return self
587
588
    def transform(self, df):
589
        return df.dropna(axis=self.axis,how=self.how,thresh=self.thresh)
590
591
class filter_ids(BaseEstimator,TransformerMixin):
592
593
    def __init__(self,print_loss=False,ids=None):
594
        self.print_loss = print_loss
595
        self.ids = ids
596
597
    def fit(self, x, y=None, **fit_params):
598
        if self.ids is None:
599
            ids = fit_params.get('ids',None)
600
            if (ids is None) and (y is not None):
601
                ids = y.index.get_level_values(column_names.ID).unique().tolist()
602
            self.ids = ids
603
        return self
604
605
    def transform(self, df):
606
        if self.ids is not None:
607
            out_df = df.loc[df.index.get_level_values(column_names.ID).isin(self.ids)]
608
        else: out_df = df
609
        if self.print_loss:
610
            print 'Data Loss:',utils.data_loss(df,out_df)
611
        return out_df
612
613
class more_than_n_component(BaseEstimator,TransformerMixin):
614
615
    def __init__(self,n,component):
616
        self.n = n
617
        self.component = component
618
619
    def fit(self, df, y=None):
620
        return self
621
622
    def transform(self, df):
623
        if df.empty: return df.drop(df.index)
624
        good_ids = df.loc[:,[self.component]].dropna(how='all').groupby(level=column_names.ID).count() > self.n
625
        good_ids = good_ids.loc[good_ids.iloc[:,0]].index.unique().tolist()
626
        return df.loc[df.index.get_level_values(column_names.ID).isin(good_ids)]
627
628
"""
629
Simple Data Manipulation
630
"""
631
632
class TimeShifter(TransformerMixin,BaseEstimator):
633
634
    def __init__(self,datetime_level,shift='infer',n=1):
635
        self.shift=shift
636
        self.datetime_level = datetime_level
637
        self.n=n
638
639
    def fit(self, X, y=None, **fit_params):
640
        return self
641
642
    def transform(self, df):
643
        shift = self.shift
644
        if shift == 'infer':
645
            infer_freq = lambda grp: grp.index.get_level_values(self.datetime_level).inferred_freq
646
            inferred_freqs = df.groupby(level=column_names.ID).apply(infer_freq)
647
            shift = inferred_freqs.value_counts().sort_values().index[-1]
648
        df = df.reset_index(level=self.datetime_level)
649
        df.loc[:,self.datetime_level] = df.loc[:,self.datetime_level] + self.n*pd.Timedelta(shift)
650
        df.set_index(self.datetime_level,append=True,inplace=True)
651
        return df
652
653
class RowShifter(TransformerMixin,BaseEstimator):
654
655
    def __init__(self,n):
656
        self.n=n
657
658
    def fit(self, X, y=None, **fit_params):
659
        return self
660
661
    def transform(self, df):
662
        return df.shift(self.n)
663
664
class Replacer(TransformerMixin,BaseEstimator):
665
666
    def __init__(self,to_replace=None, value=None, regex=False, method='pad'):
667
        self.to_replace = to_replace
668
        self.value = value
669
        self.regex=regex
670
        self.method = method
671
672
    def fit(self, X, y=None, **fit_params):
673
        return self
674
675
    def transform(self, df):
676
        return df.replace(
677
                    to_replace=self.to_replace,
678
                    value=self.value,
679
                    regex=self.regex,
680
                    method=self.method
681
                )
682
class Delta(TransformerMixin,BaseEstimator):
683
684
    def fit(self, X, y=None, **fit_params):
685
        return self
686
687
    def transform(self, df):
688
689
        df_last = df.ffill().dropna(how='any')
690
        df_last = utils.add_same_val_index_level(df_last,'last','temp',axis=1)
691
692
693
        df_next = df.shift(-1).dropna(how='any')
694
        df_next = utils.add_same_val_index_level(df_next,'next','temp',axis=1)
695
696
        df_all = df_last.join(df_next,how='inner')
697
        return df_all.loc[:,'next'] - df_all.loc[:,'last']
698
699
class ToGroupby(TransformerMixin,BaseEstimator):
700
701
    def __init__(self, by=None, axis=0, level=None, as_index=True):
702
        self.by=by
703
        self.axis=axis
704
        self.level=level
705
        self.as_index = as_index
706
707
    def fit(self, X, y=None, **fit_params):
708
        return self
709
710
    def transform(self, df):
711
        return df.groupby(by=self.by, axis=self.axis, level=self.level, as_index=self.as_index)