|
a |
|
b/singlecellmultiomics/countTableProcessing/downsampleDataFrame.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
import glob |
|
|
4 |
import math |
|
|
5 |
import re |
|
|
6 |
import sys |
|
|
7 |
import multiprocessing |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
def downsampleRow(args): |
|
|
11 |
row, targetSum = args |
|
|
12 |
currentCount = row.sum() |
|
|
13 |
downsampledRow = row.copy() |
|
|
14 |
while currentCount > targetSum and currentCount != 0: |
|
|
15 |
possible = downsampledRow[(downsampledRow > 0)] |
|
|
16 |
desiredTossCount = int(currentCount - targetSum) |
|
|
17 |
probabilities = [p / currentCount for p in possible] |
|
|
18 |
for indexToLower in np.random.choice( |
|
|
19 |
possible.index, max(0, desiredTossCount), |
|
|
20 |
replace=True, p=probabilities): |
|
|
21 |
if downsampledRow[indexToLower] > 0: |
|
|
22 |
downsampledRow[indexToLower] -= 1 |
|
|
23 |
|
|
|
24 |
currentCount = downsampledRow.sum() |
|
|
25 |
return downsampledRow |
|
|
26 |
|
|
|
27 |
# downsample_to = sample to this amount of counts per column |
|
|
28 |
# min_feature_abundance = remove all rows which have less than these counts |
|
|
29 |
|
|
|
30 |
|
|
|
31 |
def downsampleDataFrame(df, downsample_to, min_feature_abundance=50): |
|
|
32 |
pool = multiprocessing.Pool(8) |
|
|
33 |
try: |
|
|
34 |
df = df.loc[:, df.sum() > downsample_to] |
|
|
35 |
df = df.loc[df.sum(1) > min_feature_abundance, :] |
|
|
36 |
subset = df.transpose() |
|
|
37 |
dfDownsampled = subset.copy() |
|
|
38 |
for idx, drow in enumerate( |
|
|
39 |
pool.map( |
|
|
40 |
downsampleRow, [ |
|
|
41 |
(row, downsample_to) for i, row in subset.iterrows()])): |
|
|
42 |
dfDownsampled.iloc[idx, :] = drow |
|
|
43 |
except Exception as e: |
|
|
44 |
print(e) |
|
|
45 |
pool.close() |
|
|
46 |
return dfDownsampled.transpose() |