Switch to unified view

a b/singlecellmultiomics/countTableProcessing/downsampleDataFrame.py
1
import pandas as pd
2
import numpy as np
3
import glob
4
import math
5
import re
6
import sys
7
import multiprocessing
8
9
10
def downsampleRow(args):
11
    row, targetSum = args
12
    currentCount = row.sum()
13
    downsampledRow = row.copy()
14
    while currentCount > targetSum and currentCount != 0:
15
        possible = downsampledRow[(downsampledRow > 0)]
16
        desiredTossCount = int(currentCount - targetSum)
17
        probabilities = [p / currentCount for p in possible]
18
        for indexToLower in np.random.choice(
19
                possible.index, max(0, desiredTossCount),
20
                replace=True, p=probabilities):
21
            if downsampledRow[indexToLower] > 0:
22
                downsampledRow[indexToLower] -= 1
23
24
        currentCount = downsampledRow.sum()
25
    return downsampledRow
26
27
# downsample_to = sample to this amount of counts per column
28
# min_feature_abundance = remove all rows which have less than these counts
29
30
31
def downsampleDataFrame(df, downsample_to, min_feature_abundance=50):
32
    pool = multiprocessing.Pool(8)
33
    try:
34
        df = df.loc[:, df.sum() > downsample_to]
35
        df = df.loc[df.sum(1) > min_feature_abundance, :]
36
        subset = df.transpose()
37
        dfDownsampled = subset.copy()
38
        for idx, drow in enumerate(
39
            pool.map(
40
                downsampleRow, [
41
                (row, downsample_to) for i, row in subset.iterrows()])):
42
            dfDownsampled.iloc[idx, :] = drow
43
    except Exception as e:
44
        print(e)
45
    pool.close()
46
    return dfDownsampled.transpose()