Switch to unified view

a b/singlecellmultiomics/utils/bdbsstats.py
1
# Buys's statsistics doodles
2
import scipy.stats
3
import numpy as np
4
import collections
5
import sklearn.metrics
6
import matplotlib.pyplot as plt
7
8
def sampleFromCounter(counter, n, replace=True, outputCounter=False):
9
    counterSum = sum(counter.values())
10
    if replace:
11
        samples = np.random.choice(list(counter.keys()), n, p=[ float(counter[className])/float(counterSum) for className in counter ])
12
        if outputCounter:
13
            #@todo optimize
14
            collections.Counter( samples )
15
        else:
16
            return(  samples )
17
    if counterSum<n:
18
        raise ValueError('Cannot pick %s samples from a list of %s samples' % (n, counterSum))
19
20
    samples = []
21
22
    if outputCounter:
23
        samples=collections.Counter()
24
    for i in range(0,n):
25
        counterSum = sum(counter.values())
26
        classProbs = [ float(counter[className])/float(counterSum) for className in counter ]
27
        s = list( np.random.choice( list(counter.keys()), 1, p=classProbs) )[0]
28
        if outputCounter:
29
            samples[s]+=1
30
        else:
31
            samples.append( s )
32
        counter[s]-=1
33
    return(samples)
34
35
36
class IterativeHistogram(object):
37
38
    def __init__(self, minV, maxV, bins):
39
        self.binCount = bins
40
        self.min = minV
41
        self.max = maxV
42
        self.counts = np.zeros( self.binCount )
43
44
        self.binStarts = np.linspace(self.min, self.max, self.binCount,endpoint=False)
45
        self.binSize = self.binStarts[1] - self.binStarts[0]
46
47
    def add(self, vector):
48
        try:
49
            locs = np.searchsorted(self.binStarts, vector,'right')-1
50
            #print('input: %s' %vector)
51
            #print('point: %s'%locs)
52
            #print('starts:%s' % self.binStarts)
53
54
            np.add.at( self.counts, locs, 1)
55
            print(self.counts)
56
        except:
57
58
            exit('Failed adding vector to histogram')
59
        return(self)
60
61
    def plot(self,path=None):
62
        fig, ax = plt.subplots() #figsize=(120, 10))
63
        for i in range(0,self.binCount):
64
            plt.bar(self.binStarts[i],self.counts[i], float(self.max-self.min)/float(self.binCount))
65
66
        plt.ylabel("Density")
67
        plt.xlabel("PIC score")
68
        plt.title('PIC score distribution')
69
        ax.legend(loc='upper right')
70
        plt.yscale('log', nonposy='clip')
71
        if path==None:
72
            plt.show()
73
        else:
74
            plt.savefig(path)
75
76
    def __add__(self, histogramToAdd):
77
        n = IterativeHistogram(self.min, self.max, self.binCount)
78
        n.counts += self.counts
79
        n.counts += histogramToAdd.counts
80
        return(n)
81
82
    def p(self, value):
83
        return( sum( self.counts[ np.searchsorted(self.binStarts, value,'right'): ] )/sum(self.counts) )
84
85
86
    def getScipyRandomVariable(self):
87
88
        values = self.binStarts
89
        probabilities = self.counts/np.sum(self.counts)
90
        X = scipy.stats.rv_discrete(values=(values, probabilities))
91
        return(X)
92
93
94
    def __iter__(self):
95
        self.iterIndex = 0
96
        return(self)
97
98
    def __next__(self):
99
        value = self.min
100
        index = 0
101
        for i in range(0,self.binCount):
102
            index+=self.counts[i]
103
104
            if index> self.iterIndex:
105
                self.iterIndex+=1
106
                return( self.binStarts[i]  ) #+ 0.5*self.binSize
107
        raise StopIteration
108
109
def calc_MI(x, y, bins):
110
    c_xy = np.histogram2d(x, y, bins)[0]
111
    mi = sklearn.metrics.mutual_info_score(None, None, contingency=c_xy)
112
    return mi
113
114
## Example of Chi-squared test between two distributions:
115
#
116
#
117
# observedValues = [16, 18, 16, 14, 12, 12]
118
# expectedValues = [16, 16, 16, 16, 16, 8]
119
# binCount = 20
120
#
121
#
122
# expectedDistribution = list( IterativeHistogram(0,20, binCount).add(expectedValues) )
123
# print(expectedDistribution)
124
#
125
# observedDistribution = list( IterativeHistogram(0,20, binCount).add(observedValues) )
126
#
127
# print( scipy.stats.chisquare( observedDistribution, f_exp=expectedDistribution).pvalue )
128
#
129
# print( scipy.stats.ks_2samp( observedDistribution, expectedDistribution ))
130
# expectedDistribution = IterativeHistogram(0,20, binCount).add(expectedValues).getScipyRandomVariable()
131
# observedDistribution = IterativeHistogram(0,20, binCount).add(observedValues).getScipyRandomVariable()
132
#
133
# print( scipy.stats.kstest( observedDistribution, expectedDistribution ))