|
a |
|
b/singlecellmultiomics/utils/bdbsstats.py |
|
|
1 |
# Buys's statsistics doodles |
|
|
2 |
import scipy.stats |
|
|
3 |
import numpy as np |
|
|
4 |
import collections |
|
|
5 |
import sklearn.metrics |
|
|
6 |
import matplotlib.pyplot as plt |
|
|
7 |
|
|
|
8 |
def sampleFromCounter(counter, n, replace=True, outputCounter=False): |
|
|
9 |
counterSum = sum(counter.values()) |
|
|
10 |
if replace: |
|
|
11 |
samples = np.random.choice(list(counter.keys()), n, p=[ float(counter[className])/float(counterSum) for className in counter ]) |
|
|
12 |
if outputCounter: |
|
|
13 |
#@todo optimize |
|
|
14 |
collections.Counter( samples ) |
|
|
15 |
else: |
|
|
16 |
return( samples ) |
|
|
17 |
if counterSum<n: |
|
|
18 |
raise ValueError('Cannot pick %s samples from a list of %s samples' % (n, counterSum)) |
|
|
19 |
|
|
|
20 |
samples = [] |
|
|
21 |
|
|
|
22 |
if outputCounter: |
|
|
23 |
samples=collections.Counter() |
|
|
24 |
for i in range(0,n): |
|
|
25 |
counterSum = sum(counter.values()) |
|
|
26 |
classProbs = [ float(counter[className])/float(counterSum) for className in counter ] |
|
|
27 |
s = list( np.random.choice( list(counter.keys()), 1, p=classProbs) )[0] |
|
|
28 |
if outputCounter: |
|
|
29 |
samples[s]+=1 |
|
|
30 |
else: |
|
|
31 |
samples.append( s ) |
|
|
32 |
counter[s]-=1 |
|
|
33 |
return(samples) |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
class IterativeHistogram(object): |
|
|
37 |
|
|
|
38 |
def __init__(self, minV, maxV, bins): |
|
|
39 |
self.binCount = bins |
|
|
40 |
self.min = minV |
|
|
41 |
self.max = maxV |
|
|
42 |
self.counts = np.zeros( self.binCount ) |
|
|
43 |
|
|
|
44 |
self.binStarts = np.linspace(self.min, self.max, self.binCount,endpoint=False) |
|
|
45 |
self.binSize = self.binStarts[1] - self.binStarts[0] |
|
|
46 |
|
|
|
47 |
def add(self, vector): |
|
|
48 |
try: |
|
|
49 |
locs = np.searchsorted(self.binStarts, vector,'right')-1 |
|
|
50 |
#print('input: %s' %vector) |
|
|
51 |
#print('point: %s'%locs) |
|
|
52 |
#print('starts:%s' % self.binStarts) |
|
|
53 |
|
|
|
54 |
np.add.at( self.counts, locs, 1) |
|
|
55 |
print(self.counts) |
|
|
56 |
except: |
|
|
57 |
|
|
|
58 |
exit('Failed adding vector to histogram') |
|
|
59 |
return(self) |
|
|
60 |
|
|
|
61 |
def plot(self,path=None): |
|
|
62 |
fig, ax = plt.subplots() #figsize=(120, 10)) |
|
|
63 |
for i in range(0,self.binCount): |
|
|
64 |
plt.bar(self.binStarts[i],self.counts[i], float(self.max-self.min)/float(self.binCount)) |
|
|
65 |
|
|
|
66 |
plt.ylabel("Density") |
|
|
67 |
plt.xlabel("PIC score") |
|
|
68 |
plt.title('PIC score distribution') |
|
|
69 |
ax.legend(loc='upper right') |
|
|
70 |
plt.yscale('log', nonposy='clip') |
|
|
71 |
if path==None: |
|
|
72 |
plt.show() |
|
|
73 |
else: |
|
|
74 |
plt.savefig(path) |
|
|
75 |
|
|
|
76 |
def __add__(self, histogramToAdd): |
|
|
77 |
n = IterativeHistogram(self.min, self.max, self.binCount) |
|
|
78 |
n.counts += self.counts |
|
|
79 |
n.counts += histogramToAdd.counts |
|
|
80 |
return(n) |
|
|
81 |
|
|
|
82 |
def p(self, value): |
|
|
83 |
return( sum( self.counts[ np.searchsorted(self.binStarts, value,'right'): ] )/sum(self.counts) ) |
|
|
84 |
|
|
|
85 |
|
|
|
86 |
def getScipyRandomVariable(self): |
|
|
87 |
|
|
|
88 |
values = self.binStarts |
|
|
89 |
probabilities = self.counts/np.sum(self.counts) |
|
|
90 |
X = scipy.stats.rv_discrete(values=(values, probabilities)) |
|
|
91 |
return(X) |
|
|
92 |
|
|
|
93 |
|
|
|
94 |
def __iter__(self): |
|
|
95 |
self.iterIndex = 0 |
|
|
96 |
return(self) |
|
|
97 |
|
|
|
98 |
def __next__(self): |
|
|
99 |
value = self.min |
|
|
100 |
index = 0 |
|
|
101 |
for i in range(0,self.binCount): |
|
|
102 |
index+=self.counts[i] |
|
|
103 |
|
|
|
104 |
if index> self.iterIndex: |
|
|
105 |
self.iterIndex+=1 |
|
|
106 |
return( self.binStarts[i] ) #+ 0.5*self.binSize |
|
|
107 |
raise StopIteration |
|
|
108 |
|
|
|
109 |
def calc_MI(x, y, bins): |
|
|
110 |
c_xy = np.histogram2d(x, y, bins)[0] |
|
|
111 |
mi = sklearn.metrics.mutual_info_score(None, None, contingency=c_xy) |
|
|
112 |
return mi |
|
|
113 |
|
|
|
114 |
## Example of Chi-squared test between two distributions: |
|
|
115 |
# |
|
|
116 |
# |
|
|
117 |
# observedValues = [16, 18, 16, 14, 12, 12] |
|
|
118 |
# expectedValues = [16, 16, 16, 16, 16, 8] |
|
|
119 |
# binCount = 20 |
|
|
120 |
# |
|
|
121 |
# |
|
|
122 |
# expectedDistribution = list( IterativeHistogram(0,20, binCount).add(expectedValues) ) |
|
|
123 |
# print(expectedDistribution) |
|
|
124 |
# |
|
|
125 |
# observedDistribution = list( IterativeHistogram(0,20, binCount).add(observedValues) ) |
|
|
126 |
# |
|
|
127 |
# print( scipy.stats.chisquare( observedDistribution, f_exp=expectedDistribution).pvalue ) |
|
|
128 |
# |
|
|
129 |
# print( scipy.stats.ks_2samp( observedDistribution, expectedDistribution )) |
|
|
130 |
# expectedDistribution = IterativeHistogram(0,20, binCount).add(expectedValues).getScipyRandomVariable() |
|
|
131 |
# observedDistribution = IterativeHistogram(0,20, binCount).add(observedValues).getScipyRandomVariable() |
|
|
132 |
# |
|
|
133 |
# print( scipy.stats.kstest( observedDistribution, expectedDistribution )) |