[45ad7e]: / singlecellmultiomics / utils / bdbsstats.py

Download this file

134 lines (110 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Buys's statsistics doodles
import scipy.stats
import numpy as np
import collections
import sklearn.metrics
import matplotlib.pyplot as plt
def sampleFromCounter(counter, n, replace=True, outputCounter=False):
counterSum = sum(counter.values())
if replace:
samples = np.random.choice(list(counter.keys()), n, p=[ float(counter[className])/float(counterSum) for className in counter ])
if outputCounter:
#@todo optimize
collections.Counter( samples )
else:
return( samples )
if counterSum<n:
raise ValueError('Cannot pick %s samples from a list of %s samples' % (n, counterSum))
samples = []
if outputCounter:
samples=collections.Counter()
for i in range(0,n):
counterSum = sum(counter.values())
classProbs = [ float(counter[className])/float(counterSum) for className in counter ]
s = list( np.random.choice( list(counter.keys()), 1, p=classProbs) )[0]
if outputCounter:
samples[s]+=1
else:
samples.append( s )
counter[s]-=1
return(samples)
class IterativeHistogram(object):
def __init__(self, minV, maxV, bins):
self.binCount = bins
self.min = minV
self.max = maxV
self.counts = np.zeros( self.binCount )
self.binStarts = np.linspace(self.min, self.max, self.binCount,endpoint=False)
self.binSize = self.binStarts[1] - self.binStarts[0]
def add(self, vector):
try:
locs = np.searchsorted(self.binStarts, vector,'right')-1
#print('input: %s' %vector)
#print('point: %s'%locs)
#print('starts:%s' % self.binStarts)
np.add.at( self.counts, locs, 1)
print(self.counts)
except:
exit('Failed adding vector to histogram')
return(self)
def plot(self,path=None):
fig, ax = plt.subplots() #figsize=(120, 10))
for i in range(0,self.binCount):
plt.bar(self.binStarts[i],self.counts[i], float(self.max-self.min)/float(self.binCount))
plt.ylabel("Density")
plt.xlabel("PIC score")
plt.title('PIC score distribution')
ax.legend(loc='upper right')
plt.yscale('log', nonposy='clip')
if path==None:
plt.show()
else:
plt.savefig(path)
def __add__(self, histogramToAdd):
n = IterativeHistogram(self.min, self.max, self.binCount)
n.counts += self.counts
n.counts += histogramToAdd.counts
return(n)
def p(self, value):
return( sum( self.counts[ np.searchsorted(self.binStarts, value,'right'): ] )/sum(self.counts) )
def getScipyRandomVariable(self):
values = self.binStarts
probabilities = self.counts/np.sum(self.counts)
X = scipy.stats.rv_discrete(values=(values, probabilities))
return(X)
def __iter__(self):
self.iterIndex = 0
return(self)
def __next__(self):
value = self.min
index = 0
for i in range(0,self.binCount):
index+=self.counts[i]
if index> self.iterIndex:
self.iterIndex+=1
return( self.binStarts[i] ) #+ 0.5*self.binSize
raise StopIteration
def calc_MI(x, y, bins):
c_xy = np.histogram2d(x, y, bins)[0]
mi = sklearn.metrics.mutual_info_score(None, None, contingency=c_xy)
return mi
## Example of Chi-squared test between two distributions:
#
#
# observedValues = [16, 18, 16, 14, 12, 12]
# expectedValues = [16, 16, 16, 16, 16, 8]
# binCount = 20
#
#
# expectedDistribution = list( IterativeHistogram(0,20, binCount).add(expectedValues) )
# print(expectedDistribution)
#
# observedDistribution = list( IterativeHistogram(0,20, binCount).add(observedValues) )
#
# print( scipy.stats.chisquare( observedDistribution, f_exp=expectedDistribution).pvalue )
#
# print( scipy.stats.ks_2samp( observedDistribution, expectedDistribution ))
# expectedDistribution = IterativeHistogram(0,20, binCount).add(expectedValues).getScipyRandomVariable()
# observedDistribution = IterativeHistogram(0,20, binCount).add(observedValues).getScipyRandomVariable()
#
# print( scipy.stats.kstest( observedDistribution, expectedDistribution ))