--- a +++ b/functions.py @@ -0,0 +1,100 @@ +import numpy as np +import random + +def quan_detector(dataset,label): + """ + the percent of zeros(n) promoter + """ + n = len(dataset[1]) + N = len(dataset) + p = [0]*n + count = 0. + pos_count = 0. + neg_count = 0. + for i in range(N): + d = dataset[i] + if all(d == p): + count += 1. + if label[i][0] == 1: + pos_count += 1. + else: + neg_count += 1. + return count/N, pos_count,neg_count +#quan_detector([[0,0,0],[0,1,0],[0,2,0],[1,0,1]]) + +def most_repeared_promoter(dataset,label): + N = len(dataset) + n = len(dataset[1]) + zeros = ','.join(map(str,['0']*n)) + # print len(zeros)-n + dict_count = {} + for i in range(N): + str_prom = ','.join(map(str,dataset[i])) + if str_prom not in dict_count.keys(): + dict_count[str_prom] = [1,0,0] + if label[i][0] == 1: + dict_count[str_prom][1] = 1 + else: + dict_count[str_prom][2] = 1 + else: + dict_count[str_prom][0] += 1 + if label[i][0] == 1: + dict_count[str_prom][1] += 1 + else: + dict_count[str_prom][2] += 1 + if zeros in dict_count.keys(): + dict_count.pop(zeros) # remove without + count = np.array(dict_count.values())[:,0] + max_count = max(count) + for k,v in dict_count.items(): + if v[0] == max_count: + idx_temp = k + idx = idx_temp.split(',') + return idx, max_count, dict_count[idx_temp] + +############################################################### +######### Dataset generation ############## +############################################################### +def indx(lab): + # lab = np.argmax(lab,axis=1) + p = [] # positive samples index-- ALS + n = [] # negative samples index-- Non-ALS + for i in range(len(lab)): + if lab[i] == 0: + p.append(i) + else: + n.append(i) + return p, n + + +def dataset(X, Y, test_ratio): + lab = np.argmax(Y, axis=1) + pos_s, neg_s = indx(lab) + + N = len(lab) + idx = range(N) + + N_te = int(N * test_ratio) / 5 * 5 # number of test samples + N_tr = N - N_te # number of training samples + + pos_s_te = int(N_te * 0.5) + neg_s_te = int(N_te * 0.5) + + random.shuffle(pos_s) + random.shuffle(neg_s) + + pos_idx_te = pos_s[:pos_s_te] + neg_idx_te = neg_s[:neg_s_te] + + te_idx = pos_idx_te + neg_idx_te + tr_idx = list(set(idx) - set(te_idx)) + + random.shuffle(te_idx) + random.shuffle(tr_idx) + + tr_X = X[tr_idx] + tr_Y = Y[tr_idx] + + te_X = X[te_idx] + te_Y = Y[te_idx] + return tr_X, tr_Y, te_X, te_Y \ No newline at end of file