Diff of /benchmark/utils.py [000000] .. [bc9e98]

Switch to side-by-side view

--- a
+++ b/benchmark/utils.py
@@ -0,0 +1,141 @@
+###### import ######
+
+
+import numpy as np 
+try:
+	from rdkit import Chem 
+	from rdkit.Chem import AllChem
+except:
+	pass 
+###### import ######
+
+
+
+def plot_hist(prefix_name, prediction, label):
+	import seaborn as sns
+	import matplotlib.pyplot as plt
+	figure_name = prefix_name + "_histogram.png"
+	positive_prediction = [prediction[i] for i in range(len(label)) if label[i]==1]
+	negative_prediction = [prediction[i] for i in range(len(label)) if label[i]==0]
+	sns.distplot(positive_prediction, hist=True,  kde=False, bins=50, color = 'blue', label = 'positive')
+	sns.distplot(negative_prediction, hist=True,  kde=False, bins=50, color = 'red', label = 'negative')
+	plt.legend()
+	plt.savefig(figure_name)
+	return 
+
+def replace_strange_symbol(text):
+	for i in "[]'\n/":
+		text = text.replace(i,'_')
+	return text
+
+#  xml read blog:  https://blog.csdn.net/yiluochenwu/article/details/23515923 
+def walkData(root_node, prefix, result_list):
+	temp_list =[prefix + '/' + root_node.tag, root_node.text]
+	result_list.append(temp_list)
+	children_node = root_node.getchildren()
+	if len(children_node) == 0:
+		return
+	for child in children_node:
+		walkData(child, prefix = prefix + '/' + root_node.tag, result_list = result_list)
+
+
+def dynamic_programming(s1, s2):
+	arr2d = [[0 for i in s2] for j in s1]
+	if s1[0] == s2[0]:
+		arr2d[0][0] = 1
+	for i in range(1, len(s1)):
+		if s1[i]==s2[0]:
+			arr2d[i][0] = 1
+		else:
+			arr2d[i][0] = arr2d[i-1][0] 
+	for i in range(1,len(s2)):
+		if s2[i]==s1[0]:
+			arr2d[0][i] = 1 
+		else:
+			arr2d[0][i] = arr2d[0][i-1]
+	for i in range(1,len(s1)):
+		for j in range(1,len(s2)):
+			if s1[i] == s2[j]:
+				arr2d[i][j] = arr2d[i-1][j-1] + 1 
+			else:
+				arr2d[i][j] = max(arr2d[i-1][j], arr2d[i][j-1])
+	return arr2d[len(s1)-1][len(s2)-1]
+
+
+def get_path_of_all_xml_file():
+	input_file = "./data/all_xml"
+	with open(input_file, 'r') as fin:
+		lines = fin.readlines()
+	input_file_lst = [i.strip() for i in lines]
+	return input_file_lst 
+
+
+def remove_multiple_space(text):
+	text = ' '.join(text.split())
+	return text 
+
+def nctid_2_xml_file_path(nctid):
+	assert len(nctid)==11
+	prefix = nctid[:7] + "xxxx"
+	datafolder = os.path.join("./ClinicalTrialGov/", prefix, nctid+".xml")
+	return datafolder 
+
+
+def fingerprints_from_mol(mol):
+    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
+    size = 2048
+    nfp = np.zeros((1, size), np.int32)
+    for idx,v in fp.GetNonzeroElements().items():
+        nidx = idx%size
+        nfp[0, nidx] += int(v)
+    return nfp
+
+def smiles2fp(smiles):
+	try:
+		mol = Chem.MolFromSmiles(smile)
+		fp = fingerprints_from_mol(mol)
+		return fp 
+	except:
+		return np.zeros((1, 2048), np.int32)
+
+def smiles_lst2fp(smiles_lst):
+	fp_lst = [smiles2fp(smiles) for smiles in smiles_lst]
+	fp_mat = np.concatenate(fp_lst, 0)
+	fp = np.mean(fp_mat,0)
+	return fp	
+
+
+
+
+
+if __name__ == "__main__":
+	text = "interpret_result/NCT00329602__completed____1__1.7650960683822632__phase 4__['restless legs syndrome']__['placebo', 'ropinirole'].png"
+	print(replace_strange_symbol(text))
+
+
+
+
+
+
+# if __name__ == "__main__":
+# 	input_file_lst = get_path_of_all_xml_file() 
+# 	print(input_file_lst[:5])
+# '''
+# input_file_lst = [ 
+# 	'ClinicalTrialGov/NCT0000xxxx/NCT00000102.xml', 
+#  	'ClinicalTrialGov/NCT0000xxxx/NCT00000104.xml', 
+#  	'ClinicalTrialGov/NCT0000xxxx/NCT00000105.xml', 
+# 	  ... ]
+# '''
+
+
+
+# if __name__ == "__main__":
+# 	s1 = "328943"
+# 	s2 = "13785"
+# 	assert dynamic_programming(s1, s2)==2 
+
+
+
+
+