Diff of /benchmark/drug2smiles.py [000000] .. [bc9e98]

Switch to unified view

a b/benchmark/drug2smiles.py
1
'''
2
## drug maps to smiles
3
## input:  "data/drugbank_drugs_info.csv"
4
## output:  "data/drug2smiles.pkl"
5
6
'''
7
8
9
10
import csv, pickle 
11
from collections import defaultdict 
12
def drug2smiles_func():
13
    file = "data/drugbank_drugs_info.csv"
14
    with open(file, 'r') as csvfile:
15
        reader = list(csv.reader(csvfile, delimiter = ','))[1:]
16
    drug2smiles = defaultdict(set)
17
    drug2smiles2 = dict()
18
    for row in reader:
19
        smiles = row[27]
20
        if smiles.strip()=='':
21
            continue 
22
        drug1 = row[3].lower()
23
        drug2 = row[11].lower()
24
        drug2smiles[drug1].add(smiles)
25
        drug2smiles[drug2].add(smiles)
26
    for drug, smiles in drug2smiles.items():
27
        smiles = list(smiles)[0]
28
        drug2smiles2[drug] = smiles 
29
    #### to improve 
30
    '''
31
         7: 53, 3: 1452, 1: 26851, 5: 178, 10: 16, 14: 6, 2: 6129, 
32
         4: 504, 17: 8, 12: 8, 8: 38, 6: 83, 11: 12, 9: 17, 161: 1, 
33
         21: 2, 15: 4, 32: 2, 13: 2, 31: 1, 22: 2, 23: 3, 16: 1, 18: 2, 104: 1, 19: 2
34
    '''
35
    return drug2smiles2
36
37
### disease -> icd code
38
if __name__ == "__main__":
39
    drug2smiles = drug2smiles_func()
40
    drug2smiles_file = "data/drug2smiles.pkl"
41
    pickle.dump(drug2smiles, open(drug2smiles_file, 'wb'))
42
43
44
'''
45
[
46
 0: 'id', 
47
 1: 'trial_id', 
48
 2: 'kind', 
49
 3: 'title', 
50
 4: 'description', 
51
 5: 'id', 
52
 6: 'intervention_id', 
53
 7: 'drug_id', 
54
 8: 'id', 
55
 9: 'type', 
56
 10: 'drugbank_id', 
57
 11: 'name', 
58
 12: 'state',   ----- solid liquid 
59
 13: 'description', 
60
 14: 'cas_number', 
61
 15: 'protein_formula', 
62
 16: 'protein_weight', 
63
 17: 'investigational', 
64
 18: 'approved', 
65
 19: 'vet_approved', 
66
 20: 'experimental', 
67
 21: 'nutraceutical', 
68
 22: 'illicit', 
69
 23: 'withdrawn', 
70
 'moldb_mono_mass', 
71
 'moldb_inchi', 
72
 'moldb_inchikey', 
73
 'moldb_smiles', 
74
 'moldb_average_mass', 
75
 'moldb_formula', 
76
 'synthesis_patent_id', 
77
 'protein_weight_details', 
78
 'biotech_kind']
79
80
'''