[8956d4]: / py_scripts / lmdb_utils.py

Download this file

74 lines (54 with data), 1.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import pickle
import lmdb
import selfies as sf
from tqdm import tqdm, trange
def read_lmdb(lmdb_path):
env = lmdb.open(
lmdb_path,
subdir=False,
readonly=True,
lock=False,
readahead=False,
meminit=False,
max_readers=256,
)
txn = env.begin()
keys = list(txn.cursor().iternext(values=False))
out_list = []
for idx in tqdm(keys):
datapoint_pickled = txn.get(idx)
data = pickle.loads(datapoint_pickled)
out_list.append(data)
print(len(data["coordinates"]))
env.close()
return out_list
def write_lmdb(out_list, save_path):
env = lmdb.open(
save_path,
subdir=False,
lock=False,
readahead=False,
meminit=False,
max_readers=64,
map_size=1099511627776
)
with env.begin(write=True) as lmdb_txn:
for i in tqdm(range(len(out_list))):
lmdb_txn.put(str(i).encode('ascii'), pickle.dumps(out_list[i]))
if __name__ == "__main__":
# Example usage
# Read LMDB
lmdb_path = "./data/train_no_test_af/train.lmdb"
data = read_lmdb(lmdb_path)
print(data[0].keys())
#print(len(data[1]["coordinates"]))
dic = {
"atoms": "atom types for each atom in the ligand",
"coordinates": "3D coordinates for each atom in the ligand generated by RDKit. Max number of conformations is 10",
"pocket_atoms": "atom types for each atom in the pocket",
"pocket_coordinates": "3D coordinates for each atom in the pocket",
"mol": "RDKit molecule object for the ligand",
"smi": "SMILES string for the ligand",
"pocket": "pdbid of the pocket",
}