from rdkit import Chem
import gzip
import time
import psycopg2
import json
import rdkit
print(rdkit.__version__)
2023.03.1
May 7, 2023
Updated 07.05.2023 This is an updated version of a post from 2020 with a few revisions.
I had a couple of online conversations this week about working with the PostgreSQL cartridge from Python and sending molecules back and forth between Python and the database. Here’s a quick blogpost on that topic.
from rdkit import Chem
import gzip
import time
import psycopg2
import json
import rdkit
print(rdkit.__version__)
2023.03.1
The dataset we’ll use here is a set of molecules+activity data from ChEMBL26. We take all the measured Ki values that are less than 1nM.
Here’s how I constructed the dataset
conn2 = psycopg2.connect("dbname=chembl_26 host=localhost")
curs2 = conn2.cursor()
curs2.execute('''select cid1.chembl_id as compound_chembl_id,cid2.chembl_id as assay_chembl_id,
target_dictionary.chembl_id as target_chembl_id,target_dictionary.pref_name as pref_name,
standard_relation,standard_value,standard_units,standard_type,molfile
from activities acts
join assays using (assay_id)
join compound_structures using (molregno)
join chembl_id_lookup cid1 on (molregno=entity_id and entity_type='COMPOUND')
join chembl_id_lookup cid2 on (assay_id=cid2.entity_id and cid2.entity_type='ASSAY')
join target_dictionary using (tid)
where standard_type='Ki' and standard_units='nM' and standard_value is not null and
standard_relation='=' and standard_value<1''')
data = curs2.fetchall()
import gzip
cnames = [x.name for x in curs2.description]
w = Chem.SDWriter(gzip.open('/home/glandrum/RDKit_blog/data/chembl26_very_active.sdf.gz','wt+'))
for row in data:
m = Chem.MolFromMolBlock(row[-1])
for i in range(len(cnames)-1):
m.SetProp(cnames[i],str(row[i]))
w.write(m)
w=None
The timing data in this post were generated on a standard desktop PC from 2022 (Dell XPS, 16 core i9 3.2 GHz CPU, 64GB RAM) running Ubuntu 22.04. I used the Ubuntu install of PostgreSQL 14 and didn’t do any tuning of the database configuration.
Let’s start by assembling a benchmarking set of 30K molblocks:
How long does it take to parse all the molblocks on the python side?
t1 = time.time()
ms = [Chem.MolFromMolBlock(mb) for mb in molblocks]
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
that took 3.48 seconds
What about to do the same work in the database?
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
curs.execute('drop table if exists molbs')
curs.execute('drop table if exists mols')
curs.execute('create table molbs (chembl_id text,molb text)')
curs.executemany('insert into molbs values (%s,%s)',[(x,y) for x,y in zip(nms,molblocks)])
t1 = time.time()
curs.execute('select chembl_id,mol_from_ctab(molb::cstring) m into mols from molbs')
conn.commit()
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
that took 4.66 seconds
Notice that we also had to transfer the mol blocks to the database. I didn’t include that in the timing results because we’re just looking at processing time.
It seems silly to do the work of processing the mol blocks in the database a second time. Fortunately, we can add the molecules to the database in RDKit’s binary form:
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
curs.execute('drop table if exists mols')
curs.execute('create table mols (chembl_id text,m mol)')
t1 = time.time()
curs.executemany('insert into mols values (%s,mol_from_pkl(%s))',[(x,y.ToBinary(),) for x,y in zip(nms,ms)])
conn.commit()
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
that took 2.65 seconds
What about going the other way: we have binary molecules in the database and want to pull them back into Python to work with them?
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
t1 = time.time()
curs.execute('select chembl_id,mol_to_pkl(m) from mols')
tms = [Chem.Mol(x[1].tobytes()) for x in curs.fetchall()]
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
that took 1.72 seconds
We can, of course, do searches in the database and then pull just the molecules from the search results into Python:
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
t1 = time.time()
curs.execute('select chembl_id,mol_to_pkl(m) from mols where m@>qmol_from_smarts(%s)',('c1ncn[o,n]1',))
tms = [Chem.Mol(x[1].tobytes()) for x in curs.fetchall()]
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds and returned {len(tms)} results")
that took 0.42 seconds and returned 993 results
The cartridge can calculate a number of molecular descriptors directly, but there are more available in Python. Let’s calculate some of those and add them to the database.
We’ll pull the binary molecules from the database, calculate BCUT2D descriptors for them, and then add the descriptors to a new database table.
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
curs.execute('drop table if exists bcuts')
curs.execute(f'create table bcuts (chembl_id text,{descrdefn})')
curs.execute('select chembl_id,mol_to_pkl(m) from mols')
bcut_data = []
for row in curs.fetchall():
trow = [row[0]]
mol = Chem.Mol(row[1].tobytes())
try:
descrs = rdMolDescriptors.BCUT2D(mol)
except ValueError:
continue
trow.extend(descrs)
bcut_data.append(trow)
cmd = f'insert into bcuts values (%s,{descrholder})'
curs.executemany(cmd,bcut_data)
conn.commit()
Since the bcuts descriptors use partial charges and we don’t have parameters for all atom types, some molecules don’t have values:
We loaded the molecules from an SDF, but ignored the data fields in that SDF. Now let’s load those into the database too.
We’ll take advantage of PostgreSQL’s jsonb type to store the properties on each molecule in a dictionary-like object.
Let’s start by loading the data:
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
curs.execute('drop table if exists mols')
curs.execute('create table mols (chembl_id text,m mol,sdf_data jsonb)')
rows = []
with gzip.open('../data/chembl26_very_active.sdf.gz','r') as inf:
suppl = Chem.ForwardSDMolSupplier(inf)
for m in suppl:
if not m:
continue
nm = m.GetProp('compound_chembl_id')
props = m.GetPropsAsDict()
rows.append((nm,m.ToBinary(),json.dumps(props)))
if len(rows)>=10000:
curs.executemany('insert into mols values (%s,mol_from_pkl(%s),%s)',rows)
conn.commit()
rows = []
curs.executemany('insert into mols values (%s,mol_from_pkl(%s),%s)',rows)
conn.commit()
Demonstrate how to do a string query:
t1 = time.time()
curs.execute("select count(*) from mols where sdf_data->>'pref_name' = 'Human immunodeficiency virus type 1 protease'")
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
curs.fetchone()
that took 0.01 seconds
(892,)
And a query on a floating point value, here we’re counting the number of rows where the Ki value is less than 10 picomolar.
t1 = time.time()
curs.execute("select count(*) from mols where (sdf_data->>'standard_value')::float < 0.01")
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
curs.fetchone()
that took 0.01 seconds
(916,)
Get all the rows with measurements aginst HIV protease where Ki < 1 picomolar
t1 = time.time()
curs.execute("select chembl_id,m from mols where sdf_data->>'pref_name' = 'Human immunodeficiency virus type 1 protease'\
and (sdf_data->>'standard_value')::float < 0.01")
d = curs.fetchall()
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
d[:2]
that took 0.02 seconds
[('CHEMBL443030',
'COc1cc(CN2C(=O)N(Cc3ccc(O)c(OC)c3)N(Cc3ccccc3)C[C@@H](O)[C@H]2Cc2ccccc2)ccc1O |(3.8692,2.234,;3.0465,2.2956,;2.5817,1.614,;1.759,1.6756,;1.2943,0.994,;0.4716,1.0556,;0.0069,0.374,;-0.8089,0.4969,;-1.0521,1.2853,;-1.4137,-0.0642,;-2.1817,0.2372,;-2.3046,1.053,;-3.0726,1.3544,;-3.1955,2.1702,;-2.5505,2.6846,;-2.6735,3.5004,;-1.7826,2.3832,;-1.1376,2.8975,;-0.3696,2.5961,;-1.6596,1.5674,;-1.352,-0.8869,;-2.0665,-1.2994,;-2.0665,-2.1244,;-2.781,-2.5369,;-2.781,-3.3619,;-2.0665,-3.7744,;-1.352,-3.3619,;-1.352,-2.5369,;-0.6704,-1.3516,;0.118,-1.1085,;0.6791,-1.7132,;0.4194,-0.3405,;1.2421,-0.2788,;1.7068,-0.9605,;2.5295,-0.8988,;2.9942,-1.5805,;2.6363,-2.3238,;1.8136,-2.3854,;1.3488,-1.7038,;1.6523,0.2507,;2.4749,0.189,;2.9397,0.8707,;3.7624,0.809,)|'),
('CHEMBL177470',
'CC(C)(CCCN)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)O[C@H]1CO[C@H]2OCC[C@H]21)S(=O)(=O)c1ccc2c(c1)OCO2 |(8.05,-8.8625,;8.0542,-8.0375,;7.4667,-8.6167,;8.775,-7.6292,;9.4875,-8.0417,;10.2,-7.6292,;10.9125,-8.0417,;7.35,-7.6167,;7.3542,-6.7917,;6.6375,-6.3875,;5.9292,-6.8042,;5.9292,-7.6292,;5.2125,-6.3917,;5.2042,-5.5667,;5.9167,-5.1542,;5.9167,-4.3292,;6.625,-3.9167,;7.3417,-4.3167,;7.35,-5.15,;6.6375,-5.5667,;4.5,-6.8125,;3.7792,-6.4042,;3.775,-5.5792,;3.0667,-6.8167,;2.3542,-6.4042,;2.3625,-5.6875,;1.1167,-5.6792,;1.1167,-6.4,;0.4792,-7.475,;1.1042,-7.8417,;1.7375,-7.4875,;1.7417,-6.7667,;8.0667,-6.3792,;8.7792,-6.7792,;8.0542,-5.5542,;8.8625,-6.1625,;9.4417,-6.7417,;10.2417,-6.5292,;10.4542,-5.7292,;9.8625,-5.1417,;9.0667,-5.3667,;10.2292,-4.4042,;11.0542,-4.5292,;11.1917,-5.35,)|')]
Notice that we get CXSMILES with coordinates back from the database. We loaded the compounds from the SDF with coordinates and the CXSMILES coming back from the database includes those coordinates.
Let’s do one last query there to look at the composition of the dataset:
t1 = time.time()
curs.execute("select sdf_data->>'pref_name',count(distinct(chembl_id)) cnt \
from mols group by (sdf_data->>'pref_name') order by cnt desc limit 20")
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
curs.fetchall()
that took 0.04 seconds
[('Unchecked', 1475),
('Mu opioid receptor', 1202),
('Serine/threonine-protein kinase PIM1', 900),
('Kappa opioid receptor', 823),
('Apoptosis regulator Bcl-2', 695),
('Delta opioid receptor', 631),
('Human immunodeficiency virus type 1 protease', 588),
('Coagulation factor X', 558),
('Serotonin 1a (5-HT1a) receptor', 553),
('Histamine H3 receptor', 542),
('Neuronal acetylcholine receptor; alpha4/beta2', 483),
('Dopamine D3 receptor', 444),
('Thrombin', 386),
('Dopamine D2 receptor', 382),
('Serotonin transporter', 373),
('Alpha-1a adrenergic receptor', 364),
('Serotonin 2a (5-HT2a) receptor', 354),
('PI3-kinase p110-alpha/p85-alpha', 352),
('Tyrosine-protein kinase JAK1', 349),
('Phosphodiesterase 10A', 331)]
The table only has ~35K rows, but notice that the performance of these jsonb queries is still quite good.
I think using JSONB this way, which basically lets us combine a relational database with a document store, opens up a bunch of interesting possibilties. I’ll try and do another blog post on that in the near(ish) future.
I was curious about performance, so let’s try loading a larger dataset:
from rdkit import RDLogger
# Disable RDKit warnings and error messages:
RDLogger.DisableLog('rdApp.*')
import glob
conn = psycopg2.connect("dbname=demodb host=localhost")
curs = conn.cursor()
curs.execute('drop table if exists mols')
curs.execute('create table mols (pubchem_compound_id text,m mol,sdf_data jsonb)')
rows = []
for fn in glob.glob('/scratch/Data/PubChem/Compound_*.sdf.gz')[:2]:
print(fn)
with gzip.open(fn,'r') as inf:
suppl = Chem.ForwardSDMolSupplier(inf)
for m in suppl:
if not m:
continue
nm = m.GetProp('PUBCHEM_COMPOUND_CID')
props = m.GetPropsAsDict()
rows.append((nm,m.ToBinary(),json.dumps(props)))
if len(rows)>=10000:
curs.executemany('insert into mols values (%s,mol_from_pkl(%s),%s)',rows)
conn.commit()
rows = []
curs.executemany('insert into mols values (%s,mol_from_pkl(%s),%s)',rows)
conn.commit()
/scratch/Data/PubChem/Compound_000000001_000500000.sdf.gz
/scratch/Data/PubChem/Compound_006000001_006500000.sdf.gz
Now we’ve got about 700K compounds, how long do the queries take?
t1 = time.time()
curs.execute("select count(*) from mols where (sdf_data->>'PUBCHEM_ISOTOPIC_ATOM_COUNT')::int > 10")
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
curs.fetchone()
that took 0.62 seconds
(25,)
t1 = time.time()
curs.execute("select count(*) from mols where (sdf_data->>'PUBCHEM_HEAVY_ATOM_COUNT')::int > 200")
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
curs.fetchone()
that took 0.60 seconds
(20,)
t1 = time.time()
curs.execute("select count(*) from mols where (sdf_data->>'PUBCHEM_HEAVY_ATOM_COUNT')::int < 20 and (sdf_data->>'PUBCHEM_ISOTOPIC_ATOM_COUNT')::int > 10")
t2 = time.time()
print(f" that took {t2-t1 :.2f} seconds")
curs.fetchone()
that took 0.72 seconds
(21,)
Those are still quite quick. Nice!