Thresholds for “random” in fingerprints the RDKit supports
fingerprints
similarity
reference
When is it just noise?
Published
May 18, 2021
Thresholds for “random” in fingerprints the RDKit supports
This post has been updated multiple times. This version is from 16.03.2025. Older version of the notebook can be found in github.
Note that older versions of this post had a mistake in the way the bit-based Morgan and FeatMorgan fingerprints were calculated: I was using the old fingerprint-generation functions and was generating folded count-based fingerprints instead of folded bit-based fingerprints. This led to similarity values and thresholds that were too high.
A frequent question that comes up when considering fingerprint similarity is: “What threshold should I use to determine what a neighbor is?” The answer is poorly defined. Of course it depends heavily on the details of the fingerprint, but there’s also a very subjective component: you want to pick a low enough threshold that you’re sure you won’t miss anything, but you don’t want to pick up too much noise.
The goal here is to systematically come up with some guidelines that can be used for fingerprints supported within the RDKit. We will do that by looking at similarities between random pairs of molecules picked from ChEMBL. The selection of the molecules used is described in another post.
For the analysis, the 50K similarity values are sorted and the values at particular threshold are examined.
There’s a fair amount of code and results below, so here’s the summary table. To help interpret this: 45000 of the 50000 pairs (90%) have a MACCS keys similarity value less than 0.549.
As a quick reminder: these are pairs of molecules taken from ChEMBL with a single fragment, less than 50 heavy atoms, and a count-based MFP0 similarity of at least 0.65 to each other.
ind = [x.split(b'\t') for x in gzip.open('../data/chembl35_50K.mfp0.pairs.txt.gz')]ms1 = []ms2 = []for i,row inenumerate(ind): m1 = Chem.MolFromSmiles(row[1]) ms1.append((row[0],m1)) m2 = Chem.MolFromSmiles(row[3]) ms2.append((row[2],m2))
Those pairs are related to each other, but we want random pairs, so shuffle the second list:
random.seed(23)random.shuffle(ms2)
try:import ipyparallel as ipp rc = ipp.Client() dview = rc[:] dview.execute('from rdkit import Chem') dview.execute('from rdkit import Descriptors') dview.execute('from rdkit.Chem import rdFingerprintGenerator') dview.execute('from rdkit.Chem import rdMolDescriptors') dview.execute('from rdkit import DataStructs') dview.execute('from rdkit.Avalon import pyAvalonTools') dview.execute('from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generate')except:print("could not use ipyparallel") dview =Noneresults_accum =dict()def accumResults(fps,fp2s,fpName): sims = [DataStructs.TanimotoSimilarity(x,y) for x,y inzip(fps,fp2s)] sl =sorted(sims) np =len(sl)withopen('fp_results.txt','a+') as outf: outf.write(f'<tr><td>{fpName}</td><td>Tanimoto</td>\n') accum = {}forbinin (.7,.8,.9,.95,.99): simv = sl[int(bin*np)]print( bin,simv) outf.write(f' <td>{simv:.3f}</td>\n') accum[bin] = simv outf.write('</tr>') results_accum[fpName] = accum plt.figure(figsize=(5,3)) plt.hist(sims,bins=20) plt.xlabel(fpName)def compareFPs(ms1,ms2,fpfn,fpName):if dview isnotNone: fps = dview.map_sync(lambda x:fpfn(x[1]),ms1) fp2s = dview.map_sync(lambda x:fpfn(x[1]),ms2)else: fps = [fpfn(x[1]) for x in ms1] fp2s = [fpfn(x[1]) for x in ms2] accumResults(fps,fp2s,fpName)
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=0)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan0 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan0 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=1)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan1 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan1 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan2 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan2 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan3 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Morgan3 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=0, atomInvariantsGenerator=rdFingerprintGenerator.GetMorganAtomInvGen())fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan0 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan0 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=1, atomInvariantsGenerator=rdFingerprintGenerator.GetMorganAtomInvGen())fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan1 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan1 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, atomInvariantsGenerator=rdFingerprintGenerator.GetMorganAtomInvGen())fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan2 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan2 (bits)")
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, atomInvariantsGenerator=rdFingerprintGenerator.GetMorganAtomInvGen())fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan3 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"FeatMorgan3 (bits)")
fpg = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"RDKit 5 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"RDKit 5 (bits)")
fpg = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"RDKit 7 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"RDKit 7 (bits)")
fpg = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5,branchedPaths=False)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"linear RDKit 5 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"linear RDKit 5 (bits)")
fpg = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7,branchedPaths=False)fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"linear RDKit 7 (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"linear RDKit 7 (bits)")
fpg = rdFingerprintGenerator.GetAtomPairGenerator()fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Atom Pairs (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Atom Pairs (bits)")
fpg = rdFingerprintGenerator.GetTopologicalTorsionGenerator()fps = fpg.GetSparseCountFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetSparseCountFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Topological Torsions (counts)")fps = fpg.GetFingerprints([x[1] for x in ms1],numThreads=8)fp2s = fpg.GetFingerprints([x[1] for x in ms2],numThreads=8)accumResults(fps,fp2s,"Topological Torsions (bits)")
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generatedef Gobbi2D_bits(mol,fpLen=2048): res = DataStructs.ExplicitBitVect(fpLen)for bit in Generate.Gen2DFingerprint(mol,Gobbi_Pharm2D.factory).GetOnBits():# the bits are not hashed, so we need to do so before we fold them: res.SetBit(hash((bit,))%fpLen)return res