Looking at the number of bits set by different fingerprints

fingerprints
reference
How many features do we find?
Published

July 6, 2021

This is an updated version of a post. The original version of the notebook can be found in github.

I’ve done a number of posts looking at Morgan fingerprint statistics before, including:

I have done similar analysis for other fingerprint types, but it looks like I didn’t post that (at least I can’t find it if I did). It’s useful to do this because, as we’ll see, the different fingerprint types have very different numbers of bits set for typical molecules.

Here’s the summary of the mean and standard deviation of the number of bits set, from an analysis of 5 million molecules with less than 50 heavy atoms extracted from ZINC:

Fingerprint Type Mean num_bits SD num_bits
Morgan1 sparse 29.4 5.6
Morgan2 sparse 48.7 9.6
Morgan3 sparse 66.8 13.8
FeatMorgan1 sparse 20.1 3.9
FeatMorgan2 sparse 38.1 7.7
FeatMorgan3 sparse 56.0 11.8
RDKit5 bitvect 363 122
RDKit6 bitvect 621 233
RDKit7 bitvect 993 406
pattern bitvect 446 122
avalon bitvect 280 130
atom pairs sparse 167 56
TT sparse 33.4 9.8
atom pairs bitvect 267 90
TT bitvect 47.2 12.0

The bit vector fingerprints were all 4096 bits long.

from rdkit import Chem,DataStructs
import time,random,gzip,pickle,copy
import numpy as np
from collections import Counter,defaultdict
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit import rdBase
%pylab inline

print(rdBase.rdkitVersion)
import time
print(time.asctime())
Populating the interactive namespace from numpy and matplotlib
2021.09.1pre
Tue Jul  6 04:58:28 2021
/home/glandrum/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['copy', 'random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
try:
    import ipyparallel as ipp
    rc = ipp.Client()
    dview = rc[:]
    dview.execute('from rdkit import Chem')
    dview.execute('from rdkit import Descriptors')
    dview.execute('from rdkit.Chem import rdMolDescriptors')
    dview.execute('from rdkit.Avalon import pyAvalonTools')
except:
    print("could not use ipyparallel")
    dview = None

For test data I’ll use the same 16 million ZINC compounds I used in the bit statistics post.

filen='/scratch/RDKit_git/LocalData/Zinc/zinc_all_clean.pkl.gz'

Loop over the molecules, skip anything with more than 50 atoms, and build fingerprints for all the others.

The fingerprints I generate for this analysis are: - Sparse Morgan with radii 1, 2, and 3 - Sparse FeatureMorgan with radii 1, 2, and 3 - RDKit BitVect with maxPath 5, 6, and 7 - Pattern BitVect - Avalon BitVect - Sparse Atom Pairs - Sparse Topological Torsions - Atom Pair BitVect - Topological Torsion BitVect

All of the BitVect fingerprints are 4096 bits long

import copy

historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','wb+')

counts=defaultdict(Counter)
t1 = time.time()
with gzip.open(filen,'rb') as inf:
    i = 0
    ms = []
    while 1:
        try:
            m,nm = pickle.load(inf)
        except EOFError:
            break
        if not m or m.GetNumHeavyAtoms()>50: continue
        ms.append(m)
        i+=1
        if len(ms)>=10000:
            for v in 1,2,3:
                cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v).GetNonzeroElements()),
                                     ms)
                for obc in cnts:
                    counts[('Morgan',v)][obc]+=1
            for v in 1,2,3:
                cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v,useFeatures=True).GetNonzeroElements()),
                                     ms)
                for obc in cnts:
                    counts[('FeatMorgan',v)][obc]+=1
            for v in 5,6,7:
                cnts = dview.map_sync(lambda x,v=v:Chem.RDKFingerprint(x,maxPath=v,fpSize=4096).GetNumOnBits(),
                                      ms)
                for obc in cnts:
                    counts[('RDKit',v)][obc]+=1
            cnts = dview.map_sync(lambda x:Chem.PatternFingerprint(x,fpSize=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('pattern',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:pyAvalonTools.GetAvalonFP(x,nBits=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('avalon',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetAtomPairFingerprint(x).GetNonzeroElements()),
                                  ms)
            for obc in cnts:
                counts[('ap-counts',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetTopologicalTorsionFingerprint(x).GetNonzeroElements()),
                                  ms)
            for obc in cnts:
                counts[('tt-counts',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('ap-bv',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('tt-bv',-1)][obc]+=1
            ms = []
            
        if not i%50000:
            t2 = time.time()
            print("Done %d in %.2f sec"%(i,t2-t1))
        if not i%500000:
            pickle.dump(dict(counts),historyf)
        if i>=5000000:
            break
Done 50000 in 38.63 sec
Done 100000 in 77.02 sec
Done 150000 in 115.17 sec
Done 200000 in 163.61 sec
Done 250000 in 215.39 sec
Done 300000 in 267.96 sec
Done 350000 in 319.74 sec
Done 400000 in 373.11 sec
Done 450000 in 415.37 sec
Done 500000 in 468.50 sec
Done 550000 in 526.23 sec
Done 600000 in 570.65 sec
Done 650000 in 622.83 sec
Done 700000 in 674.11 sec
Done 750000 in 724.71 sec
Done 800000 in 775.76 sec
Done 850000 in 823.44 sec
Done 900000 in 873.37 sec
Done 950000 in 922.91 sec
Done 1000000 in 971.03 sec
Done 1050000 in 1019.84 sec
Done 1100000 in 1068.24 sec
Done 1150000 in 1116.11 sec
Done 1200000 in 1164.39 sec
Done 1250000 in 1211.31 sec
Done 1300000 in 1255.67 sec
Done 1350000 in 1306.25 sec
Done 1400000 in 1356.04 sec
Done 1450000 in 1402.95 sec
Done 1500000 in 1453.38 sec
Done 1550000 in 1500.31 sec
Done 1600000 in 1546.90 sec
Done 1650000 in 1593.48 sec
Done 1700000 in 1640.38 sec
Done 1750000 in 1696.32 sec
Done 1800000 in 1750.83 sec
Done 1850000 in 1810.42 sec
Done 1900000 in 1868.12 sec
Done 1950000 in 1926.07 sec
Done 2000000 in 1983.37 sec
Done 2050000 in 2043.56 sec
Done 2100000 in 2102.81 sec
Done 2150000 in 2160.67 sec
Done 2200000 in 2218.30 sec
Done 2250000 in 2272.73 sec
Done 2300000 in 2323.77 sec
Done 2350000 in 2375.39 sec
Done 2400000 in 2427.04 sec
Done 2450000 in 2481.36 sec
Done 2500000 in 2536.57 sec
Done 2550000 in 2591.71 sec
Done 2600000 in 2644.06 sec
Done 2650000 in 2698.32 sec
Done 2700000 in 2752.86 sec
Done 2750000 in 2805.41 sec
Done 2800000 in 2856.95 sec
Done 2850000 in 2909.60 sec
Done 2900000 in 2965.05 sec
Done 2950000 in 3021.72 sec
Done 3000000 in 3073.35 sec
Done 3050000 in 3127.90 sec
Done 3100000 in 3177.67 sec
Done 3150000 in 3234.92 sec
Done 3200000 in 3288.20 sec
Done 3250000 in 3341.28 sec
Done 3300000 in 3393.97 sec
Done 3350000 in 3446.92 sec
Done 3400000 in 3499.45 sec
Done 3450000 in 3549.88 sec
Done 3500000 in 3601.67 sec
Done 3550000 in 3653.41 sec
Done 3600000 in 3705.95 sec
Done 3650000 in 3759.37 sec
Done 3700000 in 3810.11 sec
Done 3750000 in 3861.68 sec
Done 3800000 in 3912.28 sec
Done 3850000 in 3965.28 sec
Done 3900000 in 4022.67 sec
Done 3950000 in 4077.32 sec
Done 4000000 in 4129.91 sec
Done 4050000 in 4185.33 sec
Done 4100000 in 4240.67 sec
Done 4150000 in 4287.86 sec
Done 4200000 in 4340.04 sec
Done 4250000 in 4391.57 sec
Done 4300000 in 4443.67 sec
Done 4350000 in 4493.96 sec
Done 4400000 in 4545.53 sec
Done 4450000 in 4592.16 sec
Done 4500000 in 4640.05 sec
Done 4550000 in 4687.30 sec
Done 4600000 in 4733.79 sec
Done 4650000 in 4780.85 sec
Done 4700000 in 4828.29 sec
Done 4750000 in 4878.40 sec
Done 4800000 in 4927.55 sec
Done 4850000 in 4984.36 sec
Done 4900000 in 5042.20 sec
Done 4950000 in 5101.82 sec
Done 5000000 in 5154.32 sec
pickle.dump(dict(counts),gzip.open('../data/fp_bit_counts.pkl.gz','wb+'))

Now plot the distributions of the number of bits set

morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']

figure(figsize=(15,15))

pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
    cnts = sorted(counts[(n,r)].items())
    
    plot([x for x,y in cnts],[y for x,y in cnts],label=
        f"r={r}")
_=title("Morgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()

pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
    cnts = sorted(counts[(n,r)].items())
    
    plot([x for x,y in cnts],[y for x,y in cnts],label=
        f"r={r}")
_=title("FeatMorgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()

pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
    cnts = sorted(counts[(n,r)].items())
    
    plot([x for x,y in cnts],[y for x,y in cnts],label=
        f"r={r}")
_=title("RDKit")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()

for k in counts.keys():
    if k[0] in ('Morgan','FeatMorgan','RDKit'):
        continue
    pidx+=1
    subplot(3,3,pidx)
    cnts = sorted(counts[k].items())
    plot([x for x,y in cnts],[y for x,y in cnts])
    _=title(k[0])
    _=xlabel("num bits set")
    _=ylabel("count")
   

The avalon FP curve has an interesting shape

for k,cnts in counts.items():
    accum = 0
    denom = 0
    for cnt,num in cnts.items():
        accum += cnt*num
        denom += num
    mean = accum/denom
    dev = 0
    for cnt,num in cnts.items():
        dev += num*(cnt-mean)**2
    dev /= (denom-1)
    dev = dev**0.5
    label = k[0]
    if k[1]!=-1:
        label += str(k[1])
        
    print(label,'\t%.1f'%mean,'%.1f'%dev)
Morgan1     29.4 5.6
Morgan2     48.7 9.6
Morgan3     66.8 13.8
FeatMorgan1     20.1 3.9
FeatMorgan2     38.1 7.7
FeatMorgan3     56.0 11.8
RDKit5  363.3 122.5
RDKit6  621.7 233.2
RDKit7  993.6 406.3
pattern     445.5 122.5
avalon  279.8 129.9
ap-counts   166.6 56.3
tt-counts   33.4 9.8
ap-bv   267.3 90.0
tt-bv   47.2 12.0

Convergence

I did 5 million examples, which took a while (about 1.5 hours with 6 worker processes on my PC). Could I have analyzed less and gotten to the same results? Did the means converge? If so, how quickly?

historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','rb')
means = defaultdict(list)
devs = defaultdict(list)
nmols = []
while 1:
    try:
        lcounts = pickle.load(historyf)
    except EOFError:
        break
    for k,cnts in lcounts.items():
        accum = 0
        denom = 0
        for cnt,num in cnts.items():
            accum += cnt*num
            denom += num
        mean = accum/denom
        dev = 0
        for cnt,num in cnts.items():
            dev += num*(cnt-mean)**2
        dev /= (denom-1)
        dev = dev**0.5
        
        if denom not in nmols:
            nmols.append(denom)
        means[k].append(mean)
        devs[k].append(dev)
        label = k[0]
        if k[1]!=-1:
            label += str(k[1])

        print(denom,label,'\t%.1f'%mean,'%.1f'%dev)    
500000 Morgan1  26.0 6.2
500000 Morgan2  42.8 10.7
500000 Morgan3  58.7 15.5
500000 FeatMorgan1  18.2 4.3
500000 FeatMorgan2  33.8 8.5
500000 FeatMorgan3  49.5 13.2
500000 RDKit5   324.6 133.9
500000 RDKit6   560.8 256.2
500000 RDKit7   902.9 445.7
500000 pattern  408.8 133.9
500000 avalon   241.8 133.8
500000 ap-counts    133.3 57.6
500000 tt-counts    28.6 10.2
500000 ap-bv    219.5 93.6
500000 tt-bv    41.9 12.9
1000000 Morgan1     27.1 6.1
1000000 Morgan2     44.6 10.5
1000000 Morgan3     61.2 15.2
1000000 FeatMorgan1     18.9 4.2
1000000 FeatMorgan2     35.2 8.4
1000000 FeatMorgan3     51.6 13.0
1000000 RDKit5  340.7 133.9
1000000 RDKit6  588.9 257.4
1000000 RDKit7  948.5 449.9
1000000 pattern     425.2 136.0
1000000 avalon  257.7 136.7
1000000 ap-counts   143.7 57.7
1000000 tt-counts   30.1 10.1
1000000 ap-bv   234.4 92.8
1000000 tt-bv   43.6 12.9
1500000 Morgan1     27.3 5.8
1500000 Morgan2     45.0 9.9
1500000 Morgan3     61.7 14.3
1500000 FeatMorgan1     19.0 4.1
1500000 FeatMorgan2     35.5 8.0
1500000 FeatMorgan3     52.0 12.3
1500000 RDKit5  340.3 127.8
1500000 RDKit6  587.1 246.2
1500000 RDKit7  944.8 432.0
1500000 pattern     424.0 129.4
1500000 avalon  260.5 133.7
1500000 ap-counts   145.1 54.8
1500000 tt-counts   30.5 9.8
1500000 ap-bv   234.9 87.3
1500000 tt-bv   43.7 12.3
2000000 Morgan1     28.0 5.7
2000000 Morgan2     46.2 9.8
2000000 Morgan3     63.4 14.1
2000000 FeatMorgan1     19.4 4.0
2000000 FeatMorgan2     36.3 7.9
2000000 FeatMorgan3     53.3 12.1
2000000 RDKit5  350.7 126.6
2000000 RDKit6  603.5 243.1
2000000 RDKit7  969.0 425.8
2000000 pattern     433.3 128.0
2000000 avalon  269.5 133.1
2000000 ap-counts   152.4 55.5
2000000 tt-counts   31.5 9.8
2000000 ap-bv   245.8 88.2
2000000 tt-bv   45.0 12.2
2500000 Morgan1     28.7 5.8
2500000 Morgan2     47.5 9.8
2500000 Morgan3     65.3 14.2
2500000 FeatMorgan1     19.7 4.0
2500000 FeatMorgan2     37.2 7.9
2500000 FeatMorgan3     54.7 12.1
2500000 RDKit5  361.5 126.3
2500000 RDKit6  621.2 241.1
2500000 RDKit7  996.0 420.5
2500000 pattern     443.2 126.4
2500000 avalon  278.4 132.6
2500000 ap-counts   160.1 56.9
2500000 tt-counts   32.6 9.9
2500000 ap-bv   257.9 90.2
2500000 tt-bv   46.3 12.2
3000000 Morgan1     29.1 5.7
3000000 Morgan2     48.1 9.8
3000000 Morgan3     66.1 14.1
3000000 FeatMorgan1     19.9 3.9
3000000 FeatMorgan2     37.6 7.8
3000000 FeatMorgan3     55.3 12.0
3000000 RDKit5  364.5 124.5
3000000 RDKit6  625.3 237.2
3000000 RDKit7  1001.4 413.2
3000000 pattern     446.5 124.1
3000000 avalon  280.5 131.5
3000000 ap-counts   163.7 57.0
3000000 tt-counts   33.1 9.8
3000000 ap-bv   263.5 90.5
3000000 tt-bv   46.9 12.1
3500000 Morgan1     29.2 5.7
3500000 Morgan2     48.3 9.7
3500000 Morgan3     66.4 14.0
3500000 FeatMorgan1     19.9 3.9
3500000 FeatMorgan2     37.7 7.8
3500000 FeatMorgan3     55.6 11.9
3500000 RDKit5  365.3 123.8
3500000 RDKit6  626.7 236.0
3500000 RDKit7  1003.7 411.3
3500000 pattern     448.4 123.3
3500000 avalon  280.3 131.1
3500000 ap-counts   165.1 56.7
3500000 tt-counts   33.3 9.8
3500000 ap-bv   265.9 90.1
3500000 tt-bv   47.2 12.1
4000000 Morgan1     29.4 5.7
4000000 Morgan2     48.6 9.8
4000000 Morgan3     66.7 14.1
4000000 FeatMorgan1     20.0 3.9
4000000 FeatMorgan2     38.0 7.8
4000000 FeatMorgan3     55.9 12.0
4000000 RDKit5  365.2 124.1
4000000 RDKit6  627.1 236.6
4000000 RDKit7  1005.0 412.4
4000000 pattern     448.6 124.0
4000000 avalon  281.4 131.3
4000000 ap-counts   165.7 56.9
4000000 tt-counts   33.4 9.9
4000000 ap-bv   266.8 90.6
4000000 tt-bv   47.3 12.2
4500000 Morgan1     29.4 5.6
4500000 Morgan2     48.7 9.6
4500000 Morgan3     66.8 13.9
4500000 FeatMorgan1     20.1 3.9
4500000 FeatMorgan2     38.0 7.7
4500000 FeatMorgan3     55.9 11.8
4500000 RDKit5  364.3 123.1
4500000 RDKit6  624.4 234.6
4500000 RDKit7  999.1 408.8
4500000 pattern     447.0 122.7
4500000 avalon  280.7 130.6
4500000 ap-counts   166.3 56.4
4500000 tt-counts   33.4 9.8
4500000 ap-bv   267.3 89.9
4500000 tt-bv   47.3 12.1
5000000 Morgan1     29.4 5.6
5000000 Morgan2     48.7 9.6
5000000 Morgan3     66.8 13.8
5000000 FeatMorgan1     20.1 3.9
5000000 FeatMorgan2     38.1 7.7
5000000 FeatMorgan3     56.0 11.8
5000000 RDKit5  363.3 122.5
5000000 RDKit6  621.7 233.2
5000000 RDKit7  993.6 406.3
5000000 pattern     445.5 122.5
5000000 avalon  279.8 129.9
5000000 ap-counts   166.6 56.3
5000000 tt-counts   33.4 9.8
5000000 ap-bv   267.3 90.0
5000000 tt-bv   47.2 12.0

Let’s look at those graphically:

morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']

figure(figsize=(15,15))

nmols2 = [x/1000000 for x in nmols]

pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
    lmeans = means[(n,r)]
    ldevs = devs[(n,r)]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
    
_=title("Morgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()

pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
    lmeans = means[(n,r)]
    ldevs = devs[(n,r)]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("FeatMorgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()

pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
    lmeans = means[(n,r)]
    ldevs = devs[(n,r)]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("RDKit")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()

for k in counts.keys():
    if k[0] in ('Morgan','FeatMorgan','RDKit'):
        continue
    pidx+=1
    subplot(3,3,pidx)
    lmeans = means[k]
    ldevs = devs[k]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
    _=title(k[0])
    _=xlabel("num mols (millions)")
    _=ylabel("count")
   

Looks like we would have been fine with 3 million molecules.