This is an updated version of a post. The original version of the notebook can be found in github.

I've done a number of posts looking at Morgan fingerprint statistics before, including:

I have done similar analysis for other fingerprint types, but it looks like I didn't post that (at least I can't find it if I did). It's useful to do this because, as we'll see, the different fingerprint types have very different numbers of bits set for typical molecules.

Here's the summary of the mean and standard deviation of the number of bits set, from an analysis of 5 million molecules with less than 50 heavy atoms extracted from ZINC:

Fingerprint Type Mean num_bits SD num_bits
Morgan1 sparse 29.4 5.6
Morgan2 sparse 48.7 9.6
Morgan3 sparse 66.8 13.8
FeatMorgan1 sparse 20.1 3.9
FeatMorgan2 sparse 38.1 7.7
FeatMorgan3 sparse 56.0 11.8
RDKit5 bitvect 363 122
RDKit6 bitvect 621 233
RDKit7 bitvect 993 406
pattern bitvect 446 122
avalon bitvect 280 130
atom pairs sparse 167 56
TT sparse 33.4 9.8
atom pairs bitvect 267 90
TT bitvect 47.2 12.0

The bit vector fingerprints were all 4096 bits long.

from rdkit import Chem,DataStructs
import time,random,gzip,pickle,copy
import numpy as np
from collections import Counter,defaultdict
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit import rdBase
%pylab inline

print(rdBase.rdkitVersion)
import time
print(time.asctime())
Populating the interactive namespace from numpy and matplotlib
2021.09.1pre
Tue Jul  6 04:58:28 2021
/home/glandrum/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['copy', 'random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
try:
    import ipyparallel as ipp
    rc = ipp.Client()
    dview = rc[:]
    dview.execute('from rdkit import Chem')
    dview.execute('from rdkit import Descriptors')
    dview.execute('from rdkit.Chem import rdMolDescriptors')
    dview.execute('from rdkit.Avalon import pyAvalonTools')
except:
    print("could not use ipyparallel")
    dview = None

For test data I'll use the same 16 million ZINC compounds I used in the bit statistics post.

filen='/scratch/RDKit_git/LocalData/Zinc/zinc_all_clean.pkl.gz'

Loop over the molecules, skip anything with more than 50 atoms, and build fingerprints for all the others.

The fingerprints I generate for this analysis are:

  • Sparse Morgan with radii 1, 2, and 3
  • Sparse FeatureMorgan with radii 1, 2, and 3
  • RDKit BitVect with maxPath 5, 6, and 7
  • Pattern BitVect
  • Avalon BitVect
  • Sparse Atom Pairs
  • Sparse Topological Torsions
  • Atom Pair BitVect
  • Topological Torsion BitVect

All of the BitVect fingerprints are 4096 bits long

import copy

historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','wb+')

counts=defaultdict(Counter)
t1 = time.time()
with gzip.open(filen,'rb') as inf:
    i = 0
    ms = []
    while 1:
        try:
            m,nm = pickle.load(inf)
        except EOFError:
            break
        if not m or m.GetNumHeavyAtoms()>50: continue
        ms.append(m)
        i+=1
        if len(ms)>=10000:
            for v in 1,2,3:
                cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v).GetNonzeroElements()),
                                     ms)
                for obc in cnts:
                    counts[('Morgan',v)][obc]+=1
            for v in 1,2,3:
                cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v,useFeatures=True).GetNonzeroElements()),
                                     ms)
                for obc in cnts:
                    counts[('FeatMorgan',v)][obc]+=1
            for v in 5,6,7:
                cnts = dview.map_sync(lambda x,v=v:Chem.RDKFingerprint(x,maxPath=v,fpSize=4096).GetNumOnBits(),
                                      ms)
                for obc in cnts:
                    counts[('RDKit',v)][obc]+=1
            cnts = dview.map_sync(lambda x:Chem.PatternFingerprint(x,fpSize=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('pattern',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:pyAvalonTools.GetAvalonFP(x,nBits=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('avalon',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetAtomPairFingerprint(x).GetNonzeroElements()),
                                  ms)
            for obc in cnts:
                counts[('ap-counts',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetTopologicalTorsionFingerprint(x).GetNonzeroElements()),
                                  ms)
            for obc in cnts:
                counts[('tt-counts',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('ap-bv',-1)][obc]+=1
            cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
                                  ms)
            for obc in cnts:
                counts[('tt-bv',-1)][obc]+=1
            ms = []
            
        if not i%50000:
            t2 = time.time()
            print("Done %d in %.2f sec"%(i,t2-t1))
        if not i%500000:
            pickle.dump(dict(counts),historyf)
        if i>=5000000:
            break
Done 50000 in 38.63 sec
Done 100000 in 77.02 sec
Done 150000 in 115.17 sec
Done 200000 in 163.61 sec
Done 250000 in 215.39 sec
Done 300000 in 267.96 sec
Done 350000 in 319.74 sec
Done 400000 in 373.11 sec
Done 450000 in 415.37 sec
Done 500000 in 468.50 sec
Done 550000 in 526.23 sec
Done 600000 in 570.65 sec
Done 650000 in 622.83 sec
Done 700000 in 674.11 sec
Done 750000 in 724.71 sec
Done 800000 in 775.76 sec
Done 850000 in 823.44 sec
Done 900000 in 873.37 sec
Done 950000 in 922.91 sec
Done 1000000 in 971.03 sec
Done 1050000 in 1019.84 sec
Done 1100000 in 1068.24 sec
Done 1150000 in 1116.11 sec
Done 1200000 in 1164.39 sec
Done 1250000 in 1211.31 sec
Done 1300000 in 1255.67 sec
Done 1350000 in 1306.25 sec
Done 1400000 in 1356.04 sec
Done 1450000 in 1402.95 sec
Done 1500000 in 1453.38 sec
Done 1550000 in 1500.31 sec
Done 1600000 in 1546.90 sec
Done 1650000 in 1593.48 sec
Done 1700000 in 1640.38 sec
Done 1750000 in 1696.32 sec
Done 1800000 in 1750.83 sec
Done 1850000 in 1810.42 sec
Done 1900000 in 1868.12 sec
Done 1950000 in 1926.07 sec
Done 2000000 in 1983.37 sec
Done 2050000 in 2043.56 sec
Done 2100000 in 2102.81 sec
Done 2150000 in 2160.67 sec
Done 2200000 in 2218.30 sec
Done 2250000 in 2272.73 sec
Done 2300000 in 2323.77 sec
Done 2350000 in 2375.39 sec
Done 2400000 in 2427.04 sec
Done 2450000 in 2481.36 sec
Done 2500000 in 2536.57 sec
Done 2550000 in 2591.71 sec
Done 2600000 in 2644.06 sec
Done 2650000 in 2698.32 sec
Done 2700000 in 2752.86 sec
Done 2750000 in 2805.41 sec
Done 2800000 in 2856.95 sec
Done 2850000 in 2909.60 sec
Done 2900000 in 2965.05 sec
Done 2950000 in 3021.72 sec
Done 3000000 in 3073.35 sec
Done 3050000 in 3127.90 sec
Done 3100000 in 3177.67 sec
Done 3150000 in 3234.92 sec
Done 3200000 in 3288.20 sec
Done 3250000 in 3341.28 sec
Done 3300000 in 3393.97 sec
Done 3350000 in 3446.92 sec
Done 3400000 in 3499.45 sec
Done 3450000 in 3549.88 sec
Done 3500000 in 3601.67 sec
Done 3550000 in 3653.41 sec
Done 3600000 in 3705.95 sec
Done 3650000 in 3759.37 sec
Done 3700000 in 3810.11 sec
Done 3750000 in 3861.68 sec
Done 3800000 in 3912.28 sec
Done 3850000 in 3965.28 sec
Done 3900000 in 4022.67 sec
Done 3950000 in 4077.32 sec
Done 4000000 in 4129.91 sec
Done 4050000 in 4185.33 sec
Done 4100000 in 4240.67 sec
Done 4150000 in 4287.86 sec
Done 4200000 in 4340.04 sec
Done 4250000 in 4391.57 sec
Done 4300000 in 4443.67 sec
Done 4350000 in 4493.96 sec
Done 4400000 in 4545.53 sec
Done 4450000 in 4592.16 sec
Done 4500000 in 4640.05 sec
Done 4550000 in 4687.30 sec
Done 4600000 in 4733.79 sec
Done 4650000 in 4780.85 sec
Done 4700000 in 4828.29 sec
Done 4750000 in 4878.40 sec
Done 4800000 in 4927.55 sec
Done 4850000 in 4984.36 sec
Done 4900000 in 5042.20 sec
Done 4950000 in 5101.82 sec
Done 5000000 in 5154.32 sec
pickle.dump(dict(counts),gzip.open('../data/fp_bit_counts.pkl.gz','wb+'))

Now plot the distributions of the number of bits set

morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']

figure(figsize=(15,15))

pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
    cnts = sorted(counts[(n,r)].items())
    
    plot([x for x,y in cnts],[y for x,y in cnts],label=
        f"r={r}")
_=title("Morgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()

pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
    cnts = sorted(counts[(n,r)].items())
    
    plot([x for x,y in cnts],[y for x,y in cnts],label=
        f"r={r}")
_=title("FeatMorgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()

pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
    cnts = sorted(counts[(n,r)].items())
    
    plot([x for x,y in cnts],[y for x,y in cnts],label=
        f"r={r}")
_=title("RDKit")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()

for k in counts.keys():
    if k[0] in ('Morgan','FeatMorgan','RDKit'):
        continue
    pidx+=1
    subplot(3,3,pidx)
    cnts = sorted(counts[k].items())
    plot([x for x,y in cnts],[y for x,y in cnts])
    _=title(k[0])
    _=xlabel("num bits set")
    _=ylabel("count")
   

The avalon FP curve has an interesting shape

for k,cnts in counts.items():
    accum = 0
    denom = 0
    for cnt,num in cnts.items():
        accum += cnt*num
        denom += num
    mean = accum/denom
    dev = 0
    for cnt,num in cnts.items():
        dev += num*(cnt-mean)**2
    dev /= (denom-1)
    dev = dev**0.5
    label = k[0]
    if k[1]!=-1:
        label += str(k[1])
        
    print(label,'\t%.1f'%mean,'%.1f'%dev)
Morgan1 	29.4 5.6
Morgan2 	48.7 9.6
Morgan3 	66.8 13.8
FeatMorgan1 	20.1 3.9
FeatMorgan2 	38.1 7.7
FeatMorgan3 	56.0 11.8
RDKit5 	363.3 122.5
RDKit6 	621.7 233.2
RDKit7 	993.6 406.3
pattern 	445.5 122.5
avalon 	279.8 129.9
ap-counts 	166.6 56.3
tt-counts 	33.4 9.8
ap-bv 	267.3 90.0
tt-bv 	47.2 12.0

Convergence

I did 5 million examples, which took a while (about 1.5 hours with 6 worker processes on my PC). Could I have analyzed less and gotten to the same results? Did the means converge? If so, how quickly?

historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','rb')
means = defaultdict(list)
devs = defaultdict(list)
nmols = []
while 1:
    try:
        lcounts = pickle.load(historyf)
    except EOFError:
        break
    for k,cnts in lcounts.items():
        accum = 0
        denom = 0
        for cnt,num in cnts.items():
            accum += cnt*num
            denom += num
        mean = accum/denom
        dev = 0
        for cnt,num in cnts.items():
            dev += num*(cnt-mean)**2
        dev /= (denom-1)
        dev = dev**0.5
        
        if denom not in nmols:
            nmols.append(denom)
        means[k].append(mean)
        devs[k].append(dev)
        label = k[0]
        if k[1]!=-1:
            label += str(k[1])

        print(denom,label,'\t%.1f'%mean,'%.1f'%dev)    
500000 Morgan1 	26.0 6.2
500000 Morgan2 	42.8 10.7
500000 Morgan3 	58.7 15.5
500000 FeatMorgan1 	18.2 4.3
500000 FeatMorgan2 	33.8 8.5
500000 FeatMorgan3 	49.5 13.2
500000 RDKit5 	324.6 133.9
500000 RDKit6 	560.8 256.2
500000 RDKit7 	902.9 445.7
500000 pattern 	408.8 133.9
500000 avalon 	241.8 133.8
500000 ap-counts 	133.3 57.6
500000 tt-counts 	28.6 10.2
500000 ap-bv 	219.5 93.6
500000 tt-bv 	41.9 12.9
1000000 Morgan1 	27.1 6.1
1000000 Morgan2 	44.6 10.5
1000000 Morgan3 	61.2 15.2
1000000 FeatMorgan1 	18.9 4.2
1000000 FeatMorgan2 	35.2 8.4
1000000 FeatMorgan3 	51.6 13.0
1000000 RDKit5 	340.7 133.9
1000000 RDKit6 	588.9 257.4
1000000 RDKit7 	948.5 449.9
1000000 pattern 	425.2 136.0
1000000 avalon 	257.7 136.7
1000000 ap-counts 	143.7 57.7
1000000 tt-counts 	30.1 10.1
1000000 ap-bv 	234.4 92.8
1000000 tt-bv 	43.6 12.9
1500000 Morgan1 	27.3 5.8
1500000 Morgan2 	45.0 9.9
1500000 Morgan3 	61.7 14.3
1500000 FeatMorgan1 	19.0 4.1
1500000 FeatMorgan2 	35.5 8.0
1500000 FeatMorgan3 	52.0 12.3
1500000 RDKit5 	340.3 127.8
1500000 RDKit6 	587.1 246.2
1500000 RDKit7 	944.8 432.0
1500000 pattern 	424.0 129.4
1500000 avalon 	260.5 133.7
1500000 ap-counts 	145.1 54.8
1500000 tt-counts 	30.5 9.8
1500000 ap-bv 	234.9 87.3
1500000 tt-bv 	43.7 12.3
2000000 Morgan1 	28.0 5.7
2000000 Morgan2 	46.2 9.8
2000000 Morgan3 	63.4 14.1
2000000 FeatMorgan1 	19.4 4.0
2000000 FeatMorgan2 	36.3 7.9
2000000 FeatMorgan3 	53.3 12.1
2000000 RDKit5 	350.7 126.6
2000000 RDKit6 	603.5 243.1
2000000 RDKit7 	969.0 425.8
2000000 pattern 	433.3 128.0
2000000 avalon 	269.5 133.1
2000000 ap-counts 	152.4 55.5
2000000 tt-counts 	31.5 9.8
2000000 ap-bv 	245.8 88.2
2000000 tt-bv 	45.0 12.2
2500000 Morgan1 	28.7 5.8
2500000 Morgan2 	47.5 9.8
2500000 Morgan3 	65.3 14.2
2500000 FeatMorgan1 	19.7 4.0
2500000 FeatMorgan2 	37.2 7.9
2500000 FeatMorgan3 	54.7 12.1
2500000 RDKit5 	361.5 126.3
2500000 RDKit6 	621.2 241.1
2500000 RDKit7 	996.0 420.5
2500000 pattern 	443.2 126.4
2500000 avalon 	278.4 132.6
2500000 ap-counts 	160.1 56.9
2500000 tt-counts 	32.6 9.9
2500000 ap-bv 	257.9 90.2
2500000 tt-bv 	46.3 12.2
3000000 Morgan1 	29.1 5.7
3000000 Morgan2 	48.1 9.8
3000000 Morgan3 	66.1 14.1
3000000 FeatMorgan1 	19.9 3.9
3000000 FeatMorgan2 	37.6 7.8
3000000 FeatMorgan3 	55.3 12.0
3000000 RDKit5 	364.5 124.5
3000000 RDKit6 	625.3 237.2
3000000 RDKit7 	1001.4 413.2
3000000 pattern 	446.5 124.1
3000000 avalon 	280.5 131.5
3000000 ap-counts 	163.7 57.0
3000000 tt-counts 	33.1 9.8
3000000 ap-bv 	263.5 90.5
3000000 tt-bv 	46.9 12.1
3500000 Morgan1 	29.2 5.7
3500000 Morgan2 	48.3 9.7
3500000 Morgan3 	66.4 14.0
3500000 FeatMorgan1 	19.9 3.9
3500000 FeatMorgan2 	37.7 7.8
3500000 FeatMorgan3 	55.6 11.9
3500000 RDKit5 	365.3 123.8
3500000 RDKit6 	626.7 236.0
3500000 RDKit7 	1003.7 411.3
3500000 pattern 	448.4 123.3
3500000 avalon 	280.3 131.1
3500000 ap-counts 	165.1 56.7
3500000 tt-counts 	33.3 9.8
3500000 ap-bv 	265.9 90.1
3500000 tt-bv 	47.2 12.1
4000000 Morgan1 	29.4 5.7
4000000 Morgan2 	48.6 9.8
4000000 Morgan3 	66.7 14.1
4000000 FeatMorgan1 	20.0 3.9
4000000 FeatMorgan2 	38.0 7.8
4000000 FeatMorgan3 	55.9 12.0
4000000 RDKit5 	365.2 124.1
4000000 RDKit6 	627.1 236.6
4000000 RDKit7 	1005.0 412.4
4000000 pattern 	448.6 124.0
4000000 avalon 	281.4 131.3
4000000 ap-counts 	165.7 56.9
4000000 tt-counts 	33.4 9.9
4000000 ap-bv 	266.8 90.6
4000000 tt-bv 	47.3 12.2
4500000 Morgan1 	29.4 5.6
4500000 Morgan2 	48.7 9.6
4500000 Morgan3 	66.8 13.9
4500000 FeatMorgan1 	20.1 3.9
4500000 FeatMorgan2 	38.0 7.7
4500000 FeatMorgan3 	55.9 11.8
4500000 RDKit5 	364.3 123.1
4500000 RDKit6 	624.4 234.6
4500000 RDKit7 	999.1 408.8
4500000 pattern 	447.0 122.7
4500000 avalon 	280.7 130.6
4500000 ap-counts 	166.3 56.4
4500000 tt-counts 	33.4 9.8
4500000 ap-bv 	267.3 89.9
4500000 tt-bv 	47.3 12.1
5000000 Morgan1 	29.4 5.6
5000000 Morgan2 	48.7 9.6
5000000 Morgan3 	66.8 13.8
5000000 FeatMorgan1 	20.1 3.9
5000000 FeatMorgan2 	38.1 7.7
5000000 FeatMorgan3 	56.0 11.8
5000000 RDKit5 	363.3 122.5
5000000 RDKit6 	621.7 233.2
5000000 RDKit7 	993.6 406.3
5000000 pattern 	445.5 122.5
5000000 avalon 	279.8 129.9
5000000 ap-counts 	166.6 56.3
5000000 tt-counts 	33.4 9.8
5000000 ap-bv 	267.3 90.0
5000000 tt-bv 	47.2 12.0

Let's look at those graphically:

morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']

figure(figsize=(15,15))

nmols2 = [x/1000000 for x in nmols]

pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
    lmeans = means[(n,r)]
    ldevs = devs[(n,r)]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
    
_=title("Morgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()

pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
    lmeans = means[(n,r)]
    ldevs = devs[(n,r)]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("FeatMorgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()

pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
    lmeans = means[(n,r)]
    ldevs = devs[(n,r)]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("RDKit")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()

for k in counts.keys():
    if k[0] in ('Morgan','FeatMorgan','RDKit'):
        continue
    pidx+=1
    subplot(3,3,pidx)
    lmeans = means[k]
    ldevs = devs[k]
    errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
    _=title(k[0])
    _=xlabel("num mols (millions)")
    _=ylabel("count")
   

Looks like we would have been fine with 3 million molecules.