In [1]:
import os,sys
import math
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import curve_fit
from scipy.stats import linregress
from scipy.stats import multivariate_normal
from sklearn.model_selection import train_test_split
In [2]:
fdaDF=pd.read_csv("fda.csv")
chemDF=pd.read_csv("chembridge.csv")
mayDF=pd.read_csv("maybridge.csv")
In [ ]:
 
In [3]:
fdaDF.describe()
Out[3]:
LogP Weight Acceptors Donors
count 3050.000000 3050.000000 3050.000000 3050.000000
mean 0.979433 355.843251 5.012459 2.120656
std 2.927299 174.233500 3.565419 2.236259
min -13.200000 33.021000 0.000000 0.000000
25% -0.650000 243.151750 3.000000 1.000000
50% 1.340000 329.200000 4.000000 2.000000
75% 2.820000 434.811750 6.000000 3.000000
max 12.610000 1663.492000 31.000000 19.000000
In [4]:
chemDF.describe()
Out[4]:
LogP Weight Acceptors Donors
count 2.149606e+06 2.149606e+06 2.149606e+06 2.149606e+06
mean 1.712938e+00 3.482317e+02 3.680796e+00 1.772779e+00
std 1.886267e+00 5.519033e+01 1.521987e+00 1.112067e+00
min -9.760000e+00 7.406000e+01 0.000000e+00 0.000000e+00
25% 5.800000e-01 3.131640e+02 3.000000e+00 1.000000e+00
50% 1.840000e+00 3.500340e+02 4.000000e+00 2.000000e+00
75% 3.000000e+00 3.782960e+02 5.000000e+00 2.000000e+00
max 1.072000e+01 7.285260e+02 1.600000e+01 1.100000e+01
In [5]:
mayDF.describe()
Out[5]:
LogP Weight Acceptors Donors
count 128689.000000 128689.000000 128689.000000 128689.000000
mean 2.674715 353.753234 3.928727 1.457809
std 1.917827 78.462085 1.703206 1.168193
min -7.150000 86.084000 0.000000 0.000000
25% 1.580000 302.196000 3.000000 1.000000
50% 2.800000 357.172000 4.000000 1.000000
75% 3.950000 410.224000 5.000000 2.000000
max 15.880000 1296.610000 25.000000 11.000000
In [6]:
def oned():
    fig, axs = plt.subplots(2, 2,figsize=(12,12))
    #fig=plt.figure(figsize=(10,10))
    axs[0, 0].set_title('MW')
    axs[0, 1].set_title('LogP')
    axs[1, 0].set_title('H-Acceptors')
    axs[1, 1].set_title('H-Donors')
    axs[0, 0].set(xlabel='Molecular Weight (Dalton)', ylabel='Probability')
    axs[0, 1].set(xlabel='LogP (o/w)', ylabel='Probability')
    axs[1, 0].set(xlabel='H-Bond Acceptors', ylabel='Probability')
    axs[1, 1].set(xlabel='H-Bond Donors', ylabel='Probability')

    fdensMW = stats.gaussian_kde(fdaDF["Weight"])
    fdensLogP = stats.gaussian_kde(fdaDF["LogP"])
    fdensHACC = stats.gaussian_kde(fdaDF["Acceptors"])
    fdensHDON = stats.gaussian_kde(fdaDF["Donors"])

    _,xfMW  = np.histogram(fdaDF["Weight"],bins=50)
    _,xfLogP= np.histogram(fdaDF["LogP"],bins=50)
    _,xfHACC= np.histogram(fdaDF["Acceptors"], bins=100) 
    _,xfHDON= np.histogram(fdaDF["Donors"], bins=100) 

    axs[0,0].plot(xfMW, fdensMW(xfMW),label="FDA")
    axs[0,1].plot(xfLogP, fdensLogP(xfLogP),label="FDA")
    axs[1,0].plot(xfHACC, fdensHACC(xfHACC),label="FDA")
    axs[1,1].plot(xfHDON, fdensHDON(xfHDON),label="FDA")

    cdensMW = stats.gaussian_kde(chemDF["Weight"])
    cdensLogP = stats.gaussian_kde(chemDF["LogP"])
    cdensHACC = stats.gaussian_kde(chemDF["Acceptors"])
    cdensHDON = stats.gaussian_kde(chemDF["Donors"])

    _,xMW  = np.histogram(chemDF["Weight"],bins=50)
    _,xLogP= np.histogram(chemDF["LogP"],bins=50)
    _,xHACC= np.histogram(chemDF["Acceptors"], bins=100) 
    _,xHDON= np.histogram(chemDF["Donors"], bins=100) 

    axs[0,0].plot(xMW, cdensMW(xMW),label="CHEMBRIDGE")
    axs[0,1].plot(xLogP, cdensLogP(xLogP),label="CHEMBRIDGE")
    axs[1,0].plot(xHACC, cdensHACC(xHACC),label="CHEMBRIDGE")
    axs[1,1].plot(xHDON, cdensHDON(xHDON),label="CHEMBRIDGE")

    mdensMW = stats.gaussian_kde(mayDF["Weight"])
    mdensLogP = stats.gaussian_kde(mayDF["LogP"])
    mdensHACC = stats.gaussian_kde(mayDF["Acceptors"])
    mdensHDON = stats.gaussian_kde(mayDF["Donors"])

    _,xMW  = np.histogram(mayDF["Weight"],bins=50)
    _,xLogP= np.histogram(mayDF["LogP"],bins=50)
    _,xHACC= np.histogram(mayDF["Acceptors"], bins=100) 
    _,xHDON= np.histogram(mayDF["Donors"], bins=100) 

    axs[0,0].plot(xMW, mdensMW(xMW),label="MAYBRIDGE")
    axs[0,1].plot(xLogP, mdensLogP(xLogP),label="MAYBRIDGE")
    axs[1,0].plot(xHACC, mdensHACC(xHACC),label="MAYBRIDGE")
    axs[1,1].plot(xHDON, mdensHDON(xHDON),label="MAYBRIDGE")
    axs[0,0].legend()
    axs[0,1].legend()
    axs[1,0].legend()
    axs[1,1].legend()
    fig.savefig("plot1.png")
In [7]:
oned()
In [8]:
def twod():
    fig, axs = plt.subplots(3, 2,figsize=(14,14))
    axs[0, 0].set(xlabel='Molecular Weight (Dalton)', ylabel='LogP(o/w)')
    axs[0, 1].set(xlabel='Molecular Weight (Dalton)', ylabel='Donors')
    axs[1, 0].set(xlabel='Molecular Weight (Dalton)', ylabel='Acceptors')
    axs[1, 1].set(xlabel='LogP(o/w)', ylabel='Donors')
    axs[2, 0].set(xlabel='LogP(o/w)', ylabel='Acceptors')
    axs[2, 1].set(xlabel='Donors', ylabel='Acceptors')

    axs[0,0].hist2d(fdaDF["Weight"],fdaDF["LogP"], bins=50) 
    axs[0,1].hist2d(fdaDF["Weight"],fdaDF["Donors"], bins=50) 
    axs[1,0].hist2d(fdaDF["Weight"],fdaDF["Acceptors"], bins=50) 
    axs[1,1].hist2d(fdaDF["LogP"],fdaDF["Donors"], bins=50) 
    axs[2,0].hist2d(fdaDF["LogP"],fdaDF["Acceptors"], bins=50) 
    axs[2,1].hist2d(fdaDF["Donors"],fdaDF["Acceptors"], bins=50) 
    fig.savefig("plot2.png")
In [9]:
twod()
In [10]:
fmean=fdaDF.mean().to_numpy()
fcov=fdaDF.cov().to_numpy()
fx = fdaDF[['LogP', 'Weight', 'Acceptors','Donors']].copy()
fxnp = fx.to_numpy()
trainf, testf = train_test_split(fx, test_size=0.2)
trainfmean=trainf.mean()
trainfcov=trainf.cov()
fdba = multivariate_normal(mean=fmean, cov=fcov)
tfdba = multivariate_normal(mean=trainfmean, cov=trainfcov)

#cmean=chemDF.mean().to_numpy()
#ccov=chemDF.cov().to_numpy()
cx = chemDF[['LogP', 'Weight', 'Acceptors','Donors']].copy()
cxnp = cx.to_numpy()
trainc, testc = train_test_split(cx, test_size=0.5)
#traincmean=trainc.mean()
#trainccov=trainc.cov()
#cdba = multivariate_normal(mean=cmean, cov=fcov)
#tcdba = multivariate_normal(mean=traincmean, cov=traincov)

#mmean=mayDF.mean().to_numpy()
#mcov=mayDF.cov().to_numpy()
mx = mayDF[['LogP', 'Weight', 'Acceptors','Donors']].copy()
mxnp = mx.to_numpy()
trainm, testm = train_test_split(mx, test_size=0.5)
#trainmmean=trainm.mean()
#trainmcov=trainm.cov()
#mdba = multivariate_normal(mean=mmean, cov=mcov)
#tmdba = multivariate_normal(mean=trainmmean, cov=trainmcov)
In [11]:
stdLogP = float(fdaDF[['LogP']].std())
stdMW = float(fdaDF[['Weight']].std())
stdHACC = float(fdaDF[['Acceptors']].std())
stdHDON = float(fdaDF[['Donors']].std())
meanLogP = float(fdaDF[['LogP']].mean())
meanMW = float(fdaDF[['Weight']].mean())
meanHACC = float(fdaDF[['Acceptors']].mean())
meanHDON = float(fdaDF[['Donors']].mean())
print (fmean)
print (fcov)
print ([meanLogP,meanMW,meanHACC,meanHDON])
print ([stdLogP**2,stdMW**2,stdHACC**2,stdHDON**2])
    
#m1=3.2; m2=292.0; m3=1.8; m4=0.5
#s11=4.4; s12=128.2; s13=-0.9; s14=-0.9; s22=12650.7; s23=103.2; s24=35.9; s33=3.5; s34=1.7; s44=2.0
#x1=2.0; x2=330.0; x3=2; x4=2
[  0.97943279 355.84325082   5.01245902   2.12065574]
[[ 8.56908197e+00  3.59738370e+01 -4.08947801e+00 -3.43380166e+00]
 [ 3.59738370e+01  3.03573126e+04  4.41390084e+02  2.11249279e+02]
 [-4.08947801e+00  4.41390084e+02  1.27122094e+01  5.30679406e+00]
 [-3.43380166e+00  2.11249279e+02  5.30679406e+00  5.00085231e+00]]
[0.979432786885246, 355.84325081967216, 5.012459016393443, 2.120655737704918]
[8.569081967438933, 30357.312564498887, 12.712209431740586, 5.000852308469856]
In [12]:
def dbacalc(x,idba):
    cdba = math.log10(50*idba.pdf([x[0],x[1],x[2],x[3]]))
    return cdba

def abs_dba(x,m1,m2,m3,m4,s11,s12,s13,s14,s22,s23,s24,s33,s34,s44):
    dba=(50/(math.sqrt(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33
                      -s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33
                      +s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23
                      -s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23)
                    *(2*3.1416)**2)) * math.exp( -0.5*((s22*s33*s44+s23*s34*s24+s24*s23*s34-s22*s34*s34-s23*s23*s44-s24*s33*s24)
                    *(x[0]-m1)*(x[0]-m1)+(s12*s34*s34+s13*s23*s44+s14*s33*s24-s12*s33*s44-s13*s34*s24-s14*s23*s34)
                    *((x[0]-m1)*(x[1]-m2)+(x[1]-m2)*(x[0]-m1))+ (s12*s23*s44+s13*s24*s24+s14*s22*s34-s12*s24*s34-s13*s22*s44-s14*s23*s24)
                    *((x[0]-m1)*(x[2]-m3)+(x[2]-m3)*(x[0]-m1))+ (s12*s24*s33+s13*s22*s34+s14*s23*s23-s12*s23*s34-s13*s24*s23-s14*s22*s33)
                    *((x[0]-m1)*(x[3]-m4)+(x[3]-m4)*(x[0]-m1))+ (s11*s33*s44+s13*s34*s14+s14*s13*s34-s11*s34*s34-s13*s13*s44-s14*s14*s33)
                    *(x[1]-m2)*(x[1]-m2) + (s11*s24*s34+s13*s12*s44+s14*s23*s14-s11*s23*s44-s13*s24*s14-s14*s12*s34)
                    *((x[1]-m2)*(x[2]-m3)+(x[2]-m3)*(x[1]-m2)) + (s11*s23*s34+s13*s24*s13+s14*s12*s33-s11*s24*s33-s13*s12*s34-s14*s23*s13)
                    *((x[1]-m2)*(x[3]-m4)+(x[3]-m4)*(x[1]-m2)) + (s11*s22*s44+s12*s24*s14+s14*s12*s24-s11*s24*s24-s12*s12*s44-s14*s14*s22)
                    *(x[2]-m3)*(x[2]-m3) + (s11*s24*s23+s12*s12*s34+s14*s22*s13-s11*s22*s34-s12*s24*s13-s14*s12*s23)
                    *((x[2]-m3)*(x[3]-m4)+(x[3]-m4)*(x[2]-m3)) + (s11*s22*s33+s12*s23*s13+s13*s12*s23-s11*s23*s23-s12*s12*s33-s13*s22*s13)
                    *(x[3]-m4)*(x[3]-m4))/(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33
                                       -s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33
                                       +s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23
                                       -s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23))
    math.log10(dba)
    return dba

def obs_dba(x,m1,m2,m3,m4,s1,s2,s3,s4):
    odba = (50/(4*s1*s2*s3*s4*math.pi**2))*math.exp(-(((x[0]-m1)**2/(2*s1**2))
                                                     -((x[1]-m2)**2/(2*s2**2))
                                                     -((x[2]-m3)**2/(2*s3**2))
                                                     -((x[3]-m4)**2/(2*s4**2))))
    odba = math.log10(odba)
    return odba
In [22]:
pl =[]
testfnp = testf.to_numpy()
testcnp = testc.to_numpy()
testmnp = testm.to_numpy()

def plot_dba_hist(ex,alldba):
    cl =[]
    for i in range(len(ex)):
        c = dbacalc([ex[i][0],ex[i][1],ex[i][2],ex[i][3]],alldba) 
        cl.append(c) 
        
    return cl
fxdba = plot_dba_hist(fxnp,fdba)
fdaDF['log4DBA']=fxdba
cxdba = plot_dba_hist(cxnp,fdba)
chemDF['log4DBA']=cxdba
mxdba = plot_dba_hist(mxnp,fdba)
mayDF['log4DBA']=mxdba
In [30]:
def onedba():
    fig, axs = plt.subplots(1, 1,figsize=(12,12))
    axs.set_title('log4DBA')
    axs.set(xlabel='log 4DBA', ylabel='Probability')
    axs.set_xlim(-12,-2)
    fdens4dba = stats.gaussian_kde(fdaDF["log4DBA"])
    _,xf4dba  = np.histogram(fdaDF["log4DBA"],bins=50)
    axs.plot(xf4dba, fdens4dba(xf4dba),label="FDA")
    cdens4dba = stats.gaussian_kde(chemDF["log4DBA"])
    _,xc4dba  = np.histogram(chemDF["log4DBA"],bins=50)
    axs.plot(xc4dba, cdens4dba(xc4dba),label="CHEMBRIDGE")
    mdens4dba = stats.gaussian_kde(mayDF["log4DBA"])
    _,xm4dba  = np.histogram(mayDF["log4DBA"],bins=50)
    axs.plot(xm4dba, mdens4dba(xm4dba),label="MAYBRIDGE")
    axs.legend()
    fig.savefig("plot3.png")
onedba()    
In [31]:
m1=3.2; m2=292.0; m3=1.8; m4=0.5
s11=4.4; s12=128.2; s13=-0.9; s14=-0.9; s22=12650.7; s23=103.2; s24=35.9; s33=3.5; s34=1.7; s44=2.0
x1=2.0; x2=330.0; x3=2; x4=2
dba=(50/(math.sqrt(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33-s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33+s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23-s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23)*(2*3.1416)**2)) * math.exp( -0.5*((s22*s33*s44+s23*s34*s24+s24*s23*s34-s22*s34*s34-s23*s23*s44-s24*s33*s24)*(x1-m1)*(x1-m1) + (s12*s34*s34+s13*s23*s44+s14*s33*s24-s12*s33*s44-s13*s34*s24-s14*s23*s34)*((x1-m1)*(x2-m2)+(x2-m2)*(x1-m1)) + (s12*s23*s44+s13*s24*s24+s14*s22*s34-s12*s24*s34-s13*s22*s44-s14*s23*s24)*((x1-m1)*(x3-m3)+(x3-m3)*(x1-m1)) + (s12*s24*s33+s13*s22*s34+s14*s23*s23-s12*s23*s34-s13*s24*s23-s14*s22*s33)*((x1-m1)*(x4-m4)+(x4-m4)*(x1-m1)) + (s11*s33*s44+s13*s34*s14+s14*s13*s34-s11*s34*s34-s13*s13*s44-s14*s14*s33)*(x2-m2)*(x2-m2) + (s11*s24*s34+s13*s12*s44+s14*s23*s14-s11*s23*s44-s13*s24*s14-s14*s12*s34)*((x2-m2)*(x3-m3)+(x3-m3)*(x2-m2)) + (s11*s23*s34+s13*s24*s13+s14*s12*s33-s11*s24*s33-s13*s12*s34-s14*s23*s13)*((x2-m2)*(x4-m4)+(x4-m4)*(x2-m2)) + (s11*s22*s44+s12*s24*s14+s14*s12*s24-s11*s24*s24-s12*s12*s44-s14*s14*s22)*(x3-m3)*(x3-m3) + (s11*s24*s23+s12*s12*s34+s14*s22*s13-s11*s22*s34-s12*s24*s13-s14*s12*s23)*((x3-m3)*(x4-m4)+(x4-m4)*(x3-m3)) + (s11*s22*s33+s12*s23*s13+s13*s12*s23-s11*s23*s23-s12*s12*s33-s13*s22*s13)*(x4-m4)*(x4-m4))/(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33-s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33+s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23-s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23))
math.log10(dba)
Out[31]:
-2.970317659098872
In [33]:
fdamoeDF=pd.read_csv("fda_moe.txt")
In [54]:
plt.xlabel('LogP from MOE')
plt.ylabel('4DBA from MOE')
plt.scatter(fdamoeDF['logP(o/w)'],fdamoeDF['4DBA'])
Out[54]:
<matplotlib.collections.PathCollection at 0x7f8378b38310>
In [55]:
plt.xlabel('LogP from RDKIT')
plt.ylabel('4DBA from RDKIT')
plt.scatter(fdaDF['LogP'],fdaDF['log4DBA'])
Out[55]:
<matplotlib.collections.PathCollection at 0x7f833cea46d0>
In [56]:
compDF = fdaDF.merge(fdamoeDF, left_on='Name', right_on='Vendor-ID') 
plt.xlabel('LogP from RDKIT')
plt.ylabel('LogP from MOE')
plt.scatter(compDF['LogP'],compDF['logP(o/w)'])
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
fig.savefig('plot_logP.png')
In [57]:
plt.xlabel('4DBA from RDKIT')
plt.ylabel('4DBA from MOE')
plt.scatter(compDF['log4DBA'],compDF['4DBA'])
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
fig.savefig('plot_log4DBA.png')
In [52]:
# To be continued 
## Ignore
###############
from sklearn.metrics import r2_score

cl =[]
pl =[]
testfnp = testf.to_numpy()
testcnp = testc.to_numpy()
testmnp = testm.to_numpy()

def plot_dba(etest,alldba,trdba):
    for i in range(len(etest)):
        c = dbacalc([etest[i][0],etest[i][1],etest[i][2],etest[i][3]],alldba) 
        p = dbacalc([etest[i][0],etest[i][1],etest[i][2],etest[i][3]],trdba)
        cl.append(c) 
        pl.append(p) 
    
    fig = plt.figure()
    ax = plt.subplot(111)
    ax.scatter(pl, cl)
    print(r2_score(cl, pl))
    print(r2_score(cl, pl,multioutput='variance_weighted'))
    
plot_dba(testcnp,fdba,tfdba)
0.9996193016314454
0.9996193016314454
In [ ]:
#ydata=[]
##xdata=[]
#for i in range(3050):
#    ydata.append(obs_dba(fxnp[i],meanLogP,meanMW,meanHACC,meanHDON,stdLogP,stdMW,stdHACC,stdHDON))
#xdata=np.array(xdata)                 
#ydata=np.array(ydata)

#plt.hist(ydata,bins=50)                 
#popt, pcov = curve_fit(obs_dba,fxnp,ydata,p0=[meanLogP,meanMW,meanHACC,meanHDON,stdLogP,stdMW,stdHACC,stdHDON])

#curve_fit(obs_data,xdata,ydata)
#def func(x, a, b, c):
#    return a * np.exp(-b * x) + c
#xdata = np.linspace(0, 4, 50)

#y = func(xdata, 2.5, 1.3, 0.5)
In [ ]: