import os,sys
import math
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import curve_fit
from scipy.stats import linregress
from scipy.stats import multivariate_normal
from sklearn.model_selection import train_test_split
fdaDF=pd.read_csv("fda.csv")
chemDF=pd.read_csv("chembridge.csv")
mayDF=pd.read_csv("maybridge.csv")
fdaDF.describe()
chemDF.describe()
mayDF.describe()
def oned():
fig, axs = plt.subplots(2, 2,figsize=(12,12))
#fig=plt.figure(figsize=(10,10))
axs[0, 0].set_title('MW')
axs[0, 1].set_title('LogP')
axs[1, 0].set_title('H-Acceptors')
axs[1, 1].set_title('H-Donors')
axs[0, 0].set(xlabel='Molecular Weight (Dalton)', ylabel='Probability')
axs[0, 1].set(xlabel='LogP (o/w)', ylabel='Probability')
axs[1, 0].set(xlabel='H-Bond Acceptors', ylabel='Probability')
axs[1, 1].set(xlabel='H-Bond Donors', ylabel='Probability')
fdensMW = stats.gaussian_kde(fdaDF["Weight"])
fdensLogP = stats.gaussian_kde(fdaDF["LogP"])
fdensHACC = stats.gaussian_kde(fdaDF["Acceptors"])
fdensHDON = stats.gaussian_kde(fdaDF["Donors"])
_,xfMW = np.histogram(fdaDF["Weight"],bins=50)
_,xfLogP= np.histogram(fdaDF["LogP"],bins=50)
_,xfHACC= np.histogram(fdaDF["Acceptors"], bins=100)
_,xfHDON= np.histogram(fdaDF["Donors"], bins=100)
axs[0,0].plot(xfMW, fdensMW(xfMW),label="FDA")
axs[0,1].plot(xfLogP, fdensLogP(xfLogP),label="FDA")
axs[1,0].plot(xfHACC, fdensHACC(xfHACC),label="FDA")
axs[1,1].plot(xfHDON, fdensHDON(xfHDON),label="FDA")
cdensMW = stats.gaussian_kde(chemDF["Weight"])
cdensLogP = stats.gaussian_kde(chemDF["LogP"])
cdensHACC = stats.gaussian_kde(chemDF["Acceptors"])
cdensHDON = stats.gaussian_kde(chemDF["Donors"])
_,xMW = np.histogram(chemDF["Weight"],bins=50)
_,xLogP= np.histogram(chemDF["LogP"],bins=50)
_,xHACC= np.histogram(chemDF["Acceptors"], bins=100)
_,xHDON= np.histogram(chemDF["Donors"], bins=100)
axs[0,0].plot(xMW, cdensMW(xMW),label="CHEMBRIDGE")
axs[0,1].plot(xLogP, cdensLogP(xLogP),label="CHEMBRIDGE")
axs[1,0].plot(xHACC, cdensHACC(xHACC),label="CHEMBRIDGE")
axs[1,1].plot(xHDON, cdensHDON(xHDON),label="CHEMBRIDGE")
mdensMW = stats.gaussian_kde(mayDF["Weight"])
mdensLogP = stats.gaussian_kde(mayDF["LogP"])
mdensHACC = stats.gaussian_kde(mayDF["Acceptors"])
mdensHDON = stats.gaussian_kde(mayDF["Donors"])
_,xMW = np.histogram(mayDF["Weight"],bins=50)
_,xLogP= np.histogram(mayDF["LogP"],bins=50)
_,xHACC= np.histogram(mayDF["Acceptors"], bins=100)
_,xHDON= np.histogram(mayDF["Donors"], bins=100)
axs[0,0].plot(xMW, mdensMW(xMW),label="MAYBRIDGE")
axs[0,1].plot(xLogP, mdensLogP(xLogP),label="MAYBRIDGE")
axs[1,0].plot(xHACC, mdensHACC(xHACC),label="MAYBRIDGE")
axs[1,1].plot(xHDON, mdensHDON(xHDON),label="MAYBRIDGE")
axs[0,0].legend()
axs[0,1].legend()
axs[1,0].legend()
axs[1,1].legend()
fig.savefig("plot1.png")
oned()
def twod():
fig, axs = plt.subplots(3, 2,figsize=(14,14))
axs[0, 0].set(xlabel='Molecular Weight (Dalton)', ylabel='LogP(o/w)')
axs[0, 1].set(xlabel='Molecular Weight (Dalton)', ylabel='Donors')
axs[1, 0].set(xlabel='Molecular Weight (Dalton)', ylabel='Acceptors')
axs[1, 1].set(xlabel='LogP(o/w)', ylabel='Donors')
axs[2, 0].set(xlabel='LogP(o/w)', ylabel='Acceptors')
axs[2, 1].set(xlabel='Donors', ylabel='Acceptors')
axs[0,0].hist2d(fdaDF["Weight"],fdaDF["LogP"], bins=50)
axs[0,1].hist2d(fdaDF["Weight"],fdaDF["Donors"], bins=50)
axs[1,0].hist2d(fdaDF["Weight"],fdaDF["Acceptors"], bins=50)
axs[1,1].hist2d(fdaDF["LogP"],fdaDF["Donors"], bins=50)
axs[2,0].hist2d(fdaDF["LogP"],fdaDF["Acceptors"], bins=50)
axs[2,1].hist2d(fdaDF["Donors"],fdaDF["Acceptors"], bins=50)
fig.savefig("plot2.png")
twod()
fmean=fdaDF.mean().to_numpy()
fcov=fdaDF.cov().to_numpy()
fx = fdaDF[['LogP', 'Weight', 'Acceptors','Donors']].copy()
fxnp = fx.to_numpy()
trainf, testf = train_test_split(fx, test_size=0.2)
trainfmean=trainf.mean()
trainfcov=trainf.cov()
fdba = multivariate_normal(mean=fmean, cov=fcov)
tfdba = multivariate_normal(mean=trainfmean, cov=trainfcov)
#cmean=chemDF.mean().to_numpy()
#ccov=chemDF.cov().to_numpy()
cx = chemDF[['LogP', 'Weight', 'Acceptors','Donors']].copy()
cxnp = cx.to_numpy()
trainc, testc = train_test_split(cx, test_size=0.5)
#traincmean=trainc.mean()
#trainccov=trainc.cov()
#cdba = multivariate_normal(mean=cmean, cov=fcov)
#tcdba = multivariate_normal(mean=traincmean, cov=traincov)
#mmean=mayDF.mean().to_numpy()
#mcov=mayDF.cov().to_numpy()
mx = mayDF[['LogP', 'Weight', 'Acceptors','Donors']].copy()
mxnp = mx.to_numpy()
trainm, testm = train_test_split(mx, test_size=0.5)
#trainmmean=trainm.mean()
#trainmcov=trainm.cov()
#mdba = multivariate_normal(mean=mmean, cov=mcov)
#tmdba = multivariate_normal(mean=trainmmean, cov=trainmcov)
stdLogP = float(fdaDF[['LogP']].std())
stdMW = float(fdaDF[['Weight']].std())
stdHACC = float(fdaDF[['Acceptors']].std())
stdHDON = float(fdaDF[['Donors']].std())
meanLogP = float(fdaDF[['LogP']].mean())
meanMW = float(fdaDF[['Weight']].mean())
meanHACC = float(fdaDF[['Acceptors']].mean())
meanHDON = float(fdaDF[['Donors']].mean())
print (fmean)
print (fcov)
print ([meanLogP,meanMW,meanHACC,meanHDON])
print ([stdLogP**2,stdMW**2,stdHACC**2,stdHDON**2])
#m1=3.2; m2=292.0; m3=1.8; m4=0.5
#s11=4.4; s12=128.2; s13=-0.9; s14=-0.9; s22=12650.7; s23=103.2; s24=35.9; s33=3.5; s34=1.7; s44=2.0
#x1=2.0; x2=330.0; x3=2; x4=2
def dbacalc(x,idba):
cdba = math.log10(50*idba.pdf([x[0],x[1],x[2],x[3]]))
return cdba
def abs_dba(x,m1,m2,m3,m4,s11,s12,s13,s14,s22,s23,s24,s33,s34,s44):
dba=(50/(math.sqrt(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33
-s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33
+s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23
-s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23)
*(2*3.1416)**2)) * math.exp( -0.5*((s22*s33*s44+s23*s34*s24+s24*s23*s34-s22*s34*s34-s23*s23*s44-s24*s33*s24)
*(x[0]-m1)*(x[0]-m1)+(s12*s34*s34+s13*s23*s44+s14*s33*s24-s12*s33*s44-s13*s34*s24-s14*s23*s34)
*((x[0]-m1)*(x[1]-m2)+(x[1]-m2)*(x[0]-m1))+ (s12*s23*s44+s13*s24*s24+s14*s22*s34-s12*s24*s34-s13*s22*s44-s14*s23*s24)
*((x[0]-m1)*(x[2]-m3)+(x[2]-m3)*(x[0]-m1))+ (s12*s24*s33+s13*s22*s34+s14*s23*s23-s12*s23*s34-s13*s24*s23-s14*s22*s33)
*((x[0]-m1)*(x[3]-m4)+(x[3]-m4)*(x[0]-m1))+ (s11*s33*s44+s13*s34*s14+s14*s13*s34-s11*s34*s34-s13*s13*s44-s14*s14*s33)
*(x[1]-m2)*(x[1]-m2) + (s11*s24*s34+s13*s12*s44+s14*s23*s14-s11*s23*s44-s13*s24*s14-s14*s12*s34)
*((x[1]-m2)*(x[2]-m3)+(x[2]-m3)*(x[1]-m2)) + (s11*s23*s34+s13*s24*s13+s14*s12*s33-s11*s24*s33-s13*s12*s34-s14*s23*s13)
*((x[1]-m2)*(x[3]-m4)+(x[3]-m4)*(x[1]-m2)) + (s11*s22*s44+s12*s24*s14+s14*s12*s24-s11*s24*s24-s12*s12*s44-s14*s14*s22)
*(x[2]-m3)*(x[2]-m3) + (s11*s24*s23+s12*s12*s34+s14*s22*s13-s11*s22*s34-s12*s24*s13-s14*s12*s23)
*((x[2]-m3)*(x[3]-m4)+(x[3]-m4)*(x[2]-m3)) + (s11*s22*s33+s12*s23*s13+s13*s12*s23-s11*s23*s23-s12*s12*s33-s13*s22*s13)
*(x[3]-m4)*(x[3]-m4))/(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33
-s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33
+s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23
-s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23))
math.log10(dba)
return dba
def obs_dba(x,m1,m2,m3,m4,s1,s2,s3,s4):
odba = (50/(4*s1*s2*s3*s4*math.pi**2))*math.exp(-(((x[0]-m1)**2/(2*s1**2))
-((x[1]-m2)**2/(2*s2**2))
-((x[2]-m3)**2/(2*s3**2))
-((x[3]-m4)**2/(2*s4**2))))
odba = math.log10(odba)
return odba
pl =[]
testfnp = testf.to_numpy()
testcnp = testc.to_numpy()
testmnp = testm.to_numpy()
def plot_dba_hist(ex,alldba):
cl =[]
for i in range(len(ex)):
c = dbacalc([ex[i][0],ex[i][1],ex[i][2],ex[i][3]],alldba)
cl.append(c)
return cl
fxdba = plot_dba_hist(fxnp,fdba)
fdaDF['log4DBA']=fxdba
cxdba = plot_dba_hist(cxnp,fdba)
chemDF['log4DBA']=cxdba
mxdba = plot_dba_hist(mxnp,fdba)
mayDF['log4DBA']=mxdba
def onedba():
fig, axs = plt.subplots(1, 1,figsize=(12,12))
axs.set_title('log4DBA')
axs.set(xlabel='log 4DBA', ylabel='Probability')
axs.set_xlim(-12,-2)
fdens4dba = stats.gaussian_kde(fdaDF["log4DBA"])
_,xf4dba = np.histogram(fdaDF["log4DBA"],bins=50)
axs.plot(xf4dba, fdens4dba(xf4dba),label="FDA")
cdens4dba = stats.gaussian_kde(chemDF["log4DBA"])
_,xc4dba = np.histogram(chemDF["log4DBA"],bins=50)
axs.plot(xc4dba, cdens4dba(xc4dba),label="CHEMBRIDGE")
mdens4dba = stats.gaussian_kde(mayDF["log4DBA"])
_,xm4dba = np.histogram(mayDF["log4DBA"],bins=50)
axs.plot(xm4dba, mdens4dba(xm4dba),label="MAYBRIDGE")
axs.legend()
fig.savefig("plot3.png")
onedba()
m1=3.2; m2=292.0; m3=1.8; m4=0.5
s11=4.4; s12=128.2; s13=-0.9; s14=-0.9; s22=12650.7; s23=103.2; s24=35.9; s33=3.5; s34=1.7; s44=2.0
x1=2.0; x2=330.0; x3=2; x4=2
dba=(50/(math.sqrt(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33-s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33+s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23-s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23)*(2*3.1416)**2)) * math.exp( -0.5*((s22*s33*s44+s23*s34*s24+s24*s23*s34-s22*s34*s34-s23*s23*s44-s24*s33*s24)*(x1-m1)*(x1-m1) + (s12*s34*s34+s13*s23*s44+s14*s33*s24-s12*s33*s44-s13*s34*s24-s14*s23*s34)*((x1-m1)*(x2-m2)+(x2-m2)*(x1-m1)) + (s12*s23*s44+s13*s24*s24+s14*s22*s34-s12*s24*s34-s13*s22*s44-s14*s23*s24)*((x1-m1)*(x3-m3)+(x3-m3)*(x1-m1)) + (s12*s24*s33+s13*s22*s34+s14*s23*s23-s12*s23*s34-s13*s24*s23-s14*s22*s33)*((x1-m1)*(x4-m4)+(x4-m4)*(x1-m1)) + (s11*s33*s44+s13*s34*s14+s14*s13*s34-s11*s34*s34-s13*s13*s44-s14*s14*s33)*(x2-m2)*(x2-m2) + (s11*s24*s34+s13*s12*s44+s14*s23*s14-s11*s23*s44-s13*s24*s14-s14*s12*s34)*((x2-m2)*(x3-m3)+(x3-m3)*(x2-m2)) + (s11*s23*s34+s13*s24*s13+s14*s12*s33-s11*s24*s33-s13*s12*s34-s14*s23*s13)*((x2-m2)*(x4-m4)+(x4-m4)*(x2-m2)) + (s11*s22*s44+s12*s24*s14+s14*s12*s24-s11*s24*s24-s12*s12*s44-s14*s14*s22)*(x3-m3)*(x3-m3) + (s11*s24*s23+s12*s12*s34+s14*s22*s13-s11*s22*s34-s12*s24*s13-s14*s12*s23)*((x3-m3)*(x4-m4)+(x4-m4)*(x3-m3)) + (s11*s22*s33+s12*s23*s13+s13*s12*s23-s11*s23*s23-s12*s12*s33-s13*s22*s13)*(x4-m4)*(x4-m4))/(s11*s22*s33*s44-s11*s22*s34*s34-s11*s23*s23*s44+s11*s23*s24*s34+s11*s24*s23*s34-s11*s24*s24*s33-s12*s12*s33*s44+s12*s12*s34*s34+s12*s23*s13*s44-s12*s23*s14*s34-s12*s24*s13*s34+s12*s24*s14*s33+s13*s12*s23*s44-s13*s12*s24*s34-s13*s22*s13*s44+s13*s22*s14*s34+s13*s24*s13*s24-s13*s24*s14*s23-s14*s12*s23*s34+s14*s12*s24*s33+s14*s22*s13*s34-s14*s22*s14*s33-s14*s23*s13*s24+s14*s23*s14*s23))
math.log10(dba)
fdamoeDF=pd.read_csv("fda_moe.txt")
plt.xlabel('LogP from MOE')
plt.ylabel('4DBA from MOE')
plt.scatter(fdamoeDF['logP(o/w)'],fdamoeDF['4DBA'])
plt.xlabel('LogP from RDKIT')
plt.ylabel('4DBA from RDKIT')
plt.scatter(fdaDF['LogP'],fdaDF['log4DBA'])
compDF = fdaDF.merge(fdamoeDF, left_on='Name', right_on='Vendor-ID')
plt.xlabel('LogP from RDKIT')
plt.ylabel('LogP from MOE')
plt.scatter(compDF['LogP'],compDF['logP(o/w)'])
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
fig.savefig('plot_logP.png')
plt.xlabel('4DBA from RDKIT')
plt.ylabel('4DBA from MOE')
plt.scatter(compDF['log4DBA'],compDF['4DBA'])
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
fig.savefig('plot_log4DBA.png')
# To be continued
## Ignore
###############
from sklearn.metrics import r2_score
cl =[]
pl =[]
testfnp = testf.to_numpy()
testcnp = testc.to_numpy()
testmnp = testm.to_numpy()
def plot_dba(etest,alldba,trdba):
for i in range(len(etest)):
c = dbacalc([etest[i][0],etest[i][1],etest[i][2],etest[i][3]],alldba)
p = dbacalc([etest[i][0],etest[i][1],etest[i][2],etest[i][3]],trdba)
cl.append(c)
pl.append(p)
fig = plt.figure()
ax = plt.subplot(111)
ax.scatter(pl, cl)
print(r2_score(cl, pl))
print(r2_score(cl, pl,multioutput='variance_weighted'))
plot_dba(testcnp,fdba,tfdba)
#ydata=[]
##xdata=[]
#for i in range(3050):
# ydata.append(obs_dba(fxnp[i],meanLogP,meanMW,meanHACC,meanHDON,stdLogP,stdMW,stdHACC,stdHDON))
#xdata=np.array(xdata)
#ydata=np.array(ydata)
#plt.hist(ydata,bins=50)
#popt, pcov = curve_fit(obs_dba,fxnp,ydata,p0=[meanLogP,meanMW,meanHACC,meanHDON,stdLogP,stdMW,stdHACC,stdHDON])
#curve_fit(obs_data,xdata,ydata)
#def func(x, a, b, c):
# return a * np.exp(-b * x) + c
#xdata = np.linspace(0, 4, 50)
#y = func(xdata, 2.5, 1.3, 0.5)