div.ProseMirror

Pytorch를 사용한 QSAR 모델 구축(Prediction)

!python --version
1.1s
Python
import deepchem as dc
dc.__version__
0.0s
Python
'2.3.0'
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
0.0s
Python

Pytorch 사용한 QSAR 모델 구축

import torch
torch.__version__
0.0s
Python
'1.6.0'

load to data

!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv
1.3s
Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv("delaney-processed.csv")
data.head(1)
0.1s
Python
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
data[["smiles","Molecule"]].head(1)
0.3s
Python
from math import sqrt
print(sqrt(4096))
# https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html
def mol2fp(mol):
    #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
    fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar
     
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
plt.matshow(fp.reshape((64,-1)) >0)
0.3s
Python
<matplotlib.image.AxesImage at 0x7ff79dc9fa50>
data["FPs"] = data.Molecule.apply(mol2fp)
0.2s
Python
data.head(1)
0.1s
Python

Data Transform

#dataframe에 분할되어 저장되어있는 FPs의 값들은 하나의 np.ndarray에 합치는 함수 np.stack사용
X = np.stack(data.FPs.values)
print(X.shape)
print(X)
print(type(X))
0.3s
Python
print(type(data["measured log solubility in mols per litre"]))
0.2s
Python
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
print(y)
print(type(y))
0.3s
Python
#sklearn의 train_test_split함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train,  test_size=0.1, random_state=42)
#정규화 수행 Z-score 평균0, 표준편차1 의 값으로 변환
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# cpu에서 학습
X_train = torch.tensor(X_train).float()
X_test = torch.tensor(X_test).float()
X_validation = torch.tensor(X_validation).float()
y_train = torch.tensor(y_train).float()
y_test = torch.tensor(y_test).float()
y_validation = torch.tensor(y_validation).float()
X_train
0.0s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
X_train.shape, X_validation.shape, X_test.shape
0.0s
Python
(torch.Size([913, 4096]), torch.Size([102, 4096]), torch.Size([113, 4096]))
y_train.shape
0.0s
Python
torch.Size([913, 1])
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=256,
                                          shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
                                          batch_size=256,
                                          shuffle=True)
0.0s
Python

DNN Model

#나만의 딥러닝 모델 구축
#nn.XXX로 레이어를 추가할 수 있음
#예제의 모델은 (Linear, LayerNorm, ReLU, Dropout)이 한쌍으로 3번 반복되게 구성한 모델임
#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(MLPModel, self).__init__()
        # 3개의 fully connected Linear layer를 사용
        self.linear1 = nn.Linear(input_size, hidden_size) 
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, out_size) # Output layer
        #학습 속도 개선을 위한 LayerNorm 레이어 추가
        #self.ln1 = nn.LayerNorm(hidden_size)
        #self.ln2 = nn.LayerNorm(hidden_size)
        #self.ln3 = nn.LayerNorm(hidden_size)        
        #ReLU 활성화 함수 추가
        self.activation = nn.ReLU()
        #Dropout 일반화 부분
        self.dropout = nn.Dropout(dropout_rate)
    #실제 학습시 레이어 사용 부분 
    def forward(self, x):
        out = self.linear1(x)
        #out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        #이부분 윗까지 1개의 블록으로 간주할 수 있음
        out = self.linear2(out)
        #out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        out = self.linear3(out)
        #out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)
        #Final output layer 1
        out = self.fc_out(out)
        return out
0.0s
Python
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1]     # The input size should fit our fingerprint size
hidden_size = 1024   # The size of the hidden layer
dropout_rate = 0.8   # The dropout rate
output_size = 1        # This is just a single task, so this will be one
learning_rate = 0.001  # The learning rate for the optimizer
model = MLPModel(input_size, hidden_size, dropout_rate, output_size)
model
0.1s
Python
from IPython.display import Image
Image(url='https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fk5Ch3%2FbtqDL4jjXl1%2F93WikBjXpxJ0e7kYy8c8SK%2Fimg.gif')
0.0s
Python
#Loss function과 optimizer를 설정
#Loss function은 Mean Squared Error로
#Optimizer는 Adam으로
#pytorch Loss function: https://pytorch.org/docs/stable/_modules/torch/nn/modules/loss.html
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
0.0s
Python

Training

#cpu 모드로 돌렸을때와 gpu모드로 돌렸을때의 속도 차이
from sklearn.metrics import r2_score
import timeit
start_time = timeit.default_timer()
list_epoch = []
list_train_loss = []
list_val_loss = []
list_r2 = []
epochs = 200
for e in range(epochs):
    running_loss = 0
    for X, y in train_loader:
        model.train() #학습 모드로 변경
        optimizer.zero_grad() # 그라디언트 값을 0으로 초기화
         
        output = model(X) #배치단위별로 학습
        loss = criterion(output, y) #loss를 Lossfunction을 사용하여 구하는 부분
        loss.backward() # backward를 통해서 그라디언트를 구해줌 optimizer가 감당할 파라미터들을 사용하여 미분을 수행함
        optimizer.step() # step을 통해서 그라디언트를 바탕으로 파라미터 업데이트
        running_loss += loss.item()
    else:
        if e%50 == 0:
          #mse
            validation_loss =torch.mean((y_validation-model(X_validation))**2)
            list_r2.append(r2_score(y_validation.detach(), model(X_validation).detach()))
            list_epoch.append(e)
            list_train_loss.append(running_loss/len(train_loader))
            print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
            list_val_loss.append(validation_loss)
terminate_time = timeit.default_timer()
print("%f초 걸렸습니다."% (terminate_time-start_time))
63.1s
Python

DNN 모델의 성능평가

fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.set_ylim(0.6, 1.0)
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.7s
Python
#evaluation mode로 전환
model.eval()
y_pred_train = model(X_train)
y_pred_validation = model(X_validation)
y_pred_test = model(X_test)
0.1s
Python
#test 데이터셋의 MSE와 R2점수 구하기 
print(torch.mean(( y_test - y_pred_test )**2).item())
print(r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone()))
0.3s
Python

Pytorch GPU사용하기

!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv
1.0s
Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
0.0s
Python

load to data

data = pd.read_csv("delaney-processed.csv")
0.0s
Python

Data Transform

from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from math import sqrt
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
def mol2fp(mol):
    #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
    fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
data["FPs"] = data.Molecule.apply(mol2fp)
0.3s
Python
data.head(1)
0.0s
Python
X = np.stack(data.FPs.values)
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train,  test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로 
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.5s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], device='cuda:0')

GPU DNN Model

#나만의 딥러닝 모델 구축
#nn.XXX로 레이어를 추가할 수 있음
#예제의 모델은 (Linear, LayerNorm, ReLU, Dropout)이 한쌍으로 3번 반복되게 구성한 모델임
#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음
class cudaMLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(cudaMLPModel, self).__init__()
        # 3개의 fully connected Linear layer를 사용
        self.linear1 = nn.Linear(input_size, hidden_size) 
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, out_size) # Output layer
        #학습 속도 개선을 위한 LayerNorm 레이어 추가
        #self.ln1 = nn.LayerNorm(hidden_size)
        #self.ln2 = nn.LayerNorm(hidden_size)
        #self.ln3 = nn.LayerNorm(hidden_size)        
        #ReLU 활성화 함수 추가
        self.activation = nn.ReLU()
        #Dropout 일반화 부분
        self.dropout = nn.Dropout(dropout_rate)
    #실제 학습시 레이어 사용 부분 
    def forward(self, x):
        out = self.linear1(x)
        #out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        #이부분 윗까지 1개의 블록으로 간주할 수 있음
        out = self.linear2(out)
        #out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        out = self.linear3(out)
        #out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)
        #Final output layer
        out = self.fc_out(out)
        return out
0.0s
Python
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1]     # The input size should fit our fingerprint size
hidden_size = 1024   # The size of the hidden layer
dropout_rate = 0.8   # The dropout rate
output_size = 1        # This is just a single task, so this will be one
learning_rate = 0.0001  # The learning rate for the optimizer
cudamodel = cudaMLPModel(input_size, hidden_size, dropout_rate, output_size)
0.1s
Python
#학습 모델이 GPU를 사용한다는 표시
cudamodel.cuda()
0.0s
Python
cudaMLPModel( (linear1): Linear(in_features=4096, out_features=1024, bias=True) (linear2): Linear(in_features=1024, out_features=1024, bias=True) (linear3): Linear(in_features=1024, out_features=1024, bias=True) (fc_out): Linear(in_features=1024, out_features=1, bias=True) (activation): ReLU() (dropout): Dropout(p=0.8, inplace=False) )
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=256,
                                          shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
                                          batch_size=256,
                                          shuffle=True)
0.0s
Python
#Loss function과 optimizer를 설정
criterion2 = nn.MSELoss()
#weight_decay=0
optimizer2 = torch.optim.Adam(cudamodel.parameters(), lr=learning_rate)
0.0s
Python

Training

#gpu모드
from sklearn.metrics import r2_score
import timeit
start_time = timeit.default_timer()
list_epoch = []
list_train_loss = []
list_val_loss = []
list_r2 = []
cudamodel.train()
epochs = 200
for e in range(epochs):
    running_loss = 0
    for fps, labels in train_loader:
        # Training pass
        optimizer2.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
         
        output = cudamodel(fps) #Forward pass of the mini-batch
        loss = criterion2(output, labels) #Computing the loss
        loss.backward() # calculate the backward pass
        optimizer2.step() # Optimize the weights
        running_loss += loss.item()
    else:
        if e%50 == 0:
            validation_loss = torch.mean(( y_validation - cudamodel(X_validation) )**2).item()
            list_r2.append(r2_score(y_validation.detach().cpu(), cudamodel(X_validation).detach().cpu()))
            list_epoch.append(e)
            list_train_loss.append(running_loss/len(train_loader))
            print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
            list_val_loss.append(validation_loss)
terminate_time = timeit.default_timer()
print("%f초 걸렸습니다."% (terminate_time-start_time))
4.4s
Python

GPU DNN 모델의 성능평가

fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax1.set_ylim(0, 1.0)
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.4s
Python
#evaluation mode로 전환
cudamodel.eval()
y_pred_train = cudamodel(X_train)
y_pred_validation = cudamodel(X_validation)
y_pred_test = cudamodel(X_test)
0.0s
Python
from sklearn.metrics import r2_score
#train 데이터셋의 MSE와 R2점수 구하기
#gpu에 할당된 Tensor를 cpu의 numpy로 가져오는 과정
torch.mean(( y_train - y_pred_train )**2).item()
r2_score(y_train.detach().cpu().clone() , y_pred_train.detach().cpu().clone())
0.0s
Python
0.957260735468451
#validation 데이터셋의 MSE와 R2점수 구하기 
torch.mean(( y_validation - y_pred_validation )**2).item()
r2_score(y_validation.detach().cpu().clone() , y_pred_validation.detach().cpu().clone())
0.0s
Python
0.8121053048843863
#test 데이터셋의 MSE와 R2점수 구하기 
torch.mean(( y_test - y_pred_test )**2).item()
r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())
0.0s
Python
0.864958526237417
plt.scatter(y_pred_test.detach().cpu().clone(), y_test.detach().cpu().clone())
plt.xlabel('Predicted log-solubility in mols/liter')
plt.ylabel('True log-solubility in mols/liter')
plt.title(r'DNN LinearModel predicted vs. true log-solubilities')
plt.show()
0.7s
Python
def flatten(tensor):
    return tensor.cpu().detach().numpy().flatten()
    
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
plt.show()
0.2s
Python
#정확한 값 예측을 위해 다음과 같이 inverse_transform을 사용
def predict_smiles(smiles):
    #모델에 찾고자하는 smiles를 입력으로 주면 Smiles를 fingerprint로 변형
    fp =mol2fp(Chem.MolFromSmiles(smiles)).reshape(1,-1)
    #fp를 torch.tensor입력으로 변환gpu사용하는 경우
    fp_tensor = torch.tensor(fp, device=device).float()
    #모델에 변형한 입력을 줌
    prediction = cudamodel(fp_tensor)
    #return prediction.cpu().detach().numpy()
    #모델 학습 시 입력 및 출력 데이터를 transform하였기에 inverse_transform을 사용하여 원본 값의 범위로 재조정하여 예측값 출력
    logP = scaler.inverse_transform(prediction.cpu().detach().numpy())
    return logP[0][0]
predict_smiles('Cc1ccc2c(N3CCNCC3)cc(F)cc2n1')
0.0s
Python
-3.508878

하이퍼 파라미터 Grid Search

load to data

data = pd.read_csv("delaney-processed.csv")
0.0s
Python

Data Transform

from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from math import sqrt
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
def mol2fp(mol):
    #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
    fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
data["FPs"] = data.Molecule.apply(mol2fp)
0.3s
Python
X = np.stack(data.FPs.values)
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train,  test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로 
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.3s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], device='cuda:0')
    #TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
    #이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
    from torch.utils.data import TensorDataset
    train_dataset = TensorDataset(X_train, y_train)
    validation_dataset = TensorDataset(X_validation, y_validation)
    #DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                              batch_size=256,
                                              shuffle=True)
    validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
                                              batch_size=256,
                                              shuffle=True)
0.0s
Python

Training & Tuning Hyper-parameter

from sklearn.metrics import r2_score
from itertools import product
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1]     #fingerprint size를 얻어오는 코드
hidden_size = 1024   #히든 레이어 크기
output_size = 1        # 1개의 숫자를 예측하기 때문에 output의 크기를 1로 설정
parameters = dict(
    learning_rates = [0.001,0.0001]
    ,dropout_rates= [0.8,0.5]
    ,epochs=[200,300]
    ,hidden_size=[512,1024]
)
param_values = [v for v in parameters.values()]
param_values
prev_score = 0.0
best_param=[]
for lr, dropout_rate, epochs, hidden_size in product(*param_values):
    grid_model = MLPModel(input_size, hidden_size, dropout_rate, output_size)
    grid_model.cuda()
    criterion3 = nn.MSELoss()
    optimizer3 = torch.optim.Adam(grid_model.parameters(), lr=lr)
    grid_model.train() #모델은 train mode로 설정
    epochs = epochs
    for e in range(epochs+1):
        running_loss = 0
        for fps, labels in train_loader:
            # Training pass
            optimizer3.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
            output = grid_model(fps) #Forward pass of the mini-batch
            loss = criterion3(output, labels) #Computing the loss
            loss.backward() # calculate the backward pass
            optimizer3.step() # Optimize the weights
            running_loss += loss.item()
        else:
            if e%100 == 0:
                validation_loss = torch.mean(( y_validation - grid_model(X_validation) )**2).item()
                print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
                #list_val_loss.append(validation_loss)
            if e == epochs:
              score = r2_score(y_validation.detach().cpu(), grid_model(X_validation).detach().cpu())
              print("lr:", lr, "dropout_rate:", dropout_rate, "epochs:",epochs, "validation_loss:",validation_loss, "r2_score:", score)
              if prev_score < score:
                prev_score=score
                best_param={'lr':lr, 'dropout_rate':dropout_rate, 'epochs':epochs, 'r2_score':score}
84.4s
Python
print(best_param)
0.3s
Python

Convolutional Neural Network

load to data

data = pd.read_csv("delaney-processed.csv")
data.head(1)
0.1s
Python

Data Tansform

from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from math import sqrt
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
def mol2fp(mol):
    #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
    fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
data["FPs"] = data.Molecule.apply(mol2fp)
0.3s
Python
#dataframe에 분할되어 저장되어있는 FPs의 값들은 하나의 np.ndarray에 합치는 함수 np.stack사용
#합친 결과를 (1128,1, 64,64)로 모양 수정 CNN입력인 4차원으로 주기 위함
X = np.stack(data.FPs.values)
X = X.reshape(len(X),1,64,-1)
print(X.shape)
print(X)
0.2s
Python
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
print(y)
print(type(y))
0.3s
Python
#sklearn의 train_test_split함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train,  test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로 
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
print(X_train.shape, X_validation.shape, X_test.shape)
0.4s
Python
#랜덤 시드 고정
torch.manual_seed(42)
# GPU 사용 가능일 경우 랜덤 시드 고정
if device == 'cuda':
    torch.cuda.manual_seed_all(42)
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.4s
Python
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=256,
                                          shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
                                          batch_size=256,
                                          shuffle=True)
0.0s
Python
#나만의 딥러닝 모델 구축
#nn.XXX로 레이어를 추가할 수 있음
#예제의 모델은 CNN모델을 사용하기 위해 Conv2d relue, max_pool2d가 하나의 컬렉션으로 구성되어 2번을 반복하고 마지막 fc레이어를 구성한 값이 됨
#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음
class CNNModel(nn.Module):
    def __init__(self,):
        super(CNNModel, self).__init__()
        #cnn layer
        self.conv1 = nn.Conv2d(1,6,kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(6,16,kernel_size=3, padding=1)
        #activation function
        self.relu = nn.ReLU()
        #pooling layer
        self.maxpool = nn.MaxPool2d(2)
        #dropout layer
        self.dropout1 = nn.Dropout2d(0.2)
        #fully connect layer
        self.fc1 = nn.Linear(16 * 16 * 16, 1024) 
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 1)
     
    def forward(self, x):# Forward pass: stacking each layer together
        #input Shape (batch_size,1,64,64)
        #convol(batch_size,6,64,64)
        #pooling (batch_size,6,32,32) 
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        #convol(batch_size,6,32,32)
        #pooling (batch_size,16,16,16)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.dropout1(x)
        #1자로 inputshape를 펴주는 과정
        x = x.view(x.size(0),-1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x  
0.0s
Python
#하이퍼 파라미터 세팅
#input_size = X_train.size()[-1]     # The input size should fit our fingerprint size
#hidden_size = 1024   # The size of the hidden layer
#dropout_rate = 0.2   # The dropout rate
#output_size = 1        # This is just a single task, so this will be one
#learning_rate = 0.0001  # The learning rate for the optimizer
cnnmodel = CNNModel()
print(cnnmodel)
0.3s
Python
cnnmodel.cuda()
0.0s
Python
CNNModel( (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (relu): ReLU() (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (dropout1): Dropout2d(p=0.2, inplace=False) (fc1): Linear(in_features=4096, out_features=1024, bias=True) (fc2): Linear(in_features=1024, out_features=256, bias=True) (fc3): Linear(in_features=256, out_features=1, bias=True) )
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cnnmodel.parameters(), lr=0.0001)
0.0s
Python

Training

from sklearn.metrics import r2_score
list_epoch = []
list_train_loss = []
list_val_loss = []
list_train_r2 = []
list_val_r2 = []
cnnmodel.train() #Ensure the network is in "train" mode with dropouts active
epochs = 500
for e in range(epochs):
    running_loss = 0
    for fps, labels in train_loader:
        # Training pass
        optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
         
        output = cnnmodel(fps) #Forward pass of the mini-batch
        loss = criterion(output, labels) #Computing the loss
        loss.backward() # calculate the backward pass
        optimizer.step() # Optimize the weights
        running_loss += loss.item()
    else:
        if e%50 == 0:
            validation_loss = torch.mean(( y_validation - cnnmodel(X_validation) )**2).item()
            list_train_r2.append(r2_score(y_train.detach().cpu(), cnnmodel(X_train).detach().cpu()))
            list_val_r2.append(r2_score(y_validation.detach().cpu(), cnnmodel(X_validation).detach().cpu()))
            list_epoch.append(e)
            list_train_loss.append(running_loss/len(train_loader))
            print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
            list_val_loss.append(validation_loss)
24.2s
Python

CNN 모델의 성능평가

fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_val_r2, marker='x', label='validation_r2 metric')
ax2.plot(list_epoch, list_train_r2, marker='x', label='train_r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.6s
Python
cnnmodel.eval() #Swith to evaluation mode, where dropout is switched off
y_pred_train = cnnmodel(X_train)
y_pred_validation = cnnmodel(X_validation)
y_pred_test = cnnmodel(X_test)
0.0s
Python
torch.mean(( y_test - y_pred_test )**2).item()
r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())
0.0s
Python
0.8950224503208235
def flatten(tensor):
    return tensor.cpu().detach().numpy().flatten()
    
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
0.7s
Python
[<matplotlib.lines.Line2D at 0x7ff7fc6ab850>]

Mol2Vec사용한 모델 구축

!pip install git+https://github.com/samoturk/mol2vec
18.4s
Python

load to data

!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv
1.0s
Python
import pandas as pd
data = pd.read_csv("delaney-processed.csv")
data.head(1)
0.0s
Python

Data Transform

from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
data[["smiles","Molecule"]].head(1)
0.2s
Python
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
0.6s
Python
aas = [Chem.MolFromSmiles(x) for x in data["smiles"]]
0.1s
Python
sentence=mol2alt_sentence(aas[0],1)
sentence
0.0s
Python
['864662311', '1535166686', '2245384272', '3153477100', '2976033787', '1916236386', '3189457552', '2667063169', '2976033787', '1286704427', '864674487', '1759589175', '2245384272', '3129492592', '2976033787', '1916236386', '3189457552', '2667063169', '2976033787', '1286704427', '864674487', '199163361', '2245273601', '3147100053', '2245900962', '869152089', '847433064', '2551483158', '3217380708', '3579962709', '3218693969', '951226070', '3218693969', '98513984', '3218693969', '98513984', '3218693969', '98513984', '3218693969', '951226070', '2976033787', '675765711', '864662311', '266675433', '2976033787', '675765711', '864662311', '266675433', '2976033787', '675765711', '864662311', '266675433', '2976033787', '675765711', '864662311', '266675433', '2976033787', '675765711', '864662311', '266675433', '2976033787', '675765711', '864662311', '266675433']
depict_identifier(aas[0], 864662311, 1)
0.1s
Python
it = IdentifierTable(sentence, [aas[0]]*len(sentence), [sentence]*len(sentence), 5, 1)
it
0.6s
Python
from gensim.models import word2vec
0.0s
Python
!wget https://raw.githubusercontent.com/samoturk/mol2vec/master/examples/models/model_300dim.pkl
6.3s
Python
w2vmodel = word2vec.Word2Vec.load('model_300dim.pkl')
1.0s
Python
#Number of unique identifiers represented as vectors
len(w2vmodel.wv.vocab.keys())
0.0s
Python
21003
#Feature vector representing above depicted identifier 2246728737
#w2vmodel.wv.word_vec('2246728737')
0.0s
Python
data.head(1)
0.1s
Python
data['Molecule']
0.1s
Python
0 <img data-content="rdkit/molecule" src="data:i... 1 <img data-content="rdkit/molecule" src="data:i... 2 <img data-content="rdkit/molecule" src="data:i... 3 <img data-content="rdkit/molecule" src="data:i... 4 <img data-content="rdkit/molecule" src="data:i... ... 1123 <img data-content="rdkit/molecule" src="data:i... 1124 <img data-content="rdkit/molecule" src="data:i... 1125 <img data-content="rdkit/molecule" src="data:i... 1126 <img data-content="rdkit/molecule" src="data:i... 1127 <img data-content="rdkit/molecule" src="data:i... Name: Molecule, Length: 1128, dtype: object
data['sentence'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['Molecule'], 1)), axis=1)
0.6s
Python
data['mol2vec'] = [DfVec(x) for x in sentences2vec(data['sentence'], w2vmodel, unseen='UNK')]
0.2s
Python
data.head(1)
0.1s
Python
X = np.array([x.vec for x in data['mol2vec']])
y = data['measured log solubility in mols per litre'].values.reshape((-1,1))
0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train,  test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로 
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.4s
Python
tensor([[ 0.6398, 1.5559, -1.5355, ..., -2.2593, -3.9086, 0.4851], [ -1.5316, -7.9163, -6.0273, ..., -2.4850, -12.4975, 3.9382], [ 0.9494, -0.9584, -0.5601, ..., -4.1352, -6.4839, -1.8985], ..., [ -1.5671, -0.0286, -0.9324, ..., 1.1181, -1.1595, -1.1114], [ -1.1879, -6.7705, -6.3758, ..., -3.3778, -14.5161, -0.0831], [ 1.5110, -3.0176, -2.8297, ..., -4.9359, -8.8553, -2.0444]], device='cuda:0')
print(X_train.shape, y_train.shape)
0.3s
Python
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=256,
                                          shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
                                          batch_size=256,
                                          shuffle=False)
0.0s
Python
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1]     # The input size should fit our fingerprint size
hidden_size = 1024   # The size of the hidden layer
dropout_rate = 0.2   # The dropout rate
output_size = 1        # This is just a single task, so this will be one
learning_rate = 0.0001  # The learning rate for the optimizer
vec_dnn_model = MLPModel(input_size, hidden_size, dropout_rate, output_size)
0.0s
Python
vec_dnn_model.cuda()
0.0s
Python
MLPModel( (linear1): Linear(in_features=300, out_features=1024, bias=True) (linear2): Linear(in_features=1024, out_features=1024, bias=True) (linear3): Linear(in_features=1024, out_features=1024, bias=True) (fc_out): Linear(in_features=1024, out_features=1, bias=True) (ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (ln2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (ln3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (activation): ReLU() (dropout): Dropout(p=0.2, inplace=False) )
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(vec_dnn_model.parameters(), lr=learning_rate)
0.0s
Python
from sklearn.metrics import r2_score
list_epoch = []
list_train_loss = []
list_val_loss = []
list_r2 = []
vec_dnn_model.train() #Ensure the network is in "train" mode with dropouts active
epochs = 1000
for e in range(epochs):
    running_loss = 0
    for fps, labels in train_loader:
        # Training pass
        optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
         
        output = vec_dnn_model(fps) #Forward pass of the mini-batch
        loss = criterion(output, labels) #Computing the loss
        loss.backward() # calculate the backward pass
        optimizer.step() # Optimize the weights
        running_loss += loss.item()
    else:
        if e%100 == 0:
            validation_loss = torch.mean(( y_validation - vec_dnn_model(X_validation) )**2).item()
            list_r2.append(r2_score(y_validation.detach().cpu(), vec_dnn_model(X_validation).detach().cpu()))
            list_epoch.append(e)
            list_train_loss.append(running_loss/len(train_loader))
            print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
            list_val_loss.append(validation_loss)
20.9s
Python
fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.5s
Python
torch.mean(( y_test - y_pred_test )**2).item()
r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())
0.0s
Python
0.8950224503208235
def flatten(tensor):
    return tensor.cpu().detach().numpy().flatten()
    
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
0.5s
Python
[<matplotlib.lines.Line2D at 0x7ff7c54b2850>]
Runtimes (1)