div.ProseMirror
Pytorch를 사용한 QSAR 모델 구축(Prediction.)
!python --version
1.0s
Python
import deepchem as dc
dc.__version__
0.0s
Python
'2.3.0'
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
0.0s
Python
Pytorch 사용한 QSAR 모델 구축
import torch
torch.__version__
0.0s
Python
'1.6.0'
load to data
!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv
1.2s
Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv("delaney-processed.csv")
data.head(1)
0.1s
Python
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
data[["smiles","Molecule"]].head(1)
0.2s
Python
from math import sqrt
print(sqrt(4096))
# https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html
def mol2fp(mol):
#원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
ar = np.zeros((1,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, ar)
return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
plt.matshow(fp.reshape((64,-1)) >0)
0.3s
Python
<matplotlib.image.AxesImage at 0x7f1b8d204850>
data["FPs"] = data.Molecule.apply(mol2fp)
0.2s
Python
data.head(1)
0.1s
Python
Data Transform
#dataframe에 분할되어 저장되어있는 FPs의 값들은 하나의 np.ndarray에 합치는 함수 np.stack사용
X = np.stack(data.FPs.values)
print(X.shape)
print(X)
print(type(X))
0.4s
Python
print(type(data["measured log solubility in mols per litre"]))
0.2s
Python
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
print(y)
print(type(y))
0.3s
Python
#sklearn의 train_test_split함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
#정규화 수행 Z-score 평균0, 표준편차1 의 값으로 변환
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# cpu에서 학습
X_train = torch.tensor(X_train).float()
X_test = torch.tensor(X_test).float()
X_validation = torch.tensor(X_validation).float()
y_train = torch.tensor(y_train).float()
y_test = torch.tensor(y_test).float()
y_validation = torch.tensor(y_validation).float()
X_train
0.0s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
X_train.shape, X_validation.shape, X_test.shape
0.0s
Python
(torch.Size([913, 4096]), torch.Size([102, 4096]), torch.Size([113, 4096]))
y_train.shape
0.0s
Python
torch.Size([913, 1])
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=256,
shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
batch_size=256,
shuffle=True)
0.0s
Python
DNN Model
#나만의 딥러닝 모델 구축
#nn.XXX로 레이어를 추가할 수 있음
#예제의 모델은 (Linear, LayerNorm, ReLU, Dropout)이 한쌍으로 3번 반복되게 구성한 모델임
#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음
class MLPModel(nn.Module):
def __init__(self, input_size, hidden_size, dropout_rate, out_size):
super(MLPModel, self).__init__()
# 3개의 fully connected Linear layer를 사용
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, hidden_size)
self.fc_out = nn.Linear(hidden_size, out_size) # Output layer
#ReLU 활성화 함수 추가
self.activation = nn.ReLU()
#Dropout 일반화 부분
self.dropout = nn.Dropout(dropout_rate)
#실제 학습시 레이어 사용 부분
def forward(self, x):
out = self.linear1(x)
out = self.activation(out)
out = self.dropout(out)
out = self.linear2(out)
out = self.activation(out)
out = self.dropout(out)
out = self.linear3(out)
out = self.activation(out)
out = self.dropout(out)
#Final output layer 1
out = self.fc_out(out)
return out
0.0s
Python
X_train.size()
0.0s
Python
torch.Size([913, 4096])
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1] #입력 데이터 크기 size
hidden_size = 1024 # The size of the hidden layer
dropout_rate = 0.8 # The dropout rate
output_size = 1 # This is just a single task, so this will be one
learning_rate = 0.001 # The learning rate for the optimizer
model = MLPModel(input_size, hidden_size, dropout_rate, output_size)
model
0.1s
Python
MLPModel(
(linear1): Linear(in_features=4096, out_features=1024, bias=True)
(linear2): Linear(in_features=1024, out_features=1024, bias=True)
(linear3): Linear(in_features=1024, out_features=1024, bias=True)
(fc_out): Linear(in_features=1024, out_features=1, bias=True)
(activation): ReLU()
(dropout): Dropout(p=0.8, inplace=False)
)
from IPython.display import Image
Image(url='https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fk5Ch3%2FbtqDL4jjXl1%2F93WikBjXpxJ0e7kYy8c8SK%2Fimg.gif')
0.0s
Python
#Loss function과 optimizer를 설정
#Loss function은 Mean Squared Error로
#Optimizer는 Adam으로
#pytorch Loss function: https://pytorch.org/docs/stable/_modules/torch/nn/modules/loss.html
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
0.0s
Python
Training
#cpu 모드
from sklearn.metrics import r2_score
import timeit
start_time = timeit.default_timer()
list_epoch = []
list_train_loss = []
list_val_loss = []
list_r2 = []
epochs = 200
for e in range(epochs):
running_loss = 0
for X, y in train_loader:
model.train() #학습 모드로 변경
optimizer.zero_grad() # 그라디언트 값을 0으로 초기화
output = model(X) #배치단위별로 학습
loss = criterion(output, y) #loss를 Lossfunction을 사용하여 구하는 부분
loss.backward() # backward를 통해서 그라디언트를 구해줌 optimizer가 감당할 파라미터들을 사용하여 미분을 수행함
optimizer.step() # step을 통해서 그라디언트를 바탕으로 파라미터 업데이트
running_loss += loss.item()
else:
if e%50 == 0:
#mse
validation_loss =torch.mean((y_validation-model(X_validation))**2)
list_r2.append(r2_score(y_validation.detach(), model(X_validation).detach()))
list_epoch.append(e)
list_train_loss.append(running_loss/len(train_loader))
print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
list_val_loss.append(validation_loss)
terminate_time = timeit.default_timer()
print("%f초 걸렸습니다."% (terminate_time-start_time))
62.9s
Python
DNN 모델의 성능평가
fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.set_ylim(0.6, 1.0)
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.8s
Python
#evaluation mode로 전환
model.eval()
y_pred_train = model(X_train)
y_pred_validation = model(X_validation)
y_pred_test = model(X_test)
0.1s
Python
#test 데이터셋의 RMSE와 R2점수 구하기
print("RMSE: {0:.3f}".format(torch.sqrt(torch.mean(( y_test - y_pred_test )**2)).detach()))
print("r2_score: {0:.3f}".format(r2_score(y_test.detach() , y_pred_test.detach())))
0.3s
Python
plt.scatter(y_pred_test.detach(), y_test.detach())
plt.xlabel('Predicted log-solubility in mols/liter')
plt.ylabel('True log-solubility in mols/liter')
plt.title(r'DNN LinearModel predicted vs. true log-solubilities')
plt.show()
0.5s
Python
Pytorch GPU사용하기
!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv
1.0s
Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
0.0s
Python
load to data
data = pd.read_csv("delaney-processed.csv")
0.0s
Python
Data Transform
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from math import sqrt
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
def mol2fp(mol):
#원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
ar = np.zeros((1,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, ar)
return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
data["FPs"] = data.Molecule.apply(mol2fp)
0.3s
Python
data.head(1)
0.1s
Python
X = np.stack(data.FPs.values)
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.3s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]], device='cuda:0')
GPU DNN Model
#나만의 딥러닝 모델 구축
#nn.XXX로 레이어를 추가할 수 있음
#예제의 모델은 (Linear, LayerNorm, ReLU, Dropout)이 한쌍으로 3번 반복되게 구성한 모델임
#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음
class cudaMLPModel(nn.Module):
def __init__(self, input_size, hidden_size, dropout_rate, out_size):
super(cudaMLPModel, self).__init__()
# 3개의 fully connected Linear layer를 사용
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, hidden_size)
self.fc_out = nn.Linear(hidden_size, out_size) # Output layer
#ReLU 활성화 함수 추가
self.activation = nn.ReLU()
#Dropout 일반화 부분
self.dropout = nn.Dropout(dropout_rate)
#실제 학습시 레이어 사용 부분
def forward(self, x):
out = self.linear1(x)
#out = self.ln1(out)
out = self.activation(out)
out = self.dropout(out)
#이부분 윗까지 1개의 블록으로 간주할 수 있음
out = self.linear2(out)
#out = self.ln2(out)
out = self.activation(out)
out = self.dropout(out)
out = self.linear3(out)
#out = self.ln3(out)
out = self.activation(out)
out = self.dropout(out)
#Final output layer
out = self.fc_out(out)
return out
0.0s
Python
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1] # The input size should fit our fingerprint size
hidden_size = 1024 # The size of the hidden layer
dropout_rate = 0.8 # The dropout rate
output_size = 1 # This is just a single task, so this will be one
learning_rate = 0.0001 # The learning rate for the optimizer
cudamodel = cudaMLPModel(input_size, hidden_size, dropout_rate, output_size)
0.1s
Python
#학습 모델이 GPU를 사용한다는 표시
cudamodel.cuda()
0.0s
Python
cudaMLPModel(
(linear1): Linear(in_features=4096, out_features=1024, bias=True)
(linear2): Linear(in_features=1024, out_features=1024, bias=True)
(linear3): Linear(in_features=1024, out_features=1024, bias=True)
(fc_out): Linear(in_features=1024, out_features=1, bias=True)
(activation): ReLU()
(dropout): Dropout(p=0.8, inplace=False)
)
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=256,
shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
batch_size=256,
shuffle=True)
0.0s
Python
#Loss function과 optimizer를 설정
criterion2 = nn.MSELoss()
#weight_decay=0
optimizer2 = torch.optim.Adam(cudamodel.parameters(), lr=learning_rate)
0.0s
Python
Training
#gpu모드
from sklearn.metrics import r2_score
import timeit
start_time = timeit.default_timer()
list_epoch = []
list_train_loss = []
list_val_loss = []
list_r2 = []
cudamodel.train()
epochs = 200
for e in range(epochs):
running_loss = 0
for fps, labels in train_loader:
# Training pass
optimizer2.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
output = cudamodel(fps) #Forward pass of the mini-batch
loss = criterion2(output, labels) #Computing the loss
loss.backward() # calculate the backward pass
optimizer2.step() # Optimize the weights
running_loss += loss.item()
else:
if e%50 == 0:
validation_loss = torch.mean(( y_validation - cudamodel(X_validation) )**2).item()
list_r2.append(r2_score(y_validation.detach().cpu(), cudamodel(X_validation).detach().cpu()))
list_epoch.append(e)
list_train_loss.append(running_loss/len(train_loader))
print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
list_val_loss.append(validation_loss)
terminate_time = timeit.default_timer()
print("%f초 걸렸습니다."% (terminate_time-start_time))
4.7s
Python
GPU DNN 모델의 성능평가
fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.set_ylim(0, 1.0)
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.9s
Python
#evaluation mode로 전환
cudamodel.eval()
y_pred_train = cudamodel(X_train)
y_pred_validation = cudamodel(X_validation)
y_pred_test = cudamodel(X_test)
0.0s
Python
#test 데이터셋의 MSE와 R2점수 구하기
#test 데이터셋의 RMSE와 R2점수 구하기
print("RMSE: {0:.3f}".format(torch.sqrt(torch.mean(( y_test - y_pred_test )**2)).item()))
print("r2_score: {0:.3f}".format(r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())))
0.2s
Python
plt.scatter(y_pred_test.detach().cpu().clone(), y_test.detach().cpu().clone())
plt.xlabel('Predicted log-solubility in mols/liter')
plt.ylabel('True log-solubility in mols/liter')
plt.title(r'DNN LinearModel predicted vs. true log-solubilities')
plt.show()
0.5s
Python
def flatten(tensor):
return tensor.cpu().detach().numpy().flatten()
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
plt.show()
0.7s
Python
#정확한 값 예측을 위해 다음과 같이 inverse_transform을 사용
def predict_smiles(smiles):
#모델에 찾고자하는 smiles를 입력으로 주면 Smiles를 fingerprint로 변형
fp =mol2fp(Chem.MolFromSmiles(smiles)).reshape(1,-1)
#fp를 torch.tensor입력으로 변환gpu사용하는 경우
fp_tensor = torch.tensor(fp, device=device).float()
#모델에 변형한 입력을 줌
prediction = cudamodel(fp_tensor)
#return prediction.cpu().detach().numpy()
#모델 학습 시 입력 및 출력 데이터를 transform하였기에 inverse_transform을 사용하여 원본 값의 범위로 재조정하여 예측값 출력
logP = scaler.inverse_transform(prediction.cpu().detach().numpy())
return logP[0][0]
predict_smiles('Cc1ccc2c(N3CCNCC3)cc(F)cc2n1')
0.0s
Python
-3.4839573
하이퍼 파라미터 Grid Search
load to data
data = pd.read_csv("delaney-processed.csv")
0.0s
Python
Data Transform
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from math import sqrt
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
def mol2fp(mol):
#원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
ar = np.zeros((1,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, ar)
return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
data["FPs"] = data.Molecule.apply(mol2fp)
0.3s
Python
X = np.stack(data.FPs.values)
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.4s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]], device='cuda:0')
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=256,
shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
batch_size=256,
shuffle=True)
0.0s
Python
Training & Tuning Hyper-parameter
from sklearn.metrics import r2_score
from itertools import product
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1] #fingerprint size를 얻어오는 코드
hidden_size = 1024 #히든 레이어 크기
output_size = 1 # 1개의 숫자를 예측하기 때문에 output의 크기를 1로 설정
parameters = dict(
learning_rates = [0.001,0.0001]
,dropout_rates= [0.8,0.5]
,epochs=[200,300]
,hidden_size=[512,1024]
)
param_values = [v for v in parameters.values()]
param_values
prev_score = 0.0
best_param=[]
for lr, dropout_rate, epochs, hidden_size in product(*param_values):
grid_model = MLPModel(input_size, hidden_size, dropout_rate, output_size)
grid_model.cuda()
criterion3 = nn.MSELoss()
optimizer3 = torch.optim.Adam(grid_model.parameters(), lr=lr)
grid_model.train() #모델은 train mode로 설정
epochs = epochs
for e in range(epochs+1):
running_loss = 0
for fps, labels in train_loader:
# Training pass
optimizer3.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
output = grid_model(fps) #Forward pass of the mini-batch
loss = criterion3(output, labels) #Computing the loss
loss.backward() # calculate the backward pass
optimizer3.step() # Optimize the weights
running_loss += loss.item()
else:
if e%100 == 0:
validation_loss = torch.mean(( y_validation - grid_model(X_validation) )**2).item()
print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
#list_val_loss.append(validation_loss)
if e == epochs:
score = r2_score(y_validation.detach().cpu(), grid_model(X_validation).detach().cpu())
print("lr:", lr, "dropout_rate:", dropout_rate, "epochs:",epochs, "validation_loss:",validation_loss, "r2_score:", score)
if prev_score < score:
prev_score=score
best_param={'lr':lr, 'dropout_rate':dropout_rate, 'epochs':epochs, 'r2_score':score}
91.6s
Python
print(best_param)
0.4s
Python
Convolutional Neural Network
load to data
data = pd.read_csv("delaney-processed.csv")
data.head(1)
0.1s
Python
Data Tansform
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from math import sqrt
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
def mol2fp(mol):
#원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림
fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
ar = np.zeros((1,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, ar)
return ar
fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))
data["FPs"] = data.Molecule.apply(mol2fp)
0.3s
Python
#dataframe에 분할되어 저장되어있는 FPs의 값들은 하나의 np.ndarray에 합치는 함수 np.stack사용
#합친 결과를 (1128,1, 64,64)로 모양 수정 CNN입력인 4차원으로 주기 위함
X = np.stack(data.FPs.values)
X = X.reshape(len(X),1,64,-1)
print(X.shape)
print(X)
0.3s
Python
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))
print(y)
print(type(y))
0.4s
Python
#sklearn의 train_test_split함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
print(X_train.shape, X_validation.shape, X_test.shape)
0.3s
Python
#랜덤 시드 고정
torch.manual_seed(42)
# GPU 사용 가능일 경우 랜덤 시드 고정
if device == 'cuda':
torch.cuda.manual_seed_all(42)
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.7s
Python
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=256,
shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
batch_size=256,
shuffle=True)
0.0s
Python
#나만의 딥러닝 모델 구축
#nn.XXX로 레이어를 추가할 수 있음
#예제의 모델은 CNN모델을 사용하기 위해 Conv2d relue, max_pool2d가 하나의 컬렉션으로 구성되어 2번을 반복하고 마지막 fc레이어를 구성한 값이 됨
#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음
class CNNModel(nn.Module):
def __init__(self,):
super(CNNModel, self).__init__()
#cnn layer
self.conv1 = nn.Conv2d(1,6,kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(6,16,kernel_size=3, padding=1)
#activation function
self.relu = nn.ReLU()
#pooling layer
self.maxpool = nn.MaxPool2d(2)
#dropout layer
self.dropout1 = nn.Dropout2d(0.2)
#fully connect layer
self.fc1 = nn.Linear(16 * 16 * 16, 1024)
self.fc2 = nn.Linear(1024, 256)
self.fc3 = nn.Linear(256, 1)
def forward(self, x):# Forward pass: stacking each layer together
#input Shape (batch_size,1,64,64)
#convol(batch_size,6,64,64)
#pooling (batch_size,6,32,32)
x = self.conv1(x)
x = self.relu(x)
x = self.maxpool(x)
#convol(batch_size,6,32,32)
#pooling (batch_size,16,16,16)
x = self.conv2(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.dropout1(x)
#1자로 inputshape를 펴주는 과정
x = x.view(x.size(0),-1)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x
0.0s
Python
#하이퍼 파라미터 세팅
#input_size = X_train.size()[-1] # The input size should fit our fingerprint size
#hidden_size = 1024 # The size of the hidden layer
#dropout_rate = 0.2 # The dropout rate
#output_size = 1 # This is just a single task, so this will be one
learning_rate = 0.0001 # The learning rate for the optimizer
cnnmodel = CNNModel()
print(cnnmodel)
0.4s
Python
cnnmodel.cuda()
0.0s
Python
CNNModel(
(conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu): ReLU()
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(dropout1): Dropout2d(p=0.2, inplace=False)
(fc1): Linear(in_features=4096, out_features=1024, bias=True)
(fc2): Linear(in_features=1024, out_features=256, bias=True)
(fc3): Linear(in_features=256, out_features=1, bias=True)
)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cnnmodel.parameters(), lr=learning_rate)
0.0s
Python
Training
from sklearn.metrics import r2_score
list_epoch = []
list_train_loss = []
list_val_loss = []
list_train_r2 = []
list_val_r2 = []
cnnmodel.train() #Ensure the network is in "train" mode with dropouts active
epochs = 200
for e in range(epochs):
running_loss = 0
for fps, labels in train_loader:
# Training pass
optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
output = cnnmodel(fps) #Forward pass of the mini-batch
loss = criterion(output, labels) #Computing the loss
loss.backward() # calculate the backward pass
optimizer.step() # Optimize the weights
running_loss += loss.item()
else:
if e%50 == 0:
validation_loss = torch.mean(( y_validation - cnnmodel(X_validation) )**2).item()
list_train_r2.append(r2_score(y_train.detach().cpu(), cnnmodel(X_train).detach().cpu()))
list_val_r2.append(r2_score(y_validation.detach().cpu(), cnnmodel(X_validation).detach().cpu()))
list_epoch.append(e)
list_train_loss.append(running_loss/len(train_loader))
print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
list_val_loss.append(validation_loss)
10.2s
Python
CNN 모델의 성능평가
fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_val_r2, marker='x', label='validation_r2 metric')
ax2.plot(list_epoch, list_train_r2, marker='x', label='train_r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.8s
Python
cnnmodel.eval() #Swith to evaluation mode, where dropout is switched off
y_pred_train = cnnmodel(X_train)
y_pred_validation = cnnmodel(X_validation)
y_pred_test = cnnmodel(X_test)
0.0s
Python
#test 데이터셋의 RMSE와 R2점수 구하기
print("RMSE: {0:.3f}".format(torch.sqrt(torch.mean(( y_test - y_pred_test )**2)).item()))
print("r2_score: {0:.3f}".format(r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())))
0.3s
Python
def flatten(tensor):
return tensor.cpu().detach().numpy().flatten()
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
0.5s
Python
[<matplotlib.lines.Line2D at 0x7f1ba12fbd50>]
Mol2Vec사용한 모델 구축
!pip install git+https://github.com/samoturk/mol2vec
9.8s
Python
load to data
!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv
1.1s
Python
import pandas as pd
data = pd.read_csv("delaney-processed.csv")
data.head(1)
0.0s
Python
Data Transform
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')
data[["smiles","Molecule"]].head(1)
0.2s
Python
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
0.0s
Python
aas = [Chem.MolFromSmiles(x) for x in data["smiles"]]
0.1s
Python
sentence=mol2alt_sentence(aas[0],1)
sentence
0.0s
Python
['864662311',
'1535166686',
'2245384272',
'3153477100',
'2976033787',
'1916236386',
'3189457552',
'2667063169',
'2976033787',
'1286704427',
'864674487',
'1759589175',
'2245384272',
'3129492592',
'2976033787',
'1916236386',
'3189457552',
'2667063169',
'2976033787',
'1286704427',
'864674487',
'199163361',
'2245273601',
'3147100053',
'2245900962',
'869152089',
'847433064',
'2551483158',
'3217380708',
'3579962709',
'3218693969',
'951226070',
'3218693969',
'98513984',
'3218693969',
'98513984',
'3218693969',
'98513984',
'3218693969',
'951226070',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433']
depict_identifier(aas[0], 864662311, 1)
0.1s
Python
it = IdentifierTable(sentence, [aas[0]]*len(sentence), [sentence]*len(sentence), 5, 1)
it
0.6s
Python
from gensim.models import word2vec
0.0s
Python
!wget https://raw.githubusercontent.com/samoturk/mol2vec/master/examples/models/model_300dim.pkl
2.8s
Python
w2vmodel = word2vec.Word2Vec.load('model_300dim.pkl')
0.9s
Python
#Number of unique identifiers represented as vectors
len(w2vmodel.wv.vocab.keys())
0.0s
Python
21003
#Feature vector representing above depicted identifier 2246728737
#w2vmodel.wv.word_vec('2246728737')
0.0s
Python
data.head(1)
0.1s
Python
data['Molecule']
Shift+Enter to run
→
data['sentence'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['Molecule'], 1)), axis=1)
0.8s
Python
data['mol2vec'] = [DfVec(x) for x in sentences2vec(data['sentence'], w2vmodel, unseen='UNK')]
0.2s
Python
data.head(1)
0.1s
Python
X = np.array([x.vec for x in data['mol2vec']])
y = data['measured log solubility in mols per litre'].values.reshape((-1,1))
0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔
#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것
#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
#정규화 수행 [0,1]의 값으로
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
0.0s
Python
# gpu에서 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
X_train
0.4s
Python
tensor([[ 0.6398, 1.5559, -1.5355, ..., -2.2593, -3.9086, 0.4851],
[ -1.5316, -7.9163, -6.0273, ..., -2.4850, -12.4975, 3.9382],
[ 0.9494, -0.9584, -0.5601, ..., -4.1352, -6.4839, -1.8985],
...,
[ -1.5671, -0.0286, -0.9324, ..., 1.1181, -1.1595, -1.1114],
[ -1.1879, -6.7705, -6.3758, ..., -3.3778, -14.5161, -0.0831],
[ 1.5110, -3.0176, -2.8297, ..., -4.9359, -8.8553, -2.0444]],
device='cuda:0')
print(X_train.shape, y_train.shape)
0.3s
Python
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌
#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함
from torch.utils.data import TensorDataset
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=256,
shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
batch_size=256,
shuffle=False)
0.0s
Python
#하이퍼 파라미터 세팅
input_size = X_train.size()[-1] # The input size should fit our fingerprint size
hidden_size = 1024 # The size of the hidden layer
dropout_rate = 0.8 # The dropout rate
output_size = 1 # This is just a single task, so this will be one
learning_rate = 0.0001 # The learning rate for the optimizer
vec_dnn_model = MLPModel(input_size, hidden_size, dropout_rate, output_size)
0.0s
Python
vec_dnn_model.cuda()
0.0s
Python
MLPModel(
(linear1): Linear(in_features=300, out_features=1024, bias=True)
(linear2): Linear(in_features=1024, out_features=1024, bias=True)
(linear3): Linear(in_features=1024, out_features=1024, bias=True)
(fc_out): Linear(in_features=1024, out_features=1, bias=True)
(activation): ReLU()
(dropout): Dropout(p=0.8, inplace=False)
)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(vec_dnn_model.parameters(), lr=learning_rate)
0.0s
Python
Training
from sklearn.metrics import r2_score
list_epoch = []
list_train_loss = []
list_val_loss = []
list_r2 = []
vec_dnn_model.train() #Ensure the network is in "train" mode with dropouts active
epochs = 400
for e in range(epochs):
running_loss = 0
for fps, labels in train_loader:
# Training pass
optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
output = vec_dnn_model(fps) #Forward pass of the mini-batch
loss = criterion(output, labels) #Computing the loss
loss.backward() # calculate the backward pass
optimizer.step() # Optimize the weights
running_loss += loss.item()
else:
if e%100 == 0:
validation_loss = torch.mean(( y_validation - vec_dnn_model(X_validation) )**2).item()
list_r2.append(r2_score(y_validation.detach().cpu(), vec_dnn_model(X_validation).detach().cpu()))
list_epoch.append(e)
list_train_loss.append(running_loss/len(train_loader))
print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
list_val_loss.append(validation_loss)
9.0s
Python
fig = plt.figure(figsize=(15,5))
# ====== Loss Fluctuation ====== #
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(list_epoch, list_train_loss, label='train_loss')
ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.set_ylim(0, 0.4)
ax1.grid()
ax1.legend()
ax1.set_title('epoch vs loss')
# ====== Metric Fluctuation ====== #
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')
ax2.set_xlabel('epoch')
ax2.set_ylabel('r2')
ax2.grid()
ax2.legend()
ax2.set_title('epoch vs r2')
plt.show()
0.4s
Python
#test 데이터셋의 RMSE와 R2점수 구하기
print("RMSE: {0:.3f}".format(torch.sqrt(torch.mean(( y_test - y_pred_test )**2)).item()))
print("r2_score: {0:.3f}".format(r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())))
0.3s
Python
def flatten(tensor):
return tensor.cpu().detach().numpy().flatten()
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
0.2s
Python
[<matplotlib.lines.Line2D at 0x7f1ba1374b50>]