div.ProseMirror






Pytorch를 사용한 QSAR 모델 구축(Prediction)
!python --version1.1s
Python
import deepchem as dcdc.__version__0.0s
Python
'2.3.0'
from rdkit import Chemfrom rdkit.Chem import Drawfrom rdkit.Chem.Draw import IPythonConsolefrom rdkit.Chem import Descriptorsfrom rdkit.Chem import AllChemfrom rdkit import DataStructsimport numpy as np0.0s
Python
Pytorch 사용한 QSAR 모델 구축
import torchtorch.__version__0.0s
Python
'1.6.0'
load to data
!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv1.3s
Python
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport torchimport torch.nn as nnfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_splitdata = pd.read_csv("delaney-processed.csv")data.head(1)0.1s
Python
from rdkit import Chem, DataStructsfrom rdkit.Chem import PandasTools, AllChemPandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')data[["smiles","Molecule"]].head(1)0.3s
Python
from math import sqrtprint(sqrt(4096))# https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.htmldef mol2fp(mol): #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림 fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096) ar = np.zeros((1,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, ar) return ar fp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))plt.matshow(fp.reshape((64,-1)) >0)0.3s
Python
<matplotlib.image.AxesImage at 0x7ff79dc9fa50>
data["FPs"] = data.Molecule.apply(mol2fp)0.2s
Python
data.head(1)0.1s
Python
Data Transform
#dataframe에 분할되어 저장되어있는 FPs의 값들은 하나의 np.ndarray에 합치는 함수 np.stack사용X = np.stack(data.FPs.values)print(X.shape)print(X)print(type(X))0.3s
Python
print(type(data["measured log solubility in mols per litre"]))0.2s
Python
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))print(y)print(type(y))0.3s
Python
#sklearn의 train_test_split함수를 사용하여 random하게 데이터를 나눔#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)#정규화 수행 Z-score 평균0, 표준편차1 의 값으로 변환scaler = StandardScaler()y_train = scaler.fit_transform(y_train)y_test = scaler.transform(y_test)y_validation = scaler.transform(y_validation)0.0s
Python
# cpu에서 학습X_train = torch.tensor(X_train).float()X_test = torch.tensor(X_test).float()X_validation = torch.tensor(X_validation).float()y_train = torch.tensor(y_train).float()y_test = torch.tensor(y_test).float()y_validation = torch.tensor(y_validation).float()X_train0.0s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
X_train.shape, X_validation.shape, X_test.shape0.0s
Python
(torch.Size([913, 4096]), torch.Size([102, 4096]), torch.Size([113, 4096]))
y_train.shape0.0s
Python
torch.Size([913, 1])
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함from torch.utils.data import TensorDatasettrain_dataset = TensorDataset(X_train, y_train)validation_dataset = TensorDataset(X_validation, y_validation)0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=256, shuffle=True)validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=256, shuffle=True)0.0s
Python
DNN Model
#나만의 딥러닝 모델 구축#nn.XXX로 레이어를 추가할 수 있음#예제의 모델은 (Linear, LayerNorm, ReLU, Dropout)이 한쌍으로 3번 반복되게 구성한 모델임#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음class MLPModel(nn.Module): def __init__(self, input_size, hidden_size, dropout_rate, out_size): super(MLPModel, self).__init__() # 3개의 fully connected Linear layer를 사용 self.linear1 = nn.Linear(input_size, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, hidden_size) self.fc_out = nn.Linear(hidden_size, out_size) # Output layer #학습 속도 개선을 위한 LayerNorm 레이어 추가 #self.ln1 = nn.LayerNorm(hidden_size) #self.ln2 = nn.LayerNorm(hidden_size) #self.ln3 = nn.LayerNorm(hidden_size) #ReLU 활성화 함수 추가 self.activation = nn.ReLU() #Dropout 일반화 부분 self.dropout = nn.Dropout(dropout_rate) #실제 학습시 레이어 사용 부분 def forward(self, x): out = self.linear1(x) #out = self.ln1(out) out = self.activation(out) out = self.dropout(out) #이부분 윗까지 1개의 블록으로 간주할 수 있음 out = self.linear2(out) #out = self.ln2(out) out = self.activation(out) out = self.dropout(out) out = self.linear3(out) #out = self.ln3(out) out = self.activation(out) out = self.dropout(out) #Final output layer 1 out = self.fc_out(out) return out0.0s
Python
#하이퍼 파라미터 세팅input_size = X_train.size()[-1] # The input size should fit our fingerprint sizehidden_size = 1024 # The size of the hidden layerdropout_rate = 0.8 # The dropout rateoutput_size = 1 # This is just a single task, so this will be onelearning_rate = 0.001 # The learning rate for the optimizermodel = MLPModel(input_size, hidden_size, dropout_rate, output_size)model0.1s
Python

from IPython.display import ImageImage(url='https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fk5Ch3%2FbtqDL4jjXl1%2F93WikBjXpxJ0e7kYy8c8SK%2Fimg.gif')0.0s
Python
#Loss function과 optimizer를 설정#Loss function은 Mean Squared Error로#Optimizer는 Adam으로#pytorch Loss function: https://pytorch.org/docs/stable/_modules/torch/nn/modules/loss.htmlcriterion = nn.MSELoss()optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)0.0s
Python
Training
#cpu 모드로 돌렸을때와 gpu모드로 돌렸을때의 속도 차이from sklearn.metrics import r2_scoreimport timeitstart_time = timeit.default_timer()list_epoch = []list_train_loss = []list_val_loss = []list_r2 = []epochs = 200for e in range(epochs): running_loss = 0 for X, y in train_loader: model.train() #학습 모드로 변경 optimizer.zero_grad() # 그라디언트 값을 0으로 초기화 output = model(X) #배치단위별로 학습 loss = criterion(output, y) #loss를 Lossfunction을 사용하여 구하는 부분 loss.backward() # backward를 통해서 그라디언트를 구해줌 optimizer가 감당할 파라미터들을 사용하여 미분을 수행함 optimizer.step() # step을 통해서 그라디언트를 바탕으로 파라미터 업데이트 running_loss += loss.item() else: if e%50 == 0: #mse validation_loss =torch.mean((y_validation-model(X_validation))**2) list_r2.append(r2_score(y_validation.detach(), model(X_validation).detach())) list_epoch.append(e) list_train_loss.append(running_loss/len(train_loader)) print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss)) list_val_loss.append(validation_loss)terminate_time = timeit.default_timer()print("%f초 걸렸습니다."% (terminate_time-start_time))63.1s
Python
DNN 모델의 성능평가
fig = plt.figure(figsize=(15,5))# ====== Loss Fluctuation ====== #ax1 = fig.add_subplot(1, 2, 1)ax1.plot(list_epoch, list_train_loss, label='train_loss')ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')ax1.set_xlabel('epoch')ax1.set_ylabel('loss')ax1.set_ylim(0, 0.4)ax1.grid()ax1.legend()ax1.set_title('epoch vs loss')# ====== Metric Fluctuation ====== #ax2 = fig.add_subplot(1, 2, 2)ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')ax2.set_xlabel('epoch')ax2.set_ylabel('r2')ax2.set_ylim(0.6, 1.0)ax2.grid()ax2.legend()ax2.set_title('epoch vs r2')plt.show()0.7s
Python
#evaluation mode로 전환model.eval()y_pred_train = model(X_train)y_pred_validation = model(X_validation)y_pred_test = model(X_test)0.1s
Python
#test 데이터셋의 MSE와 R2점수 구하기 print(torch.mean(( y_test - y_pred_test )**2).item())print(r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone()))0.3s
Python
Pytorch GPU사용하기
!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv1.0s
Python
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport torchimport torch.nn as nnfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_split0.0s
Python
load to data
data = pd.read_csv("delaney-processed.csv")0.0s
Python
Data Transform
from rdkit import Chem, DataStructsfrom rdkit.Chem import PandasTools, AllChemfrom math import sqrtPandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')def mol2fp(mol): #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림 fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096) ar = np.zeros((1,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, ar) return arfp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))data["FPs"] = data.Molecule.apply(mol2fp)0.3s
Python
data.head(1)0.0s
Python
X = np.stack(data.FPs.values)y = data["measured log solubility in mols per litre"].values.reshape((-1,1))0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)#정규화 수행 [0,1]의 값으로 scaler = StandardScaler()y_train = scaler.fit_transform(y_train)y_test = scaler.transform(y_test)y_validation = scaler.transform(y_validation)0.0s
Python
# gpu에서 학습device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")print(device)X_train = torch.tensor(X_train, device=device).float()X_test = torch.tensor(X_test, device=device).float()X_validation = torch.tensor(X_validation, device=device).float()y_train = torch.tensor(y_train, device=device).float()y_test = torch.tensor(y_test, device=device).float()y_validation = torch.tensor(y_validation, device=device).float()X_train0.5s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]], device='cuda:0')
GPU DNN Model
#나만의 딥러닝 모델 구축#nn.XXX로 레이어를 추가할 수 있음#예제의 모델은 (Linear, LayerNorm, ReLU, Dropout)이 한쌍으로 3번 반복되게 구성한 모델임#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음class cudaMLPModel(nn.Module): def __init__(self, input_size, hidden_size, dropout_rate, out_size): super(cudaMLPModel, self).__init__() # 3개의 fully connected Linear layer를 사용 self.linear1 = nn.Linear(input_size, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, hidden_size) self.fc_out = nn.Linear(hidden_size, out_size) # Output layer #학습 속도 개선을 위한 LayerNorm 레이어 추가 #self.ln1 = nn.LayerNorm(hidden_size) #self.ln2 = nn.LayerNorm(hidden_size) #self.ln3 = nn.LayerNorm(hidden_size) #ReLU 활성화 함수 추가 self.activation = nn.ReLU() #Dropout 일반화 부분 self.dropout = nn.Dropout(dropout_rate) #실제 학습시 레이어 사용 부분 def forward(self, x): out = self.linear1(x) #out = self.ln1(out) out = self.activation(out) out = self.dropout(out) #이부분 윗까지 1개의 블록으로 간주할 수 있음 out = self.linear2(out) #out = self.ln2(out) out = self.activation(out) out = self.dropout(out) out = self.linear3(out) #out = self.ln3(out) out = self.activation(out) out = self.dropout(out) #Final output layer out = self.fc_out(out) return out0.0s
Python
#하이퍼 파라미터 세팅input_size = X_train.size()[-1] # The input size should fit our fingerprint sizehidden_size = 1024 # The size of the hidden layerdropout_rate = 0.8 # The dropout rateoutput_size = 1 # This is just a single task, so this will be onelearning_rate = 0.0001 # The learning rate for the optimizercudamodel = cudaMLPModel(input_size, hidden_size, dropout_rate, output_size)0.1s
Python
#학습 모델이 GPU를 사용한다는 표시cudamodel.cuda()0.0s
Python
cudaMLPModel(
(linear1): Linear(in_features=4096, out_features=1024, bias=True)
(linear2): Linear(in_features=1024, out_features=1024, bias=True)
(linear3): Linear(in_features=1024, out_features=1024, bias=True)
(fc_out): Linear(in_features=1024, out_features=1, bias=True)
(activation): ReLU()
(dropout): Dropout(p=0.8, inplace=False)
)
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함from torch.utils.data import TensorDatasettrain_dataset = TensorDataset(X_train, y_train)validation_dataset = TensorDataset(X_validation, y_validation)0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=256, shuffle=True)validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=256, shuffle=True)0.0s
Python
#Loss function과 optimizer를 설정criterion2 = nn.MSELoss()#weight_decay=0optimizer2 = torch.optim.Adam(cudamodel.parameters(), lr=learning_rate)0.0s
Python
Training
#gpu모드from sklearn.metrics import r2_scoreimport timeitstart_time = timeit.default_timer()list_epoch = []list_train_loss = []list_val_loss = []list_r2 = []cudamodel.train()epochs = 200for e in range(epochs): running_loss = 0 for fps, labels in train_loader: # Training pass optimizer2.zero_grad() # Initialize the gradients, which will be recorded during the forward pa output = cudamodel(fps) #Forward pass of the mini-batch loss = criterion2(output, labels) #Computing the loss loss.backward() # calculate the backward pass optimizer2.step() # Optimize the weights running_loss += loss.item() else: if e%50 == 0: validation_loss = torch.mean(( y_validation - cudamodel(X_validation) )**2).item() list_r2.append(r2_score(y_validation.detach().cpu(), cudamodel(X_validation).detach().cpu())) list_epoch.append(e) list_train_loss.append(running_loss/len(train_loader)) print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss)) list_val_loss.append(validation_loss)terminate_time = timeit.default_timer()print("%f초 걸렸습니다."% (terminate_time-start_time))4.4s
Python
GPU DNN 모델의 성능평가
fig = plt.figure(figsize=(15,5))# ====== Loss Fluctuation ====== #ax1 = fig.add_subplot(1, 2, 1)ax1.plot(list_epoch, list_train_loss, label='train_loss')ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')ax1.set_xlabel('epoch')ax1.set_ylabel('loss')ax1.set_ylim(0, 0.4)ax1.grid()ax1.legend()ax1.set_title('epoch vs loss')# ====== Metric Fluctuation ====== #ax2 = fig.add_subplot(1, 2, 2)ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')ax2.set_xlabel('epoch')ax2.set_ylabel('r2')ax1.set_ylim(0, 1.0)ax2.grid()ax2.legend()ax2.set_title('epoch vs r2')plt.show()0.4s
Python
#evaluation mode로 전환cudamodel.eval()y_pred_train = cudamodel(X_train)y_pred_validation = cudamodel(X_validation)y_pred_test = cudamodel(X_test)0.0s
Python
from sklearn.metrics import r2_score#train 데이터셋의 MSE와 R2점수 구하기#gpu에 할당된 Tensor를 cpu의 numpy로 가져오는 과정torch.mean(( y_train - y_pred_train )**2).item()r2_score(y_train.detach().cpu().clone() , y_pred_train.detach().cpu().clone())0.0s
Python
0.957260735468451
#validation 데이터셋의 MSE와 R2점수 구하기 torch.mean(( y_validation - y_pred_validation )**2).item()r2_score(y_validation.detach().cpu().clone() , y_pred_validation.detach().cpu().clone())0.0s
Python
0.8121053048843863
#test 데이터셋의 MSE와 R2점수 구하기 torch.mean(( y_test - y_pred_test )**2).item()r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())0.0s
Python
0.864958526237417
plt.scatter(y_pred_test.detach().cpu().clone(), y_test.detach().cpu().clone())plt.xlabel('Predicted log-solubility in mols/liter')plt.ylabel('True log-solubility in mols/liter')plt.title(r'DNN LinearModel predicted vs. true log-solubilities')plt.show()0.7s
Python
def flatten(tensor): return tensor.cpu().detach().numpy().flatten() plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")plt.legend()plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")plt.show()0.2s
Python
#정확한 값 예측을 위해 다음과 같이 inverse_transform을 사용def predict_smiles(smiles): #모델에 찾고자하는 smiles를 입력으로 주면 Smiles를 fingerprint로 변형 fp =mol2fp(Chem.MolFromSmiles(smiles)).reshape(1,-1) #fp를 torch.tensor입력으로 변환gpu사용하는 경우 fp_tensor = torch.tensor(fp, device=device).float() #모델에 변형한 입력을 줌 prediction = cudamodel(fp_tensor) #return prediction.cpu().detach().numpy() #모델 학습 시 입력 및 출력 데이터를 transform하였기에 inverse_transform을 사용하여 원본 값의 범위로 재조정하여 예측값 출력 logP = scaler.inverse_transform(prediction.cpu().detach().numpy()) return logP[0][0]predict_smiles('Cc1ccc2c(N3CCNCC3)cc(F)cc2n1')0.0s
Python
-3.508878
하이퍼 파라미터 Grid Search
load to data
data = pd.read_csv("delaney-processed.csv")0.0s
Python
Data Transform
from rdkit import Chem, DataStructsfrom rdkit.Chem import PandasTools, AllChemfrom math import sqrtPandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')def mol2fp(mol): #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림 fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096) ar = np.zeros((1,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, ar) return arfp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))data["FPs"] = data.Molecule.apply(mol2fp)0.3s
Python
X = np.stack(data.FPs.values)y = data["measured log solubility in mols per litre"].values.reshape((-1,1))0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)#정규화 수행 [0,1]의 값으로 scaler = StandardScaler()y_train = scaler.fit_transform(y_train)y_test = scaler.transform(y_test)y_validation = scaler.transform(y_validation)0.0s
Python
# gpu에서 학습device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")print(device)X_train = torch.tensor(X_train, device=device).float()X_test = torch.tensor(X_test, device=device).float()X_validation = torch.tensor(X_validation, device=device).float()y_train = torch.tensor(y_train, device=device).float()y_test = torch.tensor(y_test, device=device).float()y_validation = torch.tensor(y_validation, device=device).float()X_train0.3s
Python
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]], device='cuda:0')
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌 #이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함 from torch.utils.data import TensorDataset train_dataset = TensorDataset(X_train, y_train) validation_dataset = TensorDataset(X_validation, y_validation) #DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=256, shuffle=True) validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=256, shuffle=True)0.0s
Python
Training & Tuning Hyper-parameter
from sklearn.metrics import r2_scorefrom itertools import product#하이퍼 파라미터 세팅input_size = X_train.size()[-1] #fingerprint size를 얻어오는 코드hidden_size = 1024 #히든 레이어 크기output_size = 1 # 1개의 숫자를 예측하기 때문에 output의 크기를 1로 설정parameters = dict( learning_rates = [0.001,0.0001] ,dropout_rates= [0.8,0.5] ,epochs=[200,300] ,hidden_size=[512,1024])param_values = [v for v in parameters.values()]param_valuesprev_score = 0.0best_param=[]for lr, dropout_rate, epochs, hidden_size in product(*param_values): grid_model = MLPModel(input_size, hidden_size, dropout_rate, output_size) grid_model.cuda() criterion3 = nn.MSELoss() optimizer3 = torch.optim.Adam(grid_model.parameters(), lr=lr) grid_model.train() #모델은 train mode로 설정 epochs = epochs for e in range(epochs+1): running_loss = 0 for fps, labels in train_loader: # Training pass optimizer3.zero_grad() # Initialize the gradients, which will be recorded during the forward pa output = grid_model(fps) #Forward pass of the mini-batch loss = criterion3(output, labels) #Computing the loss loss.backward() # calculate the backward pass optimizer3.step() # Optimize the weights running_loss += loss.item() else: if e%100 == 0: validation_loss = torch.mean(( y_validation - grid_model(X_validation) )**2).item() print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss)) #list_val_loss.append(validation_loss) if e == epochs: score = r2_score(y_validation.detach().cpu(), grid_model(X_validation).detach().cpu()) print("lr:", lr, "dropout_rate:", dropout_rate, "epochs:",epochs, "validation_loss:",validation_loss, "r2_score:", score) if prev_score < score: prev_score=score best_param={'lr':lr, 'dropout_rate':dropout_rate, 'epochs':epochs, 'r2_score':score}84.4s
Python
print(best_param)0.3s
Python
Convolutional Neural Network
load to data
data = pd.read_csv("delaney-processed.csv")data.head(1)0.1s
Python
Data Tansform
from rdkit import Chem, DataStructsfrom rdkit.Chem import PandasTools, AllChemfrom math import sqrtPandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')def mol2fp(mol): #원래 ECFP변환 hash크기는 2048이나 64x64의 크기로 보기위해 늘림 fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096) ar = np.zeros((1,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, ar) return arfp =mol2fp(Chem.MolFromSmiles(data.loc[1,"smiles"]))data["FPs"] = data.Molecule.apply(mol2fp)0.3s
Python
#dataframe에 분할되어 저장되어있는 FPs의 값들은 하나의 np.ndarray에 합치는 함수 np.stack사용#합친 결과를 (1128,1, 64,64)로 모양 수정 CNN입력인 4차원으로 주기 위함X = np.stack(data.FPs.values)X = X.reshape(len(X),1,64,-1)print(X.shape)print(X)0.2s
Python
y = data["measured log solubility in mols per litre"].values.reshape((-1,1))print(y)print(type(y))0.3s
Python
#sklearn의 train_test_split함수를 사용하여 random하게 데이터를 나눔#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)#정규화 수행 [0,1]의 값으로 scaler = StandardScaler()y_train = scaler.fit_transform(y_train)y_test = scaler.transform(y_test)y_validation = scaler.transform(y_validation)0.0s
Python
print(X_train.shape, X_validation.shape, X_test.shape)0.4s
Python
#랜덤 시드 고정torch.manual_seed(42)# GPU 사용 가능일 경우 랜덤 시드 고정if device == 'cuda': torch.cuda.manual_seed_all(42)# gpu에서 학습device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")print(device)X_train = torch.tensor(X_train, device=device).float()X_test = torch.tensor(X_test, device=device).float()X_validation = torch.tensor(X_validation, device=device).float()y_train = torch.tensor(y_train, device=device).float()y_test = torch.tensor(y_test, device=device).float()y_validation = torch.tensor(y_validation, device=device).float()X_train0.4s
Python
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함from torch.utils.data import TensorDatasettrain_dataset = TensorDataset(X_train, y_train)validation_dataset = TensorDataset(X_validation, y_validation)0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=256, shuffle=True)validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=256, shuffle=True)0.0s
Python
#나만의 딥러닝 모델 구축#nn.XXX로 레이어를 추가할 수 있음#예제의 모델은 CNN모델을 사용하기 위해 Conv2d relue, max_pool2d가 하나의 컬렉션으로 구성되어 2번을 반복하고 마지막 fc레이어를 구성한 값이 됨#단순 Linear 레이어를 쌓았지만 일반화를 위하여 LayerNorm, Dropout을 사용하였음class CNNModel(nn.Module): def __init__(self,): super(CNNModel, self).__init__() #cnn layer self.conv1 = nn.Conv2d(1,6,kernel_size=3, padding=1) self.conv2 = nn.Conv2d(6,16,kernel_size=3, padding=1) #activation function self.relu = nn.ReLU() #pooling layer self.maxpool = nn.MaxPool2d(2) #dropout layer self.dropout1 = nn.Dropout2d(0.2) #fully connect layer self.fc1 = nn.Linear(16 * 16 * 16, 1024) self.fc2 = nn.Linear(1024, 256) self.fc3 = nn.Linear(256, 1) def forward(self, x):# Forward pass: stacking each layer together #input Shape (batch_size,1,64,64) #convol(batch_size,6,64,64) #pooling (batch_size,6,32,32) x = self.conv1(x) x = self.relu(x) x = self.maxpool(x) #convol(batch_size,6,32,32) #pooling (batch_size,16,16,16) x = self.conv2(x) x = self.relu(x) x = self.maxpool(x) x = self.dropout1(x) #1자로 inputshape를 펴주는 과정 x = x.view(x.size(0),-1) x = self.relu(self.fc1(x)) x = self.relu(self.fc2(x)) x = self.fc3(x) return x 0.0s
Python
#하이퍼 파라미터 세팅#input_size = X_train.size()[-1] # The input size should fit our fingerprint size#hidden_size = 1024 # The size of the hidden layer#dropout_rate = 0.2 # The dropout rate#output_size = 1 # This is just a single task, so this will be one#learning_rate = 0.0001 # The learning rate for the optimizercnnmodel = CNNModel()print(cnnmodel)0.3s
Python
cnnmodel.cuda()0.0s
Python
CNNModel(
(conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu): ReLU()
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(dropout1): Dropout2d(p=0.2, inplace=False)
(fc1): Linear(in_features=4096, out_features=1024, bias=True)
(fc2): Linear(in_features=1024, out_features=256, bias=True)
(fc3): Linear(in_features=256, out_features=1, bias=True)
)
criterion = nn.MSELoss()optimizer = torch.optim.Adam(cnnmodel.parameters(), lr=0.0001)0.0s
Python
Training
from sklearn.metrics import r2_scorelist_epoch = []list_train_loss = []list_val_loss = []list_train_r2 = []list_val_r2 = []cnnmodel.train() #Ensure the network is in "train" mode with dropouts activeepochs = 500for e in range(epochs): running_loss = 0 for fps, labels in train_loader: # Training pass optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa output = cnnmodel(fps) #Forward pass of the mini-batch loss = criterion(output, labels) #Computing the loss loss.backward() # calculate the backward pass optimizer.step() # Optimize the weights running_loss += loss.item() else: if e%50 == 0: validation_loss = torch.mean(( y_validation - cnnmodel(X_validation) )**2).item() list_train_r2.append(r2_score(y_train.detach().cpu(), cnnmodel(X_train).detach().cpu())) list_val_r2.append(r2_score(y_validation.detach().cpu(), cnnmodel(X_validation).detach().cpu())) list_epoch.append(e) list_train_loss.append(running_loss/len(train_loader)) print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss)) list_val_loss.append(validation_loss)24.2s
Python
CNN 모델의 성능평가
fig = plt.figure(figsize=(15,5))# ====== Loss Fluctuation ====== #ax1 = fig.add_subplot(1, 2, 1)ax1.plot(list_epoch, list_train_loss, label='train_loss')ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')ax1.set_xlabel('epoch')ax1.set_ylabel('loss')ax1.set_ylim(0, 0.4)ax1.grid()ax1.legend()ax1.set_title('epoch vs loss')# ====== Metric Fluctuation ====== #ax2 = fig.add_subplot(1, 2, 2)ax2.plot(list_epoch, list_val_r2, marker='x', label='validation_r2 metric')ax2.plot(list_epoch, list_train_r2, marker='x', label='train_r2 metric')ax2.set_xlabel('epoch')ax2.set_ylabel('r2')ax2.grid()ax2.legend()ax2.set_title('epoch vs r2')plt.show()0.6s
Python
cnnmodel.eval() #Swith to evaluation mode, where dropout is switched offy_pred_train = cnnmodel(X_train)y_pred_validation = cnnmodel(X_validation)y_pred_test = cnnmodel(X_test)0.0s
Python
torch.mean(( y_test - y_pred_test )**2).item()r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())0.0s
Python
0.8950224503208235
def flatten(tensor): return tensor.cpu().detach().numpy().flatten() plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")plt.legend()plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")0.7s
Python
[<matplotlib.lines.Line2D at 0x7ff7fc6ab850>]
Mol2Vec사용한 모델 구축
!pip install git+https://github.com/samoturk/mol2vec18.4s
Python
load to data
!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv1.0s
Python
import pandas as pddata = pd.read_csv("delaney-processed.csv")data.head(1)0.0s
Python
Data Transform
from rdkit import Chem, DataStructsfrom rdkit.Chem import PandasTools, AllChemPandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule')data[["smiles","Molecule"]].head(1)0.2s
Python
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vecfrom mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg0.6s
Python
aas = [Chem.MolFromSmiles(x) for x in data["smiles"]]0.1s
Python
sentence=mol2alt_sentence(aas[0],1)sentence0.0s
Python
['864662311',
'1535166686',
'2245384272',
'3153477100',
'2976033787',
'1916236386',
'3189457552',
'2667063169',
'2976033787',
'1286704427',
'864674487',
'1759589175',
'2245384272',
'3129492592',
'2976033787',
'1916236386',
'3189457552',
'2667063169',
'2976033787',
'1286704427',
'864674487',
'199163361',
'2245273601',
'3147100053',
'2245900962',
'869152089',
'847433064',
'2551483158',
'3217380708',
'3579962709',
'3218693969',
'951226070',
'3218693969',
'98513984',
'3218693969',
'98513984',
'3218693969',
'98513984',
'3218693969',
'951226070',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433',
'2976033787',
'675765711',
'864662311',
'266675433']
depict_identifier(aas[0], 864662311, 1)0.1s
Python
it = IdentifierTable(sentence, [aas[0]]*len(sentence), [sentence]*len(sentence), 5, 1)it0.6s
Python
from gensim.models import word2vec0.0s
Python
!wget https://raw.githubusercontent.com/samoturk/mol2vec/master/examples/models/model_300dim.pkl6.3s
Python
w2vmodel = word2vec.Word2Vec.load('model_300dim.pkl')1.0s
Python
#Number of unique identifiers represented as vectorslen(w2vmodel.wv.vocab.keys())0.0s
Python
21003
#Feature vector representing above depicted identifier 2246728737#w2vmodel.wv.word_vec('2246728737')0.0s
Python
data.head(1)0.1s
Python
data['Molecule']0.1s
Python
0 <img data-content="rdkit/molecule" src="data:i...
1 <img data-content="rdkit/molecule" src="data:i...
2 <img data-content="rdkit/molecule" src="data:i...
3 <img data-content="rdkit/molecule" src="data:i...
4 <img data-content="rdkit/molecule" src="data:i...
...
1123 <img data-content="rdkit/molecule" src="data:i...
1124 <img data-content="rdkit/molecule" src="data:i...
1125 <img data-content="rdkit/molecule" src="data:i...
1126 <img data-content="rdkit/molecule" src="data:i...
1127 <img data-content="rdkit/molecule" src="data:i...
Name: Molecule, Length: 1128, dtype: object
data['sentence'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['Molecule'], 1)), axis=1)0.6s
Python
data['mol2vec'] = [DfVec(x) for x in sentences2vec(data['sentence'], w2vmodel, unseen='UNK')]0.2s
Python
data.head(1)0.1s
Python
X = np.array([x.vec for x in data['mol2vec']])y = data['measured log solubility in mols per litre'].values.reshape((-1,1))0.0s
Python
#sklearn의 train_test_splie함수를 사용하여 random하게 데이터를 나눔#random_state=42로 고정 이 값을 변경할 경우 데이터의 구성이 바뀌니 기억해 둘것#test_size=0.1로 1을 전체 데이터라고 가정했을때 전체 데이터의 10%를 테스트로 나머지 90% 훈련 데이터로 사용한다는 뜻X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)#splitd을 한번더 수행 이러한 이유는 valid 데이터를 수행하기 위함임 train 데이터의 10%를 valid 데이터로 생성X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1, random_state=42)#정규화 수행 [0,1]의 값으로 scaler = StandardScaler()y_train = scaler.fit_transform(y_train)y_test = scaler.transform(y_test)y_validation = scaler.transform(y_validation)0.0s
Python
# gpu에서 학습device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")print(device)X_train = torch.tensor(X_train, device=device).float()X_test = torch.tensor(X_test, device=device).float()X_validation = torch.tensor(X_validation, device=device).float()y_train = torch.tensor(y_train, device=device).float()y_test = torch.tensor(y_test, device=device).float()y_validation = torch.tensor(y_validation, device=device).float()X_train0.4s
Python
tensor([[ 0.6398, 1.5559, -1.5355, ..., -2.2593, -3.9086, 0.4851],
[ -1.5316, -7.9163, -6.0273, ..., -2.4850, -12.4975, 3.9382],
[ 0.9494, -0.9584, -0.5601, ..., -4.1352, -6.4839, -1.8985],
...,
[ -1.5671, -0.0286, -0.9324, ..., 1.1181, -1.1595, -1.1114],
[ -1.1879, -6.7705, -6.3758, ..., -3.3778, -14.5161, -0.0831],
[ 1.5110, -3.0176, -2.8297, ..., -4.9359, -8.8553, -2.0444]],
device='cuda:0')
print(X_train.shape, y_train.shape)0.3s
Python
#TensorDataset을 사용하여 입력과 출력 쌍을 묶어줌#이는 torch의 데이터로더함수를 사용하여 batch크기만큼 입력 출력쌍을 주기 위함from torch.utils.data import TensorDatasettrain_dataset = TensorDataset(X_train, y_train)validation_dataset = TensorDataset(X_validation, y_validation)0.0s
Python
#DataLoader를 사용하여 전체 train, valid 데이터를 배치크기로 분할하면서 섞어주는 작업을 수행함 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=256, shuffle=True)validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=256, shuffle=False)0.0s
Python
#하이퍼 파라미터 세팅input_size = X_train.size()[-1] # The input size should fit our fingerprint sizehidden_size = 1024 # The size of the hidden layerdropout_rate = 0.2 # The dropout rateoutput_size = 1 # This is just a single task, so this will be onelearning_rate = 0.0001 # The learning rate for the optimizervec_dnn_model = MLPModel(input_size, hidden_size, dropout_rate, output_size)0.0s
Python
vec_dnn_model.cuda()0.0s
Python
MLPModel(
(linear1): Linear(in_features=300, out_features=1024, bias=True)
(linear2): Linear(in_features=1024, out_features=1024, bias=True)
(linear3): Linear(in_features=1024, out_features=1024, bias=True)
(fc_out): Linear(in_features=1024, out_features=1, bias=True)
(ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(ln3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(activation): ReLU()
(dropout): Dropout(p=0.2, inplace=False)
)
criterion = nn.MSELoss()optimizer = torch.optim.Adam(vec_dnn_model.parameters(), lr=learning_rate)0.0s
Python
from sklearn.metrics import r2_scorelist_epoch = []list_train_loss = []list_val_loss = []list_r2 = []vec_dnn_model.train() #Ensure the network is in "train" mode with dropouts activeepochs = 1000for e in range(epochs): running_loss = 0 for fps, labels in train_loader: # Training pass optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa output = vec_dnn_model(fps) #Forward pass of the mini-batch loss = criterion(output, labels) #Computing the loss loss.backward() # calculate the backward pass optimizer.step() # Optimize the weights running_loss += loss.item() else: if e%100 == 0: validation_loss = torch.mean(( y_validation - vec_dnn_model(X_validation) )**2).item() list_r2.append(r2_score(y_validation.detach().cpu(), vec_dnn_model(X_validation).detach().cpu())) list_epoch.append(e) list_train_loss.append(running_loss/len(train_loader)) print("Epoch: %3i Training loss: %0.2F Validation loss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss)) list_val_loss.append(validation_loss)20.9s
Python
fig = plt.figure(figsize=(15,5))# ====== Loss Fluctuation ====== #ax1 = fig.add_subplot(1, 2, 1)ax1.plot(list_epoch, list_train_loss, label='train_loss')ax1.plot(list_epoch, list_val_loss, '--', label='val_loss')ax1.set_xlabel('epoch')ax1.set_ylabel('loss')ax1.set_ylim(0, 0.4)ax1.grid()ax1.legend()ax1.set_title('epoch vs loss')# ====== Metric Fluctuation ====== #ax2 = fig.add_subplot(1, 2, 2)ax2.plot(list_epoch, list_r2, marker='x', label='r2 metric')ax2.set_xlabel('epoch')ax2.set_ylabel('r2')ax2.grid()ax2.legend()ax2.set_title('epoch vs r2')plt.show()0.5s
Python
torch.mean(( y_test - y_pred_test )**2).item()r2_score(y_test.detach().cpu().clone() , y_pred_test.detach().cpu().clone())0.0s
Python
0.8950224503208235
def flatten(tensor): return tensor.cpu().detach().numpy().flatten() plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")plt.legend()plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")0.5s
Python
[<matplotlib.lines.Line2D at 0x7ff7c54b2850>]