# Import all the necessary libraries¶

In [1]:
import numpy as np
import pandas as pd
import pickle
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit import DataStructs


# Processing the data to be predicted¶

Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above.

In [2]:
## Load the file
df = pd.read_excel("Aropha_AB_classification_predict_example.xlsx", sheet_name='Sheet1')

Out[2]:
SMILES
0 CC1(C)OC[C@@H](COC(=O)CCc2ccc(OC[C@@H](O)CNCCN...
1 CC1(C)OC[C@@H](COC(=O)CCc2ccc(OC[C@@H](O)CNCCN...
2 CC1(C)OC[C@@H](COS(C)(=O)=O)O1
3 CC1(C)OC[C@@H](O)[C@H](O)CO1
4 CC1(C)OC[C@@H]2O[C@@]3(C(=O)O)OC(C)(C)O[C@H]3[...
In [3]:
## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)
mols = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]
df_fp = [GetMACCSKeysFingerprint(mol) for mol in mols]

In [4]:
## Convert the fingerprint list to a numpy array, so that the model can read them as the input with the correct shape
X = np.array(df_fp)
X

Out[4]:
array([[0, 0, 0, ..., 1, 1, 0],
[0, 0, 0, ..., 1, 1, 1],
[0, 0, 0, ..., 1, 1, 0],
...,
[0, 0, 0, ..., 0, 1, 0],
[0, 0, 0, ..., 0, 1, 0],
[0, 0, 0, ..., 0, 1, 0]])

# Load the model and perform the prediction¶

In [5]:
## Load the model (you can download this model use the link above)
model = pickle.load(open('Aropha_AB_XGBClassifier_model.pkl', 'rb'))

In [6]:
## Below two lines of codes are to ignore the futurewarning raised by sklearn due to the version reasons
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)

## Perform the prediction and save the results to a column named "Prediction" in the orginal dataframe
df['Prediction'] = model.predict(X)

Out[6]:
SMILES Prediction
0 CC1(C)OC[C@@H](COC(=O)CCc2ccc(OC[C@@H](O)CNCCN... 0
1 CC1(C)OC[C@@H](COC(=O)CCc2ccc(OC[C@@H](O)CNCCN... 0
2 CC1(C)OC[C@@H](COS(C)(=O)=O)O1 0
3 CC1(C)OC[C@@H](O)[C@H](O)CO1 1
4 CC1(C)OC[C@@H]2O[C@@]3(C(=O)O)OC(C)(C)O[C@H]3[... 0

# Calculate the prediction accuracy¶

The prediction accuracy is based on the similarity between the query compound and the dataset used to build the model.

In [7]:
## Load the data that was used to build the model. It can be downloaded in the "Dataset" tab
model_data = pd.read_excel('Aropha_AB_classification_model_data.xlsx', sheet_name='Sheet1')
model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]
model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]

In [8]:
'''The prediction accuracy is based on the similarity score.
For example, during the model development, chemicals with a similarity score of >=0.9 with each other
demonstrated a model perdiction accuracy of 0.876.'''
def prediction_acc(similarity):
if similarity >= 0.9:
accuracy = 0.876
elif 0.8 <= similarity <= 0.9:
accuracy = 0.856
elif 0.7 <= similarity <= 0.8:
accuracy = 0.852
elif 0.6 <= similarity <= 0.7:
accuracy = 0.832
else:
accuracy = '-'
return accuracy

In [9]:
similarity_list = []
accuracy_list = []
for fp in df_fp:
similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data
similarities.sort()
similarity = round(similarities[-1], 2) ## Get the largest similarity score and round to two decimal points
accuracy = prediction_acc(similarity)
similarity_list.append(similarity)
accuracy_list.append(accuracy)

In [10]:
## Add the similarity and accuracy scores to the dataframe
df['Similarity'] = similarity_list
df['Accuracy'] = accuracy_list

Out[10]:
SMILES Prediction Similarity Accuracy
0 CC1(C)OC[C@@H](COC(=O)CCc2ccc(OC[C@@H](O)CNCCN... 0 0.70 0.852
1 CC1(C)OC[C@@H](COC(=O)CCc2ccc(OC[C@@H](O)CNCCN... 0 0.67 0.832
2 CC1(C)OC[C@@H](COS(C)(=O)=O)O1 0 0.65 0.832
3 CC1(C)OC[C@@H](O)[C@H](O)CO1 1 0.62 0.832
4 CC1(C)OC[C@@H]2O[C@@]3(C(=O)O)OC(C)(C)O[C@H]3[... 0 0.66 0.832

# Save the results to a csv file¶

In [11]:
df.to_csv("prediction_result.csv")