Import libraries
In [ ]:
import pandas as pd # data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import datasets, linear_model, metrics
%matplotlib inline
In [ ]:
Import dataset
In [ ]:
from google.colab import drive
drive.mount('/content/gdrive')
Mounted at /content/gdrive
In [ ]:
df = pd.read_csv('/content/gdrive/MyDrive/Polar Region Project/pr_plant/pr_plant_microbialSymbionts.csv')
Exploratory data analysis
In [ ]:
df.shape
Out[ ]:
(366, 12)
In [ ]:
df.head(5)
Out[ ]:
paper | plant.sp | response.var | response.units | mean | n | dispersion.value | dispersion.measure | cold.condition | temperature | infection.status | microorganism | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2006.Ait-Barka et al. | Vitis vinifera | shoot.w | g | 0.118 | 24 | 0.026 | sd | cold | 4 | M+ | Bacteria |
1 | 2006.Ait-Barka et al. | Vitis vinifera | shoot.w | g | 0.083 | 24 | 0.008 | sd | cold | 4 | M- | none |
2 | 2006.Ait-Barka et al. | Vitis vinifera | shoot.w | g | 0.554 | 24 | 0.107 | sd | control | 24 | M+ | Bacteria |
3 | 2006.Ait-Barka et al. | Vitis vinifera | shoot.w | g | 0.194 | 24 | 0.014 | sd | control | 24 | M- | none |
4 | 2010a.Zhu et al. | Zea mays | shoot.DW | g | 2.500 | 4 | 0.107 | se | cold | 5 | M+ | Arbuscular mycorrhizal fungus |
In [ ]:
df.columns
Out[ ]:
Index(['paper', 'plant.sp', 'response.var', 'response.units', 'mean', 'n', 'dispersion.value', 'dispersion.measure', 'cold.condition', 'temperature', 'infection.status', 'microorganism'], dtype='object')
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 366 entries, 0 to 365 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 paper 366 non-null object 1 plant.sp 366 non-null object 2 response.var 366 non-null object 3 response.units 366 non-null object 4 mean 366 non-null float64 5 n 366 non-null int64 6 dispersion.value 366 non-null float64 7 dispersion.measure 366 non-null object 8 cold.condition 366 non-null object 9 temperature 366 non-null int64 10 infection.status 366 non-null object 11 microorganism 366 non-null object dtypes: float64(2), int64(2), object(8) memory usage: 34.4+ KB
In [ ]:
df.isnull().sum()
Out[ ]:
paper 0 plant.sp 0 response.var 0 response.units 0 mean 0 n 0 dispersion.value 0 dispersion.measure 0 cold.condition 0 temperature 0 infection.status 0 microorganism 0 dtype: int64
In [ ]:
#drop_columns = ['paper', 'response.var', 'response.units', 'dispersion.measure', 'cold.condition', 'microorganism' ]
#data = df.drop(columns=drop_columns)
In [ ]:
#data.dtypes
In [ ]:
#infection_status = df.astype({"infection.status":'category'})
In [ ]:
# Assuming 'infection.status' is a column in your DataFrame
#infection_status = df['infection.status']
In [ ]:
# Create a LabelEncoder instance
#le = LabelEncoder()
In [ ]:
# Identify categorical variables
categorical_columns = ['plant.sp', 'response.var', 'response.units', 'dispersion.measure', 'cold.condition', 'microorganism']
In [ ]:
# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns)
In [ ]:
# Fit and transform the data
#binary_infection_status = label_encoder.fit_transform(infection_status)
In [ ]:
# Replace the original column with the binary values
#data['infection.status'] = binary_infection_status
In [ ]:
# Display the result
print(df)
paper mean n dispersion.value temperature \ 0 2006.Ait-Barka et al. 0.118 24 0.026 4 1 2006.Ait-Barka et al. 0.083 24 0.008 4 2 2006.Ait-Barka et al. 0.554 24 0.107 24 3 2006.Ait-Barka et al. 0.194 24 0.014 24 4 2010a.Zhu et al. 2.500 4 0.107 5 .. ... ... .. ... ... 361 2018.Gorbanpour et al. 0.792 3 0.002 24 362 2019.Ma et al. 0.743 3 0.023 25 363 2019.Ma et al. 0.733 3 0.028 25 364 2019.Ma et al. 0.398 3 0.069 15 365 2019.Ma et al. 0.338 3 0.051 15 infection.status plant.sp_Arabidopsis thaliana plant.sp_Cucumis sativus \ 0 M+ 0 0 1 M- 0 0 2 M+ 0 0 3 M- 0 0 4 M+ 0 0 .. ... ... ... 361 M- 0 0 362 M- 0 1 363 M+ 0 1 364 M- 0 1 365 M+ 0 1 plant.sp_Digitaria eriantha plant.sp_Elymus nutans ... \ 0 0 0 ... 1 0 0 ... 2 0 0 ... 3 0 0 ... 4 0 0 ... .. ... ... ... 361 0 0 ... 362 0 0 ... 363 0 0 ... 364 0 0 ... 365 0 0 ... microorganism_AM microorganism_Arbuscular mycorrhizal fungus \ 0 0 0 1 0 0 2 0 0 3 0 0 4 0 1 .. ... ... 361 0 0 362 0 0 363 0 0 364 0 0 365 0 0 microorganism_Bacteria microorganism_Glomus intraradices \ 0 1 0 1 0 0 2 1 0 3 0 0 4 0 0 .. ... ... 361 0 0 362 0 0 363 0 0 364 0 0 365 0 0 microorganism_Glomus mosseae microorganism_Rhizophagus irregularis \ 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 .. ... ... 361 0 0 362 0 0 363 0 1 364 0 0 365 0 1 microorganism_bacteria microorganism_fungi microorganism_no \ 0 0 0 0 1 0 0 0 2 0 0 0 3 0 0 0 4 0 0 0 .. ... ... ... 361 0 0 0 362 0 0 1 363 0 0 0 364 0 0 1 365 0 0 0 microorganism_none 0 0 1 1 2 0 3 1 4 0 .. ... 361 1 362 0 363 0 364 0 365 0 [366 rows x 97 columns]
In [ ]:
# check distribution of target_class column
df['infection.status'].value_counts()
Out[ ]:
M+ 190 M- 176 Name: infection.status, dtype: int64
In [ ]:
df['infection.status'].value_counts()/np.float(len(df))
<ipython-input-20-73bc8c15fe93>:1: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations df['infection.status'].value_counts()/np.float(len(df))
Out[ ]:
M+ 0.519126 M- 0.480874 Name: infection.status, dtype: float64
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 366 entries, 0 to 365 Data columns (total 97 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 paper 366 non-null object 1 mean 366 non-null float64 2 n 366 non-null int64 3 dispersion.value 366 non-null float64 4 temperature 366 non-null int64 5 infection.status 366 non-null object 6 plant.sp_Arabidopsis thaliana 366 non-null uint8 7 plant.sp_Cucumis sativus 366 non-null uint8 8 plant.sp_Digitaria eriantha 366 non-null uint8 9 plant.sp_Elymus nutans 366 non-null uint8 10 plant.sp_Oryza sativa 366 non-null uint8 11 plant.sp_Phaseolus vulgaris 366 non-null uint8 12 plant.sp_Solanum lycopersicum 366 non-null uint8 13 plant.sp_Vaccinium ashei 366 non-null uint8 14 plant.sp_Vaccinium corymbosum 366 non-null uint8 15 plant.sp_Vitis vinifera 366 non-null uint8 16 plant.sp_Zea mays 366 non-null uint8 17 response.var_Fv/Fm 366 non-null uint8 18 response.var_leaf.CAT.act 366 non-null uint8 19 response.var_leaf.SOD.act 366 non-null uint8 20 response.var_leaf.rel.wat.cont 366 non-null uint8 21 response.var_leaf[H2O2] 366 non-null uint8 22 response.var_leaf[MDA] 366 non-null uint8 23 response.var_leaf[proline] 366 non-null uint8 24 response.var_leaf[sol.sug] 366 non-null uint8 25 response.var_plant[H2O2] 366 non-null uint8 26 response.var_plantlet[H2O2] 366 non-null uint8 27 response.var_plantlet[MDA] 366 non-null uint8 28 response.var_plantlet[proline] 366 non-null uint8 29 response.var_rel.expr(CBF1) 366 non-null uint8 30 response.var_rel.expr(CBF2) 366 non-null uint8 31 response.var_rel.expr(CBF3) 366 non-null uint8 32 response.var_rel.expr(CBF4) 366 non-null uint8 33 response.var_rel.expr(COR15a) 366 non-null uint8 34 response.var_rel.expr(COR78a) 366 non-null uint8 35 response.var_rel.expr(Chi4c) 366 non-null uint8 36 response.var_rel.expr(Chit1b) 366 non-null uint8 37 response.var_rel.expr(Gluc) 366 non-null uint8 38 response.var_rel.expr(ICE1) 366 non-null uint8 39 response.var_rel.expr(LeCBF1) 366 non-null uint8 40 response.var_rel.expr(LeCBF3) 366 non-null uint8 41 response.var_rel.wat.cont 366 non-null uint8 42 response.var_root.CAT.act 366 non-null uint8 43 response.var_root.DW 366 non-null uint8 44 response.var_root.FW 366 non-null uint8 45 response.var_root.H2O2 366 non-null uint8 46 response.var_root.SOD.act 366 non-null uint8 47 response.var_root.rel.wat.cont 366 non-null uint8 48 response.var_root.w 366 non-null uint8 49 response.var_root[MDA] 366 non-null uint8 50 response.var_root[proline] 366 non-null uint8 51 response.var_root[sol.sug] 366 non-null uint8 52 response.var_shoot.CAT.act 366 non-null uint8 53 response.var_shoot.DW 366 non-null uint8 54 response.var_shoot.FW 366 non-null uint8 55 response.var_shoot.SOD.act 366 non-null uint8 56 response.var_shoot.w 366 non-null uint8 57 response.var_shoot[H2O2] 366 non-null uint8 58 response.var_shoot[MDA] 366 non-null uint8 59 response.var_shoot[sol.sug] 366 non-null uint8 60 response.var_wat.cont 366 non-null uint8 61 response.units_% 366 non-null uint8 62 response.units_U g-1 FW 366 non-null uint8 63 response.units_U g-1 FW min-1 366 non-null uint8 64 response.units_U mgProtein-1 366 non-null uint8 65 response.units_U mgProtein-1 min-1 366 non-null uint8 66 response.units_dimensionless 366 non-null uint8 67 response.units_fold increase (au) 366 non-null uint8 68 response.units_g 366 non-null uint8 69 response.units_mg 366 non-null uint8 70 response.units_mg g-1 FW 366 non-null uint8 71 response.units_mmol g-1 FW 366 non-null uint8 72 response.units_nmol H2O2 g-1 FW 366 non-null uint8 73 response.units_nmol MDA g-1 FW 366 non-null uint8 74 response.units_nmol g-1 DW 366 non-null uint8 75 response.units_nmol g-1 FW 366 non-null uint8 76 response.units_nmol mgProtein-1 min-1 366 non-null uint8 77 response.units_uM g-1 FW 366 non-null uint8 78 response.units_ug g-1 FW 366 non-null uint8 79 response.units_umol H2O2 gFW-1 min-1 366 non-null uint8 80 response.units_umol g-1 DW 366 non-null uint8 81 response.units_umol g-1 FW 366 non-null uint8 82 response.units_umol g-1 FW 366 non-null uint8 83 dispersion.measure_sd 366 non-null uint8 84 dispersion.measure_se 366 non-null uint8 85 cold.condition_cold 366 non-null uint8 86 cold.condition_control 366 non-null uint8 87 microorganism_AM 366 non-null uint8 88 microorganism_Arbuscular mycorrhizal fungus 366 non-null uint8 89 microorganism_Bacteria 366 non-null uint8 90 microorganism_Glomus intraradices 366 non-null uint8 91 microorganism_Glomus mosseae 366 non-null uint8 92 microorganism_Rhizophagus irregularis 366 non-null uint8 93 microorganism_bacteria 366 non-null uint8 94 microorganism_fungi 366 non-null uint8 95 microorganism_no 366 non-null uint8 96 microorganism_none 366 non-null uint8 dtypes: float64(2), int64(2), object(2), uint8(91) memory usage: 49.8+ KB
Feature selection
In [ ]:
# Feature selection
features = ['mean', 'n', 'dispersion.value', 'temperature'] + list(df.columns[df.columns.str.startswith('plant.sp_')]) + list(df.columns[df.columns.str.startswith('response.var_')]) + list(df.columns[df.columns.str.startswith('response.units_')]) + list(df.columns[df.columns.str.startswith('dispersion.measure_')]) + list(df.columns[df.columns.str.startswith('cold.condition_')]) + list(df.columns[df.columns.str.startswith('microorganism_')])
x = df[features]
y = df['infection.status']
Split data into training and test set
In [ ]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
In [ ]:
# check the shape of X_train and X_test
x_train.shape, x_test.shape
Out[ ]:
((292, 95), (74, 95))
Feature Scaling
In [ ]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
In [ ]:
# Linear Kernel
svm_linear_model = SVC(kernel='linear', C=1.0)
svm_linear_model.fit(x_train, y_train)
Out[ ]:
SVC(kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='linear')
In [ ]:
# Polynomial Kernel
degree = 2 # You can adjust the degree of the polynomial
svm_poly_model = SVC(kernel='poly', degree=degree, C=1.0)
svm_poly_model.fit(x_train, y_train)
Out[ ]:
SVC(degree=2, kernel='poly')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(degree=2, kernel='poly')
In [ ]:
# Radial Basis Function (RBF) Kernel
svm_rbf_model = SVC(kernel='rbf', C=1.0)
svm_rbf_model.fit(x_train, y_train)
Out[ ]:
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [ ]:
# Sigmoid Kernel
svm_sigmoid_model = SVC(kernel='sigmoid', C=1.0)
svm_sigmoid_model.fit(x_train, y_train)
Out[ ]:
SVC(kernel='sigmoid')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='sigmoid')
In [ ]:
# Make predictions
y_pred_linear = svm_linear_model.predict(x_test)
y_pred_poly = svm_poly_model.predict(x_test)
y_pred_rbf = svm_rbf_model.predict(x_test)
y_pred_sigmoid = svm_sigmoid_model.predict(x_test)
# Evaluate the models
accuracy_linear = accuracy_score(y_test, y_pred_linear)
accuracy_poly = accuracy_score(y_test, y_pred_poly)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
accuracy_sigmoid = accuracy_score(y_test, y_pred_sigmoid)
print(f"Linear Kernel Accuracy: {accuracy_linear:.2f}")
print(f"Polynomial Kernel Accuracy: {accuracy_poly:.2f}")
print(f"RBF Kernel Accuracy: {accuracy_rbf:.2f}")
print(f"Sigmoid Kernel Accuracy: {accuracy_sigmoid:.2f}")
Linear Kernel Accuracy: 1.00 Polynomial Kernel Accuracy: 0.65 RBF Kernel Accuracy: 0.93 Sigmoid Kernel Accuracy: 1.00
In [ ]:
y_pred_train = svm_rbf_model.predict(x_train)
In [ ]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
Training-set accuracy score: 0.9966
In [ ]:
print('Training set score: {:.4f}'.format(svm_rbf_model.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(svm_rbf_model.score(x_test, y_test)))
Training set score: 0.9966 Test set score: 0.9324
Confusion Matrix
In [ ]:
cm = confusion_matrix(y_test, y_pred_rbf)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])
Confusion matrix [[32 3] [ 2 37]] True Positives(TP) = 32 True Negatives(TN) = 37 False Positives(FP) = 3 False Negatives(FN) = 2
In [ ]:
# visualize confusion matrix with seaborn heatmap
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
Out[ ]:
<Axes: >
Classification Metrices
In [ ]:
print(classification_report(y_test, y_pred_rbf))
precision recall f1-score support M+ 0.94 0.91 0.93 35 M- 0.93 0.95 0.94 39 accuracy 0.93 74 macro avg 0.93 0.93 0.93 74 weighted avg 0.93 0.93 0.93 74
Classification accuracy
In [ ]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
In [ ]:
# print classification accuracy
classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)
print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))
Classification accuracy : 0.9324
Classification error
In [ ]:
# print classification error
classification_error = (FP + FN) / float(TP + TN + FP + FN)
print('Classification error : {0:0.4f}'.format(classification_error))
Classification error : 0.0676
Precision
In [ ]:
# print precision score
precision = TP / float(TP + FP)
print('Precision : {0:0.4f}'.format(precision))
Precision : 0.9143
Recall
In [ ]:
recall = TP / float(TP + FN)
print('Recall or Sensitivity : {0:0.4f}'.format(recall))
Recall or Sensitivity : 0.9412
In [ ]:
# True positive rate
true_positive_rate = TP / float(TP + FN)
print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))
# False positive rate
false_positive_rate = FP / float(FP + TN)
print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))
True Positive Rate : 0.9412 False Positive Rate : 0.0750
Specificity
In [ ]:
specificity = TN / (TN + FP)
print('Specificity : {0:0.4f}'.format(specificity))
Specificity : 0.9250
ROC - AUC
In [ ]:
# import GridSearchCV
from sklearn.model_selection import GridSearchCV
# import SVC classifier
from sklearn.svm import SVC
# instantiate SVC classifier with default hyperparameters
svc = SVC(kernel='rbf')
# declare parameters for hyperparameter tuning
parameters = [{'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
# create GridSearchCV object
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring='accuracy',
cv=5,
verbose=0)
# fit the grid search to your data
grid_search.fit(x_train, y_train)
Out[ ]:
GridSearchCV(cv=5, estimator=SVC(), param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}], scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=SVC(), param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}], scoring='accuracy')
SVC()
SVC()
In [ ]:
# get the best parameters and the best estimator
best_params_rbf = grid_search.best_params_
best_estimator_rbf = grid_search.best_estimator_
In [ ]:
# best score achieved during the GridSearchCV
print('GridSearch CV best score: {:.4f}\n'.format(grid_search.best_score_))
# print parameters that give the best results
print('Parameters that give the best results:\n', grid_search.best_params_)
# print estimator that was chosen by the GridSearch
print('\nEstimator that was chosen by the search:\n', grid_search.best_estimator_)
GridSearch CV best score: 0.7295 Parameters that give the best results: {'C': 10, 'gamma': 0.1} Estimator that was chosen by the search: SVC(C=10, gamma=0.1)
In [ ]:
# calculate GridSearch CV score on test set
print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(x_test, y_test)))
GridSearch CV score on test set: 0.7973