import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from google.colab import drive
drive.mount('/content/gdrive')
Mounted at /content/gdrive
df_edaphic = pd.read_csv('/content/gdrive/MyDrive/Polar Region Project/pr_dna_rna/Edaphic_factors.csv')
df_carbon = pd.read_csv('/content/gdrive/MyDrive/Polar Region Project/pr_dna_rna/14C_and_13C_data.csv')
print(df_edaphic.shape)
(18, 7)
print(df_carbon.shape)
(30, 16)
df_edaphic.head(5)
island | pit | depth_cm | pH_water | moisture_percentage | total_C_concentration_percentage_dwt | total_N_concentration_percentage_dwt | |
---|---|---|---|---|---|---|---|
0 | Signy Island | 1 | 2 | 4.64 | 70.021882 | 37.922169 | 3.447703 |
1 | Signy Island | 1 | 4 | 3.97 | 58.401639 | 29.672395 | 2.791865 |
2 | Signy Island | 1 | 8 | 4.29 | 63.746224 | 28.557001 | 2.633001 |
3 | Signy Island | 2 | 2 | 5.45 | 65.979381 | 36.608452 | 3.321111 |
4 | Signy Island | 2 | 4 | 4.52 | 64.781022 | 40.313343 | 3.600611 |
df_carbon.head(5)
island | sample_code | pit | depth_cm | sample_identifier | DNA_relative_abundance_percentage | RNA_relative_abundance_percentage | 14C_enrichment_percentage_modern | modern_1_sigma_error_percentage | modelled_mean_residence_time_year | mean_residence_time_year | delta_13C_content_per_mille | conventional_radiocarbon _age_yr | age_1_sigma_error_yr_BP | SUERC_publication_code | combustion_method | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Signy Island | FC_16 | 2 | 2 | 160 | 27.278632 | 0.134968 | 109.98 | 0.48 | 10 | 10 | -27.1 | 0 | 0 | SUERC-53240 | elemental_combustion_system |
1 | Signy Island | FC_17 | 2 | 2 | 161 | 29.085577 | 0.590287 | 107.26 | 0.47 | 6 | 6 | -26.3 | 0 | 0 | SUERC-52673 | bomb_combustion |
2 | Signy Island | FC_18 | 2 | 2 | 162 | 20.464135 | 0.527568 | 108.23 | 0.49 | 8 | 8 | -27.5 | 0 | 0 | SUERC-52674 | bomb_combustion |
3 | Signy Island | FC_19 | 2 | 2 | 163 | 22.714932 | 0.000000 | 109.48 | 0.48 | 9 | 9 | -25.7 | 0 | 0 | SUERC-53218 | elemental_combustion_system |
4 | Signy Island | FC_20 | 2 | 2 | 164 | 24.095207 | 0.765516 | 108.01 | 0.50 | 7 | 7 | -25.4 | 0 | 0 | SUERC-53219 | elemental_combustion_system |
df_edaphic.isnull().sum()
0 | |
---|---|
island | 0 |
pit | 0 |
depth_cm | 0 |
pH_water | 0 |
moisture_percentage | 0 |
total_C_concentration_percentage_dwt | 0 |
total_N_concentration_percentage_dwt | 0 |
df_carbon.isnull().sum()
0 | |
---|---|
island | 0 |
sample_code | 0 |
pit | 0 |
depth_cm | 0 |
sample_identifier | 0 |
DNA_relative_abundance_percentage | 0 |
RNA_relative_abundance_percentage | 0 |
14C_enrichment_percentage_modern | 0 |
modern_1_sigma_error_percentage | 0 |
modelled_mean_residence_time_year | 0 |
mean_residence_time_year | 0 |
delta_13C_content_per_mille | 0 |
conventional_radiocarbon _age_yr | 0 |
age_1_sigma_error_yr_BP | 0 |
SUERC_publication_code | 0 |
combustion_method | 0 |
# Merge the two datasets on a common column
merged_data = pd.merge(df_carbon, df_edaphic, how='inner')
print(merged_data.head())
island sample_code pit depth_cm sample_identifier \ 0 Signy Island FC_16 2 2 160 1 Signy Island FC_17 2 2 161 2 Signy Island FC_18 2 2 162 3 Signy Island FC_19 2 2 163 4 Signy Island FC_20 2 2 164 DNA_relative_abundance_percentage RNA_relative_abundance_percentage \ 0 27.278632 0.134968 1 29.085577 0.590287 2 20.464135 0.527568 3 22.714932 0.000000 4 24.095207 0.765516 14C_enrichment_percentage_modern modern_1_sigma_error_percentage \ 0 109.98 0.48 1 107.26 0.47 2 108.23 0.49 3 109.48 0.48 4 108.01 0.50 modelled_mean_residence_time_year mean_residence_time_year \ 0 10 10 1 6 6 2 8 8 3 9 9 4 7 7 delta_13C_content_per_mille conventional_radiocarbon _age_yr \ 0 -27.1 0 1 -26.3 0 2 -27.5 0 3 -25.7 0 4 -25.4 0 age_1_sigma_error_yr_BP SUERC_publication_code \ 0 0 SUERC-53240 1 0 SUERC-52673 2 0 SUERC-52674 3 0 SUERC-53218 4 0 SUERC-53219 combustion_method pH_water moisture_percentage \ 0 elemental_combustion_system 5.45 65.979381 1 bomb_combustion 5.45 65.979381 2 bomb_combustion 5.45 65.979381 3 elemental_combustion_system 5.45 65.979381 4 elemental_combustion_system 5.45 65.979381 total_C_concentration_percentage_dwt total_N_concentration_percentage_dwt 0 36.608452 3.321111 1 36.608452 3.321111 2 36.608452 3.321111 3 36.608452 3.321111 4 36.608452 3.321111
merged_data.shape
(30, 20)
# Create a pairplot to visualize relationships between features and the target variable
sns.pairplot(merged_data[['pit', 'depth_cm', 'pH_water', 'moisture_percentage',
'total_C_concentration_percentage_dwt', 'total_N_concentration_percentage_dwt', 'DNA_relative_abundance_percentage']])
plt.suptitle("Pairplot of Features and DNA Relative Abundance", y=1.02)
plt.show()
# Compute the correlation matrix
correlation_matrix = merged_data[['pit', 'depth_cm', 'pH_water', 'moisture_percentage',
'total_C_concentration_percentage_dwt', 'total_N_concentration_percentage_dwt', 'DNA_relative_abundance_percentage']].corr()
# Create a heatmap to visualize correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap of Features and DNA Relative Abundance")
plt.show()
# Plot the distribution of the target variable (DNA Relative Abundance)
plt.figure(figsize=(8, 5))
sns.histplot(merged_data['DNA_relative_abundance_percentage'], kde=True, bins=20)
plt.title('Distribution of DNA Relative Abundance')
plt.xlabel('DNA Relative Abundance (%)')
plt.ylabel('Frequency')
plt.show()
# Define features
x = merged_data[['pit', 'depth_cm', 'pH_water', 'moisture_percentage',
'total_C_concentration_percentage_dwt', 'total_N_concentration_percentage_dwt']]
y = merged_data['DNA_relative_abundance_percentage']
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
RandomForestRegressor(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
# Make predictions on the test set
y_pred = model.predict(x_test)
# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
Mean Squared Error: 17.47168507617586
baseline_pred = y_train.mean()
baseline_mse = mean_squared_error(y_test, [baseline_pred] * len(y_test))
print(f"Baseline MSE: {baseline_mse}")
Baseline MSE: 226.09365696275543
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")
R²: 0.9131922126244305
# Display the feature importances
feature_importance = model.feature_importances_
print("Feature Importance:")
for feature, importance in zip(x.columns, feature_importance):
print(f'{feature}: {importance}')
Feature Importance: pit: 0.0 depth_cm: 0.027705181466660737 pH_water: 0.157681423230421 moisture_percentage: 0.028254092076552762 total_C_concentration_percentage_dwt: 0.10086180639655218 total_N_concentration_percentage_dwt: 0.6854974968298134
Justification¶
The MSE of 17.47 is lower than the aseline MSE of 226.09. It shows that the random forest model performs well in predicting fungal abundance in soil samples using edaphic factors by reflecting the relationship between fungal abundance and soil qualities. The model successfully explained 91.3% of the variance (R² = 0.913) in fungal abundance.
Multiple edaphic factors influence fungal abundance in soil ecosystems, including soil water pH, moisture content, nitrogen concentration, pit size, and carbon concentration. This ranking of feature importance reveals that nitrogen and carbon were the most important factors explaining fungal abundance.
The study found that the Random Forest algorithm successfully predicts fungal abundance using edaphic factors (MSE = 17.47, R² = 0.913), with the total nitrogen concentration as the most important predictor.