import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive

df_edaphic = pd.read_csv('/content/gdrive/MyDrive/Polar Region Project/pr_dna_rna/Edaphic_factors.csv')

df_carbon = pd.read_csv('/content/gdrive/MyDrive/Polar Region Project/pr_dna_rna/14C_and_13C_data.csv')

print(df_edaphic.shape)

(18, 7)

print(df_carbon.shape)

(30, 16)

df_edaphic.head(5)

df_carbon.head(5)

df_edaphic.isnull().sum()

df_carbon.isnull().sum()

# Merge the two datasets on a common column
merged_data = pd.merge(df_carbon, df_edaphic, how='inner')

print(merged_data.head())

         island sample_code  pit  depth_cm  sample_identifier  \
0  Signy Island       FC_16    2         2                160   
1  Signy Island       FC_17    2         2                161   
2  Signy Island       FC_18    2         2                162   
3  Signy Island       FC_19    2         2                163   
4  Signy Island       FC_20    2         2                164   

   DNA_relative_abundance_percentage  RNA_relative_abundance_percentage  \
0                          27.278632                           0.134968   
1                          29.085577                           0.590287   
2                          20.464135                           0.527568   
3                          22.714932                           0.000000   
4                          24.095207                           0.765516   

   14C_enrichment_percentage_modern  modern_1_sigma_error_percentage  \
0                            109.98                             0.48   
1                            107.26                             0.47   
2                            108.23                             0.49   
3                            109.48                             0.48   
4                            108.01                             0.50   

   modelled_mean_residence_time_year  mean_residence_time_year  \
0                                 10                        10   
1                                  6                         6   
2                                  8                         8   
3                                  9                         9   
4                                  7                         7   

   delta_13C_content_per_mille  conventional_radiocarbon _age_yr  \
0                        -27.1                                 0   
1                        -26.3                                 0   
2                        -27.5                                 0   
3                        -25.7                                 0   
4                        -25.4                                 0   

   age_1_sigma_error_yr_BP SUERC_publication_code  \
0                        0            SUERC-53240   
1                        0            SUERC-52673   
2                        0            SUERC-52674   
3                        0            SUERC-53218   
4                        0            SUERC-53219   

             combustion_method  pH_water  moisture_percentage  \
0  elemental_combustion_system      5.45            65.979381   
1              bomb_combustion      5.45            65.979381   
2              bomb_combustion      5.45            65.979381   
3  elemental_combustion_system      5.45            65.979381   
4  elemental_combustion_system      5.45            65.979381   

   total_C_concentration_percentage_dwt  total_N_concentration_percentage_dwt  
0                             36.608452                              3.321111  
1                             36.608452                              3.321111  
2                             36.608452                              3.321111  
3                             36.608452                              3.321111  
4                             36.608452                              3.321111

merged_data.shape

(30, 20)

# Create a pairplot to visualize relationships between features and the target variable
sns.pairplot(merged_data[['pit', 'depth_cm', 'pH_water', 'moisture_percentage',
                          'total_C_concentration_percentage_dwt', 'total_N_concentration_percentage_dwt', 'DNA_relative_abundance_percentage']])
plt.suptitle("Pairplot of Features and DNA Relative Abundance", y=1.02)
plt.show()

# Compute the correlation matrix
correlation_matrix = merged_data[['pit', 'depth_cm', 'pH_water', 'moisture_percentage',
                                  'total_C_concentration_percentage_dwt', 'total_N_concentration_percentage_dwt', 'DNA_relative_abundance_percentage']].corr()

# Create a heatmap to visualize correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap of Features and DNA Relative Abundance")
plt.show()

# Plot the distribution of the target variable (DNA Relative Abundance)
plt.figure(figsize=(8, 5))
sns.histplot(merged_data['DNA_relative_abundance_percentage'], kde=True, bins=20)
plt.title('Distribution of DNA Relative Abundance')
plt.xlabel('DNA Relative Abundance (%)')
plt.ylabel('Frequency')
plt.show()

# Define features
x = merged_data[['pit', 'depth_cm', 'pH_water', 'moisture_percentage',
                 'total_C_concentration_percentage_dwt', 'total_N_concentration_percentage_dwt']]
y = merged_data['DNA_relative_abundance_percentage']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

RandomForestRegressor(random_state=42)

RandomForestRegressor(random_state=42)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 17.47168507617586

baseline_pred = y_train.mean()
baseline_mse = mean_squared_error(y_test, [baseline_pred] * len(y_test))
print(f"Baseline MSE: {baseline_mse}")

Baseline MSE: 226.09365696275543

r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")

R²: 0.9131922126244305

# Display the feature importances
feature_importance = model.feature_importances_
print("Feature Importance:")
for feature, importance in zip(x.columns, feature_importance):
    print(f'{feature}: {importance}')

Feature Importance:
pit: 0.0
depth_cm: 0.027705181466660737
pH_water: 0.157681423230421
moisture_percentage: 0.028254092076552762
total_C_concentration_percentage_dwt: 0.10086180639655218
total_N_concentration_percentage_dwt: 0.6854974968298134

Predicting Fungal Abundance in Soil Samples Based on Edaphic Factors Using Random Forest

Justification¶

	island	pit	depth_cm	pH_water	moisture_percentage	total_C_concentration_percentage_dwt	total_N_concentration_percentage_dwt
0	Signy Island	1	2	4.64	70.021882	37.922169	3.447703
1	Signy Island	1	4	3.97	58.401639	29.672395	2.791865
2	Signy Island	1	8	4.29	63.746224	28.557001	2.633001
3	Signy Island	2	2	5.45	65.979381	36.608452	3.321111
4	Signy Island	2	4	4.52	64.781022	40.313343	3.600611

	island	sample_code	pit	depth_cm	sample_identifier	DNA_relative_abundance_percentage	RNA_relative_abundance_percentage	14C_enrichment_percentage_modern	modern_1_sigma_error_percentage	modelled_mean_residence_time_year	mean_residence_time_year	delta_13C_content_per_mille	SUERC_publication_code	combustion_method
0	Signy Island	FC_16	2	2	160	27.278632	0.134968	109.98	0.48	10	10	-27.1	SUERC-53240	elemental_combustion_system
1	Signy Island	FC_17	2	2	161	29.085577	0.590287	107.26	0.47	6	6	-26.3	SUERC-52673	bomb_combustion
2	Signy Island	FC_18	2	2	162	20.464135	0.527568	108.23	0.49	8	8	-27.5	SUERC-52674	bomb_combustion
3	Signy Island	FC_19	2	2	163	22.714932	0.000000	109.48	0.48	9	9	-25.7	SUERC-53218	elemental_combustion_system
4	Signy Island	FC_20	2	2	164	24.095207	0.765516	108.01	0.50	7	7	-25.4	SUERC-53219	elemental_combustion_system

	0
island	0
pit	0
depth_cm	0
pH_water	0
moisture_percentage	0
total_C_concentration_percentage_dwt	0
total_N_concentration_percentage_dwt	0

	0
island	0
sample_code	0
pit	0
depth_cm	0
sample_identifier	0
DNA_relative_abundance_percentage	0
RNA_relative_abundance_percentage	0
14C_enrichment_percentage_modern	0
modern_1_sigma_error_percentage	0
modelled_mean_residence_time_year	0
mean_residence_time_year	0
delta_13C_content_per_mille	0
conventional_radiocarbon _age_yr	0
age_1_sigma_error_yr_BP	0
SUERC_publication_code	0
combustion_method	0