# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
Water Pump Functionality Prediction
library(reticulate)
use_python("C:/Users/jgpet/AppData/Local/Programs/Python/Python312/python.exe", required = TRUE)
Project Overview
This project explores the use of machine learning to predict the functionality status of water pumps in rural areas. The dataset includes features such as water source, pump type, distance to town, population served, and funding organization.
Goal: Build a predictive model to determine whether a water pump is functional or non-functional.
Set and Verify Working Directory
Before loading the dataset, we confirm the current working directory and change it to the location where the cleaned data file is stored.
# Set and Verify Working Directory
# PYthon library
import os
print("Current working directory:", os.getcwd())
r"C:\Users\jgpet\OneDrive\Desktop\Gradiate school\2025\DSCI 5240\Final Project")
os.chdir(print("New working directory:", os.getcwd())
Load Dataset
Set the working directory and load the cleaned dataset into memory.
# Load Dataset
= pd.read_csv("Final_project_clean.csv", header=0)
Final_project "display.max_columns", None) pd.set_option(
Handle Outliers
Use the IQR method to detect and cap outliers in numerical features: - Distance to Nearest Town - Population Served - Water Pump Age
# Outlier Detection & Capping
= ['Distance to Nearest Town', 'Population Served', 'Water Pump Age']
num_cols
for col in num_cols:
= Final_project[col].quantile(0.25)
Q1 = Final_project[col].quantile(0.75)
Q3 = Q3 - Q1
IQR = Q1 - 1.5 * IQR
lower_bound = Q3 + 1.5 * IQR
upper_bound = Final_project[col].clip(lower=lower_bound, upper=upper_bound)
Final_project[col]
# Clip distance values below 0 to 0
'Distance to Nearest Town'] = Final_project['Distance to Nearest Town'].clip(lower=0) Final_project[
🔤 Encode Categorical Variables
- Binary encoding for Water Quality, Payment Type, and Functioning Status
- One-hot encoding for Water Source Type, Funder, Pump Type
# Encode Categorical Variables
= {
binary_map 'Water Quality': {'Clean': 0, 'Contaminated': 1},
'Payment Type': {'Free': 0, 'Pay per use': 1},
'Functioning Status': {'Not Functioning': 0, 'Functioning': 1}
}= Final_project.replace(binary_map)
df_encoded = ['Water Source Type', 'Funder', 'Pump Type']
categorical_cols = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)
df_encoded = df_encoded.astype(int) df_encoded
🧮 Variance Inflation Factor (VIF)
Detect multicollinearity and remove features with VIF > 10 to improve model interpretability.
# Define X and y
= df_encoded['Functioning Status']
y = df_encoded.drop(columns='Functioning Status')
X
# VIF for Feature Selection
= X.astype(float)
X_vif = pd.DataFrame()
vif_data 'Feature'] = X_vif.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif_data[
# Drop high VIF features
= ['Installation Year', 'Population Served']
high_vif_features = X.drop(columns=high_vif_features) X_reduced
📏 Feature Scaling
Standardize numerical features to ensure they contribute equally to the model.
# Scaling
= StandardScaler()
scaler = ['Distance to Nearest Town', 'Water Pump Age']
numeric_cols = X_reduced.copy()
X_scaled = scaler.fit_transform(X_scaled[numeric_cols]) X_scaled[numeric_cols]
🤖 Neural Network Model (Keras)
Train a neural network to classify water pumps as functioning or not.
# Train-Test Split
= train_test_split(X_scaled, y, test_size=0.3, random_state=4) X_train, X_test, y_train, y_test
# Build Neural Network
= Sequential([
model 16, activation='relu', input_shape=(X_train.shape[1],)),
Dense(0.3),
Dropout(12, activation='relu'),
Dense(0.3),
Dropout(6, activation='relu'),
Dense(0.3),
Dropout(1, activation='sigmoid')
Dense( ])
compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.= EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
early_stop
# Train Model
= model.fit(
history
X_train, y_train,=0.2,
validation_split=50,
epochs=32,
batch_size=[early_stop],
callbacks=1
verbose )
📈 Model Evaluation
Display classification report and confusion matrix to evaluate performance.
# Evaluate Model
= (model.predict(X_test) > 0.5).astype("int32")
y_pred print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
📊 Training History
Visualize accuracy and loss across epochs for both training and validation sets.
# Plot Accuracy and Loss
= history.history['accuracy']
acc = history.history['val_accuracy']
val_acc = history.history['loss']
loss = history.history['val_loss']
val_loss = range(1, len(acc) + 1)
epochs
=(12, 5))
plt.figure(figsize1, 2, 1)
plt.subplot('bo-', label='Training Accuracy')
plt.plot(epochs, acc, 'ro-', label='Validation Accuracy')
plt.plot(epochs, val_acc, 'Training vs. Validation Accuracy')
plt.title('Epochs')
plt.xlabel('Accuracy')
plt.ylabel(
plt.legend()
1, 2, 2)
plt.subplot('bo-', label='Training Loss')
plt.plot(epochs, loss, 'ro-', label='Validation Loss')
plt.plot(epochs, val_loss, 'Training vs. Validation Loss')
plt.title('Epochs')
plt.xlabel('Loss')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
# Final Metrics
= len(acc) - 1
final_epoch print("\nFinal Epoch Metrics:")
print(f"Training Accuracy : {acc[final_epoch]:.4f}")
print(f"Validation Accuracy : {val_acc[final_epoch]:.4f}")
print(f"Training Loss : {loss[final_epoch]:.4f}")
print(f"Validation Loss : {val_loss[final_epoch]:.4f}")