Water Pump Functionality Prediction

library(reticulate)
use_python("C:/Users/jgpet/AppData/Local/Programs/Python/Python312/python.exe", required = TRUE)

Project Overview

This project explores the use of machine learning to predict the functionality status of water pumps in rural areas. The dataset includes features such as water source, pump type, distance to town, population served, and funding organization.

Goal: Build a predictive model to determine whether a water pump is functional or non-functional.

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

Set and Verify Working Directory

Before loading the dataset, we confirm the current working directory and change it to the location where the cleaned data file is stored.

# Set and Verify Working Directory

# PYthon library
import os

print("Current working directory:", os.getcwd())

os.chdir(r"C:\Users\jgpet\OneDrive\Desktop\Gradiate school\2025\DSCI 5240\Final Project")
print("New working directory:", os.getcwd())

Load Dataset

Set the working directory and load the cleaned dataset into memory.

# Load Dataset
Final_project = pd.read_csv("Final_project_clean.csv", header=0)
pd.set_option("display.max_columns", None)

Handle Outliers

Use the IQR method to detect and cap outliers in numerical features: - Distance to Nearest Town - Population Served - Water Pump Age

# Outlier Detection & Capping
num_cols = ['Distance to Nearest Town', 'Population Served', 'Water Pump Age']

for col in num_cols:
    Q1 = Final_project[col].quantile(0.25)
    Q3 = Final_project[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    Final_project[col] = Final_project[col].clip(lower=lower_bound, upper=upper_bound)

# Clip distance values below 0 to 0
Final_project['Distance to Nearest Town'] = Final_project['Distance to Nearest Town'].clip(lower=0)

🔤 Encode Categorical Variables

  • Binary encoding for Water Quality, Payment Type, and Functioning Status
  • One-hot encoding for Water Source Type, Funder, Pump Type
# Encode Categorical Variables
binary_map = {
    'Water Quality': {'Clean': 0, 'Contaminated': 1},
    'Payment Type': {'Free': 0, 'Pay per use': 1},
    'Functioning Status': {'Not Functioning': 0, 'Functioning': 1}
}
df_encoded = Final_project.replace(binary_map)
categorical_cols = ['Water Source Type', 'Funder', 'Pump Type']
df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)
df_encoded = df_encoded.astype(int)

🧮 Variance Inflation Factor (VIF)

Detect multicollinearity and remove features with VIF > 10 to improve model interpretability.

# Define X and y
y = df_encoded['Functioning Status']
X = df_encoded.drop(columns='Functioning Status')

# VIF for Feature Selection
X_vif = X.astype(float)
vif_data = pd.DataFrame()
vif_data['Feature'] = X_vif.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

# Drop high VIF features
high_vif_features = ['Installation Year', 'Population Served']
X_reduced = X.drop(columns=high_vif_features)

📏 Feature Scaling

Standardize numerical features to ensure they contribute equally to the model.

# Scaling

scaler = StandardScaler()
numeric_cols = ['Distance to Nearest Town', 'Water Pump Age']
X_scaled = X_reduced.copy()
X_scaled[numeric_cols] = scaler.fit_transform(X_scaled[numeric_cols])

🤖 Neural Network Model (Keras)

Train a neural network to classify water pumps as functioning or not.

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=4)
# Build Neural Network
model = Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(12, activation='relu'),
    Dropout(0.3),
    Dense(6, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train Model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

📈 Model Evaluation

Display classification report and confusion matrix to evaluate performance.

# Evaluate Model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

📊 Training History

Visualize accuracy and loss across epochs for both training and validation sets.

# Plot Accuracy and Loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, acc, 'bo-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')
plt.title('Training vs. Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
plt.title('Training vs. Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()
# Final Metrics
final_epoch = len(acc) - 1
print("\nFinal Epoch Metrics:")
print(f"Training Accuracy     : {acc[final_epoch]:.4f}")
print(f"Validation Accuracy   : {val_acc[final_epoch]:.4f}")
print(f"Training Loss         : {loss[final_epoch]:.4f}")
print(f"Validation Loss       : {val_loss[final_epoch]:.4f}")