# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
Water Pump Functionality Prediction - Portfolio Project
Project Overview
This project explores the use of machine learning to predict the functionality status of water pumps in rural areas. The dataset includes features such as water source, pump type, distance to town, population served, and funding organization.
Goal: Build a predictive model to determine whether a water pump is functional or non-functional.
Set and Verify Working Directory
Before loading the dataset, we confirm the current working directory and change it to the location where the cleaned data file is stored.
# Set and Verify Working Directory
# PYthon library
import os
print("Current working directory:", os.getcwd())
r"C:\Users\jgpet\OneDrive\Desktop\Gradiate school\2025\DSCI 5240\Final Project")
os.chdir(print("New working directory:", os.getcwd())
Current working directory: C:\Users\jgpet\DSCI5240\Final Project
New working directory: C:\Users\jgpet\OneDrive\Desktop\Gradiate school\2025\DSCI 5240\Final Project
Load Dataset
Set the working directory and load the cleaned dataset into memory.
# Load Dataset
= pd.read_csv("Final_project_clean.csv", header=0)
Final_project "display.max_columns", None) pd.set_option(
Handle Outliers
Use the IQR method to detect and cap outliers in numerical features: - Distance to Nearest Town - Population Served - Water Pump Age
# Outlier Detection & Capping
= ['Distance to Nearest Town', 'Population Served', 'Water Pump Age']
num_cols
for col in num_cols:
= Final_project[col].quantile(0.25)
Q1 = Final_project[col].quantile(0.75)
Q3 = Q3 - Q1
IQR = Q1 - 1.5 * IQR
lower_bound = Q3 + 1.5 * IQR
upper_bound = Final_project[col].clip(lower=lower_bound, upper=upper_bound)
Final_project[col]
# Clip distance values below 0 to 0
'Distance to Nearest Town'] = Final_project['Distance to Nearest Town'].clip(lower=0) Final_project[
🔤 Encode Categorical Variables
- Binary encoding for Water Quality, Payment Type, and Functioning Status
- One-hot encoding for Water Source Type, Funder, Pump Type
# Encode Categorical Variables
= {
binary_map 'Water Quality': {'Clean': 0, 'Contaminated': 1},
'Payment Type': {'Free': 0, 'Pay per use': 1},
'Functioning Status': {'Not Functioning': 0, 'Functioning': 1}
}= Final_project.replace(binary_map)
df_encoded = ['Water Source Type', 'Funder', 'Pump Type']
categorical_cols = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)
df_encoded = df_encoded.astype(int) df_encoded
C:\Users\jgpet\AppData\Local\Temp\ipykernel_27700\130284717.py:7: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df_encoded = Final_project.replace(binary_map)
🧮 Variance Inflation Factor (VIF)
Detect multicollinearity and remove features with VIF > 10 to improve model interpretability.
# Define X and y
= df_encoded['Functioning Status']
y = df_encoded.drop(columns='Functioning Status')
X
# VIF for Feature Selection
= X.astype(float)
X_vif = pd.DataFrame()
vif_data 'Feature'] = X_vif.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif_data[
# Drop high VIF features
= ['Installation Year', 'Population Served']
high_vif_features = X.drop(columns=high_vif_features) X_reduced
📏 Feature Scaling
Standardize numerical features to ensure they contribute equally to the model.
# Scaling
= StandardScaler()
scaler = ['Distance to Nearest Town', 'Water Pump Age']
numeric_cols = X_reduced.copy()
X_scaled = scaler.fit_transform(X_scaled[numeric_cols]) X_scaled[numeric_cols]
🤖 Neural Network Model (Keras)
Train a neural network to classify water pumps as functioning or not.
# Train-Test Split
= train_test_split(X_scaled, y, test_size=0.3, random_state=4) X_train, X_test, y_train, y_test
# Build Neural Network
= Sequential([
model 16, activation='relu', input_shape=(X_train.shape[1],)),
Dense(0.3),
Dropout(12, activation='relu'),
Dense(0.3),
Dropout(6, activation='relu'),
Dense(0.3),
Dropout(1, activation='sigmoid')
Dense( ])
C:\Users\jgpet\anaconda31\Lib\site-packages\keras\src\layers\core\dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
super().__init__(activity_regularizer=activity_regularizer, **kwargs)
compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.= EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
early_stop
# Train Model
= model.fit(
history
X_train, y_train,=0.2,
validation_split=50,
epochs=32,
batch_size=[early_stop],
callbacks=1
verbose )
Epoch 1/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 5s 12ms/step - accuracy: 0.5993 - loss: 0.6793 - val_accuracy: 0.6529 - val_loss: 0.6523
Epoch 2/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6135 - loss: 0.6658 - val_accuracy: 0.6529 - val_loss: 0.6388
Epoch 3/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6295 - loss: 0.6504 - val_accuracy: 0.6657 - val_loss: 0.6242
Epoch 4/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.6377 - loss: 0.6417 - val_accuracy: 0.6657 - val_loss: 0.6134
Epoch 5/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6579 - loss: 0.6326 - val_accuracy: 0.6886 - val_loss: 0.6032
Epoch 6/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6732 - loss: 0.6182 - val_accuracy: 0.6957 - val_loss: 0.5947
Epoch 7/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6771 - loss: 0.6103 - val_accuracy: 0.6986 - val_loss: 0.5933
Epoch 8/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6683 - loss: 0.6196 - val_accuracy: 0.6871 - val_loss: 0.5915
Epoch 9/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6586 - loss: 0.6161 - val_accuracy: 0.6929 - val_loss: 0.5880
Epoch 10/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6893 - loss: 0.5879 - val_accuracy: 0.6971 - val_loss: 0.5845
Epoch 11/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6834 - loss: 0.6039 - val_accuracy: 0.6971 - val_loss: 0.5832
Epoch 12/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.6925 - loss: 0.5975 - val_accuracy: 0.6971 - val_loss: 0.5854
Epoch 13/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.6901 - loss: 0.5994 - val_accuracy: 0.7000 - val_loss: 0.5823
Epoch 14/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.7082 - loss: 0.5852 - val_accuracy: 0.7014 - val_loss: 0.5810
Epoch 15/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6927 - loss: 0.6031 - val_accuracy: 0.7014 - val_loss: 0.5791
Epoch 16/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6861 - loss: 0.6062 - val_accuracy: 0.7029 - val_loss: 0.5805
Epoch 17/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.7111 - loss: 0.5736 - val_accuracy: 0.6971 - val_loss: 0.5785
Epoch 18/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6947 - loss: 0.5939 - val_accuracy: 0.7014 - val_loss: 0.5812
Epoch 19/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.7112 - loss: 0.5755 - val_accuracy: 0.7057 - val_loss: 0.5781
Epoch 20/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.7095 - loss: 0.5892 - val_accuracy: 0.7043 - val_loss: 0.5781
Epoch 21/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.7064 - loss: 0.5863 - val_accuracy: 0.7000 - val_loss: 0.5772
Epoch 22/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.7113 - loss: 0.5780 - val_accuracy: 0.7029 - val_loss: 0.5793
Epoch 23/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.7144 - loss: 0.5758 - val_accuracy: 0.7057 - val_loss: 0.5759
Epoch 24/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.7124 - loss: 0.5747 - val_accuracy: 0.7029 - val_loss: 0.5732
Epoch 25/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.7094 - loss: 0.5882 - val_accuracy: 0.7043 - val_loss: 0.5739
Epoch 26/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.7233 - loss: 0.5753 - val_accuracy: 0.7043 - val_loss: 0.5737
Epoch 27/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.7068 - loss: 0.5758 - val_accuracy: 0.7043 - val_loss: 0.5743
Epoch 28/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.7174 - loss: 0.5662 - val_accuracy: 0.7086 - val_loss: 0.5745
Epoch 29/50
88/88 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.7290 - loss: 0.5575 - val_accuracy: 0.7071 - val_loss: 0.5745
📈 Model Evaluation
Display classification report and confusion matrix to evaluate performance.
# Evaluate Model
= (model.predict(X_test) > 0.5).astype("int32")
y_pred print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
47/47 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
Classification Report:
precision recall f1-score support
0 0.76 0.83 0.80 936
1 0.67 0.58 0.62 564
accuracy 0.73 1500
macro avg 0.72 0.70 0.71 1500
weighted avg 0.73 0.73 0.73 1500
Confusion Matrix:
[[776 160]
[239 325]]
📊 Training History
Visualize accuracy and loss across epochs for both training and validation sets.
# Plot Accuracy and Loss
= history.history['accuracy']
acc = history.history['val_accuracy']
val_acc = history.history['loss']
loss = history.history['val_loss']
val_loss = range(1, len(acc) + 1)
epochs
=(12, 5))
plt.figure(figsize1, 2, 1)
plt.subplot('bo-', label='Training Accuracy')
plt.plot(epochs, acc, 'ro-', label='Validation Accuracy')
plt.plot(epochs, val_acc, 'Training vs. Validation Accuracy')
plt.title('Epochs')
plt.xlabel('Accuracy')
plt.ylabel(
plt.legend()
1, 2, 2)
plt.subplot('bo-', label='Training Loss')
plt.plot(epochs, loss, 'ro-', label='Validation Loss')
plt.plot(epochs, val_loss, 'Training vs. Validation Loss')
plt.title('Epochs')
plt.xlabel('Loss')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
# Final Metrics
= len(acc) - 1
final_epoch print("\nFinal Epoch Metrics:")
print(f"Training Accuracy : {acc[final_epoch]:.4f}")
print(f"Validation Accuracy : {val_acc[final_epoch]:.4f}")
print(f"Training Loss : {loss[final_epoch]:.4f}")
print(f"Validation Loss : {val_loss[final_epoch]:.4f}")
Final Epoch Metrics:
Training Accuracy : 0.7250
Validation Accuracy : 0.7071
Training Loss : 0.5669
Validation Loss : 0.5745