Diabetes is often diagnosed late because early symptoms are subtle. This project builds a classification model that flags at-risk patients from routine clinical data, using the well-known Pima Indians Diabetes Dataset (or any hospital dataset with similar features). Unlike a black-box model, this project emphasizes:
This is a strong project to pair with your embedded systems background — the same pipeline could later feed data from a DIY IoT health-monitoring device (glucose strip reader, BP sensor, etc.) into a live risk dashboard.
Real-world extensions:
# diabetes_risk_prediction.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
classification_report, confusion_matrix, roc_auc_score, roc_curve
)
sns.set_style("whitegrid")
BRAND_DARK = "#163a6e"
BRAND_ORANGE = "#ff8a3d"
# ---------------------------------------------------------
# 1. Load dataset
# Download: https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database
# ---------------------------------------------------------
df = pd.read_csv("diabetes.csv")
# Columns: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin,
# BMI, DiabetesPedigreeFunction, Age, Outcome
# ---------------------------------------------------------
# 2. Clean data — replace implausible zeros with NaN, then impute
# ---------------------------------------------------------
zero_invalid_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[zero_invalid_cols] = df[zero_invalid_cols].replace(0, np.nan)
for col in zero_invalid_cols:
df[col] = df.groupby("Outcome")[col].transform(lambda x: x.fillna(x.median()))
# ---------------------------------------------------------
# 3. EDA
# ---------------------------------------------------------
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Feature correlation")
plt.tight_layout()
plt.savefig("correlation_heatmap.png", dpi=150)
plt.close()
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="Outcome", y="Glucose", palette=[BRAND_DARK, BRAND_ORANGE])
plt.title("Glucose level: diabetic vs non-diabetic")
plt.xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
plt.tight_layout()
plt.savefig("glucose_by_outcome.png", dpi=150)
plt.close()
# ---------------------------------------------------------
# 4. Train/test split
# ---------------------------------------------------------
X = df.drop("Outcome", axis=1)
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ---------------------------------------------------------
# 5. Train models
# ---------------------------------------------------------
log_model = LogisticRegression(max_iter=1000, class_weight="balanced")
log_model.fit(X_train_scaled, y_train)
rf_model = RandomForestClassifier(
n_estimators=300, max_depth=6, class_weight="balanced", random_state=42
)
rf_model.fit(X_train, y_train) # tree models don't need scaling
# ---------------------------------------------------------
# 6. Evaluate both models
# ---------------------------------------------------------
def evaluate(name, model, X_te, y_te):
y_pred = model.predict(X_te)
y_prob = model.predict_proba(X_te)[:, 1]
print(f"\n--- {name} ---")
print(classification_report(y_te, y_pred, target_names=["No Diabetes", "Diabetes"]))
print(f"ROC-AUC: {roc_auc_score(y_te, y_prob):.3f}")
return y_prob
log_prob = evaluate("Logistic Regression", log_model, X_test_scaled, y_test)
rf_prob = evaluate("Random Forest", rf_model, X_test, y_test)
# Confusion matrix (Random Forest)
cm = confusion_matrix(y_test, rf_model.predict(X_test))
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=["No Diabetes", "Diabetes"],
yticklabels=["No Diabetes", "Diabetes"])
plt.title("Confusion matrix — Random Forest")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=150)
plt.close()
# ROC curve comparison
plt.figure(figsize=(6, 5))
for name, prob, color in [("Logistic Regression", log_prob, BRAND_DARK),
("Random Forest", rf_prob, BRAND_ORANGE)]:
fpr, tpr, _ = roc_curve(y_test, prob)
plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc_score(y_test, prob):.2f})", color=color)
plt.plot([0, 1], [0, 1], "k--", alpha=0.5)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve comparison")
plt.legend()
plt.tight_layout()
plt.savefig("roc_curve.png", dpi=150)
plt.close()
# ---------------------------------------------------------
# 7. Feature importance (explainability)
# ---------------------------------------------------------
importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values()
plt.figure(figsize=(7, 5))
importance.plot(kind="barh", color=BRAND_ORANGE)
plt.title("Feature importance — Random Forest")
plt.tight_layout()
plt.savefig("feature_importance.png", dpi=150)
plt.close()
print("\nTop risk factors:\n", importance.sort_values(ascending=False).head(5))
# ---------------------------------------------------------
# 8. Risk scoring for a new patient
# ---------------------------------------------------------
def risk_band(prob):
if prob < 0.33:
return "Low"
elif prob < 0.66:
return "Medium"
return "High"
new_patient = pd.DataFrame([{
"Pregnancies": 2, "Glucose": 148, "BloodPressure": 72, "SkinThickness": 35,
"Insulin": 120, "BMI": 33.6, "DiabetesPedigreeFunction": 0.627, "Age": 45
}])
prob = rf_model.predict_proba(new_patient)[0][1]
print(f"\nNew patient diabetes probability: {prob:.2%} -> Risk band: {risk_band(prob)}")
print("\nCharts saved: correlation_heatmap.png, glucose_by_outcome.png,",
"confusion_matrix.png, roc_curve.png, feature_importance.png")
Get an official Project Completion Certificate with a unique ID & QR verification — perfect for internships, resumes, and college submissions.
Get Certificate →