Research entry
Machine Learning Coursework Notebooks
2021 · Academic Archive
A collection of Jupyter notebooks covering K-Means clustering, Random Forest classification on the Iris dataset, image classification with CNNs, and CA assignment work.
Python Machine Learning scikit-learn Jupyter Clustering CNN
Overview
A series of Jupyter notebooks produced during a Machine Learning module at DKIT. Topics covered include unsupervised clustering (K-Means), supervised classification (Random Forest, KNN), and image classification using convolutional neural networks.
Notebooks
| Notebook | Topic |
|---|---|
Clustering.ipynb | K-Means and hierarchical clustering |
Random_Forest_iris.ipynb | Random Forest on the Iris dataset |
image classification.ipynb | CNN image classification |
connor_CA2part1.ipynb | CA2 Part 1 — supervised learning |
Connor_CA2Part2.ipynb | CA2 Part 2 — model evaluation |
Connor_CA3.ipynb | CA3 — advanced ML techniques |
Sample: K-Means Clustering
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
# Load Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Elbow method
inertias = []
for k in range(1, 11):
km = KMeans(n_clusters=k, random_state=42)
km.fit(X_scaled)
inertias.append(km.inertia_)
plt.plot(range(1, 11), inertias, '-o')
plt.xlabel('Number of clusters k')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()
# Fit with k=3 (known ground truth)
km = KMeans(n_clusters=3, random_state=42)
labels = km.fit_predict(X_scaled)
plt.scatter(X.iloc[:,0], X.iloc[:,1], c=labels, cmap='viridis')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title('K-Means Clusters (k=3)')
plt.show()
Sample: Random Forest on Iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42, stratify=iris.target
)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# Feature importances
importances = pd.Series(rf.feature_importances_, index=iris.feature_names)
importances.sort_values().plot(kind='barh')
plt.title('Feature Importances')
plt.show()
Issues in the Original Notebooks
- No markdown explanations between code cells — results without context
- Hardcoded file paths in several cells
- No reproducibility seeds in some notebooks
- Models not persisted between sessions
Modern Rewrite
The rewrite covers both core notebooks — K-Means clustering and Random Forest classification — as a single structured script.
Key improvements:
- Silhouette score alongside inertia — the original only used the elbow/inertia method to pick k. Silhouette score gives a second, more rigorous criterion for optimal cluster count
- scikit-learn
Pipeline— scaler and classifier wrapped together so preprocessing is never accidentally fit on test data - Stratified 5-fold CV — original Random Forest notebook used a single random 60/40 split (
np.random.uniform) with norandom_stateseed, giving non-reproducible results joblibmodel persistence — trained pipeline saved tomodels/rf_iris.joblib; original models were lost between sessions- Radar chart for cluster profiles — shows mean log-spend per category per cluster, making business interpretation immediate (Cluster 0 = retailer, Cluster 1 = restaurant/hotel)
- Plotly HTML outputs — all plots saved as self-contained HTML files
pathlibfor portable paths; no Google Colabdrive.mount()dependency
"""
Machine Learning Coursework — Modern Rewrite (2024)
Originals: Clustering.ipynb, Random_Forest_iris.ipynb (2021)
Dependencies (uv):
uv add pandas scikit-learn plotly joblib
Usage:
uv run ml_coursework_modern.py
"""
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, silhouette_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
DATA_DIR = Path(__file__).parent
OUT_DIR = DATA_DIR / "output"
MODELS_DIR = DATA_DIR / "models"
def preprocess_customers(df: pd.DataFrame) -> tuple[np.ndarray, list[str]]:
"""Log-transform to reduce skew, remove outliers with z-score threshold."""
log_df = np.log(df)
z_scores = (log_df - log_df.mean()) / log_df.std()
clean = log_df[(z_scores.abs() < 2.8).all(axis=1)]
return clean.values, clean.columns.tolist()
def find_optimal_k(X: np.ndarray, k_range: range) -> tuple[list, list]:
"""Return inertia and silhouette score for each k."""
inertias, silhouettes = [], []
for k in k_range:
km = KMeans(n_clusters=k, init="k-means++", n_init=10, random_state=42)
km.fit(X)
inertias.append(km.inertia_)
silhouettes.append(silhouette_score(X, km.labels_) if k > 1 else None)
return inertias, silhouettes
def run_clustering(out_dir: Path) -> None:
df = pd.read_csv(DATA_DIR / "Customer_data.csv").drop(columns=["Channel", "Area"])
X, feature_names = preprocess_customers(df)
k_range = range(1, 11)
inertias, silhouettes = find_optimal_k(X, k_range)
best_k = list(k_range)[1:][int(np.argmax([s for s in silhouettes if s]))]
km = KMeans(n_clusters=best_k, init="k-means++", n_init=10, random_state=42)
labels = km.fit_predict(X)
# Radar chart: cluster spending profiles
cluster_df = pd.DataFrame(X, columns=feature_names)
cluster_df["Cluster"] = [f"Cluster {l}" for l in labels]
centroids = cluster_df.groupby("Cluster")[feature_names].mean().reset_index()
fig = go.Figure()
for _, row in centroids.iterrows():
fig.add_trace(go.Scatterpolar(
r=row[feature_names].values, theta=feature_names,
fill="toself", name=row["Cluster"],
))
fig.update_layout(title="Cluster Profiles — Wholesale Customer Spending",
template="plotly_dark")
fig.write_html(out_dir / "cluster_profiles.html")
def run_random_forest(out_dir: Path, models_dir: Path) -> None:
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
pipe = Pipeline([
("scaler", StandardScaler()),
("rf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
])
# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="accuracy")
print(f"5-fold CV: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
pipe.fit(X_train, y_train)
print(f"Test accuracy: {accuracy_score(y_test, pipe.predict(X_test)):.4f}")
print(classification_report(y_test, pipe.predict(X_test),
target_names=iris.target_names))
# Persist model
joblib.dump(pipe, models_dir / "rf_iris.joblib")
def main() -> None:
OUT_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)
run_clustering(OUT_DIR)
run_random_forest(OUT_DIR, MODELS_DIR)
if __name__ == "__main__":
main()