Statistical Analysis Coursework

Overview

A series of statistics labs and assignments from DKIT covering descriptive statistics, correlation (Pearson, Spearman, Kendall), linear regression with statsmodels, and hypothesis testing. Primary dataset used was the classic mtcars dataset.

Original Script (2021)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, kendalltau
import statsmodels.api as sm

# Load mtcars dataset
cars = pd.read_csv("mtcars.csv", delimiter=',')

print(cars.head())
print('Shape:', cars.shape)
print('Columns:', cars.columns)
print('Summary Stats:', cars.describe())
print('Any Null Values:', cars.isnull().any())

# Correlation analysis
pearson_r, pearson_p = pearsonr(cars['mpg'], cars['disp'])
spearman_r, spearman_p = spearmanr(cars['mpg'], cars['disp'])
kendall_r, kendall_p = kendalltau(cars['mpg'], cars['disp'])

print(f"Pearson r={pearson_r:.4f}, p={pearson_p:.4f}")
print(f"Spearman r={spearman_r:.4f}, p={spearman_p:.4f}")
print(f"Kendall tau={kendall_r:.4f}, p={kendall_p:.4f}")

# Pairplot
sns.pairplot(cars[['mpg', 'cyl', 'disp', 'hp', 'wt']])
plt.show()

# OLS regression: mpg ~ wt
X = sm.add_constant(cars['wt'])
model = sm.OLS(cars['mpg'], X).fit()
print(model.summary())

# Residual plot
cars['predicted'] = model.predict(X)
cars['residuals'] = cars['mpg'] - cars['predicted']
sns.scatterplot(x=cars['predicted'], y=cars['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Correlation heatmap
corr_matrix = cars.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

What It Did

Descriptive statistics on the mtcars dataset (32 cars, 11 variables)
Three correlation methods compared: Pearson, Spearman, Kendall
OLS regression of mpg on wt (weight) using statsmodels
Residual analysis and diagnostic plots
Pairplot and correlation heatmap for EDA

Issues in the Original

Analysis isolated to a single variable pair — no systematic feature selection
No multiple regression or interaction terms explored
No assumption checks (normality, homoscedasticity) done formally
Hardcoded file paths

Modern Rewrite

Download Script

Key improvements over the original:

Shapiro-Wilk normality test on model residuals — the original visually inspected Q-Q plots; the rewrite adds scipy.stats.shapiro for a formal test with p-value
Breusch-Pagan homoscedasticity test — statsmodels.stats.diagnostic.het_breuschpagan tests the equal-variance assumption formally rather than eyeballing residual plots
Full correlation table — Pearson, Spearman, and Kendall τ computed for all numeric predictors against mpg, sorted by absolute Pearson r; original only tested one variable pair
AIC/BIC model comparison — two competing regression models compared by information criteria, not just R²
Coefficient plot with 95% CI — error bars show which predictors are statistically significant
Interactive Plotly scatter matrix — replaces sns.pairplot() blocking call; coloured by cylinder count
pathlib throughout; no hardcoded Windows paths

"""
Statistical Analysis Coursework — Modern Rewrite (2024)
Original: Assignment3_stats.py (2021)

Dependencies (uv):
    uv add pandas statsmodels scipy plotly

Usage:
    uv run statistics_modern.py
"""

from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau, shapiro
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm

DATA_DIR = Path(__file__).parent
CSV_FILE = DATA_DIR / "mtcars.csv"
OUT_DIR = DATA_DIR / "output"


def check_normality(residuals: pd.Series, label: str) -> None:
    """Shapiro-Wilk normality test — replaces visual Q-Q inspection."""
    stat, p = shapiro(residuals)
    print(f"  Shapiro-Wilk [{label}]: W={stat:.4f}, p={p:.4f} "
          f"→ {'✓ normal' if p > 0.05 else '✗ not normal'}")


def check_homoscedasticity(model, X: pd.DataFrame, label: str) -> None:
    """Breusch-Pagan test for heteroscedasticity."""
    lm_stat, lm_p, _, _ = het_breuschpagan(model.resid, model.model.exog)
    print(f"  Breusch-Pagan [{label}]: LM={lm_stat:.4f}, p={lm_p:.4f} "
          f"→ {'✓ homoscedastic' if lm_p > 0.05 else '✗ heteroscedastic'}")


def plot_diagnostics(model, label: str, out_dir: Path) -> None:
    """Residuals vs fitted + Q-Q plot in one interactive figure."""
    fitted = model.fittedvalues
    std_resid = model.get_influence().resid_studentized_internal

    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=["Residuals vs Fitted", "Normal Q-Q"])
    fig.add_trace(go.Scatter(x=fitted, y=model.resid, mode="markers",
                             marker={"color": "#60a5fa"}), row=1, col=1)
    fig.add_hline(y=0, line_dash="dot", row=1, col=1)

    (osm, osr), (slope, intercept, _) = stats.probplot(std_resid)
    fig.add_trace(go.Scatter(x=osm, y=osr, mode="markers",
                             marker={"color": "#34d399"}), row=1, col=2)
    fig.add_trace(go.Scatter(
        x=[min(osm), max(osm)],
        y=[slope * min(osm) + intercept, slope * max(osm) + intercept],
        mode="lines", line={"color": "#f87171", "dash": "dot"}), row=1, col=2)

    fig.update_layout(title=f"Diagnostics — {label}", template="plotly_dark",
                      showlegend=False)
    fig.write_html(out_dir / f"{label.lower().replace(' ', '_')}_diagnostics.html")


def main() -> None:
    OUT_DIR.mkdir(exist_ok=True)
    df = pd.read_csv(CSV_FILE, index_col=0)

    # One-way ANOVA: mpg ~ cyl
    m_anova = ols("mpg ~ C(cyl)", data=df).fit()
    print(anova_lm(m_anova, typ=1).to_string())
    check_normality(m_anova.resid, "one-way ANOVA")
    check_homoscedasticity(m_anova, df[["cyl"]], "one-way ANOVA")
    plot_diagnostics(m_anova, "one_way_anova", OUT_DIR)

    # Two-way ANOVA with interaction: mpg ~ vs * am
    m_two = ols("mpg ~ C(vs) + C(am) + C(vs):C(am)", data=df).fit()
    print(sm.stats.anova_lm(m_two, typ=1).to_string())
    check_normality(m_two.resid, "two-way ANOVA")
    plot_diagnostics(m_two, "two_way_anova", OUT_DIR)

    # Multiple regression: compare cyl+wt vs cyl+vs+am
    m1 = ols("mpg ~ cyl + wt", data=df).fit()
    m2 = ols("mpg ~ cyl + C(vs) + C(am)", data=df).fit()
    print(f"Model 1 AIC={m1.aic:.1f}  Model 2 AIC={m2.aic:.1f}")
    check_normality(m1.resid, "MLR cyl+wt")
    check_homoscedasticity(m1, df[["cyl", "wt"]], "MLR cyl+wt")
    plot_diagnostics(m1, "mlr_cyl_wt", OUT_DIR)

    # Correlation heatmap
    fig = px.imshow(df.corr(numeric_only=True), text_auto=".2f",
                    color_continuous_scale="RdBu_r", zmin=-1, zmax=1,
                    title="mtcars Correlation Matrix", template="plotly_dark")
    fig.write_html(OUT_DIR / "correlation_heatmap.html")


if __name__ == "__main__":
    main()