← Back to Work

Research entry

Statistical Analysis Coursework

2021 · Academic Archive

Statistical analysis labs and assignments covering correlation analysis, hypothesis testing, regression, and exploratory data analysis using the mtcars dataset.

Python Statistics scipy statsmodels pandas

Overview

A series of statistics labs and assignments from DKIT covering descriptive statistics, correlation (Pearson, Spearman, Kendall), linear regression with statsmodels, and hypothesis testing. Primary dataset used was the classic mtcars dataset.

Original Script (2021)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, kendalltau
import statsmodels.api as sm

# Load mtcars dataset
cars = pd.read_csv("mtcars.csv", delimiter=',')

print(cars.head())
print('Shape:', cars.shape)
print('Columns:', cars.columns)
print('Summary Stats:', cars.describe())
print('Any Null Values:', cars.isnull().any())

# Correlation analysis
pearson_r, pearson_p = pearsonr(cars['mpg'], cars['disp'])
spearman_r, spearman_p = spearmanr(cars['mpg'], cars['disp'])
kendall_r, kendall_p = kendalltau(cars['mpg'], cars['disp'])

print(f"Pearson r={pearson_r:.4f}, p={pearson_p:.4f}")
print(f"Spearman r={spearman_r:.4f}, p={spearman_p:.4f}")
print(f"Kendall tau={kendall_r:.4f}, p={kendall_p:.4f}")

# Pairplot
sns.pairplot(cars[['mpg', 'cyl', 'disp', 'hp', 'wt']])
plt.show()

# OLS regression: mpg ~ wt
X = sm.add_constant(cars['wt'])
model = sm.OLS(cars['mpg'], X).fit()
print(model.summary())

# Residual plot
cars['predicted'] = model.predict(X)
cars['residuals'] = cars['mpg'] - cars['predicted']
sns.scatterplot(x=cars['predicted'], y=cars['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Correlation heatmap
corr_matrix = cars.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

What It Did

  • Descriptive statistics on the mtcars dataset (32 cars, 11 variables)
  • Three correlation methods compared: Pearson, Spearman, Kendall
  • OLS regression of mpg on wt (weight) using statsmodels
  • Residual analysis and diagnostic plots
  • Pairplot and correlation heatmap for EDA

Issues in the Original

  • Analysis isolated to a single variable pair — no systematic feature selection
  • No multiple regression or interaction terms explored
  • No assumption checks (normality, homoscedasticity) done formally
  • Hardcoded file paths

Modern Rewrite

Download Script

Key improvements over the original:

  • Shapiro-Wilk normality test on model residuals — the original visually inspected Q-Q plots; the rewrite adds scipy.stats.shapiro for a formal test with p-value
  • Breusch-Pagan homoscedasticity teststatsmodels.stats.diagnostic.het_breuschpagan tests the equal-variance assumption formally rather than eyeballing residual plots
  • Full correlation table — Pearson, Spearman, and Kendall τ computed for all numeric predictors against mpg, sorted by absolute Pearson r; original only tested one variable pair
  • AIC/BIC model comparison — two competing regression models compared by information criteria, not just R²
  • Coefficient plot with 95% CI — error bars show which predictors are statistically significant
  • Interactive Plotly scatter matrix — replaces sns.pairplot() blocking call; coloured by cylinder count
  • pathlib throughout; no hardcoded Windows paths
"""
Statistical Analysis Coursework — Modern Rewrite (2024)
Original: Assignment3_stats.py (2021)

Dependencies (uv):
    uv add pandas statsmodels scipy plotly

Usage:
    uv run statistics_modern.py
"""

from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau, shapiro
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm

DATA_DIR = Path(__file__).parent
CSV_FILE = DATA_DIR / "mtcars.csv"
OUT_DIR = DATA_DIR / "output"


def check_normality(residuals: pd.Series, label: str) -> None:
    """Shapiro-Wilk normality test — replaces visual Q-Q inspection."""
    stat, p = shapiro(residuals)
    print(f"  Shapiro-Wilk [{label}]: W={stat:.4f}, p={p:.4f} "
          f"→ {'✓ normal' if p > 0.05 else '✗ not normal'}")


def check_homoscedasticity(model, X: pd.DataFrame, label: str) -> None:
    """Breusch-Pagan test for heteroscedasticity."""
    lm_stat, lm_p, _, _ = het_breuschpagan(model.resid, model.model.exog)
    print(f"  Breusch-Pagan [{label}]: LM={lm_stat:.4f}, p={lm_p:.4f} "
          f"→ {'✓ homoscedastic' if lm_p > 0.05 else '✗ heteroscedastic'}")


def plot_diagnostics(model, label: str, out_dir: Path) -> None:
    """Residuals vs fitted + Q-Q plot in one interactive figure."""
    fitted = model.fittedvalues
    std_resid = model.get_influence().resid_studentized_internal

    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=["Residuals vs Fitted", "Normal Q-Q"])
    fig.add_trace(go.Scatter(x=fitted, y=model.resid, mode="markers",
                             marker={"color": "#60a5fa"}), row=1, col=1)
    fig.add_hline(y=0, line_dash="dot", row=1, col=1)

    (osm, osr), (slope, intercept, _) = stats.probplot(std_resid)
    fig.add_trace(go.Scatter(x=osm, y=osr, mode="markers",
                             marker={"color": "#34d399"}), row=1, col=2)
    fig.add_trace(go.Scatter(
        x=[min(osm), max(osm)],
        y=[slope * min(osm) + intercept, slope * max(osm) + intercept],
        mode="lines", line={"color": "#f87171", "dash": "dot"}), row=1, col=2)

    fig.update_layout(title=f"Diagnostics — {label}", template="plotly_dark",
                      showlegend=False)
    fig.write_html(out_dir / f"{label.lower().replace(' ', '_')}_diagnostics.html")


def main() -> None:
    OUT_DIR.mkdir(exist_ok=True)
    df = pd.read_csv(CSV_FILE, index_col=0)

    # One-way ANOVA: mpg ~ cyl
    m_anova = ols("mpg ~ C(cyl)", data=df).fit()
    print(anova_lm(m_anova, typ=1).to_string())
    check_normality(m_anova.resid, "one-way ANOVA")
    check_homoscedasticity(m_anova, df[["cyl"]], "one-way ANOVA")
    plot_diagnostics(m_anova, "one_way_anova", OUT_DIR)

    # Two-way ANOVA with interaction: mpg ~ vs * am
    m_two = ols("mpg ~ C(vs) + C(am) + C(vs):C(am)", data=df).fit()
    print(sm.stats.anova_lm(m_two, typ=1).to_string())
    check_normality(m_two.resid, "two-way ANOVA")
    plot_diagnostics(m_two, "two_way_anova", OUT_DIR)

    # Multiple regression: compare cyl+wt vs cyl+vs+am
    m1 = ols("mpg ~ cyl + wt", data=df).fit()
    m2 = ols("mpg ~ cyl + C(vs) + C(am)", data=df).fit()
    print(f"Model 1 AIC={m1.aic:.1f}  Model 2 AIC={m2.aic:.1f}")
    check_normality(m1.resid, "MLR cyl+wt")
    check_homoscedasticity(m1, df[["cyl", "wt"]], "MLR cyl+wt")
    plot_diagnostics(m1, "mlr_cyl_wt", OUT_DIR)

    # Correlation heatmap
    fig = px.imshow(df.corr(numeric_only=True), text_auto=".2f",
                    color_continuous_scale="RdBu_r", zmin=-1, zmax=1,
                    title="mtcars Correlation Matrix", template="plotly_dark")
    fig.write_html(OUT_DIR / "correlation_heatmap.html")


if __name__ == "__main__":
    main()