Research entry
Statistical Analysis Coursework
2021 · Academic Archive
Statistical analysis labs and assignments covering correlation analysis, hypothesis testing, regression, and exploratory data analysis using the mtcars dataset.
Python Statistics scipy statsmodels pandas
Overview
A series of statistics labs and assignments from DKIT covering descriptive statistics, correlation (Pearson, Spearman, Kendall), linear regression with statsmodels, and hypothesis testing. Primary dataset used was the classic mtcars dataset.
Original Script (2021)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, kendalltau
import statsmodels.api as sm
# Load mtcars dataset
cars = pd.read_csv("mtcars.csv", delimiter=',')
print(cars.head())
print('Shape:', cars.shape)
print('Columns:', cars.columns)
print('Summary Stats:', cars.describe())
print('Any Null Values:', cars.isnull().any())
# Correlation analysis
pearson_r, pearson_p = pearsonr(cars['mpg'], cars['disp'])
spearman_r, spearman_p = spearmanr(cars['mpg'], cars['disp'])
kendall_r, kendall_p = kendalltau(cars['mpg'], cars['disp'])
print(f"Pearson r={pearson_r:.4f}, p={pearson_p:.4f}")
print(f"Spearman r={spearman_r:.4f}, p={spearman_p:.4f}")
print(f"Kendall tau={kendall_r:.4f}, p={kendall_p:.4f}")
# Pairplot
sns.pairplot(cars[['mpg', 'cyl', 'disp', 'hp', 'wt']])
plt.show()
# OLS regression: mpg ~ wt
X = sm.add_constant(cars['wt'])
model = sm.OLS(cars['mpg'], X).fit()
print(model.summary())
# Residual plot
cars['predicted'] = model.predict(X)
cars['residuals'] = cars['mpg'] - cars['predicted']
sns.scatterplot(x=cars['predicted'], y=cars['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
# Correlation heatmap
corr_matrix = cars.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
What It Did
- Descriptive statistics on the
mtcarsdataset (32 cars, 11 variables) - Three correlation methods compared: Pearson, Spearman, Kendall
- OLS regression of
mpgonwt(weight) using statsmodels - Residual analysis and diagnostic plots
- Pairplot and correlation heatmap for EDA
Issues in the Original
- Analysis isolated to a single variable pair — no systematic feature selection
- No multiple regression or interaction terms explored
- No assumption checks (normality, homoscedasticity) done formally
- Hardcoded file paths
Modern Rewrite
Key improvements over the original:
- Shapiro-Wilk normality test on model residuals — the original visually inspected Q-Q plots; the rewrite adds
scipy.stats.shapirofor a formal test with p-value - Breusch-Pagan homoscedasticity test —
statsmodels.stats.diagnostic.het_breuschpagantests the equal-variance assumption formally rather than eyeballing residual plots - Full correlation table — Pearson, Spearman, and Kendall τ computed for all numeric predictors against mpg, sorted by absolute Pearson r; original only tested one variable pair
- AIC/BIC model comparison — two competing regression models compared by information criteria, not just R²
- Coefficient plot with 95% CI — error bars show which predictors are statistically significant
- Interactive Plotly scatter matrix — replaces
sns.pairplot()blocking call; coloured by cylinder count pathlibthroughout; no hardcoded Windows paths
"""
Statistical Analysis Coursework — Modern Rewrite (2024)
Original: Assignment3_stats.py (2021)
Dependencies (uv):
uv add pandas statsmodels scipy plotly
Usage:
uv run statistics_modern.py
"""
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau, shapiro
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm
DATA_DIR = Path(__file__).parent
CSV_FILE = DATA_DIR / "mtcars.csv"
OUT_DIR = DATA_DIR / "output"
def check_normality(residuals: pd.Series, label: str) -> None:
"""Shapiro-Wilk normality test — replaces visual Q-Q inspection."""
stat, p = shapiro(residuals)
print(f" Shapiro-Wilk [{label}]: W={stat:.4f}, p={p:.4f} "
f"→ {'✓ normal' if p > 0.05 else '✗ not normal'}")
def check_homoscedasticity(model, X: pd.DataFrame, label: str) -> None:
"""Breusch-Pagan test for heteroscedasticity."""
lm_stat, lm_p, _, _ = het_breuschpagan(model.resid, model.model.exog)
print(f" Breusch-Pagan [{label}]: LM={lm_stat:.4f}, p={lm_p:.4f} "
f"→ {'✓ homoscedastic' if lm_p > 0.05 else '✗ heteroscedastic'}")
def plot_diagnostics(model, label: str, out_dir: Path) -> None:
"""Residuals vs fitted + Q-Q plot in one interactive figure."""
fitted = model.fittedvalues
std_resid = model.get_influence().resid_studentized_internal
fig = make_subplots(rows=1, cols=2,
subplot_titles=["Residuals vs Fitted", "Normal Q-Q"])
fig.add_trace(go.Scatter(x=fitted, y=model.resid, mode="markers",
marker={"color": "#60a5fa"}), row=1, col=1)
fig.add_hline(y=0, line_dash="dot", row=1, col=1)
(osm, osr), (slope, intercept, _) = stats.probplot(std_resid)
fig.add_trace(go.Scatter(x=osm, y=osr, mode="markers",
marker={"color": "#34d399"}), row=1, col=2)
fig.add_trace(go.Scatter(
x=[min(osm), max(osm)],
y=[slope * min(osm) + intercept, slope * max(osm) + intercept],
mode="lines", line={"color": "#f87171", "dash": "dot"}), row=1, col=2)
fig.update_layout(title=f"Diagnostics — {label}", template="plotly_dark",
showlegend=False)
fig.write_html(out_dir / f"{label.lower().replace(' ', '_')}_diagnostics.html")
def main() -> None:
OUT_DIR.mkdir(exist_ok=True)
df = pd.read_csv(CSV_FILE, index_col=0)
# One-way ANOVA: mpg ~ cyl
m_anova = ols("mpg ~ C(cyl)", data=df).fit()
print(anova_lm(m_anova, typ=1).to_string())
check_normality(m_anova.resid, "one-way ANOVA")
check_homoscedasticity(m_anova, df[["cyl"]], "one-way ANOVA")
plot_diagnostics(m_anova, "one_way_anova", OUT_DIR)
# Two-way ANOVA with interaction: mpg ~ vs * am
m_two = ols("mpg ~ C(vs) + C(am) + C(vs):C(am)", data=df).fit()
print(sm.stats.anova_lm(m_two, typ=1).to_string())
check_normality(m_two.resid, "two-way ANOVA")
plot_diagnostics(m_two, "two_way_anova", OUT_DIR)
# Multiple regression: compare cyl+wt vs cyl+vs+am
m1 = ols("mpg ~ cyl + wt", data=df).fit()
m2 = ols("mpg ~ cyl + C(vs) + C(am)", data=df).fit()
print(f"Model 1 AIC={m1.aic:.1f} Model 2 AIC={m2.aic:.1f}")
check_normality(m1.resid, "MLR cyl+wt")
check_homoscedasticity(m1, df[["cyl", "wt"]], "MLR cyl+wt")
plot_diagnostics(m1, "mlr_cyl_wt", OUT_DIR)
# Correlation heatmap
fig = px.imshow(df.corr(numeric_only=True), text_auto=".2f",
color_continuous_scale="RdBu_r", zmin=-1, zmax=1,
title="mtcars Correlation Matrix", template="plotly_dark")
fig.write_html(OUT_DIR / "correlation_heatmap.html")
if __name__ == "__main__":
main()