Football Match Result Prediction

Overview

A machine learning classification project to predict Premier League football match outcomes. Eight seasons of match data were combined and cleaned, then six classifiers were compared — Random Forest, K-Nearest Neighbours, SVM, AdaBoost, MLP (neural network), and Gaussian Naive Bayes. The target variable was the full-time result: Home Win (H), Draw (D), or Away Win (A).

Original Script (2021)

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Load 8 seasons of Premier League data
dfs = [pd.read_csv(f"raw_data_{i}.csv") for i in [1,2,4,5,6,7,8,9]]

# Parse dates
for df in dfs:
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

# Concatenate all seasons
data = pd.concat(dfs, ignore_index=True)

# Feature selection
features = ['FTHG','FTAG','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR']
target = 'FTR'  # Full Time Result: H/D/A

X = data[features].dropna()
y = data.loc[X.index, target]

# Encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Evaluate all classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': svm.SVC(kernel='rbf', random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'Naive Bayes': GaussianNB(),
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix for best model (Random Forest)
rf = classifiers['Random Forest']
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Random Forest Confusion Matrix')
plt.show()

What It Did

Combined 8 seasons of Premier League match data
Used in-game statistics (shots, fouls, cards, corners) as features
Compared 6 classifiers for 3-class (H/D/A) prediction
Evaluated with accuracy, classification report, and confusion matrix

Issues in the Original

Features included in-game stats (shots, fouls) — these are only known after kick-off, making pre-match prediction impossible
No cross-validation — single train/test split
No hyperparameter tuning
LabelEncoder used without consistent class ordering
pd.read_csv called in a loop with hardcoded filenames

Modern Rewrite

Download Script

Key improvements over the original:

Pre-match features only — the original used in-game stats (shots on target, fouls, corners) that are only known after kick-off, making it impossible to actually predict anything. The rewrite builds features from cumulative season state at kick-off time: points, goal difference, goals scored/conceded, and last-5-match form points
Proper temporal train/test split — training data is seasons up to 2018-19; test set is 2019-20 and 2020-21. Splitting by date prevents data leakage from future seasons into training
Stratified 5-fold cross-validation — all 6 classifiers evaluated with StratifiedKFold to account for class imbalance (home wins are more frequent than draws)
scikit-learn Pipeline — scaler + classifier wrapped in a pipeline so scaling is always fit on training data only; original fit the scaler on all data before splitting
Full H/D/A classification — original collapsed to H vs NH (home win or not); rewrite predicts all three outcomes
Feature importance plot — Random Forest importances saved as interactive Plotly bar chart
pathlib and portable paths throughout

"""
Football Match Result Prediction — Modern Rewrite (2024)
Original: FINAL_ATTEMPT.py (2021)

Dependencies (uv):
    uv add pandas scikit-learn plotly

Usage:
    uv run football_prediction_modern.py
"""

from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC

DATA_DIR = Path(__file__).parent
OUT_DIR = DATA_DIR / "output"
SEASON_FILES = [DATA_DIR / f"raw_data_{i}.csv" for i in [1, 2, 4, 5, 6, 7, 8, 9]]
SEASON_LABELS = ["2012-13", "2013-14", "2015-16", "2016-17",
                 "2017-18", "2018-19", "2019-20", "2020-21"]
MIN_MATCHWEEK = 5

FEATURE_COLS = [
    "HTP", "ATP", "HTGD", "ATGD", "HTGS", "ATGS",
    "HTGC", "ATGC", "HTFormPts", "ATFormPts", "DiffPts", "DiffFormPts",
]

CLASSIFIERS = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "SVM (RBF)": SVC(kernel="rbf", random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=200, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42),
    "Naive Bayes": GaussianNB(),
}


def compute_form_points(results: list[str]) -> int:
    pts = {"W": 3, "D": 1, "L": 0}
    return sum(pts.get(r, 0) for r in results)


def engineer_features(data: pd.DataFrame) -> pd.DataFrame:
    """Build pre-match features per season using only information available before kick-off."""
    all_rows = []
    for season, season_df in data.groupby("Season"):
        season_df = season_df.sort_values("Date").reset_index(drop=True)
        teams = season_df["HomeTeam"].unique()
        points = {t: 0 for t in teams}
        goals_scored = {t: 0 for t in teams}
        goals_conceded = {t: 0 for t in teams}
        results_history: dict[str, list[str]] = {t: [] for t in teams}

        for _, row in season_df.iterrows():
            ht, at, ftr = row["HomeTeam"], row["AwayTeam"], row["FTR"]
            ht_form = compute_form_points(results_history[ht][-5:])
            at_form = compute_form_points(results_history[at][-5:])

            all_rows.append({
                "Season": season, "Date": row["Date"],
                "HomeTeam": ht, "AwayTeam": at, "FTR": ftr,
                "HTP": points[ht], "ATP": points[at],
                "HTGD": goals_scored[ht] - goals_conceded[ht],
                "ATGD": goals_scored[at] - goals_conceded[at],
                "HTGS": goals_scored[ht], "ATGS": goals_scored[at],
                "HTGC": goals_conceded[ht], "ATGC": goals_conceded[at],
                "HTFormPts": ht_form, "ATFormPts": at_form,
                "DiffPts": points[ht] - points[at],
                "DiffFormPts": ht_form - at_form,
                "MatchesPlayed": len(results_history[ht]),
            })

            hg, ag = int(row["FTHG"]), int(row["FTAG"])
            goals_scored[ht] += hg; goals_scored[at] += ag
            goals_conceded[ht] += ag; goals_conceded[at] += hg
            if ftr == "H":
                points[ht] += 3; results_history[ht].append("W"); results_history[at].append("L")
            elif ftr == "A":
                points[at] += 3; results_history[ht].append("L"); results_history[at].append("W")
            else:
                points[ht] += 1; points[at] += 1
                results_history[ht].append("D"); results_history[at].append("D")

    return pd.DataFrame(all_rows)


def main() -> None:
    OUT_DIR.mkdir(exist_ok=True)
    dfs = []
    for path, label in zip(SEASON_FILES, SEASON_LABELS):
        df = pd.read_csv(path)
        df["Season"] = label
        df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
        dfs.append(df)
    data = pd.concat(dfs)[["Date", "Season", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"]].dropna()

    features_df = engineer_features(data)
    features_df = features_df[features_df["MatchesPlayed"] >= MIN_MATCHWEEK]

    le = LabelEncoder()
    y = le.fit_transform(features_df["FTR"])
    X = features_df[FEATURE_COLS].values

    split_date = pd.Timestamp("2019-08-01")
    train_mask = features_df["Date"] < split_date
    X_train, y_train = X[train_mask], y[train_mask]
    X_test, y_test = X[~train_mask], y[~train_mask]

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    records = []
    for name, clf in CLASSIFIERS.items():
        pipe = Pipeline([("scaler", StandardScaler()), ("clf", clf)])
        scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="accuracy")
        records.append({"Classifier": name, "CV Mean Acc": scores.mean(), "CV Std": scores.std()})

    cv_df = pd.DataFrame(records).sort_values("CV Mean Acc", ascending=False)
    fig = px.bar(cv_df, x="Classifier", y="CV Mean Acc", error_y="CV Std",
                 title="5-Fold CV Accuracy — Premier League Result Prediction",
                 template="plotly_dark")
    fig.write_html(OUT_DIR / "cv_results.html")

    pipe = Pipeline([("scaler", StandardScaler()),
                     ("clf", RandomForestClassifier(n_estimators=200, random_state=42))])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))


if __name__ == "__main__":
    main()