Research entry
Football Match Result Prediction
2021 · Academic Archive
Multi-class classification of Premier League match results (Home Win / Draw / Away Win) using Random Forest, KNN, SVM, AdaBoost, MLP, and Naive Bayes classifiers on 8 seasons of match data.
Python Machine Learning scikit-learn Random Forest Classification
Overview
A machine learning classification project to predict Premier League football match outcomes. Eight seasons of match data were combined and cleaned, then six classifiers were compared — Random Forest, K-Nearest Neighbours, SVM, AdaBoost, MLP (neural network), and Gaussian Naive Bayes. The target variable was the full-time result: Home Win (H), Draw (D), or Away Win (A).
Original Script (2021)
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
# Load 8 seasons of Premier League data
dfs = [pd.read_csv(f"raw_data_{i}.csv") for i in [1,2,4,5,6,7,8,9]]
# Parse dates
for df in dfs:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
# Concatenate all seasons
data = pd.concat(dfs, ignore_index=True)
# Feature selection
features = ['FTHG','FTAG','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR']
target = 'FTR' # Full Time Result: H/D/A
X = data[features].dropna()
y = data.loc[X.index, target]
# Encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Evaluate all classifiers
classifiers = {
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'SVM': svm.SVC(kernel='rbf', random_state=42),
'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
'Naive Bayes': GaussianNB(),
}
for name, clf in classifiers.items():
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"{name}: {acc:.4f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))
# Confusion matrix for best model (Random Forest)
rf = classifiers['Random Forest']
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Random Forest Confusion Matrix')
plt.show()
What It Did
- Combined 8 seasons of Premier League match data
- Used in-game statistics (shots, fouls, cards, corners) as features
- Compared 6 classifiers for 3-class (H/D/A) prediction
- Evaluated with accuracy, classification report, and confusion matrix
Issues in the Original
- Features included in-game stats (shots, fouls) — these are only known after kick-off, making pre-match prediction impossible
- No cross-validation — single train/test split
- No hyperparameter tuning
- LabelEncoder used without consistent class ordering
pd.read_csvcalled in a loop with hardcoded filenames
Modern Rewrite
Key improvements over the original:
- Pre-match features only — the original used in-game stats (shots on target, fouls, corners) that are only known after kick-off, making it impossible to actually predict anything. The rewrite builds features from cumulative season state at kick-off time: points, goal difference, goals scored/conceded, and last-5-match form points
- Proper temporal train/test split — training data is seasons up to 2018-19; test set is 2019-20 and 2020-21. Splitting by date prevents data leakage from future seasons into training
- Stratified 5-fold cross-validation — all 6 classifiers evaluated with
StratifiedKFoldto account for class imbalance (home wins are more frequent than draws) - scikit-learn
Pipeline— scaler + classifier wrapped in a pipeline so scaling is always fit on training data only; original fit the scaler on all data before splitting - Full H/D/A classification — original collapsed to H vs NH (home win or not); rewrite predicts all three outcomes
- Feature importance plot — Random Forest importances saved as interactive Plotly bar chart
pathliband portable paths throughout
"""
Football Match Result Prediction — Modern Rewrite (2024)
Original: FINAL_ATTEMPT.py (2021)
Dependencies (uv):
uv add pandas scikit-learn plotly
Usage:
uv run football_prediction_modern.py
"""
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
DATA_DIR = Path(__file__).parent
OUT_DIR = DATA_DIR / "output"
SEASON_FILES = [DATA_DIR / f"raw_data_{i}.csv" for i in [1, 2, 4, 5, 6, 7, 8, 9]]
SEASON_LABELS = ["2012-13", "2013-14", "2015-16", "2016-17",
"2017-18", "2018-19", "2019-20", "2020-21"]
MIN_MATCHWEEK = 5
FEATURE_COLS = [
"HTP", "ATP", "HTGD", "ATGD", "HTGS", "ATGS",
"HTGC", "ATGC", "HTFormPts", "ATFormPts", "DiffPts", "DiffFormPts",
]
CLASSIFIERS = {
"Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
"KNN": KNeighborsClassifier(n_neighbors=7),
"SVM (RBF)": SVC(kernel="rbf", random_state=42),
"AdaBoost": AdaBoostClassifier(n_estimators=200, random_state=42),
"MLP": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42),
"Naive Bayes": GaussianNB(),
}
def compute_form_points(results: list[str]) -> int:
pts = {"W": 3, "D": 1, "L": 0}
return sum(pts.get(r, 0) for r in results)
def engineer_features(data: pd.DataFrame) -> pd.DataFrame:
"""Build pre-match features per season using only information available before kick-off."""
all_rows = []
for season, season_df in data.groupby("Season"):
season_df = season_df.sort_values("Date").reset_index(drop=True)
teams = season_df["HomeTeam"].unique()
points = {t: 0 for t in teams}
goals_scored = {t: 0 for t in teams}
goals_conceded = {t: 0 for t in teams}
results_history: dict[str, list[str]] = {t: [] for t in teams}
for _, row in season_df.iterrows():
ht, at, ftr = row["HomeTeam"], row["AwayTeam"], row["FTR"]
ht_form = compute_form_points(results_history[ht][-5:])
at_form = compute_form_points(results_history[at][-5:])
all_rows.append({
"Season": season, "Date": row["Date"],
"HomeTeam": ht, "AwayTeam": at, "FTR": ftr,
"HTP": points[ht], "ATP": points[at],
"HTGD": goals_scored[ht] - goals_conceded[ht],
"ATGD": goals_scored[at] - goals_conceded[at],
"HTGS": goals_scored[ht], "ATGS": goals_scored[at],
"HTGC": goals_conceded[ht], "ATGC": goals_conceded[at],
"HTFormPts": ht_form, "ATFormPts": at_form,
"DiffPts": points[ht] - points[at],
"DiffFormPts": ht_form - at_form,
"MatchesPlayed": len(results_history[ht]),
})
hg, ag = int(row["FTHG"]), int(row["FTAG"])
goals_scored[ht] += hg; goals_scored[at] += ag
goals_conceded[ht] += ag; goals_conceded[at] += hg
if ftr == "H":
points[ht] += 3; results_history[ht].append("W"); results_history[at].append("L")
elif ftr == "A":
points[at] += 3; results_history[ht].append("L"); results_history[at].append("W")
else:
points[ht] += 1; points[at] += 1
results_history[ht].append("D"); results_history[at].append("D")
return pd.DataFrame(all_rows)
def main() -> None:
OUT_DIR.mkdir(exist_ok=True)
dfs = []
for path, label in zip(SEASON_FILES, SEASON_LABELS):
df = pd.read_csv(path)
df["Season"] = label
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
dfs.append(df)
data = pd.concat(dfs)[["Date", "Season", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"]].dropna()
features_df = engineer_features(data)
features_df = features_df[features_df["MatchesPlayed"] >= MIN_MATCHWEEK]
le = LabelEncoder()
y = le.fit_transform(features_df["FTR"])
X = features_df[FEATURE_COLS].values
split_date = pd.Timestamp("2019-08-01")
train_mask = features_df["Date"] < split_date
X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[~train_mask], y[~train_mask]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
records = []
for name, clf in CLASSIFIERS.items():
pipe = Pipeline([("scaler", StandardScaler()), ("clf", clf)])
scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="accuracy")
records.append({"Classifier": name, "CV Mean Acc": scores.mean(), "CV Std": scores.std()})
cv_df = pd.DataFrame(records).sort_values("CV Mean Acc", ascending=False)
fig = px.bar(cv_df, x="Classifier", y="CV Mean Acc", error_y="CV Std",
title="5-Fold CV Accuracy — Premier League Result Prediction",
template="plotly_dark")
fig.write_html(OUT_DIR / "cv_results.html")
pipe = Pipeline([("scaler", StandardScaler()),
("clf", RandomForestClassifier(n_estimators=200, random_state=42))])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))
if __name__ == "__main__":
main()