This repository has been archived by the owner on Jan 2, 2023. It is now read-only.
forked from AndrewCarterUK/football-predictor
-
Notifications
You must be signed in to change notification settings - Fork 4
/
dataset.py
98 lines (75 loc) · 3.66 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv
import datetime
from functools import reduce
class Dataset:
def __init__(self, file_path):
self.raw_results = []
self.processed_results = []
with open(file_path) as stream:
reader = csv.DictReader(stream)
for row in reader:
row['Date'] = datetime.datetime.strptime(row['Date'], '%d/%m/%y')
self.raw_results.append(row)
for result in self.raw_results:
home_statistics = self.get_statistics(result['HomeTeam'], result['Date'])
if home_statistics is None:
continue
away_statistics = self.get_statistics(result['AwayTeam'], result['Date'])
if away_statistics is None:
continue
processed_result = {
'date' : result['Date'],
'result': result['FTR'],
'odds-home': float(result['B365H']),
'odds-draw': float(result['B365D']),
'odds-away': float(result['B365A']),
}
for label, statistics in [('home', home_statistics), ('away', away_statistics)]:
for key in statistics.keys():
processed_result[label + '-' + key] = statistics[key]
self.processed_results.append(processed_result)
# Filter results to only contain matches played in by a given team, before a given date
def filter(self, team, date):
def filter_fn(result):
return (
result['HomeTeam'] == team or
result['AwayTeam'] == team
) and (result['Date'] < date)
return list(filter(filter_fn, self.raw_results))
# Calculate team statistics
def get_statistics(self, team, date, matches=10):
recent_results = self.filter(team, date)
if len(recent_results) < matches:
return None
# This function maps a result to a set of performance measures roughly scaled between -1 and 1
def map_fn(result):
if result['HomeTeam'] == team:
team_letter, opposition_letter = 'H', 'A'
opposition = result['AwayTeam']
else:
team_letter, opposition_letter = 'A', 'H'
opposition = result['HomeTeam']
goals = int(result['FT{}G'.format(team_letter)])
shots = int(result['{}S'.format(team_letter)])
shots_on_target = int(result['{}ST'.format(team_letter)])
shot_accuracy = shots_on_target / shots if shots > 0 else 0
opposition_goals = int(result['FT{}G'.format(opposition_letter)])
opposition_shots = int(result['{}S'.format(opposition_letter)])
opposition_shots_on_target = int(result['{}ST'.format(opposition_letter)])
return {
'wins': 1 if result['FTR'] == team_letter else 0,
'draws': 1 if result['FTR'] == 'D' else 0,
'losses': 1 if result['FTR'] == opposition_letter else 0,
'goals': int(result['FT{}G'.format(team_letter)]),
'opposition-goals': int(result['FT{}G'.format(opposition_letter)]),
'shots': int(result['{}S'.format(team_letter)]),
'shots-on-target': int(result['{}ST'.format(team_letter)]),
'opposition-shots': int(result['{}S'.format(opposition_letter)]),
'opposition-shots-on-target': int(result['{}ST'.format(opposition_letter)]),
}
def reduce_fn(x, y):
result = {}
for key in x.keys():
result[key] = x[key] + y[key]
return result
return reduce(reduce_fn, map(map_fn, recent_results[-matches:]))