-
Notifications
You must be signed in to change notification settings - Fork 2
/
simulation_parameters_dixon.py
132 lines (94 loc) · 5.04 KB
/
simulation_parameters_dixon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
'''
this is the script that should be used to generate team ratings for an entire season
doesnt work during the season as not every team has played each other
'''
import numpy as np
import pandas as pd
from math import factorial as fact
from scipy.optimize import minimize
import datetime
def create_team_map(teams):
unique_items = np.unique(teams)
no_of_teams = len(unique_items)
return {item: id_ for id_, item in enumerate(unique_items, start=0)}, no_of_teams
def tau(x, y, lambdaa, mu, rho):
if x == 0 and y == 0:
result = 1 - (lambdaa*mu*rho)
elif x == 0 and y == 1:
result = 1 + (lambdaa*rho)
elif x == 1 and y == 0:
result = 1 + (mu*rho)
elif x == 1 and y == 1:
result = 1 - rho
else:
result = 1
return result
def log_likelihood_function(theta, data):
rho=0.15
result=0
#team_map, no_of_teams = create_team_map(data['HomeTeam'])
no_of_teams =24
most_recent_match = max(data['Date'])
for index, row in data.iterrows():
days = (most_recent_match - row['Date']).days
result += np.exp(-0.002*days) * (np.log(tau(row['HomeMeasure'], row['AwayMeasure'], theta[row['HomeTeamNumber']]*theta[row['AwayTeamNumber']+(3*no_of_teams)], theta[row['AwayTeamNumber']+(2*no_of_teams)]*theta[row['HomeTeamNumber']+(no_of_teams)], rho))\
- (theta[row['HomeTeamNumber']]*theta[row['AwayTeamNumber']+(3*no_of_teams)]) + row['HomeMeasure']*np.log(theta[row['HomeTeamNumber']]*theta[row['AwayTeamNumber']+(3*no_of_teams)])\
- np.log(fact(np.round(row['HomeMeasure'],0))) - (theta[row['HomeTeamNumber']+no_of_teams]*theta[row['AwayTeamNumber']+(2*no_of_teams)]) + row['AwayMeasure']*np.log(theta[row['HomeTeamNumber']+no_of_teams]*theta[row['AwayTeamNumber']+(2*no_of_teams)])\
- np.log(fact(np.round(row['AwayMeasure'],0))))
return result
def read_in_data():
data = pd.read_csv('/Users/BradleyGrantham/Documents/Python/FootballPredictions/xG model/Football-data.co.uk/E1/15-16.csv')
team_map, no_of_teams = create_team_map(data['HomeTeam'])
data['HomeTeamNumber'] = data['HomeTeam']
data['AwayTeamNumber'] = data['AwayTeam']
data['HomeTeamNumber'].replace(team_map, inplace=True)
data['AwayTeamNumber'].replace(team_map, inplace=True)
data['HomeMeasure'] = 1.0 * data['FTHG']# + 0.8 * data['xGH']
data['AwayMeasure'] = 1.0 * data['FTAG']# + 0.8 * data['xGA']
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%y')
return data, team_map, no_of_teams
def initial_parameter_estimates(data, no_of_teams):
home_attack_rating = [0] * no_of_teams
home_defense_rating = [0] * no_of_teams
away_attack_rating = [0] * no_of_teams
away_defense_rating = [0] * no_of_teams
# calculate initial estimates for home_attack and away_defense ratings
denom = sum(list(data['HomeMeasure']))
for team in team_map.values():
nume = data.loc[data['AwayTeamNumber'] == team, 'HomeMeasure'].sum()
away_defense_rating[team] = nume / np.sqrt(denom)
for team in team_map.values():
nume = data.loc[data['HomeTeamNumber'] == team, 'HomeMeasure'].sum()
home_attack_rating[team] = nume / np.sqrt(denom)
# calculate initial estimates for away_attack and home_defense ratings
denom = sum(list(data['AwayMeasure']))
for team in team_map.values():
nume = data.loc[data['AwayTeamNumber'] == team, 'AwayMeasure'].sum()
away_attack_rating[team] = nume / np.sqrt(denom)
for team in team_map.values():
nume = data.loc[data['HomeTeamNumber'] == team, 'AwayMeasure'].sum()
home_defense_rating[team] = nume / np.sqrt(denom)
return home_attack_rating, home_defense_rating, away_attack_rating, away_defense_rating
def get_parameters(data, team_map, home_attack_rating, home_defense_rating, away_attack_rating, away_defense_rating, no_of_teams):
theta = []
theta.extend(home_attack_rating)
theta.extend(home_defense_rating)
theta.extend(away_attack_rating)
theta.extend(away_defense_rating)
nll = lambda *args: -log_likelihood_function(*args) # multiple likelihood by -1
result = minimize(nll, theta, args=(data), tol=0.01)
results = result["x"]
home_attack, home_defense, away_attack, away_defense = [results[x:x + no_of_teams] for x in range(0, len(results), no_of_teams)]
dataframe = pd.DataFrame({'HomeAttack': home_attack, 'HomeDefense': home_defense, 'AwayAttack': away_attack,
'AwayDefense': away_defense}, index=list(team_map.keys()))
return result['message'], dataframe
now = datetime.datetime.now()
data, team_map, no_of_teams = read_in_data()
# print(data)
# print(team_map)
home_attack_rating, home_defense_rating, \
away_attack_rating, away_defense_rating = initial_parameter_estimates(data, no_of_teams)
message, results = get_parameters(data, team_map, home_attack_rating,home_defense_rating, away_attack_rating, away_defense_rating, no_of_teams)
print(datetime.datetime.now()-now)
results.to_csv('./Team ratings/E1/teamratings_15-16_testing.csv')
print(message)