-
Notifications
You must be signed in to change notification settings - Fork 94
/
collectdata.py
125 lines (100 loc) · 3.48 KB
/
collectdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# coding: utf-8
# In[6]:
import time
from urllib2 import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from reverselines import reverse_lines
url = ['http://www.oddsportal.com/baseball/usa/mlb-2015/results//#/page/{}/']
num_pages = range(1, 51)
def generate_data(page, f, url):
html = urlopen(url.format(page)).read()
pg = BeautifulSoup(html)
browser = webdriver.Chrome()
browser.get(url.format(page))
odds = browser.find_elements_by_class_name('odds_nowrp')
odds = [element.text for element in odds]
oddsindex = 0
browser.quit()
data = pg.find_all('td', {'class': 'name table-participant'})
scores = pg.find_all('td', {'class': 'center bold table-odds'})
for t, score in zip(data, scores):
try:
f.write(fix_str(t.span.string))
f.write(' > ')
except AttributeError: # Rare case where neither is bolded (ex: game cancelled)
continue
for order, s in enumerate(t.a.stripped_strings): # order is the index vector in enumerate
# which returns => index, element of the given list
if s.strip() != t.span.string.strip():
f.write(fix_str(s))
if order == 1: # if the team is in the right side
f.write(' |H')
f.write(' ^{} {}'.format(fix_odds(odds[oddsindex]), fix_odds(odds[oddsindex + 1])))
oddsindex += 2
else: # if the team is in the left side
f.write(' |A')
f.write(' ^{} {}'.format(fix_odds(odds[oddsindex + 1]), fix_odds(odds[oddsindex])))
oddsindex += 2
break
f.write(' & ')
f.write(fix_str(score.string))
f.write('\n')
def fix_odds(s):
try:
if '+' in s:
return str(float(s[1:]) / 100 + 1)
elif '-' in s:
return str((float(s[1:]) + 100) / float(s[1:]))
elif '/' in s:
s = s.strip()
return str((float(s[:s.find('/')]) / float(s[s.find('/') + 1:]) + 1.0))
elif '.' in s:
return s
else:
print s
return 1.0
except:
return 1.0
def fix_str(s):
s = s.strip()
if ':' in s:
if 'OT' in s:
s = s[:s.find('OT')].strip() # Used for basketball model
s = s.split(':')
s = [int(_) for _ in s]
s = sorted(s, reverse=True)
s = [str(_) for _ in s]
s = ' '.join(s)
else:
s = s.replace('- ', '')
s = s.replace('-', '')
return s.encode('UTF-8')
def readcommand():
# Processes the command used to output to a different file
import sys
from optparse import OptionParser
argv = sys.argv[1:]
usage_str = """
USAGE: python collectdata.py -o output.txt
"""
parser = OptionParser(usage_str)
default_fn = 'out' + time.strftime('%m_%d_%y_%H_%M_%S', time.localtime()) + '.txt'
parser.add_option('-f', '--file', dest='fn', help='the output file (default is timestamped)', metavar='FILE',
default=default_fn)
options, otherjunk = parser.parse_args(argv)
return options.fn
if __name__ == '__main__':
fn = readcommand()
f = open('temp', 'w+')
for url, p in zip(url, num_pages):
for i in range(1, p):
print i
generate_data(i, f, url)
f.close()
f = open('temp', 'r')
fout = open(fn, 'a')
reverse_lines(f, fout)
f.close()
fout.close()
# In[ ]: