-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscrapU.py
145 lines (118 loc) · 5.46 KB
/
webscrapU.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from datetime import datetime as dt
import csv
import datetime
import logging
import os
import pandas as pd
import requests
import shutil
import sys
from bs4 import BeautifulSoup
LOGIN_URL = "https://tarragon.gootem.com/autentica.html"
LOGIN_URL_REF = "https://tarragon.gootem.com/login.html"
LOGOUT_URL = "https://tarragon.gootem.com/logout.html"
BASE_URL = "https://tarragon.gootem.com/index.php?lang=en&page=historial_riego&idContador={0}&mes=F{1}"
def get_dataframe_filtered_by_user(dataframe, user):
inicial = ""
if user == "marcocerveraborja":
inicial = "B"
elif user == "marcozanonteofilo":
inicial = "F"
elif user == "isaac":
inicial = "I"
try:
return dataframe.loc[dataframe['user'] == inicial].drop(['user'], axis=1)
except:
logging.error("error al leer usuario")
sys.exit(1)
def main():
log_filename = "{}.log".format(os.path.basename(__file__).split('.')[0])
logging.basicConfig(filename=log_filename, format='%(asctime)s %(message)s', datefmt='%d/%m/%Y %I:%M:%S %p',
filemode="w+", level=logging.DEBUG)
# handler = logging.FileHandler(log_filename, 'w+')
logging.info("Empezando")
try:
# Leemos los contadores
contadores = pd.read_csv("contadores.tsv", sep="\t",
header=None, names=['user', 'partida', 'contador', 'hanegadas'])
# por cada usuario hac
usuarios = pd.read_csv("userpass.tsv", sep="\t", header=None, names=['user', 'psswd', 'name'])
except:
logging.error("Error leyendo ficheros de configuración")
sys.exit(1)
month = dt.now().strftime('%m')
year = dt.now().strftime('%Y')
# anyo y dia del mes pasado para archivar documentos
archive = (dt.utcnow().replace(day=1) - datetime.timedelta(days=1)).strftime("%Y%m")
# Preparando para archivar
archive_folder_path = os.path.abspath(os.getcwd()) + os.path.sep + year
if not os.path.isdir(archive_folder_path):
os.mkdir(archive_folder_path)
for usuario in usuarios.itertuples():
# Crear Sesion
session_requests = requests.Session()
# TODO Sacar las llamadas a request a una funcion
try:
# Get login csrf token
result = session_requests.get(LOGIN_URL)
result.raise_for_status()
except requests.exceptions.HTTPError as errh:
logging.exception("Error HTTP: {0}".format(errh.response.status_code))
except requests.exceptions.ConnectionError as errc:
logging.exception("Error de conexión")
except requests.exceptions.Timeout as errt:
logging.exception("Error de Timeout") # en español Timeout queda feo
except requests.exceptions.RequestException as err:
logging.exception("Error")
# Create payload
payload = {
"usuario": usuario.user,
"password": usuario.psswd
}
try:
# Perform login
result = session_requests.post(LOGIN_URL, data=payload, headers=dict(referer=LOGIN_URL_REF))
result.raise_for_status()
except requests.exceptions.HTTPError as errh:
logging.exception("Error HTTP: {0}".format(errh.response.status_code))
except requests.exceptions.ConnectionError as errc:
logging.exception("Error de conexión")
except requests.exceptions.Timeout as errt:
logging.exception("Error de Timeout") # en español Timeout queda feo
except requests.exceptions.RequestException as err:
logging.exception("Error")
for campo in get_dataframe_filtered_by_user(contadores, usuario.user).itertuples():
# Si es dia 1 se archiva el csv
nombrecsv = campo.partida + ".csv"
csv_working_file = '{0}{1}{2}'.format(os.path.abspath(os.getcwd()), os.path.sep, nombrecsv)
if dt.now().day == 1:
shutil.copy2(csv_working_file,
'{0}{1}{2}{3}'.format(archive_folder_path, os.path.sep, archive, nombrecsv))
download_data_url = BASE_URL.format(campo.contador, month + year)
result = session_requests.get(download_data_url, headers=dict(referer=download_data_url))
# Como en la web solo hay una tabla la buscamos
table = BeautifulSoup(result.content, 'lxml').find('table')
# Headers para el csv
table_headers = [header.text for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
rows.append([val.text for val in row.find_all('td')])
with open(csv_working_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(table_headers)
writer.writerows(row for row in rows if row)
try:
# Perform login
result = requests.get(LOGOUT_URL, headers=dict(referer=download_data_url))
result.raise_for_status()
except requests.exceptions.HTTPError as errh:
logging.exception("Error HTTP: {0}".format(errh.response.status_code))
except requests.exceptions.ConnectionError:
logging.exception("Error de conexión")
except requests.exceptions.Timeout:
logging.exception("Error de Timeout") # en español Timeout queda feo
except requests.exceptions.RequestException:
logging.exception("Error")
logging.info("Terminando")
if __name__ == '__main__':
main()