-
Notifications
You must be signed in to change notification settings - Fork 3
/
run.py
120 lines (98 loc) · 4.14 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import os
import csv
import json
import hashlib
from urllib.request import urlopen
import requests
from datetime import datetime
from dateutil import relativedelta
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor,as_completed
import zipfile
def get_time_now():
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def get_last_month():
return datetime.today() - relativedelta.relativedelta(months=1)
def download_file(url):
file_name = f"{url.split('/')[-1]}_BolsaFamilia_Pagamentos"
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
BSAFML_ROOT = os.path.join(BASE_DIR, "bolsafamilia")
ZIP_FILE = f'{file_name}.zip'
CSV_FILE = f'{file_name}.csv'
# download big file if the zip does not exists
if not os.path.isfile(f'{BSAFML_ROOT}/{ZIP_FILE}'):
content_length = int(urlopen(url).info().get('Content-Length', -1))
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(f'{ZIP_FILE}', 'wb') as f:
pbar = tqdm(total=content_length)
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
f.flush()
pbar.update(8192)
# extract if the csv does not exists
if not os.path.isfile(f'{BSAFML_ROOT}/{CSV_FILE}'):
with zipfile.ZipFile(f'{BSAFML_ROOT}/{ZIP_FILE}', 'r') as fzip:
fzip.extractall(BSAFML_ROOT)
# reading data
with open(f'{BSAFML_ROOT}/{CSV_FILE}',
encoding='utf-8', errors='ignore') as f:
reader = csv.reader(f, delimiter=';')
first = True
for row in reader:
if not first:
ms_referncia = row[0]
ms_competncia = row[1]
uf = row[2]
cdigo_municpio_siafi = row[3]
nome_municpio = row[4]
nis_favorecido = row[5]
nome_favorecido = row[6]
valor_parcela = row[7]
dict_bf = {
'ms_referncia': ms_referncia,
'ms_competncia': ms_competncia,
'uf': uf,
'cdigo_municpio_siafi': cdigo_municpio_siafi,
'nome_municpio': nome_municpio,
'nis_favorecido': nis_favorecido,
'nome_favorecido': nome_favorecido,
'valor_parcela': valor_parcela
}
# hashing json file name
hash = hashlib.sha224(f'{ms_referncia}\
{ms_competncia}{uf}{cdigo_municpio_siafi}{nis_favorecido}\
{valor_parcela}'.encode('utf-8')).hexdigest()
file_json = f'{BSAFML_ROOT}/data/{hash.lower()}.json'
# save file
with open(file_json, mode="w") as f:
f.write(json.dumps(dict_bf, indent=4))
print(f'{get_time_now()} [ Ok ] {hash} {nome_favorecido}')
else:
first = not first
return f'{CSV_FILE}'
#===============================================================================
if __name__ == '__main__':
URL_BOLSA_FAMI_PAGTOS = 'http://transparencia.gov.br/download-de-dados'
URL_BOLSA_FAMI_PAGTOS = f'{URL_BOLSA_FAMI_PAGTOS}/bolsa-familia-pagamentos'
quantityOfMonths = 1 # just one
monthList = [
(get_last_month() - relativedelta.relativedelta(months=x))
.strftime('%Y%m')
for x in range(quantityOfMonths)
]
# threading in python to run faster (asynchronous)
with ThreadPoolExecutor(max_workers=3) as executor:
for thread in as_completed({
executor.submit(
download_file,
f'{URL_BOLSA_FAMI_PAGTOS}/{yearMonthReferency}'
): yearMonthReferency for yearMonthReferency in monthList
}):
try:
thread.result()
except Exception as e:
print(e)
exit(0)