Skip to content

Commit

Permalink
Exception in HTTP response + "None" test
Browse files Browse the repository at this point in the history
- Management of a rare exception that might occurs while reading an HTTP response (response is empty)
- Multiple tests "if (var == None):" are replaced by "if (var is None):" in order to be more correct
  • Loading branch information
metalbobinou authored Aug 8, 2023
1 parent 72fa21f commit 457501e
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 43 deletions.
30 changes: 19 additions & 11 deletions src/2-Get_Ark_ID.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,14 +209,22 @@ def get_ressource_url(url):
else:
print("OK")

#data = response.read()
#content = str(data.decode('utf-8'))
#text = data.decode(info.get_param('charset', 'utf-8'))
html = response.read().decode(response.info().get_param('charset') or 'utf-8')
info = response.info()
url_new = response.url
headers = response.headers
status = response.status
# Read the HTTP response
try:
#data = response.read()
#content = str(data.decode('utf-8'))
#text = data.decode(info.get_param('charset', 'utf-8'))
html = response.read().decode(response.info().get_param('charset') or 'utf-8')
info = response.info()
url_new = response.url
headers = response.headers
status = response.status
except Exception as e:
print("--- UNKNOWN ERROR WHILE READING HTTP RESPONSE: ---")
print(str(e))
print("#############")
logging.error(traceback.format_exc())
return (None, None)

print("## url_new : " + str(url_new))
#print("## headers : " + str(headers))
Expand Down Expand Up @@ -476,7 +484,7 @@ def main():
print("File list_of_URLs format: [one URL per line]")
print("[date] [URL]")
print("date : YYYY-MM-DD URL : https://gallica.bnf.fr/ark:/...")
exit(-1)
sys.exit(-1)
else:
url_filename_input = sys.argv[1]
# Check if file is readable
Expand All @@ -498,13 +506,13 @@ def main():
print("File list_of_URLs format: [one URL per line]")
print("[date] [URL]")
print("date : YYYY-MM-DD URL : https://gallica.bnf.fr/ark:/...")
exit(-2)
sys.exit(-2)

# In other case, when evrything is fine, let's process lines
MyCommonTools.print_time("%%%% BEGIN PROCESSING")
ret = process_lines(lines)
MyCommonTools.print_time("%%%% END PROCESSING")

exit(ret)
sys.exit(ret)

main()
12 changes: 6 additions & 6 deletions src/2bis-Get_Ark_ID_NO_BS4.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def get_ark_id_from_date_URL(url_date):
external_gallica = False

# if resolved URL is empty, let's write it
if (url_resolved == None):
if (url_resolved is None):
print("## Error :")
print(" no ressource found")
print("url_date : --" + url_date + "--")
Expand All @@ -241,7 +241,7 @@ def get_ark_id_from_date_URL(url_date):
url_no_suffix = remove_suffix_item_from_url(url_resolved)
## next, let's remove the prefix : the http[s]://...
ark_id = remove_prefix_https_from_url(url_no_suffix)
if (ark_id == None):
if (ark_id is None):
print("## Error :")
print(" can't find the Ark ID in the URL")
print("url_date : --" + url_date + "--")
Expand Down Expand Up @@ -298,7 +298,7 @@ def process_lines(lines):
external_gallica = answers[1]

# if ark_id was not found, write down the number where it failed and stop
if (ark_id == None):
if (ark_id is None):
#print("ERROR: Failed at line " + str(cur_line))
#print("DATE : " + date)
#print("URL : " + url)
Expand Down Expand Up @@ -400,7 +400,7 @@ def main():
print("File list_of_URLs format: [one URL per line]")
print("[date] [URL]")
print("date : YYYY-MM-DD URL : https://gallica.bnf.fr/ark:/...")
exit(-1)
sys.exit(-1)
else:
url_filename_input = sys.argv[1]
# Check if file is readable
Expand All @@ -422,13 +422,13 @@ def main():
print("File list_of_URLs format: [one URL per line]")
print("[date] [URL]")
print("date : YYYY-MM-DD URL : https://gallica.bnf.fr/ark:/...")
exit(-2)
sys.exit(-2)

# In other case, when evrything is fine, let's process lines
MyCommonTools.print_time("%%%% BEGIN PROCESSING")
ret = process_lines(lines)
MyCommonTools.print_time("%%%% END PROCESSING")

exit(ret)
sys.exit(ret)

main()
6 changes: 3 additions & 3 deletions src/3-Download_Multiple_Docs_Same_Date.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def main():
print("File list_of_URLs format: [one URL per line]")
print("[date] [URL]")
print("date : YYYY-MM-DD URL : https://gallica.bnf.fr/ark:/...")
exit(-1)
sys.exit(-1)
else:
url_filename_input = sys.argv[1]
# Check if file is readable
Expand All @@ -451,13 +451,13 @@ def main():
print("File list_of_URLs format: [one URL per line]")
print("[date] [URL]")
print("date : YYYY-MM-DD URL : https://gallica.bnf.fr/ark:/...")
exit(-2)
sys.exit(-2)

# In other case, when evrything is fine, let's process lines
MyCommonTools.print_time("%%%% BEGIN PROCESSING")
ret = process_lines(lines)
MyCommonTools.print_time("%%%% END PROCESSING")

exit(ret)
sys.exit(ret)

main()
29 changes: 18 additions & 11 deletions src/4-Download_Date_to_JPEG.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,17 +221,24 @@ def get_document_debat_parlementaire(ark_id, directory_output, filename_prefix):
logging.error(traceback.format_exc())
return (None, error_503)

# Everything is fine
# Everything is fine during the connection
else:
print("OK - p." + str(page))

# Get HTTP response
# Read the HTTP response
try:
data = response.read()
info = response.info()
url_new = response.url
headers = response.headers
status = response.status
except Exception as e:
print("--- UNKNOWN ERROR WHILE READING HTTP RESPONSE: ---")
print(str(e))
print("#############")
logging.error(traceback.format_exc())
return (None, error_503)

data = response.read()
info = response.info()
url_new = response.url
headers = response.headers
status = response.status
#text = data.decode(info.get_param('charset', 'utf-8'))
#text = data.decode('utf-8')
print("## url_new : " + str(url_new))
Expand Down Expand Up @@ -311,7 +318,7 @@ def process_lines(lines):

## If an error occurred, let's save where we were
## Error => when no pages were downloaded + no error 503 happened
if ((pages_written == None) and (error_503 == False)):
if ((pages_written is None) and (error_503 == False)):
print("=> No document to download found")
try:
line_undownloaded = date + " " + ark_id
Expand Down Expand Up @@ -360,7 +367,7 @@ def main():
print("File list_of_Ark_IDs format: [one Ark ID per line]")
print("[date] [Ark ID]")
print("date : YYYY-MM-DD Ark ID : /12148/bpt6k64490143")
exit(-1)
sys.exit(-1)
else:
ark_id_filename_input = sys.argv[2]
# Check if file is readable
Expand All @@ -382,14 +389,14 @@ def main():
print("File list_of_Ark_IDs format: [one Ark ID per line]")
print("[date] [Ark ID]")
print("date : YYYY-MM-DD Ark ID : /12148/bpt6k64490143")
exit(-2)
sys.exit(-2)


# In other case, when evrything is fine, let's process lines
MyCommonTools.print_time("%%%% BEGIN PROCESSING")
ret = process_lines(lines)
MyCommonTools.print_time("%%%% END PROCESSING")

exit(ret)
sys.exit(ret)

main()
37 changes: 25 additions & 12 deletions src/4-Download_Date_to_PDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,12 @@ def get_document_PDF_debat_parlementaire(ark_id, directory_output, filename_pref

# Exception HTTP Error
except urllib.error.HTTPError as e:
print("### HTTP ERROR:")
## Error 503 : we reached the end of the document
if (page == 1):
print("### HTTP ERROR ON PAGE 1:")
else:
print("### HTTP ERROR:")

if hasattr(e, 'reason'):
print('Failed to reach a server.')
print('Reason: ', e.reason)
Expand Down Expand Up @@ -197,12 +202,20 @@ def get_document_PDF_debat_parlementaire(ark_id, directory_output, filename_pref
else:
print("OK")

# Get HTTP response
data = response.read()
info = response.info()
url_new = response.url
headers = response.headers
status = response.status
# Read HTTP response
try:
data = response.read()
info = response.info()
url_new = response.url
headers = response.headers
status = response.status
except Exception as e:
print("--- UNKNOWN ERROR WHILE READING HTTP RESPONSE: ---")
print(str(e))
print("#############")
logging.error(traceback.format_exc())
return (None)

#text = data.decode(info.get_param('charset', 'utf-8'))
#text = data.decode('utf-8')
print("## url_new : " + str(url_new))
Expand All @@ -218,7 +231,7 @@ def get_document_PDF_debat_parlementaire(ark_id, directory_output, filename_pref
except IOError:
print("++++ IOError : Couldn't write the PDF output file ++++")
print(" output filename : " + str(directory_output + "/" + pdffile))
return (None, error_503)
return (None)

print("###################################")

Expand Down Expand Up @@ -277,7 +290,7 @@ def process_lines(lines):
filename_prefix)

## If an error occurred, let's save where we were
if (pages_written == None):
if (pages_written is None):
print("=> No document to download found")
try:
line_undownloaded = date + " " + ark_id
Expand Down Expand Up @@ -326,7 +339,7 @@ def main():
print("File list_of_Ark_IDs format: [one Ark ID per line]")
print("[date] [Ark ID]")
print("date : YYYY-MM-DD Ark ID : /12148/bpt6k64490143")
exit(-1)
sys.exit(-1)
else:
ark_id_filename_input = sys.argv[2]
# Check if file is readable
Expand All @@ -348,14 +361,14 @@ def main():
print("File list_of_Ark_IDs format: [one Ark ID per line]")
print("[date] [Ark ID]")
print("date : YYYY-MM-DD Ark ID : /12148/bpt6k64490143")
exit(-2)
sys.exit(-2)


# In other case, when evrything is fine, let's process lines
MyCommonTools.print_time("%%%% BEGIN PROCESSING")
ret = process_lines(lines)
MyCommonTools.print_time("%%%% END PROCESSING")

exit(ret)
sys.exit(ret)

main()

0 comments on commit 457501e

Please sign in to comment.