Move back to requests module and implement backoff retry mechanism to…

… help solve 403 errors (#78)
jadchaar · May 20, 2021 · 30e6afb · 30e6afb
1 parent b0adb62
commit 30e6afb
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 16 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 bs4
 Faker
-httpx
 lxml
 pre-commit
 pytest
 pytest-cov
+requests
 sphinx-autodoc-typehints
diff --git a/sec_edgar_downloader/_constants.py b/sec_edgar_downloader/_constants.py
@@ -11,7 +11,7 @@
 SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1
 
 # Number of times to retry a request to sec.gov
-MAX_RETRIES = 5
+MAX_RETRIES = 10
 
 DATE_FORMAT_TOKENS = "%Y-%m-%d"
 DEFAULT_BEFORE_DATE = date.today()

diff --git a/sec_edgar_downloader/_utils.py b/sec_edgar_downloader/_utils.py
@@ -7,9 +7,11 @@
 from typing import List
 from urllib.parse import urljoin
 
-import httpx
+import requests
 from bs4 import BeautifulSoup
 from faker import Faker
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 
 from ._constants import (
     DATE_FORMAT_TOKENS,
@@ -42,7 +44,12 @@ class EdgarSearchApiError(Exception):
 fake = Faker()
 
 # Specify max number of request retries
-transport = httpx.HTTPTransport(retries=MAX_RETRIES)
+# https://stackoverflow.com/a/35504626/3820660
+retries = Retry(
+    total=MAX_RETRIES,
+    backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL,
+    status_forcelist=[403, 500, 502, 503, 504],
+)
 
 
 def validate_date_format(date_format: str) -> None:
@@ -133,9 +140,10 @@ def get_filing_urls_to_download(
     filings_to_fetch: List[FilingMetadata] = []
     start_index = 0
 
-    with httpx.Client(
-        headers={"User-Agent": fake.chrome()}, transport=transport
-    ) as client:
+    client = requests.Session()
+    client.mount("http://", HTTPAdapter(max_retries=retries))
+    client.mount("https://", HTTPAdapter(max_retries=retries))
+    try:
         while len(filings_to_fetch) < num_filings_to_download:
             payload = form_request_payload(
                 ticker_or_cik,
@@ -148,6 +156,7 @@ def get_filing_urls_to_download(
             resp = client.post(
                 SEC_EDGAR_SEARCH_API_ENDPOINT,
                 json=payload,
+                headers={"User-Agent": fake.chrome()},
             )
             resp.raise_for_status()
             search_query_results = resp.json()
@@ -200,6 +209,8 @@ def get_filing_urls_to_download(
 
             # Prevent rate limiting
             time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL)
+    finally:
+        client.close()
 
     return filings_to_fetch
 
@@ -220,7 +231,7 @@ def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str:
 
 
 def download_and_save_filing(
-    client: httpx.Client,
+    client: requests.Session,
     download_folder: Path,
     ticker_or_cik: str,
     accession_number: str,
@@ -230,7 +241,7 @@ def download_and_save_filing(
     *,
     resolve_urls: bool = False,
 ) -> None:
-    resp = client.get(download_url)
+    resp = client.get(download_url, headers={"User-Agent": fake.chrome()})
     resp.raise_for_status()
     filing_text = resp.content
 
@@ -262,9 +273,10 @@ def download_filings(
     filings_to_fetch: List[FilingMetadata],
     include_filing_details: bool,
 ) -> None:
-    with httpx.Client(
-        headers={"User-Agent": fake.chrome()}, transport=transport
-    ) as client:
+    client = requests.Session()
+    client.mount("http://", HTTPAdapter(max_retries=retries))
+    client.mount("https://", HTTPAdapter(max_retries=retries))
+    try:
         for filing in filings_to_fetch:
             try:
                 download_and_save_filing(
@@ -276,7 +288,7 @@ def download_filings(
                     filing.full_submission_url,
                     FILING_FULL_SUBMISSION_FILENAME,
                 )
-            except httpx.HTTPError as e:  # pragma: no cover
+            except requests.exceptions.HTTPError as e:  # pragma: no cover
                 print(
                     "Skipping full submission download for "
                     f"'{filing.accession_number}' due to network error: {e}."
@@ -294,11 +306,13 @@ def download_filings(
                         filing.filing_details_filename,
                         resolve_urls=True,
                     )
-                except httpx.HTTPError as e:  # pragma: no cover
+                except requests.exceptions.HTTPError as e:  # pragma: no cover
                     print(
                         f"Skipping filing detail download for "
                         f"'{filing.accession_number}' due to network error: {e}."
                     )
+    finally:
+        client.close()
 
 
 def get_number_of_unique_filings(filings: List[FilingMetadata]) -> int:

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
     url="https://github.com/jadchaar/sec-edgar-downloader",
     packages=["sec_edgar_downloader"],
     zip_safe=False,
-    install_requires=["httpx", "bs4", "lxml", "Faker"],
+    install_requires=["requests", "bs4", "lxml", "Faker"],
     python_requires=">=3.6",
     classifiers=[
         "Development Status :: 5 - Production/Stable",

diff --git a/tox.ini b/tox.ini
@@ -31,7 +31,7 @@ deps =
     doc8
     sphinx
     sphinx_autodoc_typehints
-    httpx
+    requests
     bs4
     Faker
 allowlist_externals = make