Skip to content

Commit

Permalink
Move back to requests module and implement backoff retry mechanism to…
Browse files Browse the repository at this point in the history
… help solve 403 errors (#78)
  • Loading branch information
jadchaar authored May 20, 2021
1 parent b0adb62 commit 30e6afb
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 16 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
bs4
Faker
httpx
lxml
pre-commit
pytest
pytest-cov
requests
sphinx-autodoc-typehints
2 changes: 1 addition & 1 deletion sec_edgar_downloader/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1

# Number of times to retry a request to sec.gov
MAX_RETRIES = 5
MAX_RETRIES = 10

DATE_FORMAT_TOKENS = "%Y-%m-%d"
DEFAULT_BEFORE_DATE = date.today()
Expand Down
38 changes: 26 additions & 12 deletions sec_edgar_downloader/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from typing import List
from urllib.parse import urljoin

import httpx
import requests
from bs4 import BeautifulSoup
from faker import Faker
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from ._constants import (
DATE_FORMAT_TOKENS,
Expand Down Expand Up @@ -42,7 +44,12 @@ class EdgarSearchApiError(Exception):
fake = Faker()

# Specify max number of request retries
transport = httpx.HTTPTransport(retries=MAX_RETRIES)
# https://stackoverflow.com/a/35504626/3820660
retries = Retry(
total=MAX_RETRIES,
backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL,
status_forcelist=[403, 500, 502, 503, 504],
)


def validate_date_format(date_format: str) -> None:
Expand Down Expand Up @@ -133,9 +140,10 @@ def get_filing_urls_to_download(
filings_to_fetch: List[FilingMetadata] = []
start_index = 0

with httpx.Client(
headers={"User-Agent": fake.chrome()}, transport=transport
) as client:
client = requests.Session()
client.mount("http://", HTTPAdapter(max_retries=retries))
client.mount("https://", HTTPAdapter(max_retries=retries))
try:
while len(filings_to_fetch) < num_filings_to_download:
payload = form_request_payload(
ticker_or_cik,
Expand All @@ -148,6 +156,7 @@ def get_filing_urls_to_download(
resp = client.post(
SEC_EDGAR_SEARCH_API_ENDPOINT,
json=payload,
headers={"User-Agent": fake.chrome()},
)
resp.raise_for_status()
search_query_results = resp.json()
Expand Down Expand Up @@ -200,6 +209,8 @@ def get_filing_urls_to_download(

# Prevent rate limiting
time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL)
finally:
client.close()

return filings_to_fetch

Expand All @@ -220,7 +231,7 @@ def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str:


def download_and_save_filing(
client: httpx.Client,
client: requests.Session,
download_folder: Path,
ticker_or_cik: str,
accession_number: str,
Expand All @@ -230,7 +241,7 @@ def download_and_save_filing(
*,
resolve_urls: bool = False,
) -> None:
resp = client.get(download_url)
resp = client.get(download_url, headers={"User-Agent": fake.chrome()})
resp.raise_for_status()
filing_text = resp.content

Expand Down Expand Up @@ -262,9 +273,10 @@ def download_filings(
filings_to_fetch: List[FilingMetadata],
include_filing_details: bool,
) -> None:
with httpx.Client(
headers={"User-Agent": fake.chrome()}, transport=transport
) as client:
client = requests.Session()
client.mount("http://", HTTPAdapter(max_retries=retries))
client.mount("https://", HTTPAdapter(max_retries=retries))
try:
for filing in filings_to_fetch:
try:
download_and_save_filing(
Expand All @@ -276,7 +288,7 @@ def download_filings(
filing.full_submission_url,
FILING_FULL_SUBMISSION_FILENAME,
)
except httpx.HTTPError as e: # pragma: no cover
except requests.exceptions.HTTPError as e: # pragma: no cover
print(
"Skipping full submission download for "
f"'{filing.accession_number}' due to network error: {e}."
Expand All @@ -294,11 +306,13 @@ def download_filings(
filing.filing_details_filename,
resolve_urls=True,
)
except httpx.HTTPError as e: # pragma: no cover
except requests.exceptions.HTTPError as e: # pragma: no cover
print(
f"Skipping filing detail download for "
f"'{filing.accession_number}' due to network error: {e}."
)
finally:
client.close()


def get_number_of_unique_filings(filings: List[FilingMetadata]) -> int:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
url="https://github.com/jadchaar/sec-edgar-downloader",
packages=["sec_edgar_downloader"],
zip_safe=False,
install_requires=["httpx", "bs4", "lxml", "Faker"],
install_requires=["requests", "bs4", "lxml", "Faker"],
python_requires=">=3.6",
classifiers=[
"Development Status :: 5 - Production/Stable",
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ deps =
doc8
sphinx
sphinx_autodoc_typehints
httpx
requests
bs4
Faker
allowlist_externals = make
Expand Down

0 comments on commit 30e6afb

Please sign in to comment.