Skip to content

Commit

Permalink
fix scrapers
Browse files Browse the repository at this point in the history
  • Loading branch information
seangreene committed Nov 23, 2024
1 parent 6212d39 commit 42aa17a
Show file tree
Hide file tree
Showing 6 changed files with 178,534 additions and 178,015 deletions.
177,167 changes: 88,584 additions & 88,583 deletions hospital-patients/data/latest.csv

Large diffs are not rendered by default.

20 changes: 17 additions & 3 deletions hospital-patients/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
import pathlib
import pandas as pd
import requests as re

# Pathing
THIS_DIR = pathlib.Path(__file__).parent.absolute()
Expand All @@ -13,11 +14,24 @@

def main():
"""
Download the Tableau export as a CSV.
Download and export as CSV
"""
# Download the data

# Request data from URL
url = "https://data.chhs.ca.gov/dataset/2df3e19e-9ee4-42a6-a087-9761f82033f6/resource/47af979d-8685-4981-bced-96a6b79d3ed5/download/covid19hospitalbycounty.csv"
df = pd.read_csv(url)
headers={'User-Agent': 'Mozilla/5.0'}
response = re.get(url, headers=headers)
response_str = response.text

# Parse the response string to dataframe
df = pd.DataFrame([row.split(',') for row in response_str.split('\r\n')])
# Dataframe will have column names as first row
# move them to column headers
df.columns = df.iloc[0] # Set the first row as column names
df = df[1:]
# lowercase headers
df.columns = map(str.lower, df.columns)

# Save it to the raw data folder
df.to_csv(DATA_DIR / "latest.csv", index=False)

Expand Down
1 change: 1 addition & 0 deletions respiratory-virus-deaths/data/latest.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1767,3 +1767,4 @@ date,area,area_type,deaths_dc_dod_covid,deaths_dc_dod_influenza,deaths_dc_dod_al
2020-01-03,California,State,0,11,853
2020-01-02,California,State,0,8,847
2020-01-01,California,State,0,4,845
,,,,,
20 changes: 6 additions & 14 deletions respiratory-virus-deaths/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
Source: https://data.ca.gov/dataset/respiratory-virus-dashboard-metrics
"""
import json
import pathlib
import pandas as pd
import requests as re

# Pathing
THIS_DIR = pathlib.Path(__file__).parent.absolute()
Expand All @@ -17,29 +17,21 @@ def main():
Download and export as a CSV.
"""


url = 'https://data.ca.gov/api/3/action/datastore_search?resource_id=403d181e-9bc9-4e48-8e3a-efc4bc7cbd9d&limit=5&q=title:jones'
fileobj = urllib.request.urlopen(url)
response_dict = json.loads(fileobj.read())
print(response_dict)


# Request data from URL
url = "https://data.chhs.ca.gov/dataset/fb0e792f-0165-414d-af91-130a4309505f/resource/858a3393-7c51-4377-9167-405eb1591d97/download/outputfile.csv"
response = re.get(url)
headers={'User-Agent': 'Mozilla/5.0'}
response = re.get(url, headers=headers)
response_str = response.text

# Parse the response string to dataframe
df = pd.DataFrame([row.split(',') for row in response_str.split('\r\n')])
# Dataframe will have column names as first row
# move them to column headers
df.columns = df.iloc[0] # Set the first row as column names
df = df[1:]

# Create dataframe and clean
df = pd.read_csv(url)
# lowercase headers
df.columns = map(str.lower, df.columns)

# Save it to the data folder
df.to_csv(DATA_DIR / "latest.csv", index=False)

Expand Down
Loading

0 comments on commit 42aa17a

Please sign in to comment.