fix scrapers

datadesk · Nov 23, 2024 · 42aa17a · 42aa17a
1 parent 6212d39
commit 42aa17a
Show file tree

Hide file tree

Showing 6 changed files with 178,534 additions and 178,015 deletions.
diff --git a/hospital-patients/data/latest.csv b/hospital-patients/data/latest.csv
diff --git a/hospital-patients/scrape.py b/hospital-patients/scrape.py
@@ -5,6 +5,7 @@
 """
 import pathlib
 import pandas as pd
+import requests as re
 
 # Pathing
 THIS_DIR = pathlib.Path(__file__).parent.absolute()
@@ -13,11 +14,24 @@
 
 def main():
     """
-    Download the Tableau export as a CSV.
+    Download and export as CSV
     """
-    # Download the data
+
+    # Request data from URL
     url = "https://data.chhs.ca.gov/dataset/2df3e19e-9ee4-42a6-a087-9761f82033f6/resource/47af979d-8685-4981-bced-96a6b79d3ed5/download/covid19hospitalbycounty.csv"
-    df = pd.read_csv(url)
+    headers={'User-Agent': 'Mozilla/5.0'}
+    response = re.get(url, headers=headers)
+    response_str = response.text
+
+    # Parse the response string to dataframe
+    df = pd.DataFrame([row.split(',') for row in response_str.split('\r\n')])
+    # Dataframe will have column names as first row
+    # move them to column headers
+    df.columns = df.iloc[0]  # Set the first row as column names
+    df = df[1:] 
+    # lowercase headers
+    df.columns = map(str.lower, df.columns)
+
     # Save it to the raw data folder
     df.to_csv(DATA_DIR / "latest.csv", index=False)
 

diff --git a/respiratory-virus-deaths/data/latest.csv b/respiratory-virus-deaths/data/latest.csv
@@ -1767,3 +1767,4 @@ date,area,area_type,deaths_dc_dod_covid,deaths_dc_dod_influenza,deaths_dc_dod_al
 2020-01-03,California,State,0,11,853
 2020-01-02,California,State,0,8,847
 2020-01-01,California,State,0,4,845
+,,,,,
diff --git a/respiratory-virus-deaths/scrape.py b/respiratory-virus-deaths/scrape.py
@@ -3,9 +3,9 @@
 
 Source: https://data.ca.gov/dataset/respiratory-virus-dashboard-metrics
 """
-import json
 import pathlib
 import pandas as pd
+import requests as re
 
 # Pathing
 THIS_DIR = pathlib.Path(__file__).parent.absolute()
@@ -17,29 +17,21 @@ def main():
     Download and export as a CSV.
     """
 
-
-url = 'https://data.ca.gov/api/3/action/datastore_search?resource_id=403d181e-9bc9-4e48-8e3a-efc4bc7cbd9d&limit=5&q=title:jones'  
-fileobj = urllib.request.urlopen(url)
-response_dict = json.loads(fileobj.read())
-print(response_dict)
-
-
     # Request data from URL
     url = "https://data.chhs.ca.gov/dataset/fb0e792f-0165-414d-af91-130a4309505f/resource/858a3393-7c51-4377-9167-405eb1591d97/download/outputfile.csv"
-    response = re.get(url)
+    headers={'User-Agent': 'Mozilla/5.0'}
+    response = re.get(url, headers=headers)
     response_str = response.text
-    
+
     # Parse the response string to dataframe
     df = pd.DataFrame([row.split(',') for row in response_str.split('\r\n')])
     # Dataframe will have column names as first row
     # move them to column headers
     df.columns = df.iloc[0]  # Set the first row as column names
     df = df[1:] 
-
-    # Create dataframe and clean
-    df = pd.read_csv(url)
+    # lowercase headers
     df.columns = map(str.lower, df.columns)
-    
+
     # Save it to the data folder
     df.to_csv(DATA_DIR / "latest.csv", index=False)