From 380464e6415e66f8925f55e2831472e8789cee0f Mon Sep 17 00:00:00 2001 From: Jaroslaw Michalski Date: Tue, 28 Feb 2023 22:10:18 +0000 Subject: [PATCH 1/3] initial changes (WIP) --- app/engine/from_db/nested.py | 23 +++++++++++++++-------- app/utils/assets.py | 12 ++++-------- app/utils/operations/response.py | 5 +++-- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/app/engine/from_db/nested.py b/app/engine/from_db/nested.py index 44c7b6d..52d3f4e 100644 --- a/app/engine/from_db/nested.py +++ b/app/engine/from_db/nested.py @@ -23,19 +23,26 @@ def process_nested_data(results: Iterable[Record], request: Request) -> DataFram ] if request.format == "csv": + columns = [*base_columns, *MetricData.nested_struct[nested_metric_name]] + df = json_normalize( map(dict, results), nested_metric_name, - [*base_columns, MetricData.nested_struct[nested_metric_name]], - errors='ignore' + base_columns, + # meta_prefix=f"{nested_metric_name}.", + # errors='ignore' ) + # df.drop( + # axis=1, + # columns=[f"{nested_metric_name}.{column}" for column in MetricData.nested_struct[nested_metric_name]], + # inplace=True + # ) - df.rename( - columns={col: col.removeprefix(f"{nested_metric_name}.") for col in df.columns}, - inplace=True - ) + # df.rename( + # columns={col: col.removeprefix(f"{nested_metric_name}.") for col in df.columns}, + # inplace=True + # ) - columns = [*base_columns, *MetricData.nested_struct[nested_metric_name]] else: df = DataFrame( results, @@ -46,7 +53,7 @@ def process_nested_data(results: Iterable[Record], request: Request) -> DataFram payload = ( df .sort_values(["date", "areaCode"], ascending=[False, True]) - .loc[:, columns] + # .loc[:, columns] ) payload = payload.where(payload.notnull(), None) diff --git a/app/utils/assets.py b/app/utils/assets.py index cbcceed..597f3c8 100644 --- a/app/utils/assets.py +++ b/app/utils/assets.py @@ -38,7 +38,7 @@ query = """\ -SELECT rr.timestamp +SELECT rr.timestamp FROM covid19.release_reference AS rr JOIN covid19.release_category AS rc ON rc.release_id = rr.id WHERE DATE(rr.timestamp) = $1 @@ -146,16 +146,12 @@ class MetricData: "cumVaccinationThirdInjectionUptakeByVaccinationDatePercentage", "cumVaccinationCompleteCoverageByVaccinationDatePercentage", + "newPeopleVaccinatedAutumn22ByVaccinationDate", + "cumPeopleVaccinatedAutumn22ByVaccinationDate", + "cumVaccinationAutumn22UptakeByVaccinationDatePercentage", "newPeopleVaccinatedSpring22ByVaccinationDate", "cumPeopleVaccinatedSpring22ByVaccinationDate", "cumVaccinationSpring22UptakeByVaccinationDatePercentage", - - 'newPeopleVaccinatedSpring22ByVaccinationDate', - 'newPeopleVaccinatedAutumn22ByVaccinationDate', - 'cumPeopleVaccinatedSpring22ByVaccinationDate', - 'cumPeopleVaccinatedAutumn22ByVaccinationDate', - 'cumVaccinationAutumn22UptakeByVaccinationDatePercentage', - 'cumVaccinationSpring22UptakeByVaccinationDatePercentage', ] } diff --git a/app/utils/operations/response.py b/app/utils/operations/response.py index c24f006..735ff84 100644 --- a/app/utils/operations/response.py +++ b/app/utils/operations/response.py @@ -8,7 +8,7 @@ # 3rd party: -# Internal: +# Internal: from ..assets import get_latest_timestamp from .request import Request @@ -38,7 +38,8 @@ def __init__(self, request, container, path): host: str = request.base_request.headers.get("X-Forwarded-Host", API_URL) host = host.removeprefix("https://").removeprefix("api.") - self.location = f"https://api.{host}/downloads/{container}/{path}" + # self.location = f"https://api.{host}/downloads/{container}/{path}" + self.location = f"https://{API_URL}/{container}/{path}" permalink = f"https://{API_URL}/apiv2cache/{request.path}" From 3245b75a97500daa794d795c2a42e0e0784030b9 Mon Sep 17 00:00:00 2001 From: Jaroslaw Michalski Date: Thu, 2 Mar 2023 18:03:36 +0000 Subject: [PATCH 2/3] remove comments, adjust response url for sandbox --- app/engine/from_db/nested.py | 17 ----------------- app/utils/operations/response.py | 12 +++++++++--- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/app/engine/from_db/nested.py b/app/engine/from_db/nested.py index 52d3f4e..637b210 100644 --- a/app/engine/from_db/nested.py +++ b/app/engine/from_db/nested.py @@ -23,37 +23,20 @@ def process_nested_data(results: Iterable[Record], request: Request) -> DataFram ] if request.format == "csv": - columns = [*base_columns, *MetricData.nested_struct[nested_metric_name]] - df = json_normalize( map(dict, results), nested_metric_name, base_columns, - # meta_prefix=f"{nested_metric_name}.", - # errors='ignore' ) - # df.drop( - # axis=1, - # columns=[f"{nested_metric_name}.{column}" for column in MetricData.nested_struct[nested_metric_name]], - # inplace=True - # ) - - # df.rename( - # columns={col: col.removeprefix(f"{nested_metric_name}.") for col in df.columns}, - # inplace=True - # ) - else: df = DataFrame( results, columns=["areaCode", "areaType", "areaName", "date", "metric", nested_metric_name] ) - columns = df.columns payload = ( df .sort_values(["date", "areaCode"], ascending=[False, True]) - # .loc[:, columns] ) payload = payload.where(payload.notnull(), None) diff --git a/app/utils/operations/response.py b/app/utils/operations/response.py index 735ff84..6222776 100644 --- a/app/utils/operations/response.py +++ b/app/utils/operations/response.py @@ -9,6 +9,7 @@ # 3rd party: # Internal: +from app.config import Settings from ..assets import get_latest_timestamp from .request import Request @@ -21,7 +22,7 @@ API_PREFIX = "/api/" -API_URL = "coronavirus.data.gov.uk" +API_URL = Settings.service_domain ResponseContentType = Union[None, bytes, AsyncGenerator[bytes, None]] @@ -38,8 +39,13 @@ def __init__(self, request, container, path): host: str = request.base_request.headers.get("X-Forwarded-Host", API_URL) host = host.removeprefix("https://").removeprefix("api.") - # self.location = f"https://api.{host}/downloads/{container}/{path}" - self.location = f"https://{API_URL}/{container}/{path}" + # TODO: remove this IF statement when routing rule for rr-apiv2cache frontend + # domain name has been change + self.location = ( + f"https://api.{host}/downloads/{container}/{path}" + if not host.startswith("sandbox") + else f"https://api-{host}/downloads/{container}/{path}" + ) permalink = f"https://{API_URL}/apiv2cache/{request.path}" From b92eb4b0c2bc069675515ad4c7f4ccac5e7872f7 Mon Sep 17 00:00:00 2001 From: Jaroslaw Michalski Date: Wed, 15 Mar 2023 10:37:05 +0000 Subject: [PATCH 3/3] handle KeyError for all null values in dataframe Transformed a set to a list in format_dtypes (utils.py) to avoid compatibility issues in newer pandas version. --- app/engine/from_db/generic.py | 34 ++++++++++++++++++++-------------- app/engine/from_db/utils.py | 3 ++- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/app/engine/from_db/generic.py b/app/engine/from_db/generic.py index 8f5d878..e2d6a97 100644 --- a/app/engine/from_db/generic.py +++ b/app/engine/from_db/generic.py @@ -31,20 +31,26 @@ def process_generic_data(results: Iterable[Record], request: Request) -> DataFra for metric in filter(response_metrics.__contains__, MetricData.generic_dtypes) } - payload = ( - df - .pivot_table( - values="value", - index=MetricData.base_metrics, - columns="metric", - aggfunc='first' + try: + payload = ( + df + .pivot_table( + values="value", + index=MetricData.base_metrics, + columns="metric", + aggfunc='first' + ) + .reset_index() + .sort_values(["date", "areaCode"], ascending=[False, True]) + .pipe(format_dtypes, column_types=column_types) + .loc[:, [*MetricData.base_metrics, *response_metrics]] + .pipe(format_msoas, request=request) + .pipe(format_data, response_metrics=response_metrics) ) - .reset_index() - .sort_values(["date", "areaCode"], ascending=[False, True]) - .pipe(format_dtypes, column_types=column_types) - .loc[:, [*MetricData.base_metrics, *response_metrics]] - .pipe(format_msoas, request=request) - .pipe(format_data, response_metrics=response_metrics) - ) + except KeyError as err: + # This can happen if there are only null values in the df + # then some operations on the dataframe can't be performed + # Return the expected Dataframe object + payload = DataFrame() return payload diff --git a/app/engine/from_db/utils.py b/app/engine/from_db/utils.py index 2306135..9382c69 100644 --- a/app/engine/from_db/utils.py +++ b/app/engine/from_db/utils.py @@ -106,7 +106,8 @@ async def cache_response(func, *, request: Request, **kwargs) -> bool: def format_dtypes(df: DataFrame, column_types: Dict[str, object]) -> DataFrame: - json_columns = MetricData.json_dtypes.intersection(column_types) + # passing a list instead of a set to avid compatibility with new pandas version + json_columns = list(MetricData.json_dtypes.intersection(column_types)) # Replace `null` string with None. This happens because # some DB queries convert `null` to `"null"` for type