From ec498a53856b22f8a47c67a623e42646edefaba0 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Wed, 17 Jul 2024 20:17:39 -0700 Subject: [PATCH] Fix: Coercion of string to integers when converting csv to agate tables (#2918) --- sqlmesh/dbt/seed.py | 25 +++++++++++++------------ tests/dbt/test_transformation.py | 22 +++++++++++++++++++++- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/sqlmesh/dbt/seed.py b/sqlmesh/dbt/seed.py index 50a1ec9d7..b611f82b1 100644 --- a/sqlmesh/dbt/seed.py +++ b/sqlmesh/dbt/seed.py @@ -55,21 +55,22 @@ def to_sqlmesh(self, context: DbtContext) -> Model: ) -class Integer(agate.data_types.DataType): - def cast(self, d: str) -> t.Optional[int]: - if d is None: - return d - try: - return int(d) - except ValueError: - raise agate.exceptions.CastError('Can not parse value "%s" as Integer.' % d) - - def jsonify(self, d: str) -> str: +class Integer(agate_helper.Integer): + def cast(self, d: t.Any) -> t.Optional[int]: + if isinstance(d, str): + # The dbt's implementation doesn't support coercion of strings to integers. + if d.strip().lower() in self.null_values: + return None + try: + return int(d) + except ValueError: + raise agate.exceptions.CastError('Can not parse value "%s" as Integer.' % d) + return super().cast(d) + + def jsonify(self, d: t.Any) -> str: return d -# The dbt version has a bug in which they check whether the type of the input value -# is int, while the input value is actually always a string. agate_helper.Integer = Integer # type: ignore diff --git a/tests/dbt/test_transformation.py b/tests/dbt/test_transformation.py index 1c9e947c2..9356e14f6 100644 --- a/tests/dbt/test_transformation.py +++ b/tests/dbt/test_transformation.py @@ -1,3 +1,5 @@ +import agate +from datetime import datetime import json import logging import typing as t @@ -34,7 +36,7 @@ from sqlmesh.dbt.model import Materialization, ModelConfig from sqlmesh.dbt.project import Project from sqlmesh.dbt.relation import Policy -from sqlmesh.dbt.seed import SeedConfig +from sqlmesh.dbt.seed import SeedConfig, Integer from sqlmesh.dbt.target import BigQueryConfig, DuckDbConfig, SnowflakeConfig from sqlmesh.dbt.test import TestConfig from sqlmesh.utils.errors import ConfigError, MacroEvalError, SQLMeshError @@ -402,6 +404,7 @@ def test_seed_column_inference(tmp_path): fd.write("int_col,double_col,datetime_col,date_col,boolean_col,text_col\n") fd.write("1,1.2,2021-01-01 00:00:00,2021-01-01,true,foo\n") fd.write("2,2.3,2021-01-02 00:00:00,2021-01-02,false,bar\n") + fd.write("null,,null,,,null\n") seed = SeedConfig( name="test_model", @@ -423,6 +426,23 @@ def test_seed_column_inference(tmp_path): } +def test_agate_integer_cast(): + agate_integer = Integer(null_values=("null", "")) + assert agate_integer.cast("1") == 1 + assert agate_integer.cast(1) == 1 + assert agate_integer.cast("null") is None + assert agate_integer.cast("") is None + + with pytest.raises(agate.exceptions.CastError): + agate_integer.cast("1.2") + + with pytest.raises(agate.exceptions.CastError): + agate_integer.cast(1.2) + + with pytest.raises(agate.exceptions.CastError): + agate_integer.cast(datetime.now()) + + @pytest.mark.xdist_group("dbt_manifest") def test_model_dialect(sushi_test_project: Project, assert_exp_eq): model_config = ModelConfig(