Skip to content

Commit

Permalink
Update spec and fix autogenerated headers with skip after (#30123)
Browse files Browse the repository at this point in the history
  • Loading branch information
maxi297 authored Sep 3, 2023
1 parent 399b4d1 commit 5b65367
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
#

from pydantic import BaseModel, Field
from typing_extensions import Literal


class AvroFormat(BaseModel):
class Config:
title = "Avro Format"

filetype: Literal["avro"] = "avro"
filetype: str = Field(
"avro",
const=True,
)

double_as_string: bool = Field(
title="Convert Double Fields to Strings",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import Any, Dict, List, Optional, Set, Union

from pydantic import BaseModel, Field, ValidationError, root_validator, validator
from typing_extensions import Literal


class InferenceType(Enum):
Expand All @@ -25,7 +24,10 @@ class CsvHeaderFromCsv(BaseModel):
class Config:
title = "From CSV"

header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
header_definition_type: str = Field(
CsvHeaderDefinitionType.FROM_CSV.value,
const=True,
)

def has_header_row(self) -> bool:
return True
Expand All @@ -35,7 +37,10 @@ class CsvHeaderAutogenerated(BaseModel):
class Config:
title = "Autogenerated"

header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
header_definition_type: str = Field(
CsvHeaderDefinitionType.AUTOGENERATED.value,
const=True,
)

def has_header_row(self) -> bool:
return False
Expand All @@ -45,7 +50,10 @@ class CsvHeaderUserProvided(BaseModel):
class Config:
title = "User Provided"

header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
header_definition_type: str = Field(
CsvHeaderDefinitionType.USER_PROVIDED.value,
const=True,
)
column_names: List[str] = Field(
title="Column Names",
description="The column names that will be used while emitting the CSV records",
Expand All @@ -69,7 +77,10 @@ class CsvFormat(BaseModel):
class Config:
title = "CSV Format"

filetype: Literal["csv"] = "csv"
filetype: str = Field(
"csv",
const=True,
)
delimiter: str = Field(
title="Delimiter",
description="The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from pydantic import BaseModel
from typing_extensions import Literal
from pydantic import BaseModel, Field


class JsonlFormat(BaseModel):
class Config:
title = "Jsonl Format"

filetype: Literal["jsonl"] = "jsonl"
filetype: str = Field(
"jsonl",
const=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
#

from pydantic import BaseModel, Field
from typing_extensions import Literal


class ParquetFormat(BaseModel):
class Config:
title = "Parquet Format"

filetype: Literal["parquet"] = "parquet"
filetype: str = Field(
"parquet",
const=True,
)
# This option is not recommended, but necessary for backwards compatibility
decimal_as_float: bool = Field(
title="Convert Decimal Fields to Floats",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str)
if isinstance(config_format.header_definition, CsvHeaderUserProvided):
return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type

self._skip_rows(fp, config_format.skip_rows_before_header)
if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
self._skip_rows(fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header)
headers = self._auto_generate_headers(fp, dialect_name)
else:
# Then read the header
self._skip_rows(fp, config_format.skip_rows_before_header)
reader = csv.reader(fp, dialect=dialect_name) # type: ignore
headers = list(next(reader))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,21 @@ def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_f

assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]

def test_given_skip_row_before_and_after_and_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
self._config_format.header_definition = CsvHeaderAutogenerated()
self._config_format.skip_rows_before_header = 1
self._config_format.skip_rows_after_header = 2
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([
"skip before",
"skip after 1",
"skip after 2",
"0,1,2,3,4,5,6"
]).build()

data_generator = self._read_data()

assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]

def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@
"title": "Avro Format",
"type": "object",
"properties": {
"filetype": {"title": "Filetype", "default": "avro", "enum": ["avro"], "type": "string"},
"filetype": {"title": "Filetype", "default": "avro", "const": "avro", "type": "string"},
"double_as_string": {
"title": "Convert Double Fields to Strings",
"description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.",
Expand All @@ -124,7 +124,7 @@
"title": "CSV Format",
"type": "object",
"properties": {
"filetype": {"title": "Filetype", "default": "csv", "enum": ["csv"], "type": "string"},
"filetype": {"title": "Filetype", "default": "csv", "const": "csv", "type": "string"},
"delimiter": {
"title": "Delimiter",
"description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
Expand Down Expand Up @@ -190,21 +190,21 @@
"title": "From CSV",
"type": "object",
"properties": {
"header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
"header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "const": "From CSV", "type": "string"},
},
},
{
"title": "Autogenerated",
"type": "object",
"properties": {
"header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
"header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "const": "Autogenerated", "type": "string"},
},
},
{
"title": "User Provided",
"type": "object",
"properties": {
"header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
"header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "const": "User Provided", "type": "string"},
"column_names": {
"title": "Column Names",
"description": "The column names that will be used while emitting the CSV records",
Expand Down Expand Up @@ -247,7 +247,7 @@
"title": "Jsonl Format",
"type": "object",
"properties": {
"filetype": {"title": "Filetype", "default": "jsonl", "enum": ["jsonl"], "type": "string"}
"filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
},
},
{
Expand All @@ -257,7 +257,7 @@
"filetype": {
"title": "Filetype",
"default": "parquet",
"enum": ["parquet"],
"const": "parquet",
"type": "string",
},
"decimal_as_float": {
Expand Down

0 comments on commit 5b65367

Please sign in to comment.