Skip to content

Commit

Permalink
Merge pull request #84 from planetlabs/overture
Browse files Browse the repository at this point in the history
Ensure that Parquet with complex types transforms to valid GeoParquet
  • Loading branch information
tschaub authored Oct 2, 2023
2 parents 6e08eca + f554f5c commit dd967c7
Show file tree
Hide file tree
Showing 6 changed files with 362 additions and 12 deletions.
1 change: 1 addition & 0 deletions internal/geoparquet/geoparquet.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ func getMetadata(fileReader *file.Reader, convertOptions *ConvertOptions) *Metad
primaryColumn = convertOptions.InputPrimaryColumn
}
metadata = &Metadata{
Version: Version,
PrimaryColumn: primaryColumn,
Columns: map[string]*GeometryColumn{
primaryColumn: getDefaultGeometryColumn(),
Expand Down
3 changes: 0 additions & 3 deletions internal/pqutil/transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,6 @@ func TransformByColumn(config *TransformConfig) error {
if err != nil {
return err
}
if transformed.DataType() != outputField.Type {
return fmt.Errorf("transform generated an unexpected type, got %s, expected %s", transformed.DataType().Name(), outputField.Type.Name())
}
arr = transformed
}
colWriter, colWriterErr := pqarrow.NewArrowColumnWriter(arr, 0, int64(arr.Len()), outputManifest, rowGroupWriter, fieldNum)
Expand Down
18 changes: 9 additions & 9 deletions internal/validator/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,13 +432,13 @@ func GeometryUngrouped() Rule {
title: "geometry columns must not be grouped",
validate: func(info *FileInfo) error {
metadata := info.Metadata
sc := info.File.MetaData().Schema
root := info.File.MetaData().Schema.Root()
for name := range metadata.Columns {
index := sc.ColumnIndexByName(name)
index := root.FieldIndexByName(name)
if index < 0 {
return fatal("missing geometry column %q", name)
}
_, ok := sc.Root().Field(index).(*schema.PrimitiveNode)
_, ok := root.Field(index).(*schema.PrimitiveNode)
if !ok {
return fmt.Errorf("column %q must not be a group", name)
}
Expand All @@ -454,14 +454,14 @@ func GeometryDataType() Rule {
title: "geometry columns must be stored using the BYTE_ARRAY parquet type",
validate: func(info *FileInfo) error {
metadata := info.Metadata
sc := info.File.MetaData().Schema
root := info.File.MetaData().Schema.Root()
for name := range metadata.Columns {
index := sc.ColumnIndexByName(name)
index := root.FieldIndexByName(name)
if index < 0 {
return fatal("missing geometry column %q", name)
}

field, ok := sc.Root().Field(index).(*schema.PrimitiveNode)
field, ok := root.Field(index).(*schema.PrimitiveNode)
if !ok {
return fatal("expected primitive column for %q", name)
}
Expand All @@ -480,14 +480,14 @@ func GeometryRepetition() Rule {
title: "geometry columns must be required or optional, not repeated",
validate: func(info *FileInfo) error {
metadata := info.Metadata
sc := info.File.MetaData().Schema
root := info.File.MetaData().Schema.Root()
for name := range metadata.Columns {
index := sc.ColumnIndexByName(name)
index := root.FieldIndexByName(name)
if index < 0 {
return fatal("missing geometry column %q", name)
}

repetitionType := sc.Root().Field(index).RepetitionType()
repetitionType := root.Field(index).RepetitionType()
if repetitionType == parquet.Repetitions.Repeated {
return fmt.Errorf("column %q must not be repeated", name)
}
Expand Down
105 changes: 105 additions & 0 deletions internal/validator/testdata/complex-types/expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"checks": [
{
"title": "file must include a \"geo\" metadata key",
"run": true,
"passed": true
},
{
"title": "metadata must be a JSON object",
"run": true,
"passed": true
},
{
"title": "metadata must include a \"version\" string",
"run": true,
"passed": true
},
{
"title": "metadata must include a \"primary_column\" string",
"run": true,
"passed": true
},
{
"title": "metadata must include a \"columns\" object",
"run": true,
"passed": true
},
{
"title": "column metadata must include the \"primary_column\" name",
"run": true,
"passed": true
},
{
"title": "column metadata must include a valid \"encoding\" string",
"run": true,
"passed": true
},
{
"title": "column metadata must include a \"geometry_types\" list",
"run": true,
"passed": true
},
{
"title": "optional \"crs\" must be null or a PROJJSON object",
"run": true,
"passed": true
},
{
"title": "optional \"orientation\" must be a valid string",
"run": true,
"passed": true
},
{
"title": "optional \"edges\" must be a valid string",
"run": true,
"passed": true
},
{
"title": "optional \"bbox\" must be an array of 4 or 6 numbers",
"run": true,
"passed": true
},
{
"title": "optional \"epoch\" must be a number",
"run": true,
"passed": true
},
{
"title": "geometry columns must not be grouped",
"run": true,
"passed": true
},
{
"title": "geometry columns must be stored using the BYTE_ARRAY parquet type",
"run": true,
"passed": true
},
{
"title": "geometry columns must be required or optional, not repeated",
"run": true,
"passed": true
},
{
"title": "all geometry values match the \"encoding\" metadata",
"run": true,
"passed": true
},
{
"title": "all geometry types must be included in the \"geometry_types\" metadata (if not empty)",
"run": true,
"passed": true
},
{
"title": "all polygon geometries must follow the \"orientation\" metadata (if present)",
"run": true,
"passed": true
},
{
"title": "all geometries must fall within the \"bbox\" metadata (if present)",
"run": true,
"passed": true
}
],
"metadataOnly": false
}
85 changes: 85 additions & 0 deletions internal/validator/testdata/complex-types/input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"metadata": {
"version": "1.0.0",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": [
"Point"
],
"orientation": "counterclockwise",
"edges": "planar",
"bbox": [
0,
0,
0,
0
],
"epoch": 2021.47,
"crs": {
"$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
"type": "GeographicCRS",
"name": "WGS 84 longitude-latitude",
"datum": {
"type": "GeodeticReferenceFrame",
"name": "World Geodetic System 1984",
"ellipsoid": {
"name": "WGS 84",
"semi_major_axis": 6378137,
"inverse_flattening": 298.257223563
}
},
"coordinate_system": {
"subtype": "ellipsoidal",
"axis": [
{
"name": "Geodetic longitude",
"abbreviation": "Lon",
"direction": "east",
"unit": "degree"
},
{
"name": "Geodetic latitude",
"abbreviation": "Lat",
"direction": "north",
"unit": "degree"
}
]
},
"id": {
"authority": "OGC",
"code": "CRS84"
}
}
}
}
},
"data": {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {
"numbers": [2, 4, 6, 8],
"strings": ["chicken", "soup"],
"object": {
"name": "Bob"
},
"names": {
"common": [
{"value": "Hello", "language": "en"}
]
}
},
"geometry": {
"type": "Point",
"coordinates": [
0,
0
]
}
}
]
}
}
Loading

0 comments on commit dd967c7

Please sign in to comment.