Skip to content

Commit

Permalink
Avoid adding a NullBuffer when decoding timestamp offsets (#90)
Browse files Browse the repository at this point in the history
Since 60288cd we applied an unary_opt
kernel to decode timezones. This kernel always returns `Some` unless
the date is outside the 1677-2262.

Unfortunately, even though the kernel is unlikely to return `None`,
applying the kernel always causes the resulting array to get a
`NullBuffer`, even if the source array did not have one.

In order to avoid unnecessarily adding a `NullBuffer`, this commit first
tries to apply a non-nullable kernel; and only falls back to `unary_opt`
in the rare case it fails.

An alternative implementation that does not risk running the kernel
twice would be to check the `NullBuffer`'s `null_count` after running
the kernel, then strip it if its `null_count` is zero; but it requires
the unnecessary allocation of a `NullBuffer`.
  • Loading branch information
progval authored May 24, 2024
1 parent f223788 commit ac5a8ab
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/array_decoder/timestamp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,23 @@ impl ArrayBatchDecoder for TimestampOffsetArrayDecoder {
let array = self
.inner
.next_primitive_batch(batch_size, parent_present)?;
let array = array.unary_opt::<_, TimestampNanosecondType>(|ts| {

let convert_timezone = |ts| {
// Convert from writer timezone to reader timezone (which we default to UTC)
// TODO: more efficient way of doing this?
self.writer_tz
.timestamp_nanos(ts)
.naive_local()
.and_utc()
.timestamp_nanos_opt()
});
};
let array = array
// first try to convert all non-nullable batches to non-nullable batches
.try_unary::<_, TimestampNanosecondType, _>(|ts| convert_timezone(ts).ok_or(()))
// in the rare case one of the values was out of the 1677-2262 range (see
// <https://docs.rs/chrono/latest/chrono/struct.DateTime.html#method.timestamp_nanos_opt>),
// try again by allowing a nullable batch as output
.unwrap_or_else(|()| array.unary_opt::<_, TimestampNanosecondType>(convert_timezone));
let array = Arc::new(array) as ArrayRef;
Ok(array)
}
Expand Down
23 changes: 23 additions & 0 deletions tests/integration/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
use std::fs::File;

use arrow::{
array::{Array, AsArray},
compute::concat_batches,
datatypes::TimestampNanosecondType,
ipc::reader::FileReader,
record_batch::{RecordBatch, RecordBatchReader},
};
Expand Down Expand Up @@ -97,6 +99,27 @@ fn test_date_1900() {
test_expected_file("TestOrcFile.testDate1900");
}

#[test]
fn test_date_1900_not_null() {
// Don't use read_orc_file() because it concatenate batches, which would detect
// there are no nulls and remove the NullBuffer, making this test useless
let path = format!(
"{}/tests/integration/data/TestOrcFile.testDate1900.orc",
env!("CARGO_MANIFEST_DIR"),
);
let f = File::open(path).unwrap();
let reader = ArrowReaderBuilder::try_new(f).unwrap().build();
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();

for batch in batches {
assert!(batch.columns()[0]
.as_primitive_opt::<TimestampNanosecondType>()
.unwrap()
.nulls()
.is_none());
}
}

#[test]
#[ignore]
// TODO: pending https://github.com/chronotope/chrono-tz/issues/155
Expand Down

0 comments on commit ac5a8ab

Please sign in to comment.