Avoid adding a NullBuffer when decoding timestamp offsets (#90)

Since 60288cd we applied an unary_opt kernel to decode timezones. This kernel always returns `Some` unless the date is outside the 1677-2262. Unfortunately, even though the kernel is unlikely to return `None`, applying the kernel always causes the resulting array to get a `NullBuffer`, even if the source array did not have one. In order to avoid unnecessarily adding a `NullBuffer`, this commit first tries to apply a non-nullable kernel; and only falls back to `unary_opt` in the rare case it fails. An alternative implementation that does not risk running the kernel twice would be to check the `NullBuffer`'s `null_count` after running the kernel, then strip it if its `null_count` is zero; but it requires the unnecessary allocation of a `NullBuffer`.
datafusion-contrib · May 24, 2024 · ac5a8ab · ac5a8ab
1 parent f223788
commit ac5a8ab
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 2 deletions.
diff --git a/src/array_decoder/timestamp.rs b/src/array_decoder/timestamp.rs
@@ -109,15 +109,23 @@ impl ArrayBatchDecoder for TimestampOffsetArrayDecoder {
         let array = self
             .inner
             .next_primitive_batch(batch_size, parent_present)?;
-        let array = array.unary_opt::<_, TimestampNanosecondType>(|ts| {
+
+        let convert_timezone = |ts| {
             // Convert from writer timezone to reader timezone (which we default to UTC)
             // TODO: more efficient way of doing this?
             self.writer_tz
                 .timestamp_nanos(ts)
                 .naive_local()
                 .and_utc()
                 .timestamp_nanos_opt()
-        });
+        };
+        let array = array
+            // first try to convert all non-nullable batches to non-nullable batches
+            .try_unary::<_, TimestampNanosecondType, _>(|ts| convert_timezone(ts).ok_or(()))
+            // in the rare case one of the values was out of the 1677-2262 range (see
+            // <https://docs.rs/chrono/latest/chrono/struct.DateTime.html#method.timestamp_nanos_opt>),
+            // try again by allowing a nullable batch as output
+            .unwrap_or_else(|()| array.unary_opt::<_, TimestampNanosecondType>(convert_timezone));
         let array = Arc::new(array) as ArrayRef;
         Ok(array)
     }

diff --git a/tests/integration/main.rs b/tests/integration/main.rs
@@ -3,7 +3,9 @@
 use std::fs::File;
 
 use arrow::{
+    array::{Array, AsArray},
     compute::concat_batches,
+    datatypes::TimestampNanosecondType,
     ipc::reader::FileReader,
     record_batch::{RecordBatch, RecordBatchReader},
 };
@@ -97,6 +99,27 @@ fn test_date_1900() {
     test_expected_file("TestOrcFile.testDate1900");
 }
 
+#[test]
+fn test_date_1900_not_null() {
+    // Don't use read_orc_file() because it concatenate batches, which would detect
+    // there are no nulls and remove the NullBuffer, making this test useless
+    let path = format!(
+        "{}/tests/integration/data/TestOrcFile.testDate1900.orc",
+        env!("CARGO_MANIFEST_DIR"),
+    );
+    let f = File::open(path).unwrap();
+    let reader = ArrowReaderBuilder::try_new(f).unwrap().build();
+    let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
+
+    for batch in batches {
+        assert!(batch.columns()[0]
+            .as_primitive_opt::<TimestampNanosecondType>()
+            .unwrap()
+            .nulls()
+            .is_none());
+    }
+}
+
 #[test]
 #[ignore]
 // TODO: pending https://github.com/chronotope/chrono-tz/issues/155