Skip to content

Commit

Permalink
Merge pull request finos#2713 from sinistersnare/feature/better-times…
Browse files Browse the repository at this point in the history
…tamp-parsing

Better Timestamp Parsing
  • Loading branch information
texodus authored Aug 23, 2024
2 parents 22f56cc + bdabded commit e179382
Show file tree
Hide file tree
Showing 5 changed files with 430 additions and 18 deletions.
211 changes: 197 additions & 14 deletions cpp/perspective/src/cpp/arrow_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// ┃ of the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). ┃
// ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛

#include <chrono>
#include <perspective/base.h>
#include <perspective/arrow_csv.h>
#include <arrow/util/value_parsing.h>
Expand Down Expand Up @@ -223,24 +224,67 @@ ParseSSS(const char* s, std::chrono::milliseconds* out) {
}

static inline bool
ParseTZ(const char* s, std::chrono::hours* out) {
uint8_t hours = 0;
ParseSSSSSS(const char* s, std::chrono::microseconds* out) {
uint32_t nanos = 0;
if (ARROW_PREDICT_FALSE(s[0] != '.')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 1, 6, &nanos)
)) {
return false;
}

if (ARROW_PREDICT_FALSE(s[0] != '+') && ARROW_PREDICT_FALSE(s[0] != '-')) {
if (ARROW_PREDICT_FALSE(nanos >= 999999)) {
return false;
}
*out = std::chrono::microseconds(nanos);
return true;
}

static inline bool
ParseSSSSSSSSS(const char* s, std::chrono::nanoseconds* out) {
uint32_t nanos = 0;
if (ARROW_PREDICT_FALSE(s[0] != '.')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 1, 9, &nanos)
)) {
return false;
}

if (ARROW_PREDICT_FALSE(nanos >= 999999999)) {
return false;
}
*out = std::chrono::nanoseconds(nanos);
return true;
}

static inline bool
ParseTZ(const char* s, std::chrono::minutes* out) {
uint8_t hours = 0;
uint8_t minutes = 0;
if ((ARROW_PREDICT_FALSE(s[0] != '+') && ARROW_PREDICT_FALSE(s[0] != '-'))
|| ARROW_PREDICT_FALSE(s[3] != ':')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 1, 2, &hours)
)) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 4, 2, &minutes)
)) {
return false;
}

if (ARROW_PREDICT_FALSE(hours >= 12)) {
if (ARROW_PREDICT_FALSE(hours >= 12)
|| ARROW_PREDICT_FALSE(minutes >= 59)) {
return false;
}
if (s[0] == '-') {
hours = -hours;
int32_t total = hours * 60 + minutes;
if (s[0] == '+') {
total = -total;
}
*out = std::chrono::hours(hours);
*out = std::chrono::minutes(total);
return true;
}

Expand All @@ -254,13 +298,20 @@ class CustomISO8601Parser : public arrow::TimestampParser {
int64_t* out,
bool* out_zone_offset_present = NULLPTR
) const override {
// if we are trying to parse this with seconds, fail
// and it will try to parse this again but as
// nanoseconds :) then it wont truncate the fractional bits.
if (unit == arrow::TimeUnit::SECOND) {
return false;
}

if (!arrow::internal::ParseTimestampISO8601(s, length, unit, out)) {
if (s[length - 1] == 'Z') {
--length;
}
if (length == 23) {
// "YYYY-MM-DD[ T]hh:mm:ss.sss"
// "YYYY-MM-DD[ T]hh:mm:ss.sss" -- millis

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
Expand All @@ -281,8 +332,9 @@ class CustomISO8601Parser : public arrow::TimestampParser {
);
return true;
}

if (length == 25) {
// "2008-09-15[ T]15:53:00+05:00"
// "2008-09-15[ T]15:53:00+05:00" -- seconds with TZ
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
Expand All @@ -293,7 +345,7 @@ class CustomISO8601Parser : public arrow::TimestampParser {
))) {
return false;
}
std::chrono::hours tz;
std::chrono::minutes tz;
if (ARROW_PREDICT_FALSE(!ParseTZ(s + 19, &tz))) {
return false;
}
Expand All @@ -303,6 +355,137 @@ class CustomISO8601Parser : public arrow::TimestampParser {
);
return true;
}
if (length == 26) {
// YYYY-MM-DD[ T]hh:mm:ss.ssssss -- micros

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
std::chrono::microseconds micros;
if (ARROW_PREDICT_FALSE(!ParseSSSSSS(s + 19, &micros))) {
return false;
}
// round the micros into millis as Perspective does not support
// nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(micros
);
*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis, unit
);

return true;
}

if (length == 29) {
// YYYY-MM-DD[ T]hh:mm:ss.sssssssss -- nanos
// arrow handles YYYY-MM-DD[ T]hh:mm:ss.sss[+-]HH:MM
std::cout << "DDD WOOHOOOOO!\n";
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
// we can now be at sss[+-]HH:MM -- millis and TZ
// or sssssssss -- nanos
std::chrono::nanoseconds nanos;
if (ARROW_PREDICT_FALSE(!ParseSSSSSSSSS(s + 19, &nanos))) {
return false;
}
// Truncate the nanos into millis as Perspective does not
// support nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(nanos
);

*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis, unit
);

return true;
}
if (length == 32) {
// YYYY-MM-DD[ T]hh:mm:ss.ssssss[+-]HH:MM -- micros with TZ

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
std::chrono::microseconds micros;
if (ARROW_PREDICT_FALSE(!ParseSSSSSS(s + 19, &micros))) {
return false;
}
// round the micros into millis as Perspective does not support
// nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(micros
);

std::chrono::minutes tz;
if (ARROW_PREDICT_FALSE(!ParseTZ(s + 26, &tz))) {
return false;
}
*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis + tz,
unit
);

return true;
}

if (length == 35) {
// YYYY-MM-DD[ T]hh:mm:ss.sssssssss[+-]HH:MM -- nanos with TZ

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
std::chrono::nanoseconds nanos;
if (ARROW_PREDICT_FALSE(!ParseSSSSSSSSS(s + 19, &nanos))) {
return false;
}
// round the nanos into millis as Perspective does not support
// nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(nanos
);

std::chrono::minutes tz;
if (ARROW_PREDICT_FALSE(!ParseTZ(s + 29, &tz))) {
return false;
}

*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis + tz,
unit
);

return true;
}
return false;
}
return true;
Expand Down Expand Up @@ -387,8 +570,8 @@ std::vector<std::shared_ptr<arrow::TimestampParser>> DATE_PARSERS{
std::make_shared<CustomISO8601Parser>(),
std::make_shared<USTimestampParser>(),
arrow::TimestampParser::MakeStrptime("%Y-%m-%d\\D%H:%M:%S.%f"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"
), // US locale string
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"),
// US locale string
arrow::TimestampParser::MakeStrptime("%m-%d-%Y"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y"),
arrow::TimestampParser::MakeStrptime("%d %m %Y"),
Expand All @@ -401,8 +584,8 @@ std::vector<std::shared_ptr<arrow::TimestampParser>> DATE_READERS{
std::make_shared<CustomISO8601Parser>(),
std::make_shared<USTimestampParser>(),
arrow::TimestampParser::MakeStrptime("%Y-%m-%d\\D%H:%M:%S.%f"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"
), // US locale string
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"),
// US locale
arrow::TimestampParser::MakeStrptime("%m-%d-%Y"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y"),
arrow::TimestampParser::MakeStrptime("%d %m %Y"),
Expand Down
2 changes: 1 addition & 1 deletion rust/perspective-js/src/ts/emscripten_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export async function compile_perspective(
const str = Error().stack || "";
const textEncoder = new TextEncoder();
const bytes = textEncoder.encode(str);
const ptr = module._psp_js_alloc(bytes.byteLength + 1);
const ptr = module._psp_alloc(bytes.byteLength + 1);
module.HEAPU8.set(bytes, ptr);
module.HEAPU8[ptr + bytes.byteLength] = 0;
return ptr;
Expand Down
21 changes: 18 additions & 3 deletions rust/perspective-js/test/js/clear.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,29 @@ import type * as psp_types from "@finos/perspective";
{ x: 1, y: 2 },
{ x: 3, y: 4 },
]);
table.replace([{ x: 5, y: 6 }]);
await table.replace([{ x: 5, y: 6 }]);
json = await view.to_json();
expect(json).toHaveLength(1);
expect(json).toEqual([{ x: 5, y: 6 }]);
view.delete();
table.delete();
});

test("Replaces CSV Table with high precision datetimes", async function () {
const a = '"start"\n2024-08-14T14:06:07.826Z';
const b = '"start"\n2024-08-14T14:06:09.876667543Z';
const table = await perspective.table(a);
const view = await table.view();
const csv1 = await view.to_csv();
expect(csv1).toEqual('"start"\n2024-08-14 14:06:07.826\n');

await table.replace(b);
const csv2 = await view.to_csv();
expect(csv2).toEqual('"start"\n2024-08-14 14:06:09.876\n');
view.delete();
table.delete();
});

test("replaces the rows in the table with the input data and fires an on_update", async function () {
const table = await perspective.table([
{ x: 1, y: 2 },
Expand Down Expand Up @@ -95,7 +110,7 @@ import type * as psp_types from "@finos/perspective";
{ x: 3, y: 4 },
]);

table.replace([{ x: 5, y: 6 }]);
await table.replace([{ x: 5, y: 6 }]);
await result;
});

Expand Down Expand Up @@ -139,7 +154,7 @@ import type * as psp_types from "@finos/perspective";
{ x: 3, y: 4 },
]);

table.replace([{ x: 5, y: 6 }]);
await table.replace([{ x: 5, y: 6 }]);
await result;
});

Expand Down
Loading

0 comments on commit e179382

Please sign in to comment.