Skip to content

Commit

Permalink
#62 cli tool for printing file stats (#84)
Browse files Browse the repository at this point in the history
* #62 cli tool for printing file stats

* Update Cargo.toml

Co-authored-by: Weny Xu <wenymedia@gmail.com>

* fixed formatting

---------

Co-authored-by: Weny Xu <wenymedia@gmail.com>
  • Loading branch information
klangner and WenyXu authored Apr 16, 2024
1 parent c90c282 commit b1678ea
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 2 deletions.
9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ include = ["src/**/*.rs", "Cargo.toml"]
rust-version = "1.70"

[dependencies]
anyhow = { version = "1.0", optional = true }
arrow = { version = "51", features = ["prettyprint"] }
bytes = "1.4"
chrono = { version = "0.4.37", default-features = false, features = ["std"] }
chrono-tz = "0.8.6"
clap = { version = "4.5.3", features = ["derive"], optional = true }
clap = { version = "4.5.4", features = ["derive"], optional = true }
fallible-streaming-iterator = { version = "0.1" }
flate2 = "1"
futures = { version = "0.3", optional = true, default-features = false, features = ["std"] }
Expand Down Expand Up @@ -52,7 +53,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"] }
default = ["async"]

async = ["futures", "futures-util", "tokio"]
cli = ["clap"]
cli = ["anyhow", "clap"]

[[bench]]
name = "arrow_reader"
Expand All @@ -70,3 +71,7 @@ path = "./examples/datafusion_integration.rs"
[[bin]]
name = "orc-metadata"
required-features = ["cli"]

[[bin]]
name = "orc-stats"
required-features = ["cli"]
132 changes: 132 additions & 0 deletions src/bin/orc-stats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
use std::{fs::File, path::PathBuf, sync::Arc};

use anyhow::Result;
use arrow::temporal_conversions::{date32_to_datetime, timestamp_ms_to_datetime};
use clap::Parser;
use orc_rust::{reader::metadata::read_metadata, statistics::ColumnStatistics};

#[derive(Parser)]
#[command(name = "orc-stats")]
#[command(version, about = "Print column and stripe stats from the orc file", long_about = None)]
struct Cli {
/// Path to the orc file
file: PathBuf,
}

fn print_column_stats(col_stats: &ColumnStatistics) {
if let Some(tstats) = col_stats.type_statistics() {
match tstats {
orc_rust::statistics::TypeStatistics::Integer { min, max, sum } => {
println!("* Data type Integer");
println!("* Minimum: {}", min);
println!("* Maximum: {}", max);
if let Some(sum) = sum {
println!("* Sum: {}", sum);
}
}
orc_rust::statistics::TypeStatistics::Double { min, max, sum } => {
println!("* Data type Double");
println!("* Minimum: {}", min);
println!("* Maximum: {}", max);
if let Some(sum) = sum {
println!("* Sum: {}", sum);
}
}
orc_rust::statistics::TypeStatistics::String { min, max, sum } => {
println!("* Data type String");
println!("* Minimum: {}", min);
println!("* Maximum: {}", max);
println!("* Sum: {}", sum);
}
orc_rust::statistics::TypeStatistics::Bucket { true_count } => {
println!("* Data type Bucket");
println!("* True count: {}", true_count);
}
orc_rust::statistics::TypeStatistics::Decimal { min, max, sum } => {
println!("* Data type Decimal");
println!("* Minimum: {}", min);
println!("* Maximum: {}", max);
println!("* Sum: {}", sum);
}
orc_rust::statistics::TypeStatistics::Date { min, max } => {
println!("* Data type Date");
if let Some(dt) = date32_to_datetime(*min) {
println!("* Minimum: {}", dt);
}
if let Some(dt) = date32_to_datetime(*max) {
println!("* Maximum: {}", dt);
}
}
orc_rust::statistics::TypeStatistics::Binary { sum } => {
println!("* Data type Binary");
println!("* Sum: {}", sum);
}
orc_rust::statistics::TypeStatistics::Timestamp {
min,
max,
min_utc,
max_utc,
} => {
println!("* Data type Timestamp");
println!("* Minimum: {}", min);
println!("* Maximum: {}", max);
if let Some(ts) = timestamp_ms_to_datetime(*min_utc) {
println!("* Minimum UTC: {}", ts);
}
if let Some(ts) = timestamp_ms_to_datetime(*max_utc) {
println!("* Maximum UTC: {}", ts);
}
}
orc_rust::statistics::TypeStatistics::Collection {
min_children,
max_children,
total_children,
} => {
println!("* Data type Collection");
println!("* Minimum children: {}", min_children);
println!("* Maximum children: {}", max_children);
println!("* Total children: {}", total_children);
}
}
}

println!("* Num values: {}", col_stats.number_of_values());
println!("* Has nulls: {}", col_stats.has_null());
println!("");
}

fn main() -> Result<()> {
let cli = Cli::parse();

let mut f = File::open(&cli.file)?;
let metadata = Arc::new(read_metadata(&mut f)?);

println!("# Column stats");
println!(
"File {:?} has {} columns",
cli.file,
metadata.column_file_statistics().len()
);
println!();
for (idx, col_stats) in metadata.column_file_statistics().iter().enumerate() {
println!("## Column {idx}");
print_column_stats(col_stats);
}

println!("# Stripe stats");
println!(
"File {:?} has {} stripes",
cli.file,
metadata.stripe_metadatas().len()
);
println!();
for (idm, sm) in metadata.stripe_metadatas().iter().enumerate() {
println!("----- Stripe {idm} -----\n");
for (idc, col_stats) in sm.column_statistics().iter().enumerate() {
println!("## Column {idc}");
print_column_stats(col_stats);
}
}

Ok(())
}

0 comments on commit b1678ea

Please sign in to comment.