diff --git a/columnar/columnar-cli/Cargo.toml b/columnar/columnar-cli/Cargo.toml new file mode 100644 index 000000000..0c1fd9b67 --- /dev/null +++ b/columnar/columnar-cli/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "tantivy-columnar-cli" +version = "0.1.0" +edition = "2021" +license = "MIT" + +[dependencies] +columnar = {path="../", package="tantivy-columnar"} +serde_json = "1" +serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"} +serde = "1" + +[workspace] +members = [] + +[profile.release] +debug = true diff --git a/columnar/columnar-cli/src/main.rs b/columnar/columnar-cli/src/main.rs new file mode 100644 index 000000000..82fc65f3d --- /dev/null +++ b/columnar/columnar-cli/src/main.rs @@ -0,0 +1,134 @@ +use columnar::ColumnarWriter; +use columnar::NumericalValue; +use serde_json_borrow; +use std::fs::File; +use std::io; +use std::io::BufRead; +use std::io::BufReader; +use std::time::Instant; + +#[derive(Default)] +struct JsonStack { + path: String, + stack: Vec, +} + +impl JsonStack { + fn push(&mut self, seg: &str) { + let len = self.path.len(); + self.stack.push(len); + self.path.push('.'); + self.path.push_str(seg); + } + + fn pop(&mut self) { + if let Some(len) = self.stack.pop() { + self.path.truncate(len); + } + } + + fn path(&self) -> &str { + &self.path[1..] + } +} + +fn append_json_to_columnar( + doc: u32, + json_value: &serde_json_borrow::Value, + columnar: &mut ColumnarWriter, + stack: &mut JsonStack, +) -> usize { + let mut count = 0; + match json_value { + serde_json_borrow::Value::Null => {} + serde_json_borrow::Value::Bool(val) => { + columnar.record_numerical( + doc, + stack.path(), + NumericalValue::from(if *val { 1u64 } else { 0u64 }), + ); + count += 1; + } + serde_json_borrow::Value::Number(num) => { + let numerical_value: NumericalValue = if let Some(num_i64) = num.as_i64() { + num_i64.into() + } else if let Some(num_u64) = num.as_u64() { + num_u64.into() + } else if let Some(num_f64) = num.as_f64() { + num_f64.into() + } else { + panic!(); + }; + count += 1; + columnar.record_numerical( + doc, + stack.path(), + numerical_value, + ); + } + serde_json_borrow::Value::Str(msg) => { + columnar.record_str( + doc, + stack.path(), + msg, + ); + count += 1; + }, + serde_json_borrow::Value::Array(vals) => { + for val in vals { + count += append_json_to_columnar(doc, val, columnar, stack); + } + }, + serde_json_borrow::Value::Object(json_map) => { + for (child_key, child_val) in json_map { + stack.push(child_key); + count += append_json_to_columnar(doc, child_val, columnar, stack); + stack.pop(); + } + }, + } + count +} + +fn main() -> io::Result<()> { + let file = File::open("gh_small.json")?; + let mut reader = BufReader::new(file); + let mut line = String::with_capacity(100); + let mut columnar = columnar::ColumnarWriter::default(); + let mut doc = 0; + let start = Instant::now(); + let mut stack = JsonStack::default(); + let mut total_count = 0; + + let start_build = Instant::now(); + loop { + line.clear(); + let len = reader.read_line(&mut line)?; + if len == 0 { + break; + } + let Ok(json_value) = serde_json::from_str::(&line) else { continue; }; + total_count += append_json_to_columnar(doc, &json_value, &mut columnar, &mut stack); + doc += 1; + } + println!("Build in {:?}", start_build.elapsed()); + + println!("value count {total_count}"); + + let mut buffer = Vec::new(); + let start_serialize = Instant::now(); + columnar.serialize(doc, None, &mut buffer)?; + println!("Serialized in {:?}", start_serialize.elapsed()); + println!("num docs: {doc}, {:?}", start.elapsed()); + println!("buffer len {} MB", buffer.len() / 1_000_000); + let columnar = columnar::ColumnarReader::open(buffer)?; + for (column_name, dynamic_column) in columnar.list_columns()? { + let num_bytes = dynamic_column.num_bytes(); + let typ = dynamic_column.column_type(); + if num_bytes > 1_000_000 { + println!("{column_name} {typ:?} {} KB", num_bytes / 1_000); + } + } + println!("{} columns", columnar.num_columns()); + Ok(()) +}