From bd5eea98527e74d7f57dd39d62e4ab25ce8a5e5b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 20 Jan 2023 18:17:46 +0900 Subject: [PATCH] Integrated columnar work. --- Cargo.toml | 4 +- TODO.txt | 18 + bitpacker/src/bitpacker.rs | 6 +- columnar/Cargo.toml | 4 - columnar/benches/bench_u128.rs | 124 +++ columnar/benches/bench_u64.rs | 211 +++++ columnar/src/TODO.md | 26 +- columnar/src/column/dictionary_encoded.rs | 30 +- columnar/src/column/mod.rs | 58 +- columnar/src/column/serialize.rs | 47 +- columnar/src/column_index/merge/mod.rs | 136 +++ columnar/src/column_index/merge/shuffled.rs | 171 ++++ columnar/src/column_index/merge/stacked.rs | 154 ++++ columnar/src/column_index/mod.rs | 59 +- .../src/column_index/multivalued_index.rs | 134 ++- .../src/column_index/optional_index/mod.rs | 190 ++-- .../src/column_index/optional_index/set.rs | 25 +- .../optional_index/set_block/dense.rs | 63 +- .../optional_index/set_block/sparse.rs | 37 +- .../optional_index/set_block/tests.rs | 36 +- .../src/column_index/optional_index/tests.rs | 141 +-- columnar/src/column_index/serialize.rs | 26 +- columnar/src/column_values/bitpacked.rs | 115 --- .../src/column_values/blockwise_linear.rs | 188 ---- columnar/src/column_values/column.rs | 29 +- .../column_values/column_with_cardinality.rs | 19 - .../src/column_values/compact_space/mod.rs | 734 ++++++++-------- columnar/src/column_values/gcd.rs | 75 -- columnar/src/column_values/linear.rs | 230 ----- columnar/src/column_values/main.rs | 222 ----- columnar/src/column_values/mod.rs | 175 ++-- .../src/column_values/monotonic_mapping.rs | 3 +- .../column_values/monotonic_mapping_u128.rs | 3 +- columnar/src/column_values/serialize.rs | 272 +----- columnar/src/column_values/stats.rs | 96 +++ .../src/column_values/u64_based/bitpacked.rs | 127 +++ .../u64_based/blockwise_linear.rs | 281 ++++++ .../src/column_values/{ => u64_based}/line.rs | 22 +- .../src/column_values/u64_based/linear.rs | 277 ++++++ columnar/src/column_values/u64_based/mod.rs | 182 ++++ .../u64_based/stats_collector.rs | 200 +++++ .../column_values/{ => u64_based}/tests.rs | 196 +++-- columnar/src/columnar/column_type.rs | 17 +- columnar/src/columnar/merge.rs | 176 ---- .../src/columnar/merge/merge_dict_column.rs | 204 +++++ columnar/src/columnar/merge/merge_mapping.rs | 118 +++ columnar/src/columnar/merge/mod.rs | 240 ++++++ .../columnar/merge}/sorted_doc_id_column.rs | 0 columnar/src/columnar/merge/term_merger.rs | 107 +++ columnar/src/columnar/merge/tests.rs | 258 ++++++ columnar/src/columnar/merge_index.rs | 1 + columnar/src/columnar/mod.rs | 3 +- columnar/src/columnar/reader/mod.rs | 61 +- .../src/columnar/writer/column_writers.rs | 51 +- columnar/src/columnar/writer/mod.rs | 235 +++-- columnar/src/columnar/writer/serializer.rs | 6 +- columnar/src/columnar/writer/value_index.rs | 40 +- columnar/src/dynamic_column.rs | 34 +- columnar/src/iterable.rs | 19 + columnar/src/lib.rs | 30 +- columnar/src/tests.rs | 16 +- common/src/serialize.rs | 53 +- common/src/vint.rs | 4 +- .../aggregation.rs | 4 +- .../basic_search.rs | 0 .../custom_collector.rs | 0 .../custom_tokenizer.rs | 0 .../date_time_field.rs | 4 +- .../deleting_updating_documents.rs | 0 .../faceted_search.rs | 0 .../faceted_search_with_tweaked_score.rs | 0 .../integer_range_search.rs | 0 {examples => examples-disabled}/ip_field.rs | 0 .../iterating_docs_and_positions.rs | 0 {examples => examples-disabled}/json_field.rs | 0 .../multiple_producer.rs | 0 .../pre_tokenized_text.rs | 0 {examples => examples-disabled}/snippet.rs | 0 {examples => examples-disabled}/stop_words.rs | 0 {examples => examples-disabled}/warmer.rs | 0 .../working_with_json.rs | 0 fastfield_codecs/Cargo.toml | 33 - fastfield_codecs/README.md | 68 -- fastfield_codecs/benches/bench.rs | 311 ------- fastfield_codecs/src/bitpacked.rs | 116 --- fastfield_codecs/src/blockwise_linear.rs | 188 ---- fastfield_codecs/src/column.rs | 352 -------- .../src/compact_space/blank_range.rs | 43 - .../src/compact_space/build_compact_space.rs | 231 ----- fastfield_codecs/src/compact_space/mod.rs | 815 ------------------ fastfield_codecs/src/format_version.rs | 38 - fastfield_codecs/src/gcd.rs | 170 ---- fastfield_codecs/src/lib.rs | 568 ------------ fastfield_codecs/src/line.rs | 222 ----- fastfield_codecs/src/linear.rs | 230 ----- fastfield_codecs/src/main.rs | 222 ----- fastfield_codecs/src/monotonic_mapping.rs | 320 ------- .../src/monotonic_mapping_u128.rs | 43 - fastfield_codecs/src/null_index/dense.rs | 500 ----------- fastfield_codecs/src/null_index/mod.rs | 14 - fastfield_codecs/src/null_index/sparse.rs | 768 ----------------- fastfield_codecs/src/null_index_footer.rs | 145 ---- fastfield_codecs/src/serialize.rs | 427 --------- src/aggregation/agg_req_with_accessor.rs | 100 +-- .../bucket/histogram/date_histogram.rs | 126 +++ src/aggregation/bucket/histogram/histogram.rs | 87 +- src/aggregation/bucket/histogram/mod.rs | 2 + src/aggregation/bucket/range.rs | 57 +- src/aggregation/bucket/term_agg.rs | 299 ++----- src/aggregation/collector.rs | 10 +- src/aggregation/intermediate_agg_result.rs | 15 +- src/aggregation/metric/mod.rs | 4 +- src/aggregation/metric/stats.rs | 95 +- src/aggregation/mod.rs | 113 ++- src/aggregation/segment_agg_result.rs | 209 +++-- src/collector/facet_collector.rs | 331 ++++--- src/collector/filter_collector_wrapper.rs | 31 +- src/collector/histogram_collector.rs | 16 +- src/collector/mod.rs | 1 - src/collector/tests.rs | 51 +- src/collector/top_score_collector.rs | 79 +- src/core/index.rs | 20 +- src/core/segment_reader.rs | 38 +- src/directory/composite_file.rs | 11 +- src/fastfield/bytes/mod.rs | 116 --- src/fastfield/bytes/reader.rs | 58 -- src/fastfield/bytes/writer.rs | 145 ---- src/fastfield/error.rs | 2 +- src/fastfield/facet_reader.rs | 148 ++-- src/fastfield/mod.rs | 690 +++++++-------- src/fastfield/multivalued/index.rs | 149 ---- src/fastfield/multivalued/mod.rs | 619 ------------- src/fastfield/multivalued/reader.rs | 333 ------- src/fastfield/multivalued/writer.rs | 442 ---------- src/fastfield/readers.rs | 427 +++------ src/fastfield/serializer/mod.rs | 122 --- src/fastfield/writer.rs | 649 +++----------- src/indexer/doc_id_mapping.rs | 168 ++-- src/indexer/index_writer.rs | 322 ++++--- src/indexer/json_term_writer.rs | 3 +- src/indexer/merger.rs | 707 ++++----------- src/indexer/merger_sorted_index_test.rs | 310 ++++--- src/indexer/mod.rs | 88 +- src/indexer/segment_serializer.rs | 15 +- src/indexer/segment_writer.rs | 39 +- .../sorted_doc_id_multivalue_column.rs | 169 ---- src/lib.rs | 16 + src/postings/json_postings_writer.rs | 15 +- src/postings/mod.rs | 2 - src/postings/postings_writer.rs | 86 +- src/postings/serializer.rs | 11 +- src/postings/term_info.rs | 2 +- src/query/query_parser/query_parser.rs | 15 +- .../range_query/fast_field_range_query.rs | 83 +- src/query/range_query/mod.rs | 27 +- src/query/range_query/range_query.rs | 47 +- .../range_query/range_query_ip_fastfield.rs | 146 ++-- .../range_query/range_query_u64_fastfield.rs | 205 ++--- src/query/term_query/term_query.rs | 2 +- src/schema/date_time_options.rs | 52 +- src/schema/document.rs | 2 +- src/schema/facet.rs | 10 +- src/schema/field.rs | 2 +- src/schema/field_type.rs | 22 +- src/schema/field_value.rs | 2 +- src/schema/ip_options.rs | 26 +- src/schema/mod.rs | 4 +- src/schema/numeric_options.rs | 56 +- src/schema/schema.rs | 55 +- src/schema/term.rs | 2 +- src/schema/value.rs | 4 +- src/space_usage/mod.rs | 13 +- src/store/footer.rs | 2 +- src/termdict/fst_termdict/term_info_store.rs | 4 +- src/tokenizer/facet_tokenizer.rs | 4 +- sstable/src/dictionary.rs | 25 +- sstable/src/lib.rs | 1 + sstable/src/sstable_index.rs | 4 +- stacker/src/arena_hashmap.rs | 24 +- tests/mod.rs | 1 - 180 files changed, 7228 insertions(+), 13811 deletions(-) create mode 100644 TODO.txt create mode 100644 columnar/benches/bench_u128.rs create mode 100644 columnar/benches/bench_u64.rs create mode 100644 columnar/src/column_index/merge/mod.rs create mode 100644 columnar/src/column_index/merge/shuffled.rs create mode 100644 columnar/src/column_index/merge/stacked.rs delete mode 100644 columnar/src/column_values/bitpacked.rs delete mode 100644 columnar/src/column_values/blockwise_linear.rs delete mode 100644 columnar/src/column_values/column_with_cardinality.rs delete mode 100644 columnar/src/column_values/gcd.rs delete mode 100644 columnar/src/column_values/linear.rs delete mode 100644 columnar/src/column_values/main.rs create mode 100644 columnar/src/column_values/stats.rs create mode 100644 columnar/src/column_values/u64_based/bitpacked.rs create mode 100644 columnar/src/column_values/u64_based/blockwise_linear.rs rename columnar/src/column_values/{ => u64_based}/line.rs (91%) create mode 100644 columnar/src/column_values/u64_based/linear.rs create mode 100644 columnar/src/column_values/u64_based/mod.rs create mode 100644 columnar/src/column_values/u64_based/stats_collector.rs rename columnar/src/column_values/{ => u64_based}/tests.rs (60%) delete mode 100644 columnar/src/columnar/merge.rs create mode 100644 columnar/src/columnar/merge/merge_dict_column.rs create mode 100644 columnar/src/columnar/merge/merge_mapping.rs create mode 100644 columnar/src/columnar/merge/mod.rs rename {src/indexer => columnar/src/columnar/merge}/sorted_doc_id_column.rs (100%) create mode 100644 columnar/src/columnar/merge/term_merger.rs create mode 100644 columnar/src/columnar/merge/tests.rs create mode 100644 columnar/src/columnar/merge_index.rs create mode 100644 columnar/src/iterable.rs rename {examples => examples-disabled}/aggregation.rs (95%) rename {examples => examples-disabled}/basic_search.rs (100%) rename {examples => examples-disabled}/custom_collector.rs (100%) rename {examples => examples-disabled}/custom_tokenizer.rs (100%) rename {examples => examples-disabled}/date_time_field.rs (94%) rename {examples => examples-disabled}/deleting_updating_documents.rs (100%) rename {examples => examples-disabled}/faceted_search.rs (100%) rename {examples => examples-disabled}/faceted_search_with_tweaked_score.rs (100%) rename {examples => examples-disabled}/integer_range_search.rs (100%) rename {examples => examples-disabled}/ip_field.rs (100%) rename {examples => examples-disabled}/iterating_docs_and_positions.rs (100%) rename {examples => examples-disabled}/json_field.rs (100%) rename {examples => examples-disabled}/multiple_producer.rs (100%) rename {examples => examples-disabled}/pre_tokenized_text.rs (100%) rename {examples => examples-disabled}/snippet.rs (100%) rename {examples => examples-disabled}/stop_words.rs (100%) rename {examples => examples-disabled}/warmer.rs (100%) rename {examples => examples-disabled}/working_with_json.rs (100%) delete mode 100644 fastfield_codecs/Cargo.toml delete mode 100644 fastfield_codecs/README.md delete mode 100644 fastfield_codecs/benches/bench.rs delete mode 100644 fastfield_codecs/src/bitpacked.rs delete mode 100644 fastfield_codecs/src/blockwise_linear.rs delete mode 100644 fastfield_codecs/src/column.rs delete mode 100644 fastfield_codecs/src/compact_space/blank_range.rs delete mode 100644 fastfield_codecs/src/compact_space/build_compact_space.rs delete mode 100644 fastfield_codecs/src/compact_space/mod.rs delete mode 100644 fastfield_codecs/src/format_version.rs delete mode 100644 fastfield_codecs/src/gcd.rs delete mode 100644 fastfield_codecs/src/lib.rs delete mode 100644 fastfield_codecs/src/line.rs delete mode 100644 fastfield_codecs/src/linear.rs delete mode 100644 fastfield_codecs/src/main.rs delete mode 100644 fastfield_codecs/src/monotonic_mapping.rs delete mode 100644 fastfield_codecs/src/monotonic_mapping_u128.rs delete mode 100644 fastfield_codecs/src/null_index/dense.rs delete mode 100644 fastfield_codecs/src/null_index/mod.rs delete mode 100644 fastfield_codecs/src/null_index/sparse.rs delete mode 100644 fastfield_codecs/src/null_index_footer.rs delete mode 100644 fastfield_codecs/src/serialize.rs create mode 100644 src/aggregation/bucket/histogram/date_histogram.rs delete mode 100644 src/fastfield/bytes/mod.rs delete mode 100644 src/fastfield/bytes/reader.rs delete mode 100644 src/fastfield/bytes/writer.rs delete mode 100644 src/fastfield/multivalued/index.rs delete mode 100644 src/fastfield/multivalued/mod.rs delete mode 100644 src/fastfield/multivalued/reader.rs delete mode 100644 src/fastfield/multivalued/writer.rs delete mode 100644 src/fastfield/serializer/mod.rs delete mode 100644 src/indexer/sorted_doc_id_multivalue_column.rs delete mode 100644 tests/mod.rs diff --git a/Cargo.toml b/Cargo.toml index d871aeae6..cf9becaf1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,8 +59,8 @@ sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optiona stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" } tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" } tantivy-bitpacker = { version= "0.3", path="./bitpacker" } +columnar = { version= "0.1", path="./columnar", package="tantivy-columnar" } common = { version= "0.5", path = "./common/", package = "tantivy-common" } -fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false } tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" } [target.'cfg(windows)'.dependencies] @@ -107,7 +107,7 @@ unstable = [] # useful for benches. quickwit = ["sstable"] [workspace] -members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "tokenizer-api"] +members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"] # Following the "fail" crate best practises, we isolate # tests that define specific behavior in fail check points diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 000000000..64547834d --- /dev/null +++ b/TODO.txt @@ -0,0 +1,18 @@ +Make schema_builder API fluent. +fix doc serialization and prevent compression problems + +u64 , etc. shoudl return Resutl