From a141c3ec5911edfbc09aadfad7c9aba0083ecf58 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 13 Jun 2024 16:04:52 +0900 Subject: [PATCH] add columnar format compatibiliy tests (#2433) * add columnar format compatibiliy tests * always try to write current format --- columnar/compat_tests_data/v1.columnar | Bin 0 -> 32092 bytes columnar/src/columnar/format_version.rs | 13 ++++ columnar/src/columnar/mod.rs | 1 + columnar/src/compat_tests.rs | 85 ++++++++++++++++++++++++ columnar/src/lib.rs | 5 +- 5 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 columnar/compat_tests_data/v1.columnar create mode 100644 columnar/src/compat_tests.rs diff --git a/columnar/compat_tests_data/v1.columnar b/columnar/compat_tests_data/v1.columnar new file mode 100644 index 0000000000000000000000000000000000000000..512b4394e00228b0ace57f59879ef59d02c0d70a GIT binary patch literal 32092 zcmeI)aa7)A`N#2l6BQRtR8(9v9~BiREu5&hXx>g#RGK)MqSCybI8kxpL`B8zp2MU< zB_pFkyE8H>R5CIu$~z;Y#3Cc3LcKLIGAc4MGRoMxulMQqZ_i)9bH3j{p3iH~4xh7g zo$Gq|XXjb8ef!Q0@li^YLK+!lkwYE@q!!VKG&0B{hdc^MEv64?WROJ;c@&V^gFd8@ zK^8gWQ9vq#KBSRB7CGcmKx$9=kVXbs%2A)Z6Go8X07fLmmaBvgkt^8Dx<| z9tEV{P9M_9Ad4LGC?K_zKBSRB7CGcmKx$w5kVXbs(`uIpk45>OlICMh02rkVgTj zgEqt`g)}nAB8NN*NF7Wc(#Rl-9P%h2l}jJe$RLXx@+cs62z^K+gDi5$qkz;h`jAEj zS>%vM0jYP;hcq(CB8NN*NafLoG&0B{hdc^My^}tqkwF$Y%2A)Uos-jSRBLA&&x5h4dkf46?`}j{;Kf zq7P|gkVOu86p&g;AJWJmiyZPOAoXtgkVXbsGKAJWJmiyZPOAXQ8s(#Rl-9P%h2bpm}zBZDk*$fJPNi5udR zLK+!lkwYE@q)ws_X=IQ^4tW%iDxnW)WROJ;c@&U3nLebEK^8gWQ9x=neMlpNEON-B zfYf{FLmC-mkwYE@q)O>S8X07fLmmaB-b)|S$RLXx@+cs+hCZZ`K^8gWQ9$Yx`jAEj zS>%vM0Vz!%(#Rl-9P%h2bt-*GBZDk*$fJPNTKbSi23h2gM**qR=tCMAWRXK21*FR8 zLmC-mkwYE@q)w*~X=IQ^4tW%iI)gr>kwF$Y%vM z0jaa-LmC-mkwYE@q|T-fX=IQ^4tW%iI)^@_kwF$Yqz`FikVOu86p*T>4{2nOMGkoskh+LIq>(`uIpk45>SFqkMh02rkVgTj zOXx!y8Dx<|9tEUo=tCMAWRXK21*G0jAJWJmiyZPOAhnS`q>(`uIpk45>QefUMh02r zkVgS2Lm$$}Ad4LGC?ItieMlpNEON-BfYc`XkVXbs%2AR2_XtBZDk*$fJPN zRrDc^46?`}j{;Ix(}y%N$RdY43P@c;AJWJmiyZPOAXQHv(#Rl-9P%h2buE2JBZDk* z$fJPN7W$Az23h2gM**qp=tCMAWRXK21*972LmC-mkwYE@q^_qAX=IQ^4tW%i+DaeN z$RLXx@+cs61ARy%gDi5$qkvQ+eMlpNEON-BfYgojA&m^O$RUpcQa5dgPYP*dkVOu8 z6p*@^KBSRB7CGcmK&pv8q>(`uIpk45>K6KtMh02rkVgTjTj@g@8Dx<|9tEUsqYr6h zkVOu86p(7B4{2nOMGkoskh+~dq>(`uIpk45Y8!n>BZDk*$fJPNcKVP;23h2gM`62C zZ*5*D9Wgs&cExltyJPmm?2XwMvp?oQ z%)yvG=1|Pxm?JSqV~)ifk2w)D#GH&d6>~b~Ow8Gsb1~;*s)b1`Jnbq&kSWL#WD9Zx zxq>`FN{}xo5EKfE1jT|9L8(9s$^_+t3PGiyN>DAR5g0+OpiWRPXb?0Cngq=|)S(L- zv!q~QHvhFbJGO1#x$r+qcTddjdR-V?vN`FN{}xo5EKfE1jT|9L8(9s$^_+t3PGiyN>DAR5g0+OpiWRPXb?0Cngq=P zwfl^A%}0f$Sfa&??82|Ag}>PS5#y{{ynD5Fz3kZaNwjVIT~+cc@_);p zB|rba4KDsvGQYIOw;`EdGQV^tknB%7)7y~TU-J7etp!T%FRk$<_m|vXS__oiUs~f! z?k~B&v=%72zqH1e++T8kX)RE4e`$>`xxeK8(psS8{?Zy>a(~JFrL{oG{iQX&%kke|~=_TzAU$+wZcAc89tnW=qTu#B7cEp_pwk?}^zS^S+oJ zG4GGr8S|4dyJ9{V)5Ux!W_QeoWA?=SV$9x{kH+ka`FPC!m`}tUi203}gE7Ay)5m-& z=1|P1V-CmsVa$=3KZ-dT^SPK~F`th)9`lzmCt|)BGsJu)=48xQV@}0jhPU3k21Iiv=};jRGUsB&Zc^7Ssu@7Ss#22pR-i1&xB6 z1Wkfl1Oa6fflS4lnKrdlnc%lR0!4yDh2BWRe}oy)q;x!HG+)- zBiJOU6>JvN39c5@3$_Rv1X~4-f|~?Qf?Ea63s|%5_Pf-!or|8#RBFG4uQOV-`)iJB z)vzjD)V^qTQT5`%#i>17_L$nEBBL)OXV0cRC-&5Pb?=qAcm3X@dlzSRX3l4teTMca zSkks+W=YlC2HuvJWwR!;%HQ7m_UxsNOUIX%?(6o=$gayC$u8QjW52omYWDa0=O55| z!1Mu?IsG}g2R0uV4lFyU=b)^E8x9^jxFokLR~=G&$nYVB%i5RCE~|dW;5$-zEqPOU z74PhOXU_7bd|zr^%etv`73cPyn^W0TIZ>(4>pm}Yef|2;^~L9RowB-y-rBfzd~4|q z?uLxUy2g>lq8mGIoV&5+CVx}@&8;_2-(1<$-;{ex^DW_)vRiv@&AP4Owz1nvn!B3S z?X|ZL-(I+_ecSA|>g|KuQ?|uU*@_)~J96%5x?|!Fy|a5~W=nm`XiM>(op;XPX+ALY zfr7i*?wYx)s&$|>?}PS(lOHVqQ16GbKiv4?@eh}_xweeE>+T-8yXc;dd*<$`x!2#D z-`?6j-Cp^T{*UC|*L+{Nuk52eAI<7$=osrL`B>M-)cv*h58q$-@%E3;e!RMKuru|E zmQPH5qT-W%pUnAG)2AjrrMtSjG9RdaVDy3F2Rk2}f6#n-=+gzR&CR%~&kTGf?;-op zcj@Qc=Q19ydwAsGqR)4He(v)%J-#RZ3$0(6{zBy!`@fj`rRFb% zFO~K7^kzNM@W|LBC69JJsvfI-Z1}OlzV^P^zUnUzemV7c%i~jzSA3=KD>?m5{S*EA ztKDDCe4_q|(I<+(*7>#hubF|Nfr78MeSPNZRo@u+M&38=Hz&VYKG-{${jJ7tjeo23 z+wR*LPu4v-@???k@N>TAJN`TQPqjWZ{Z!?5`@fqz)I1c1%D&h0y{xAjo*sL;&8dMi+<7Zi@9Ia z{L=q2|5vTQn*LSgME^wY3(YTt7s_7jc`@syhL^@(DhXXdydb1+wR|HPSsD1P8I*I^LO*VGp`N3R`C0_-_QKMYI~>xGPriT>viFq^jg%u3%O!Xr3!bwv8VcO)ti~GePq#3 zmNe8}vE_y(); @@ -20,12 +23,22 @@ pub fn parse_footer(footer_bytes: [u8; VERSION_FOOTER_NUM_BYTES]) -> Result fmt::Result { + match self { + Version::V1 => write!(f, "v1"), + } + } +} + impl Version { fn to_bytes(self) -> [u8; 4] { (self as u32).to_le_bytes() diff --git a/columnar/src/columnar/mod.rs b/columnar/src/columnar/mod.rs index 12a7084e7..bb9e485be 100644 --- a/columnar/src/columnar/mod.rs +++ b/columnar/src/columnar/mod.rs @@ -5,6 +5,7 @@ mod reader; mod writer; pub use column_type::{ColumnType, HasAssociatedColumnType}; +pub use format_version::{Version, CURRENT_VERSION}; #[cfg(test)] pub(crate) use merge::ColumnTypeCategory; pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder}; diff --git a/columnar/src/compat_tests.rs b/columnar/src/compat_tests.rs new file mode 100644 index 000000000..52950fe15 --- /dev/null +++ b/columnar/src/compat_tests.rs @@ -0,0 +1,85 @@ +use std::path::PathBuf; + +use crate::{Column, ColumnarReader, DynamicColumn, CURRENT_VERSION}; + +const NUM_DOCS: u32 = u16::MAX as u32; + +fn generate_columnar(num_docs: u32) -> Vec { + use crate::ColumnarWriter; + + let mut columnar_writer = ColumnarWriter::default(); + + for i in 0..num_docs { + if i % 100 == 0 { + columnar_writer.record_numerical(i, "sparse", i as u64); + } + if i % 2 == 0 { + columnar_writer.record_numerical(i, "dense", i as u64); + } + columnar_writer.record_numerical(i, "full", i as u64); + columnar_writer.record_numerical(i, "multi", i as u64); + columnar_writer.record_numerical(i, "multi", i as u64); + } + + let mut wrt: Vec = Vec::new(); + columnar_writer.serialize(num_docs, None, &mut wrt).unwrap(); + + wrt +} + +#[test] +/// Writes a columnar for the CURRENT_VERSION to disk. +fn create_format() { + let version = CURRENT_VERSION.to_string(); + let file_path = path_for_version(&version); + if PathBuf::from(file_path.clone()).exists() { + return; + } + let columnar = generate_columnar(NUM_DOCS); + std::fs::write(file_path, columnar).unwrap(); +} + +fn path_for_version(version: &str) -> String { + format!("./compat_tests_data/{}.columnar", version) +} + +#[test] +fn test_format_v1() { + let path = path_for_version("v1"); + test_format(&path); +} + +fn test_format(path: &str) { + let file_content = std::fs::read(path).unwrap(); + let reader = ColumnarReader::open(file_content).unwrap(); + + let column = open_column(&reader, "full"); + assert_eq!(column.first(0).unwrap(), 0); + assert_eq!(column.first(NUM_DOCS - 1).unwrap(), NUM_DOCS as u64 - 1); + + let column = open_column(&reader, "multi"); + assert_eq!(column.first(0).unwrap(), 0); + assert_eq!(column.first(NUM_DOCS - 1).unwrap(), NUM_DOCS as u64 - 1); + + let column = open_column(&reader, "sparse"); + assert_eq!(column.first(0).unwrap(), 0); + assert_eq!(column.first(NUM_DOCS - 1), None); + assert_eq!(column.first(65000), Some(65000)); + + let column = open_column(&reader, "dense"); + assert_eq!(column.first(0).unwrap(), 0); + assert_eq!(column.first(NUM_DOCS - 1).unwrap(), NUM_DOCS as u64 - 1); + assert_eq!(column.first(NUM_DOCS - 2), None); +} + +fn open_column(reader: &ColumnarReader, name: &str) -> Column { + let column = reader.read_columns(name).unwrap()[0] + .open() + .unwrap() + .coerce_numerical(crate::NumericalType::U64) + .unwrap(); + let DynamicColumn::U64(column) = column else { + panic!(); + }; + column +} diff --git a/columnar/src/lib.rs b/columnar/src/lib.rs index 7236ea5bc..2b7a60b3a 100644 --- a/columnar/src/lib.rs +++ b/columnar/src/lib.rs @@ -48,7 +48,7 @@ pub use column_values::{ }; pub use columnar::{ merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType, - MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, + MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION, }; use sstable::VoidSSTable; pub use value::{NumericalType, NumericalValue}; @@ -131,3 +131,6 @@ impl Cardinality { #[cfg(test)] mod tests; + +#[cfg(test)] +mod compat_tests;