diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 1bd9e8e1b5..1b515046ab 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -725,7 +725,8 @@ struct Tokenizer { impl Tokenizer { pub fn tokenize(mut self, pattern: &str) -> Result> { let mut tokens = vec![]; - while self.cursor < pattern.len() { + let char_len = pattern.chars().count(); + while self.cursor < char_len { // TODO: collect pattern into Vec if this tokenizer is bottleneck in the future let c = pattern.chars().nth(self.cursor).unwrap(); match c { @@ -794,7 +795,8 @@ impl Tokenizer { let mut phase = String::new(); let mut is_quote_present = false; - while self.cursor < pattern.len() { + let char_len = pattern.chars().count(); + while self.cursor < char_len { let mut c = pattern.chars().nth(self.cursor).unwrap(); match c { @@ -899,6 +901,26 @@ mod test { Phase("c".to_string()), ], ), + ( + r#"中文 测试"#, + vec![Phase("中文".to_string()), Phase("测试".to_string())], + ), + ( + r#"中文 AND 测试"#, + vec![Phase("中文".to_string()), And, Phase("测试".to_string())], + ), + ( + r#"中文 +测试"#, + vec![Phase("中文".to_string()), Must, Phase("测试".to_string())], + ), + ( + r#"中文 -测试"#, + vec![ + Phase("中文".to_string()), + Negative, + Phase("测试".to_string()), + ], + ), ]; for (query, expected) in cases { @@ -1030,6 +1052,61 @@ mod test { ], }, ), + ( + r#"中文 测试"#, + PatternAst::Binary { + op: BinaryOp::Or, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "中文".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "测试".to_string(), + }, + ], + }, + ), + ( + r#"中文 AND 测试"#, + PatternAst::Binary { + op: BinaryOp::And, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "中文".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "测试".to_string(), + }, + ], + }, + ), + ( + r#"中文 +测试"#, + PatternAst::Literal { + op: UnaryOp::Must, + pattern: "测试".to_string(), + }, + ), + ( + r#"中文 -测试"#, + PatternAst::Binary { + op: BinaryOp::And, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Negative, + pattern: "测试".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "中文".to_string(), + }, + ], + }, + ), ]; for (query, expected) in cases { diff --git a/tests/cases/standalone/common/select/matches.result b/tests/cases/standalone/common/select/matches.result index a3e4cad9a2..084cec8cd8 100644 --- a/tests/cases/standalone/common/select/matches.result +++ b/tests/cases/standalone/common/select/matches.result @@ -257,3 +257,149 @@ drop table fox; Affected Rows: 0 +create table fox_zh ( + ts timestamp time index, + fox string, +); + +Affected Rows: 0 + +insert into fox_zh values + (1, '快速的棕色狐狸跳过了懒狗'), + (2, '这只狐狸非常聪明,跳过了高高的栅栏'), + (3, '狐狸和狗是好朋友,它们一起玩耍'), + (4, '狐狸跳过了一条小溪,狗在后面追赶'), + (5, '狐狸和狗都喜欢在森林里探险'), + (6, '狐狸跳过了一个大石头,狗却没有跳过去'), + (7, '狐狸和狗在阳光下休息,享受着温暖的时光'), + (8, '狐狸跳过了一个小山坡,狗在后面慢慢地走'), + (9, '狐狸和狗一起找到了一颗闪闪发光的宝石'), + (10, '狐狸跳过了一个小水坑,狗在旁边看着'); + +Affected Rows: 10 + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 快速的棕色狐狸跳过了懒狗 | +| 这只狐狸非常聪明,跳过了高高的栅栏 | +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸跳过了一个大石头,狗却没有跳过去 | +| 狐狸跳过了一个小山坡,狗在后面慢慢地走 | +| 狐狸跳过了一个小水坑,狗在旁边看着 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 OR 狗') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 快速的棕色狐狸跳过了懒狗 | +| 这只狐狸非常聪明,跳过了高高的栅栏 | +| 狐狸和狗是好朋友,它们一起玩耍 | +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸和狗都喜欢在森林里探险 | +| 狐狸跳过了一个大石头,狗却没有跳过去 | +| 狐狸和狗在阳光下休息,享受着温暖的时光 | +| 狐狸跳过了一个小山坡,狗在后面慢慢地走 | +| 狐狸和狗一起找到了一颗闪闪发光的宝石 | +| 狐狸跳过了一个小水坑,狗在旁边看着 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 AND 狗') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 快速的棕色狐狸跳过了懒狗 | +| 狐狸和狗是好朋友,它们一起玩耍 | +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸和狗都喜欢在森林里探险 | +| 狐狸跳过了一个大石头,狗却没有跳过去 | +| 狐狸和狗在阳光下休息,享受着温暖的时光 | +| 狐狸跳过了一个小山坡,狗在后面慢慢地走 | +| 狐狸和狗一起找到了一颗闪闪发光的宝石 | +| 狐狸跳过了一个小水坑,狗在旁边看着 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 -跳过') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 狐狸和狗是好朋友,它们一起玩耍 | +| 狐狸和狗都喜欢在森林里探险 | +| 狐狸和狗在阳光下休息,享受着温暖的时光 | +| 狐狸和狗一起找到了一颗闪闪发光的宝石 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过 -石头') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 快速的棕色狐狸跳过了懒狗 | +| 这只狐狸非常聪明,跳过了高高的栅栏 | +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸跳过了一个小山坡,狗在后面慢慢地走 | +| 狐狸跳过了一个小水坑,狗在旁边看着 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '(狐狸 OR 狗) AND 森林') order by ts; + ++----------------------------+ +| fox | ++----------------------------+ +| 狐狸和狗都喜欢在森林里探险 | ++----------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 AND (跳过 OR 追赶)') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 快速的棕色狐狸跳过了懒狗 | +| 这只狐狸非常聪明,跳过了高高的栅栏 | +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸跳过了一个大石头,狗却没有跳过去 | +| 狐狸跳过了一个小山坡,狗在后面慢慢地走 | +| 狐狸跳过了一个小水坑,狗在旁边看着 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 AND -(跳过 OR 追赶)') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 狐狸和狗是好朋友,它们一起玩耍 | +| 狐狸和狗都喜欢在森林里探险 | +| 狐狸和狗在阳光下休息,享受着温暖的时光 | +| 狐狸和狗一起找到了一颗闪闪发光的宝石 | ++----------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND (小溪 OR 石头)') order by ts; + ++--------------------------------------+ +| fox | ++--------------------------------------+ +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸跳过了一个大石头,狗却没有跳过去 | ++--------------------------------------+ + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND -(石头 OR 栅栏)') order by ts; + ++----------------------------------------+ +| fox | ++----------------------------------------+ +| 快速的棕色狐狸跳过了懒狗 | +| 狐狸跳过了一条小溪,狗在后面追赶 | +| 狐狸跳过了一个小山坡,狗在后面慢慢地走 | +| 狐狸跳过了一个小水坑,狗在旁边看着 | ++----------------------------------------+ + +drop table fox_zh; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/select/matches.sql b/tests/cases/standalone/common/select/matches.sql index 41357a8ff8..d7f9e3e8ab 100644 --- a/tests/cases/standalone/common/select/matches.sql +++ b/tests/cases/standalone/common/select/matches.sql @@ -55,3 +55,42 @@ select fox from fox where matches(fox, 'over -(fox AND jumps)') order by ts; select fox from fox where matches(fox, 'over AND -(-(fox OR jumps))') order by ts; drop table fox; + +create table fox_zh ( + ts timestamp time index, + fox string, +); + +insert into fox_zh values + (1, '快速的棕色狐狸跳过了懒狗'), + (2, '这只狐狸非常聪明,跳过了高高的栅栏'), + (3, '狐狸和狗是好朋友,它们一起玩耍'), + (4, '狐狸跳过了一条小溪,狗在后面追赶'), + (5, '狐狸和狗都喜欢在森林里探险'), + (6, '狐狸跳过了一个大石头,狗却没有跳过去'), + (7, '狐狸和狗在阳光下休息,享受着温暖的时光'), + (8, '狐狸跳过了一个小山坡,狗在后面慢慢地走'), + (9, '狐狸和狗一起找到了一颗闪闪发光的宝石'), + (10, '狐狸跳过了一个小水坑,狗在旁边看着'); + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 OR 狗') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 AND 狗') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 -跳过') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过 -石头') order by ts; + +select fox from fox_zh where matches(fox, '(狐狸 OR 狗) AND 森林') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 AND (跳过 OR 追赶)') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 AND -(跳过 OR 追赶)') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND (小溪 OR 石头)') order by ts; + +select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND -(石头 OR 栅栏)') order by ts; + +drop table fox_zh;