fix: matches incorrectly uses byte len as char len (#5411)

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
This commit is contained in:
Zhenchi
2025-01-21 10:34:36 +08:00
committed by Yingwen
parent c12fbcda9f
commit 5f67f2b58e
3 changed files with 264 additions and 2 deletions

View File

@@ -725,7 +725,8 @@ struct Tokenizer {
impl Tokenizer {
pub fn tokenize(mut self, pattern: &str) -> Result<Vec<Token>> {
let mut tokens = vec![];
while self.cursor < pattern.len() {
let char_len = pattern.chars().count();
while self.cursor < char_len {
// TODO: collect pattern into Vec<char> if this tokenizer is bottleneck in the future
let c = pattern.chars().nth(self.cursor).unwrap();
match c {
@@ -794,7 +795,8 @@ impl Tokenizer {
let mut phase = String::new();
let mut is_quote_present = false;
while self.cursor < pattern.len() {
let char_len = pattern.chars().count();
while self.cursor < char_len {
let mut c = pattern.chars().nth(self.cursor).unwrap();
match c {
@@ -899,6 +901,26 @@ mod test {
Phase("c".to_string()),
],
),
(
r#"中文 测试"#,
vec![Phase("中文".to_string()), Phase("测试".to_string())],
),
(
r#"中文 AND 测试"#,
vec![Phase("中文".to_string()), And, Phase("测试".to_string())],
),
(
r#"中文 +测试"#,
vec![Phase("中文".to_string()), Must, Phase("测试".to_string())],
),
(
r#"中文 -测试"#,
vec![
Phase("中文".to_string()),
Negative,
Phase("测试".to_string()),
],
),
];
for (query, expected) in cases {
@@ -1030,6 +1052,61 @@ mod test {
],
},
),
(
r#"中文 测试"#,
PatternAst::Binary {
op: BinaryOp::Or,
children: vec![
PatternAst::Literal {
op: UnaryOp::Optional,
pattern: "中文".to_string(),
},
PatternAst::Literal {
op: UnaryOp::Optional,
pattern: "测试".to_string(),
},
],
},
),
(
r#"中文 AND 测试"#,
PatternAst::Binary {
op: BinaryOp::And,
children: vec![
PatternAst::Literal {
op: UnaryOp::Optional,
pattern: "中文".to_string(),
},
PatternAst::Literal {
op: UnaryOp::Optional,
pattern: "测试".to_string(),
},
],
},
),
(
r#"中文 +测试"#,
PatternAst::Literal {
op: UnaryOp::Must,
pattern: "测试".to_string(),
},
),
(
r#"中文 -测试"#,
PatternAst::Binary {
op: BinaryOp::And,
children: vec![
PatternAst::Literal {
op: UnaryOp::Negative,
pattern: "测试".to_string(),
},
PatternAst::Literal {
op: UnaryOp::Optional,
pattern: "中文".to_string(),
},
],
},
),
];
for (query, expected) in cases {

View File

@@ -257,3 +257,149 @@ drop table fox;
Affected Rows: 0
create table fox_zh (
ts timestamp time index,
fox string,
);
Affected Rows: 0
insert into fox_zh values
(1, '快速的棕色狐狸跳过了懒狗'),
(2, '这只狐狸非常聪明,跳过了高高的栅栏'),
(3, '狐狸和狗是好朋友,它们一起玩耍'),
(4, '狐狸跳过了一条小溪,狗在后面追赶'),
(5, '狐狸和狗都喜欢在森林里探险'),
(6, '狐狸跳过了一个大石头,狗却没有跳过去'),
(7, '狐狸和狗在阳光下休息,享受着温暖的时光'),
(8, '狐狸跳过了一个小山坡,狗在后面慢慢地走'),
(9, '狐狸和狗一起找到了一颗闪闪发光的宝石'),
(10, '狐狸跳过了一个小水坑,狗在旁边看着');
Affected Rows: 10
select fox from fox_zh where matches(fox, '狐狸 AND 跳过') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 快速的棕色狐狸跳过了懒狗 |
| 这只狐狸非常聪明,跳过了高高的栅栏 |
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸跳过了一个大石头,狗却没有跳过去 |
| 狐狸跳过了一个小山坡,狗在后面慢慢地走 |
| 狐狸跳过了一个小水坑,狗在旁边看着 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 OR 狗') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 快速的棕色狐狸跳过了懒狗 |
| 这只狐狸非常聪明,跳过了高高的栅栏 |
| 狐狸和狗是好朋友,它们一起玩耍 |
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸和狗都喜欢在森林里探险 |
| 狐狸跳过了一个大石头,狗却没有跳过去 |
| 狐狸和狗在阳光下休息,享受着温暖的时光 |
| 狐狸跳过了一个小山坡,狗在后面慢慢地走 |
| 狐狸和狗一起找到了一颗闪闪发光的宝石 |
| 狐狸跳过了一个小水坑,狗在旁边看着 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 AND 狗') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 快速的棕色狐狸跳过了懒狗 |
| 狐狸和狗是好朋友,它们一起玩耍 |
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸和狗都喜欢在森林里探险 |
| 狐狸跳过了一个大石头,狗却没有跳过去 |
| 狐狸和狗在阳光下休息,享受着温暖的时光 |
| 狐狸跳过了一个小山坡,狗在后面慢慢地走 |
| 狐狸和狗一起找到了一颗闪闪发光的宝石 |
| 狐狸跳过了一个小水坑,狗在旁边看着 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 -跳过') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 狐狸和狗是好朋友,它们一起玩耍 |
| 狐狸和狗都喜欢在森林里探险 |
| 狐狸和狗在阳光下休息,享受着温暖的时光 |
| 狐狸和狗一起找到了一颗闪闪发光的宝石 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 AND 跳过 -石头') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 快速的棕色狐狸跳过了懒狗 |
| 这只狐狸非常聪明,跳过了高高的栅栏 |
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸跳过了一个小山坡,狗在后面慢慢地走 |
| 狐狸跳过了一个小水坑,狗在旁边看着 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '(狐狸 OR 狗) AND 森林') order by ts;
+----------------------------+
| fox |
+----------------------------+
| 狐狸和狗都喜欢在森林里探险 |
+----------------------------+
select fox from fox_zh where matches(fox, '狐狸 AND (跳过 OR 追赶)') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 快速的棕色狐狸跳过了懒狗 |
| 这只狐狸非常聪明,跳过了高高的栅栏 |
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸跳过了一个大石头,狗却没有跳过去 |
| 狐狸跳过了一个小山坡,狗在后面慢慢地走 |
| 狐狸跳过了一个小水坑,狗在旁边看着 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 AND -(跳过 OR 追赶)') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 狐狸和狗是好朋友,它们一起玩耍 |
| 狐狸和狗都喜欢在森林里探险 |
| 狐狸和狗在阳光下休息,享受着温暖的时光 |
| 狐狸和狗一起找到了一颗闪闪发光的宝石 |
+----------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND (小溪 OR 石头)') order by ts;
+--------------------------------------+
| fox |
+--------------------------------------+
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸跳过了一个大石头,狗却没有跳过去 |
+--------------------------------------+
select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND -(石头 OR 栅栏)') order by ts;
+----------------------------------------+
| fox |
+----------------------------------------+
| 快速的棕色狐狸跳过了懒狗 |
| 狐狸跳过了一条小溪,狗在后面追赶 |
| 狐狸跳过了一个小山坡,狗在后面慢慢地走 |
| 狐狸跳过了一个小水坑,狗在旁边看着 |
+----------------------------------------+
drop table fox_zh;
Affected Rows: 0

View File

@@ -55,3 +55,42 @@ select fox from fox where matches(fox, 'over -(fox AND jumps)') order by ts;
select fox from fox where matches(fox, 'over AND -(-(fox OR jumps))') order by ts;
drop table fox;
create table fox_zh (
ts timestamp time index,
fox string,
);
insert into fox_zh values
(1, '快速的棕色狐狸跳过了懒狗'),
(2, '这只狐狸非常聪明,跳过了高高的栅栏'),
(3, '狐狸和狗是好朋友,它们一起玩耍'),
(4, '狐狸跳过了一条小溪,狗在后面追赶'),
(5, '狐狸和狗都喜欢在森林里探险'),
(6, '狐狸跳过了一个大石头,狗却没有跳过去'),
(7, '狐狸和狗在阳光下休息,享受着温暖的时光'),
(8, '狐狸跳过了一个小山坡,狗在后面慢慢地走'),
(9, '狐狸和狗一起找到了一颗闪闪发光的宝石'),
(10, '狐狸跳过了一个小水坑,狗在旁边看着');
select fox from fox_zh where matches(fox, '狐狸 AND 跳过') order by ts;
select fox from fox_zh where matches(fox, '狐狸 OR 狗') order by ts;
select fox from fox_zh where matches(fox, '狐狸 AND 狗') order by ts;
select fox from fox_zh where matches(fox, '狐狸 -跳过') order by ts;
select fox from fox_zh where matches(fox, '狐狸 AND 跳过 -石头') order by ts;
select fox from fox_zh where matches(fox, '(狐狸 OR 狗) AND 森林') order by ts;
select fox from fox_zh where matches(fox, '狐狸 AND (跳过 OR 追赶)') order by ts;
select fox from fox_zh where matches(fox, '狐狸 AND -(跳过 OR 追赶)') order by ts;
select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND (小溪 OR 石头)') order by ts;
select fox from fox_zh where matches(fox, '狐狸 AND 跳过 AND -(石头 OR 栅栏)') order by ts;
drop table fox_zh;