feat(index): support SQL to specify inverted index columns (#4929)

* feat(index): support building inverted index for the field column

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* feat(index): support SQL to specify inverted index columns

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: fix sqlness

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: consider compatibility

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* polish

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* compatibility

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: ignore case

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* refactor: reduce dup

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: clippy

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
This commit is contained in:
Zhenchi
2024-11-11 16:06:23 +08:00
committed by GitHub
parent 0e0c4faf0d
commit 6248a6ccf5
14 changed files with 442 additions and 212 deletions

View File

@@ -28,4 +28,3 @@ pub use parsers::create_parser::{
COLUMN_FULLTEXT_OPT_KEY_ANALYZER, COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, ENGINE, MAXVALUE,
};
pub use parsers::tql_parser::TQL;
pub use statements::create::TIME_INDEX;

View File

@@ -20,7 +20,7 @@ use datatypes::arrow::datatypes::{DataType as ArrowDataType, IntervalUnit};
use datatypes::data_type::ConcreteDataType;
use itertools::Itertools;
use snafu::{ensure, OptionExt, ResultExt};
use sqlparser::ast::{ColumnOption, ColumnOptionDef, DataType, Expr, KeyOrIndexDisplay};
use sqlparser::ast::{ColumnOption, ColumnOptionDef, DataType, Expr};
use sqlparser::dialect::keywords::Keyword;
use sqlparser::keywords::ALL_KEYWORDS;
use sqlparser::parser::IsOptional::Mandatory;
@@ -29,7 +29,7 @@ use sqlparser::tokenizer::{Token, TokenWithLocation, Word};
use table::requests::validate_table_option;
use super::utils;
use crate::ast::{ColumnDef, Ident, TableConstraint};
use crate::ast::{ColumnDef, Ident};
use crate::error::{
self, InvalidColumnOptionSnafu, InvalidDatabaseOptionSnafu, InvalidIntervalSnafu,
InvalidSqlSnafu, InvalidTableOptionSnafu, InvalidTimeIndexSnafu, MissingTimeIndexSnafu, Result,
@@ -38,7 +38,7 @@ use crate::error::{
use crate::parser::{ParserContext, FLOW};
use crate::statements::create::{
Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable,
CreateTableLike, CreateView, Partitions, TIME_INDEX,
CreateTableLike, CreateView, Partitions, TableConstraint,
};
use crate::statements::statement::Statement;
use crate::statements::{
@@ -51,6 +51,7 @@ pub const MAXVALUE: &str = "MAXVALUE";
pub const SINK: &str = "SINK";
pub const EXPIRE: &str = "EXPIRE";
pub const AFTER: &str = "AFTER";
pub const INVERTED: &str = "INVERTED";
const DB_OPT_KEY_TTL: &str = "ttl";
@@ -500,20 +501,11 @@ impl<'a> ParserContext<'a> {
);
time_index_opt_idx = Some(index);
let constraint = TableConstraint::Unique {
name: Some(Ident {
value: TIME_INDEX.to_owned(),
quote_style: None,
}),
columns: vec![Ident {
let constraint = TableConstraint::TimeIndex {
column: Ident {
value: column.name().value.clone(),
quote_style: None,
}],
characteristics: None,
index_name: None,
index_type_display: KeyOrIndexDisplay::None,
index_type: None,
index_options: vec![],
},
};
constraints.push(constraint);
}
@@ -730,12 +722,6 @@ impl<'a> ParserContext<'a> {
}
fn parse_optional_table_constraint(&mut self) -> Result<Option<TableConstraint>> {
let name = if self.parser.parse_keyword(Keyword::CONSTRAINT) {
let raw_name = self.parse_identifier().context(SyntaxSnafu)?;
Some(Self::canonicalize_identifier(raw_name))
} else {
None
};
match self.parser.next_token() {
TokenWithLocation {
token: Token::Word(w),
@@ -755,14 +741,7 @@ impl<'a> ParserContext<'a> {
.into_iter()
.map(Self::canonicalize_identifier)
.collect();
Ok(Some(TableConstraint::PrimaryKey {
name,
index_name: None,
index_type: None,
columns,
index_options: vec![],
characteristics: None,
}))
Ok(Some(TableConstraint::PrimaryKey { columns }))
}
TokenWithLocation {
token: Token::Word(w),
@@ -779,7 +758,7 @@ impl<'a> ParserContext<'a> {
.parser
.parse_parenthesized_column_list(Mandatory, false)
.context(error::SyntaxSnafu)?;
let columns = raw_columns
let mut columns = raw_columns
.into_iter()
.map(Self::canonicalize_identifier)
.collect::<Vec<_>>();
@@ -791,28 +770,35 @@ impl<'a> ParserContext<'a> {
}
);
// TODO(dennis): TableConstraint doesn't support dialect right now,
// so we use unique constraint with special key to represent TIME INDEX.
Ok(Some(TableConstraint::Unique {
name: Some(Ident {
value: TIME_INDEX.to_owned(),
quote_style: None,
}),
columns,
characteristics: None,
index_name: None,
index_type_display: KeyOrIndexDisplay::None,
index_type: None,
index_options: vec![],
Ok(Some(TableConstraint::TimeIndex {
column: columns.pop().unwrap(),
}))
}
unexpected => {
if name.is_some() {
self.expected("PRIMARY, TIME", unexpected)
} else {
self.parser.prev_token();
Ok(None)
}
TokenWithLocation {
token: Token::Word(w),
..
} if w.value == INVERTED => {
self.parser
.expect_keyword(Keyword::INDEX)
.context(error::UnexpectedSnafu {
expected: "INDEX",
actual: self.peek_token_as_string(),
})?;
let raw_columns = self
.parser
// allow empty list to unset inverted index
.parse_parenthesized_column_list(Mandatory, true)
.context(error::SyntaxSnafu)?;
let columns = raw_columns
.into_iter()
.map(Self::canonicalize_identifier)
.collect::<Vec<_>>();
Ok(Some(TableConstraint::InvertedIndex { columns }))
}
_ => {
self.parser.prev_token();
Ok(None)
}
}
}
@@ -842,21 +828,9 @@ impl<'a> ParserContext<'a> {
fn validate_time_index(columns: &[Column], constraints: &[TableConstraint]) -> Result<()> {
let time_index_constraints: Vec<_> = constraints
.iter()
.filter_map(|c| {
if let TableConstraint::Unique {
name: Some(ident),
columns,
..
} = c
{
if ident.value == TIME_INDEX {
Some(columns)
} else {
None
}
} else {
None
}
.filter_map(|c| match c {
TableConstraint::TimeIndex { column } => Some(column),
_ => None,
})
.unique()
.collect();
@@ -871,16 +845,10 @@ fn validate_time_index(columns: &[Column], constraints: &[TableConstraint]) -> R
),
}
);
ensure!(
time_index_constraints[0].len() == 1,
InvalidTimeIndexSnafu {
msg: "it should contain only one column in time index",
}
);
// It's safe to use time_index_constraints[0][0],
// we already check the bound above.
let time_index_column_ident = &time_index_constraints[0][0];
let time_index_column_ident = &time_index_constraints[0];
let time_index_column = columns
.iter()
.find(|c| c.name().value == *time_index_column_ident.value)
@@ -1120,7 +1088,8 @@ mod tests {
cpu float32 default 0,
memory float64,
TIME INDEX (ts),
PRIMARY KEY(ts, host)
PRIMARY KEY(ts, host),
INVERTED INDEX(host)
) with(location='/var/data/city.csv',format='csv');";
let options = HashMap::from([
@@ -1144,11 +1113,24 @@ mod tests {
assert_column_def(&columns[3].column_def, "memory", "FLOAT64");
let constraints = &c.constraints;
assert!(matches!(&constraints[0], TableConstraint::Unique {
name: Some(name),
..
} if name.value == TIME_INDEX));
assert_matches!(&constraints[1], TableConstraint::PrimaryKey { .. });
assert_eq!(
&constraints[0],
&TableConstraint::TimeIndex {
column: Ident::new("ts"),
}
);
assert_eq!(
&constraints[1],
&TableConstraint::PrimaryKey {
columns: vec![Ident::new("ts"), Ident::new("host")]
}
);
assert_eq!(
&constraints[2],
&TableConstraint::InvertedIndex {
columns: vec![Ident::new("host")]
}
);
}
_ => unreachable!(),
}
@@ -1478,10 +1460,8 @@ ENGINE=mito";
assert_eq!(c.constraints.len(), 2);
let tc = c.constraints[0].clone();
match tc {
TableConstraint::Unique { name, columns, .. } => {
assert_eq!(name.unwrap().to_string(), "__time_index");
assert_eq!(columns.len(), 1);
assert_eq!(&columns[0].value, "ts");
TableConstraint::TimeIndex { column } => {
assert_eq!(&column.value, "ts");
}
_ => panic!("should be time index constraint"),
};
@@ -1679,10 +1659,8 @@ ENGINE=mito";
if let Statement::CreateTable(c) = &result[0] {
let tc = c.constraints[0].clone();
match tc {
TableConstraint::Unique { name, columns, .. } => {
assert_eq!(name.unwrap().to_string(), "__time_index");
assert_eq!(columns.len(), 1);
assert_eq!(&columns[0].value, "ts");
TableConstraint::TimeIndex { column } => {
assert_eq!(&column.value, "ts");
}
_ => panic!("should be time index constraint"),
}
@@ -1769,7 +1747,9 @@ ENGINE=mito";
cpu float32 default 0,
memory float64,
TIME INDEX (ts),
PRIMARY KEY(ts, host)) engine=mito
PRIMARY KEY(ts, host),
INVERTED INDEX(host)
) engine=mito
with(ttl='10s');
";
let result =
@@ -1789,11 +1769,24 @@ ENGINE=mito";
assert_column_def(&columns[3].column_def, "memory", "FLOAT64");
let constraints = &c.constraints;
assert!(matches!(&constraints[0], TableConstraint::Unique {
name: Some(name),
..
} if name.value == TIME_INDEX));
assert_matches!(&constraints[1], TableConstraint::PrimaryKey { .. });
assert_eq!(
&constraints[0],
&TableConstraint::TimeIndex {
column: Ident::new("ts"),
}
);
assert_eq!(
&constraints[1],
&TableConstraint::PrimaryKey {
columns: vec![Ident::new("ts"), Ident::new("host")]
}
);
assert_eq!(
&constraints[2],
&TableConstraint::InvertedIndex {
columns: vec![Ident::new("host")]
}
);
assert_eq!(1, c.options.len());
assert_eq!(
[("ttl", "10s")].into_iter().collect::<HashMap<_, _>>(),
@@ -1851,6 +1844,33 @@ ENGINE=mito";
assert_matches!(result, Err(crate::error::Error::InvalidTimeIndex { .. }));
}
#[test]
fn test_inverted_index_empty_list() {
let sql = r"create table demo(
host string,
ts timestamp time index,
cpu float64 default 0,
memory float64,
TIME INDEX (ts),
INVERTED INDEX()
) engine=mito;
";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.unwrap();
if let Statement::CreateTable(c) = &result[0] {
let tc = &c
.constraints
.iter()
.find(|c| matches!(c, TableConstraint::InvertedIndex { .. }))
.unwrap();
assert_eq!(*tc, &TableConstraint::InvertedIndex { columns: vec![] });
} else {
unreachable!("should be create table statement");
}
}
#[test]
fn test_invalid_column_name() {
let sql = "create table foo(user string, i timestamp time index)";

View File

@@ -453,9 +453,13 @@ pub fn has_primary_key_option(column_def: &ColumnDef) -> bool {
/// Create a `ColumnSchema` from `Column`.
pub fn column_to_schema(
column: &Column,
is_time_index: bool,
time_index: &str,
invereted_index_cols: &Option<Vec<String>>,
primary_keys: &[String],
timezone: Option<&Timezone>,
) -> Result<ColumnSchema> {
let is_time_index = column.name().value == time_index;
let is_nullable = column
.options()
.iter()
@@ -474,6 +478,20 @@ pub fn column_to_schema(
column: &column.name().value,
})?;
// To keep compatibility,
// 1. if inverted index columns is not set, leave it empty meaning primary key columns will be used
// 2. if inverted index columns is set and non-empty, set selected columns to be inverted indexed
// 3. if inverted index columns is set and empty, set primary key columns to be non-inverted indexed explicitly
if let Some(inverted_index_cols) = invereted_index_cols {
if inverted_index_cols.is_empty() {
if primary_keys.contains(&column.name().value) {
column_schema = column_schema.set_inverted_index(false);
}
} else if inverted_index_cols.contains(&column.name().value) {
column_schema = column_schema.set_inverted_index(true);
}
}
if let Some(ColumnOption::Comment(c)) = column.options().iter().find_map(|o| {
if matches!(o.option, ColumnOption::Comment(_)) {
Some(&o.option)
@@ -1337,7 +1355,7 @@ mod tests {
extensions: ColumnExtensions::default(),
};
let column_schema = column_to_schema(&column_def, false, None).unwrap();
let column_schema = column_to_schema(&column_def, "ts", &None, &[], None).unwrap();
assert_eq!("col", column_schema.name);
assert_eq!(
@@ -1347,7 +1365,7 @@ mod tests {
assert!(column_schema.is_nullable());
assert!(!column_schema.is_time_index());
let column_schema = column_to_schema(&column_def, true, None).unwrap();
let column_schema = column_to_schema(&column_def, "col", &None, &[], None).unwrap();
assert_eq!("col", column_schema.name);
assert_eq!(
@@ -1376,7 +1394,7 @@ mod tests {
extensions: ColumnExtensions::default(),
};
let column_schema = column_to_schema(&column_def, false, None).unwrap();
let column_schema = column_to_schema(&column_def, "ts", &None, &[], None).unwrap();
assert_eq!("col2", column_schema.name);
assert_eq!(ConcreteDataType::string_datatype(), column_schema.data_type);
@@ -1410,7 +1428,9 @@ mod tests {
let column_schema = column_to_schema(
&column,
false,
"ts",
&None,
&[],
Some(&Timezone::from_tz_string("Asia/Shanghai").unwrap()),
)
.unwrap();
@@ -1429,7 +1449,7 @@ mod tests {
);
// without timezone
let column_schema = column_to_schema(&column, false, None).unwrap();
let column_schema = column_to_schema(&column, "ts", &None, &[], None).unwrap();
assert_eq!("col", column_schema.name);
assert_eq!(
@@ -1471,7 +1491,7 @@ mod tests {
},
};
let column_schema = column_to_schema(&column, false, None).unwrap();
let column_schema = column_to_schema(&column, "ts", &None, &[], None).unwrap();
assert_eq!("col", column_schema.name);
assert_eq!(ConcreteDataType::string_datatype(), column_schema.data_type);
let fulltext_options = column_schema.fulltext_options().unwrap().unwrap();

View File

@@ -20,7 +20,7 @@ use itertools::Itertools;
use sqlparser::ast::{ColumnOptionDef, DataType, Expr, Query};
use sqlparser_derive::{Visit, VisitMut};
use crate::ast::{ColumnDef, Ident, ObjectName, TableConstraint, Value as SqlValue};
use crate::ast::{ColumnDef, Ident, ObjectName, Value as SqlValue};
use crate::error::{FulltextInvalidOptionSnafu, Result};
use crate::statements::statement::Statement;
use crate::statements::OptionMap;
@@ -52,31 +52,34 @@ macro_rules! format_list_comma {
}
fn format_table_constraint(constraints: &[TableConstraint]) -> String {
constraints
.iter()
.map(|c| {
if is_time_index(c) {
let TableConstraint::Unique { columns, .. } = c else {
unreachable!()
};
format_indent!("{}TIME INDEX ({})", format_list_comma!(columns))
} else {
format_indent!(c)
}
})
.join(LINE_SEP)
constraints.iter().map(|c| format_indent!(c)).join(LINE_SEP)
}
/// Time index name, used in table constraints.
pub const TIME_INDEX: &str = "__time_index";
/// Table constraint for create table statement.
#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
pub enum TableConstraint {
/// Primary key constraint.
PrimaryKey { columns: Vec<Ident> },
/// Time index constraint.
TimeIndex { column: Ident },
/// Inverted index constraint.
InvertedIndex { columns: Vec<Ident> },
}
#[inline]
pub fn is_time_index(constraint: &TableConstraint) -> bool {
matches!(constraint, TableConstraint::Unique {
name: Some(name),
..
} if name.value == TIME_INDEX)
impl Display for TableConstraint {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
TableConstraint::PrimaryKey { columns } => {
write!(f, "PRIMARY KEY ({})", format_list_comma!(columns))
}
TableConstraint::TimeIndex { column } => {
write!(f, "TIME INDEX ({})", column)
}
TableConstraint::InvertedIndex { columns } => {
write!(f, "INVERTED INDEX ({})", format_list_comma!(columns))
}
}
}
}
#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]