mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-13 08:22:55 +00:00
completely rewrite array text parsing based on spec
This commit is contained in:
@@ -70,6 +70,8 @@ pub(crate) enum JsonConversionError {
|
||||
ParseJsonError(#[from] serde_json::Error),
|
||||
#[error("unbalanced array")]
|
||||
UnbalancedArray,
|
||||
#[error("unbalanced quoted string")]
|
||||
UnbalancedString,
|
||||
}
|
||||
|
||||
enum OutputMode {
|
||||
@@ -80,10 +82,7 @@ enum OutputMode {
|
||||
impl OutputMode {
|
||||
fn key(&mut self, key: &str) -> &mut Value {
|
||||
match self {
|
||||
OutputMode::Array(values) => {
|
||||
values.push(Value::Null);
|
||||
values.last_mut().expect("a value was just inserted")
|
||||
}
|
||||
OutputMode::Array(values) => push_entry(values, Value::Null),
|
||||
OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null),
|
||||
}
|
||||
}
|
||||
@@ -96,6 +95,11 @@ impl OutputMode {
|
||||
}
|
||||
}
|
||||
|
||||
fn push_entry<T>(arr: &mut Vec<T>, t: T) -> &mut T {
|
||||
arr.push(t);
|
||||
arr.last_mut().expect("a value was just inserted")
|
||||
}
|
||||
|
||||
//
|
||||
// Convert postgres row with text-encoded values to JSON object
|
||||
//
|
||||
@@ -134,8 +138,11 @@ fn pg_text_to_json(
|
||||
pg_type: &Type,
|
||||
) -> Result<(), JsonConversionError> {
|
||||
if let Kind::Array(elem_type) = pg_type.kind() {
|
||||
// todo: we should fetch this from postgres.
|
||||
let delimiter = ',';
|
||||
|
||||
let mut array = vec![];
|
||||
pg_array_parse(&mut array, val, elem_type)?;
|
||||
pg_array_parse(&mut array, val, elem_type, delimiter)?;
|
||||
*output = Value::Array(array);
|
||||
return Ok(());
|
||||
}
|
||||
@@ -165,121 +172,209 @@ fn pg_text_to_json(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Parse postgres array into JSON array.
|
||||
//
|
||||
// This is a bit involved because we need to handle nested arrays and quoted
|
||||
// values. Unlike postgres we don't check that all nested arrays have the same
|
||||
// dimensions, we just return them as is.
|
||||
//
|
||||
/// Parse postgres array into JSON array.
|
||||
///
|
||||
/// This is a bit involved because we need to handle nested arrays and quoted
|
||||
/// values. Unlike postgres we don't check that all nested arrays have the same
|
||||
/// dimensions, we just return them as is.
|
||||
///
|
||||
/// <https://www.postgresql.org/docs/current/arrays.html#ARRAYS-IO>
|
||||
///
|
||||
/// The external text representation of an array value consists of items that are interpreted
|
||||
/// according to the I/O conversion rules for the array's element type, plus decoration that
|
||||
/// indicates the array structure. The decoration consists of curly braces (`{` and `}`) around
|
||||
/// the array value plus delimiter characters between adjacent items. The delimiter character
|
||||
/// is usually a comma (,) but can be something else: it is determined by the typdelim setting
|
||||
/// for the array's element type. Among the standard data types provided in the PostgreSQL
|
||||
/// distribution, all use a comma, except for type box, which uses a semicolon (;).
|
||||
///
|
||||
/// In a multidimensional array, each dimension (row, plane, cube, etc.)
|
||||
/// gets its own level of curly braces, and delimiters must be written between adjacent
|
||||
/// curly-braced entities of the same level.
|
||||
fn pg_array_parse(
|
||||
entries: &mut Vec<Value>,
|
||||
pg_array: &str,
|
||||
elem_type: &Type,
|
||||
elements: &mut Vec<Value>,
|
||||
mut pg_array: &str,
|
||||
elem: &Type,
|
||||
delim: char,
|
||||
) -> Result<(), JsonConversionError> {
|
||||
pg_array_parse_inner(entries, pg_array, elem_type, false).map(|_| ())
|
||||
}
|
||||
|
||||
fn pg_array_parse_inner(
|
||||
entries: &mut Vec<Value>,
|
||||
pg_array: &str,
|
||||
elem_type: &Type,
|
||||
nested: bool,
|
||||
) -> Result<usize, JsonConversionError> {
|
||||
let mut pg_array_chr = pg_array.char_indices();
|
||||
let mut level = 0;
|
||||
let mut quote = false;
|
||||
let mut entry = String::new();
|
||||
|
||||
// skip bounds decoration
|
||||
// skip bounds decoration, eg:
|
||||
// `[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}`
|
||||
// technically these are significant, but we have no way to represent them in json.
|
||||
if let Some('[') = pg_array.chars().next() {
|
||||
for (_, c) in pg_array_chr.by_ref() {
|
||||
if c == '=' {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let Some((_bounds, array)) = pg_array.split_once('=') else {
|
||||
return Err(JsonConversionError::UnbalancedArray);
|
||||
};
|
||||
pg_array = array;
|
||||
}
|
||||
|
||||
fn push_checked(
|
||||
entry: &mut String,
|
||||
entries: &mut Vec<Value>,
|
||||
elem_type: &Type,
|
||||
) -> Result<(), JsonConversionError> {
|
||||
if !entry.is_empty() {
|
||||
// While in usual postgres response we get nulls as None and everything else
|
||||
// as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
|
||||
// string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
|
||||
// here while we have quotation info and convert them to None.
|
||||
if entry == "NULL" {
|
||||
entries.push(Value::Null);
|
||||
} else {
|
||||
let mut val = Value::Null;
|
||||
pg_text_to_json(&mut val, entry, elem_type)?;
|
||||
entries.push(val);
|
||||
}
|
||||
entry.clear();
|
||||
}
|
||||
// whitespace might preceed a `{`.
|
||||
let pg_array = pg_array.trim_start();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
while let Some((mut i, mut c)) = pg_array_chr.next() {
|
||||
let mut escaped = false;
|
||||
|
||||
if c == '\\' {
|
||||
escaped = true;
|
||||
let Some(x) = pg_array_chr.next() else {
|
||||
return Err(JsonConversionError::UnbalancedArray);
|
||||
};
|
||||
(i, c) = x;
|
||||
}
|
||||
|
||||
match c {
|
||||
'{' if !quote => {
|
||||
level += 1;
|
||||
if level > 1 {
|
||||
let mut array = vec![];
|
||||
let off = pg_array_parse_inner(&mut array, &pg_array[i..], elem_type, true)?;
|
||||
entries.push(Value::Array(array));
|
||||
for _ in 0..off - 1 {
|
||||
pg_array_chr.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
'}' if !quote => {
|
||||
level -= 1;
|
||||
if level == 0 {
|
||||
push_checked(&mut entry, entries, elem_type)?;
|
||||
if nested {
|
||||
return Ok(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
'"' if !escaped => {
|
||||
if quote {
|
||||
// end of quoted string, so push it manually without any checks
|
||||
// for emptiness or nulls
|
||||
let mut val = Value::Null;
|
||||
pg_text_to_json(&mut val, &entry, elem_type)?;
|
||||
entries.push(val);
|
||||
entry.clear();
|
||||
}
|
||||
quote = !quote;
|
||||
}
|
||||
',' if !quote => {
|
||||
push_checked(&mut entry, entries, elem_type)?;
|
||||
}
|
||||
_ => {
|
||||
entry.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if level != 0 {
|
||||
let rest = pg_array_parse_inner(elements, pg_array, elem, delim)?;
|
||||
if !rest.is_empty() {
|
||||
return Err(JsonConversionError::UnbalancedArray);
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// reads a single array from the `pg_array` string and pushes each values to `elements`.
|
||||
/// returns the rest of the `pg_array` string that was not read.
|
||||
fn pg_array_parse_inner<'a>(
|
||||
elements: &mut Vec<Value>,
|
||||
mut pg_array: &'a str,
|
||||
elem: &Type,
|
||||
delim: char,
|
||||
) -> Result<&'a str, JsonConversionError> {
|
||||
// array should have a `{` prefix.
|
||||
pg_array = pg_array
|
||||
.strip_prefix('{')
|
||||
.ok_or(JsonConversionError::UnbalancedArray)?;
|
||||
|
||||
let mut q = String::new();
|
||||
|
||||
loop {
|
||||
let value = push_entry(elements, Value::Null);
|
||||
pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?;
|
||||
|
||||
// check for separator.
|
||||
if let Some(next) = pg_array.strip_prefix(delim) {
|
||||
// next item.
|
||||
pg_array = next;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next) = pg_array.strip_prefix('}') else {
|
||||
// missing `}` terminator.
|
||||
return Err(JsonConversionError::UnbalancedArray);
|
||||
};
|
||||
|
||||
// whitespace might follow a `}`.
|
||||
Ok(next.trim_start())
|
||||
}
|
||||
|
||||
/// reads a single item from the `pg_array` string.
|
||||
/// returns the rest of the `pg_array` string that was not read.
|
||||
///
|
||||
/// `quoted` is a scratch allocation that has no defined output.
|
||||
fn pg_array_parse_item<'a>(
|
||||
output: &mut Value,
|
||||
quoted: &mut String,
|
||||
mut pg_array: &'a str,
|
||||
elem: &Type,
|
||||
delim: char,
|
||||
) -> Result<&'a str, JsonConversionError> {
|
||||
// We are trying to parse an array item.
|
||||
// This could be a new array, if this is a multi-dimentional array.
|
||||
// This could be a quoted string representing `elem`.
|
||||
// This could be an unquoted string representing `elem`.
|
||||
|
||||
// whitespace might preceed an item.
|
||||
pg_array = pg_array.trim_start();
|
||||
|
||||
if pg_array.strip_prefix('{').is_some() {
|
||||
// nested array.
|
||||
let mut nested = vec![];
|
||||
pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?;
|
||||
*output = Value::Array(nested);
|
||||
return Ok(pg_array);
|
||||
}
|
||||
|
||||
if let Some(mut pg_array) = pg_array.strip_prefix('"') {
|
||||
pg_array = pg_array_parse_quoted(quoted, pg_array)?;
|
||||
|
||||
// we have unquoted an item string:
|
||||
pg_text_to_json(output, quoted, elem)?;
|
||||
|
||||
quoted.clear();
|
||||
|
||||
return Ok(pg_array);
|
||||
}
|
||||
|
||||
// we need to parse an item. read until we find a delimiter or `}`.
|
||||
let index = pg_array
|
||||
.find([delim, '}'])
|
||||
.ok_or(JsonConversionError::UnbalancedArray)?;
|
||||
|
||||
let item;
|
||||
(item, pg_array) = pg_array.split_at(index);
|
||||
|
||||
// item might have trailing whitespace that we need to ignore.
|
||||
let item = item.trim_end();
|
||||
|
||||
// we might have an item string:
|
||||
// check for null
|
||||
if item == "NULL" {
|
||||
*output = Value::Null;
|
||||
} else {
|
||||
pg_text_to_json(output, item, elem)?;
|
||||
}
|
||||
|
||||
Ok(pg_array)
|
||||
}
|
||||
|
||||
/// reads a single quoted item from the `pg_array` string.
|
||||
///
|
||||
/// Returns the rest of the `pg_array` string that was not read.
|
||||
/// The output is written into `quoted`.
|
||||
///
|
||||
/// The pg_array string must have a `"` terminator, but the `"` initial value
|
||||
/// must have already been removed from the input. The terminator is removed.
|
||||
fn pg_array_parse_quoted<'a>(
|
||||
quoted: &mut String,
|
||||
mut pg_array: &'a str,
|
||||
) -> Result<&'a str, JsonConversionError> {
|
||||
// The array output routine will put double quotes around element values if they are empty strings,
|
||||
// contain curly braces, delimiter characters, double quotes, backslashes, or white space,
|
||||
// or match the word `NULL`. Double quotes and backslashes embedded in element values will be backslash-escaped.
|
||||
// For numeric data types it is safe to assume that double quotes will never appear,
|
||||
// but for textual data types one should be prepared to cope with either the presence or absence of quotes.
|
||||
|
||||
// We write to quoted in chunks terminated by an escape character.
|
||||
// Eg if we have the input `foo\"bar"`, then we write `foo`, then `"`, then finally `bar`.
|
||||
|
||||
loop {
|
||||
// we need to parse an chunk. read until we find a '\\' or `"`.
|
||||
let i = pg_array
|
||||
.find(['\\', '"'])
|
||||
.ok_or(JsonConversionError::UnbalancedString)?;
|
||||
|
||||
let chunk: &str;
|
||||
(chunk, pg_array) = pg_array
|
||||
.split_at_checked(i)
|
||||
.expect("i is guaranteed to be in-bounds of pg_array");
|
||||
|
||||
// push the chunk.
|
||||
quoted.push_str(chunk);
|
||||
|
||||
// consume the chunk_end character.
|
||||
let chunk_end: char;
|
||||
(chunk_end, pg_array) =
|
||||
split_first_char(pg_array).expect("pg_array should start with either '\\\\' or '\"'");
|
||||
|
||||
// finished.
|
||||
if chunk_end == '"' {
|
||||
// whitespace might follow the '"'.
|
||||
pg_array = pg_array.trim_start();
|
||||
|
||||
break Ok(pg_array);
|
||||
}
|
||||
|
||||
// consume the escaped character.
|
||||
let escaped: char;
|
||||
(escaped, pg_array) =
|
||||
split_first_char(pg_array).ok_or(JsonConversionError::UnbalancedString)?;
|
||||
|
||||
quoted.push(escaped);
|
||||
}
|
||||
}
|
||||
|
||||
fn split_first_char(s: &str) -> Option<(char, &str)> {
|
||||
let mut chars = s.chars();
|
||||
let c = chars.next()?;
|
||||
Some((c, chars.as_str()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -351,7 +446,7 @@ mod tests {
|
||||
|
||||
fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value {
|
||||
let mut array = vec![];
|
||||
super::pg_array_parse(&mut array, pg_array, pg_type).unwrap();
|
||||
super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap();
|
||||
Value::Array(array)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user