From 4345fdf07b2e225afbfe7ca4a89f859cdf3e756f Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 26 May 2025 12:35:39 +0100 Subject: [PATCH] completely rewrite array text parsing based on spec --- proxy/src/serverless/json.rs | 319 +++++++++++++++++++++++------------ 1 file changed, 207 insertions(+), 112 deletions(-) diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 48001d1d7d..4c177857e3 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -70,6 +70,8 @@ pub(crate) enum JsonConversionError { ParseJsonError(#[from] serde_json::Error), #[error("unbalanced array")] UnbalancedArray, + #[error("unbalanced quoted string")] + UnbalancedString, } enum OutputMode { @@ -80,10 +82,7 @@ enum OutputMode { impl OutputMode { fn key(&mut self, key: &str) -> &mut Value { match self { - OutputMode::Array(values) => { - values.push(Value::Null); - values.last_mut().expect("a value was just inserted") - } + OutputMode::Array(values) => push_entry(values, Value::Null), OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null), } } @@ -96,6 +95,11 @@ impl OutputMode { } } +fn push_entry(arr: &mut Vec, t: T) -> &mut T { + arr.push(t); + arr.last_mut().expect("a value was just inserted") +} + // // Convert postgres row with text-encoded values to JSON object // @@ -134,8 +138,11 @@ fn pg_text_to_json( pg_type: &Type, ) -> Result<(), JsonConversionError> { if let Kind::Array(elem_type) = pg_type.kind() { + // todo: we should fetch this from postgres. + let delimiter = ','; + let mut array = vec![]; - pg_array_parse(&mut array, val, elem_type)?; + pg_array_parse(&mut array, val, elem_type, delimiter)?; *output = Value::Array(array); return Ok(()); } @@ -165,121 +172,209 @@ fn pg_text_to_json( Ok(()) } -// -// Parse postgres array into JSON array. -// -// This is a bit involved because we need to handle nested arrays and quoted -// values. Unlike postgres we don't check that all nested arrays have the same -// dimensions, we just return them as is. -// +/// Parse postgres array into JSON array. +/// +/// This is a bit involved because we need to handle nested arrays and quoted +/// values. Unlike postgres we don't check that all nested arrays have the same +/// dimensions, we just return them as is. +/// +/// +/// +/// The external text representation of an array value consists of items that are interpreted +/// according to the I/O conversion rules for the array's element type, plus decoration that +/// indicates the array structure. The decoration consists of curly braces (`{` and `}`) around +/// the array value plus delimiter characters between adjacent items. The delimiter character +/// is usually a comma (,) but can be something else: it is determined by the typdelim setting +/// for the array's element type. Among the standard data types provided in the PostgreSQL +/// distribution, all use a comma, except for type box, which uses a semicolon (;). +/// +/// In a multidimensional array, each dimension (row, plane, cube, etc.) +/// gets its own level of curly braces, and delimiters must be written between adjacent +/// curly-braced entities of the same level. fn pg_array_parse( - entries: &mut Vec, - pg_array: &str, - elem_type: &Type, + elements: &mut Vec, + mut pg_array: &str, + elem: &Type, + delim: char, ) -> Result<(), JsonConversionError> { - pg_array_parse_inner(entries, pg_array, elem_type, false).map(|_| ()) -} - -fn pg_array_parse_inner( - entries: &mut Vec, - pg_array: &str, - elem_type: &Type, - nested: bool, -) -> Result { - let mut pg_array_chr = pg_array.char_indices(); - let mut level = 0; - let mut quote = false; - let mut entry = String::new(); - - // skip bounds decoration + // skip bounds decoration, eg: + // `[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}` + // technically these are significant, but we have no way to represent them in json. if let Some('[') = pg_array.chars().next() { - for (_, c) in pg_array_chr.by_ref() { - if c == '=' { - break; - } - } + let Some((_bounds, array)) = pg_array.split_once('=') else { + return Err(JsonConversionError::UnbalancedArray); + }; + pg_array = array; } - fn push_checked( - entry: &mut String, - entries: &mut Vec, - elem_type: &Type, - ) -> Result<(), JsonConversionError> { - if !entry.is_empty() { - // While in usual postgres response we get nulls as None and everything else - // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while - // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs - // here while we have quotation info and convert them to None. - if entry == "NULL" { - entries.push(Value::Null); - } else { - let mut val = Value::Null; - pg_text_to_json(&mut val, entry, elem_type)?; - entries.push(val); - } - entry.clear(); - } + // whitespace might preceed a `{`. + let pg_array = pg_array.trim_start(); - Ok(()) - } - - while let Some((mut i, mut c)) = pg_array_chr.next() { - let mut escaped = false; - - if c == '\\' { - escaped = true; - let Some(x) = pg_array_chr.next() else { - return Err(JsonConversionError::UnbalancedArray); - }; - (i, c) = x; - } - - match c { - '{' if !quote => { - level += 1; - if level > 1 { - let mut array = vec![]; - let off = pg_array_parse_inner(&mut array, &pg_array[i..], elem_type, true)?; - entries.push(Value::Array(array)); - for _ in 0..off - 1 { - pg_array_chr.next(); - } - } - } - '}' if !quote => { - level -= 1; - if level == 0 { - push_checked(&mut entry, entries, elem_type)?; - if nested { - return Ok(i); - } - } - } - '"' if !escaped => { - if quote { - // end of quoted string, so push it manually without any checks - // for emptiness or nulls - let mut val = Value::Null; - pg_text_to_json(&mut val, &entry, elem_type)?; - entries.push(val); - entry.clear(); - } - quote = !quote; - } - ',' if !quote => { - push_checked(&mut entry, entries, elem_type)?; - } - _ => { - entry.push(c); - } - } - } - - if level != 0 { + let rest = pg_array_parse_inner(elements, pg_array, elem, delim)?; + if !rest.is_empty() { return Err(JsonConversionError::UnbalancedArray); } - Ok(0) + Ok(()) +} + +/// reads a single array from the `pg_array` string and pushes each values to `elements`. +/// returns the rest of the `pg_array` string that was not read. +fn pg_array_parse_inner<'a>( + elements: &mut Vec, + mut pg_array: &'a str, + elem: &Type, + delim: char, +) -> Result<&'a str, JsonConversionError> { + // array should have a `{` prefix. + pg_array = pg_array + .strip_prefix('{') + .ok_or(JsonConversionError::UnbalancedArray)?; + + let mut q = String::new(); + + loop { + let value = push_entry(elements, Value::Null); + pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?; + + // check for separator. + if let Some(next) = pg_array.strip_prefix(delim) { + // next item. + pg_array = next; + } else { + break; + } + } + + let Some(next) = pg_array.strip_prefix('}') else { + // missing `}` terminator. + return Err(JsonConversionError::UnbalancedArray); + }; + + // whitespace might follow a `}`. + Ok(next.trim_start()) +} + +/// reads a single item from the `pg_array` string. +/// returns the rest of the `pg_array` string that was not read. +/// +/// `quoted` is a scratch allocation that has no defined output. +fn pg_array_parse_item<'a>( + output: &mut Value, + quoted: &mut String, + mut pg_array: &'a str, + elem: &Type, + delim: char, +) -> Result<&'a str, JsonConversionError> { + // We are trying to parse an array item. + // This could be a new array, if this is a multi-dimentional array. + // This could be a quoted string representing `elem`. + // This could be an unquoted string representing `elem`. + + // whitespace might preceed an item. + pg_array = pg_array.trim_start(); + + if pg_array.strip_prefix('{').is_some() { + // nested array. + let mut nested = vec![]; + pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?; + *output = Value::Array(nested); + return Ok(pg_array); + } + + if let Some(mut pg_array) = pg_array.strip_prefix('"') { + pg_array = pg_array_parse_quoted(quoted, pg_array)?; + + // we have unquoted an item string: + pg_text_to_json(output, quoted, elem)?; + + quoted.clear(); + + return Ok(pg_array); + } + + // we need to parse an item. read until we find a delimiter or `}`. + let index = pg_array + .find([delim, '}']) + .ok_or(JsonConversionError::UnbalancedArray)?; + + let item; + (item, pg_array) = pg_array.split_at(index); + + // item might have trailing whitespace that we need to ignore. + let item = item.trim_end(); + + // we might have an item string: + // check for null + if item == "NULL" { + *output = Value::Null; + } else { + pg_text_to_json(output, item, elem)?; + } + + Ok(pg_array) +} + +/// reads a single quoted item from the `pg_array` string. +/// +/// Returns the rest of the `pg_array` string that was not read. +/// The output is written into `quoted`. +/// +/// The pg_array string must have a `"` terminator, but the `"` initial value +/// must have already been removed from the input. The terminator is removed. +fn pg_array_parse_quoted<'a>( + quoted: &mut String, + mut pg_array: &'a str, +) -> Result<&'a str, JsonConversionError> { + // The array output routine will put double quotes around element values if they are empty strings, + // contain curly braces, delimiter characters, double quotes, backslashes, or white space, + // or match the word `NULL`. Double quotes and backslashes embedded in element values will be backslash-escaped. + // For numeric data types it is safe to assume that double quotes will never appear, + // but for textual data types one should be prepared to cope with either the presence or absence of quotes. + + // We write to quoted in chunks terminated by an escape character. + // Eg if we have the input `foo\"bar"`, then we write `foo`, then `"`, then finally `bar`. + + loop { + // we need to parse an chunk. read until we find a '\\' or `"`. + let i = pg_array + .find(['\\', '"']) + .ok_or(JsonConversionError::UnbalancedString)?; + + let chunk: &str; + (chunk, pg_array) = pg_array + .split_at_checked(i) + .expect("i is guaranteed to be in-bounds of pg_array"); + + // push the chunk. + quoted.push_str(chunk); + + // consume the chunk_end character. + let chunk_end: char; + (chunk_end, pg_array) = + split_first_char(pg_array).expect("pg_array should start with either '\\\\' or '\"'"); + + // finished. + if chunk_end == '"' { + // whitespace might follow the '"'. + pg_array = pg_array.trim_start(); + + break Ok(pg_array); + } + + // consume the escaped character. + let escaped: char; + (escaped, pg_array) = + split_first_char(pg_array).ok_or(JsonConversionError::UnbalancedString)?; + + quoted.push(escaped); + } +} + +fn split_first_char(s: &str) -> Option<(char, &str)> { + let mut chars = s.chars(); + let c = chars.next()?; + Some((c, chars.as_str())) } #[cfg(test)] @@ -351,7 +446,7 @@ mod tests { fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value { let mut array = vec![]; - super::pg_array_parse(&mut array, pg_array, pg_type).unwrap(); + super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap(); Value::Array(array) }