From 306e8398cf441ab9041da8297144000eca4657b6 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 16 Mar 2026 11:01:02 +0800 Subject: [PATCH] fix: correct unicode representation for jsonb_to_string (#7810) * fix: correct unicode representation for jsonb_to_string * refactor: correct function name and behavior * fix: fix json_to_string and provide tests --- .../src/scalars/json/json_to_string.rs | 3 +- src/datatypes/src/types/json_type.rs | 146 ++++++++---------- .../standalone/common/types/json/json.result | 64 ++++---- .../standalone/common/types/json/json.sql | 30 ++-- 4 files changed, 119 insertions(+), 124 deletions(-) diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs index 6c0cc260b2..6364dff4de 100644 --- a/src/common/function/src/scalars/json/json_to_string.rs +++ b/src/common/function/src/scalars/json/json_to_string.rs @@ -19,6 +19,7 @@ use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder}; use datafusion_common::arrow::datatypes::DataType; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use datatypes::types::jsonb_to_string; use crate::function::{Function, extract_args}; @@ -74,7 +75,7 @@ impl Function for JsonToStringFunction { for i in 0..size { let json = jsons.is_valid(i).then(|| jsons.value(i)); let result = json - .map(|json| jsonb::from_slice(json).map(|x| x.to_string())) + .map(jsonb_to_string) .transpose() .map_err(|e| DataFusionError::Execution(format!("invalid json binary: {e}")))?; diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 61586fc460..912bbfca54 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -396,7 +396,7 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { match jsonb::from_slice(val) { Ok(jsonb_value) => { let serialized = jsonb_value.to_string(); - Ok(serialized) + fix_unicode_point(&serialized) } Err(e) => InvalidJsonbSnafu { error: e }.fail(), } @@ -405,18 +405,12 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { /// Converts a json type value to serde_json::Value pub fn jsonb_to_serde_json(val: &[u8]) -> Result { let json_string = jsonb_to_string(val)?; - jsonb_string_to_serde_value(&json_string) + serde_json::Value::from_str(&json_string).context(DeserializeSnafu { json: json_string }) } -/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort -/// fallback for Rust-style Unicode escape sequences. +/// Normalizes a JSON string by converting Rust-style Unicode escape sequences to JSON-compatible format. /// -/// This function is intended to be used on JSON strings produced from the internal -/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls -/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is -/// returned as-is. -/// -/// If the initial parse fails, the input is scanned for Rust-style Unicode code +/// The input is scanned for Rust-style Unicode code /// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace, /// followed by 1–6 hexadecimal digits, and a closing brace). Each such escape is /// converted into JSON-compatible UTF‑16 escape sequences: @@ -427,59 +421,44 @@ pub fn jsonb_to_serde_json(val: &[u8]) -> Result { /// the code point is encoded as a UTF‑16 surrogate pair and emitted as two consecutive /// `\\uXXXX` sequences (as JSON format required). /// -/// After this normalization, the function retries parsing the resulting string as -/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it -/// still cannot be parsed. -fn jsonb_string_to_serde_value(json: &str) -> Result { - match serde_json::Value::from_str(json) { - Ok(v) => Ok(v), - Err(e) => { - // If above deserialization is failed, the JSON string might contain some Rust chars - // that are somehow incorrectly represented as Unicode code point literal. For example, - // "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then - // try to deserialize the JSON string again. - if !e.is_syntax() || !e.to_string().contains("invalid escape") { - return Err(e).context(DeserializeSnafu { json }); - } +/// After this normalization, the function returns the normalized string +fn fix_unicode_point(json: &str) -> Result { + static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { + // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits + // inside braces. + Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) + }); - static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { - // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits - // inside braces. - Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) - }); + let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { + // Extract the hex payload (without braces) and parse to a code point. + let hex = &caps[1]; + let Ok(code) = u32::from_str_radix(hex, 16) else { + // On parse failure, leave the original escape sequence unchanged. + return caps[0].to_string(); + }; - let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { - // Extract the hex payload (without braces) and parse to a code point. - let hex = &caps[1]; - let Ok(code) = u32::from_str_radix(hex, 16) else { - // On parse failure, leave the original escape sequence unchanged. - return caps[0].to_string(); - }; + if code <= 0xFFFF { + // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. + format!("\\u{:04X}", code) + } else if code > 0x10FFFF { + // Beyond max Unicode code point + caps[0].to_string() + } else { + // Supplementary planes: JSON needs UTF-16 surrogate pairs. + // Convert the code point to a 20-bit value. + let code = code - 0x10000; - if code <= 0xFFFF { - // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. - format!("\\u{:04X}", code) - } else if code > 0x10FFFF { - // Beyond max Unicode code point - caps[0].to_string() - } else { - // Supplementary planes: JSON needs UTF-16 surrogate pairs. - // Convert the code point to a 20-bit value. - let code = code - 0x10000; + // High surrogate: top 10 bits, offset by 0xD800. + let high = 0xD800 + ((code >> 10) & 0x3FF); - // High surrogate: top 10 bits, offset by 0xD800. - let high = 0xD800 + ((code >> 10) & 0x3FF); + // Low surrogate: bottom 10 bits, offset by 0xDC00. + let low = 0xDC00 + (code & 0x3FF); - // Low surrogate: bottom 10 bits, offset by 0xDC00. - let low = 0xDC00 + (code & 0x3FF); - - // Emit two \uXXXX escapes in sequence. - format!("\\u{:04X}\\u{:04X}", high, low) - } - }); - serde_json::Value::from_str(&v).context(DeserializeSnafu { json }) + // Emit two \uXXXX escapes in sequence. + format!("\\u{:04X}\\u{:04X}", high, low) } - } + }); + Ok(v.to_string()) } /// Parses a string to a json type value @@ -495,45 +474,54 @@ mod tests { use crate::json::JsonStructureSettings; #[test] - fn test_jsonb_string_to_serde_value() -> Result<()> { + fn test_fix_unicode_point() -> Result<()> { let valid_cases = vec![ - (r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#), + (r#"{"data": "simple ascii"}"#, r#"{"data": "simple ascii"}"#), ( - r#"{"data": "Greek sigma: \u{03a3}"}"#, - r#"{"data":"Greek sigma: Σ"}"#, + r#"{"data":"Greek sigma: \u{03a3}"}"#, + r#"{"data":"Greek sigma: \u03A3"}"#, ), ( - r#"{"data": "Joker card: \u{1f0df}"}"#, - r#"{"data":"Joker card: 🃟"}"#, + r#"{"data":"Joker card: \u{1f0df}"}"#, + r#"{"data":"Joker card: \uD83C\uDCDF"}"#, ), ( - r#"{"data": "BMP boundary: \u{ffff}"}"#, - r#"{"data":"BMP boundary: ￿"}"#, + r#"{"data":"BMP boundary: \u{ffff}"}"#, + r#"{"data":"BMP boundary: \uFFFF"}"#, ), ( - r#"{"data": "Supplementary min: \u{10000}"}"#, - r#"{"data":"Supplementary min: 𐀀"}"#, + r#"{"data":"Supplementary min: \u{10000}"}"#, + r#"{"data":"Supplementary min: \uD800\uDC00"}"#, ), ( - r#"{"data": "Supplementary max: \u{10ffff}"}"#, - r#"{"data":"Supplementary max: 􏿿"}"#, + r#"{"data":"Supplementary max: \u{10ffff}"}"#, + r#"{"data":"Supplementary max: \uDBFF\uDFFF"}"#, ), ]; for (input, expect) in valid_cases { - let v = jsonb_string_to_serde_value(input)?; - assert_eq!(v.to_string(), expect); + let v = fix_unicode_point(input)?; + assert_eq!(v, expect); } - let invalid_cases = vec![ - r#"{"data": "Invalid hex: \u{gggg}"}"#, - r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#, - r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit - r#"{"data": "Empty braces: \u{}"}"#, + let invalid_escape_cases = vec![ + ( + r#"{"data": "Invalid hex: \u{gggg}"}"#, + r#"{"data": "Invalid hex: \u{gggg}"}"#, + ), + ( + r#"{"data": "Empty braces: \u{}"}"#, + r#"{"data": "Empty braces: \u{}"}"#, + ), + ( + r#"{"data": "Out of range: \u{1100000}"}"#, + r#"{"data": "Out of range: \u{1100000}"}"#, + ), ]; - for input in invalid_cases { - let result = jsonb_string_to_serde_value(input); - assert!(result.is_err()); + for (input, expect) in invalid_escape_cases { + let v = fix_unicode_point(input)?; + assert_eq!(v, expect); } + Ok(()) } diff --git a/tests/cases/standalone/common/types/json/json.result b/tests/cases/standalone/common/types/json/json.result index 8c4755f4ae..8fad9632b1 100644 --- a/tests/cases/standalone/common/types/json/json.result +++ b/tests/cases/standalone/common/types/json/json.result @@ -37,22 +37,23 @@ INSERT INTO jsons VALUES('[null]', 0), } ] } -}}', 11); +}}', 11), +('{"a":"abc\u2028tom"}', 12); -Affected Rows: 12 +Affected Rows: 13 -INSERT INTO jsons VALUES(parse_json('[null]'), 12), -(parse_json('[true]'), 13), -(parse_json('[false]'), 14), -(parse_json('[0]'), 15), -(parse_json('["foo"]'), 16), -(parse_json('[]'), 17), -(parse_json('{}'), 18), -(parse_json('[0,1]'), 19), -(parse_json('{"foo":"bar"}'), 20), -(parse_json('{"a":null,"foo":"bar"}'), 21), -(parse_json('[-1]'), 22), -(parse_json('[-2147483648]'), 23), +INSERT INTO jsons VALUES(parse_json('[null]'), 1000), +(parse_json('[true]'), 1001), +(parse_json('[false]'), 1002), +(parse_json('[0]'), 1003), +(parse_json('["foo"]'), 1004), +(parse_json('[]'), 1005), +(parse_json('{}'), 1006), +(parse_json('[0,1]'), 1007), +(parse_json('{"foo":"bar"}'), 1008), +(parse_json('{"a":null,"foo":"bar"}'), 1009), +(parse_json('[-1]'), 1010), +(parse_json('[-2147483648]'), 1011), (parse_json('{"entities": { "description": { "urls": [ @@ -76,9 +77,10 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12), } ] } - }}'), 24); + }}'), 1012), +(parse_json('{"a":"abc\u2028tom"}'), 1013); -Affected Rows: 13 +Affected Rows: 14 SELECT json_to_string(j), t FROM jsons; @@ -97,25 +99,27 @@ SELECT json_to_string(j), t FROM jsons; | {"a":null,"foo":"bar"} | 1970-01-01T00:00:00.009 | | [-1] | 1970-01-01T00:00:00.010 | | {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.011 | -| [null] | 1970-01-01T00:00:00.012 | -| [true] | 1970-01-01T00:00:00.013 | -| [false] | 1970-01-01T00:00:00.014 | -| [0] | 1970-01-01T00:00:00.015 | -| ["foo"] | 1970-01-01T00:00:00.016 | -| [] | 1970-01-01T00:00:00.017 | -| {} | 1970-01-01T00:00:00.018 | -| [0,1] | 1970-01-01T00:00:00.019 | -| {"foo":"bar"} | 1970-01-01T00:00:00.020 | -| {"a":null,"foo":"bar"} | 1970-01-01T00:00:00.021 | -| [-1] | 1970-01-01T00:00:00.022 | -| [-2147483648] | 1970-01-01T00:00:00.023 | -| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.024 | +| {"a":"abc\u2028tom"} | 1970-01-01T00:00:00.012 | +| [null] | 1970-01-01T00:00:01 | +| [true] | 1970-01-01T00:00:01.001 | +| [false] | 1970-01-01T00:00:01.002 | +| [0] | 1970-01-01T00:00:01.003 | +| ["foo"] | 1970-01-01T00:00:01.004 | +| [] | 1970-01-01T00:00:01.005 | +| {} | 1970-01-01T00:00:01.006 | +| [0,1] | 1970-01-01T00:00:01.007 | +| {"foo":"bar"} | 1970-01-01T00:00:01.008 | +| {"a":null,"foo":"bar"} | 1970-01-01T00:00:01.009 | +| [-1] | 1970-01-01T00:00:01.010 | +| [-2147483648] | 1970-01-01T00:00:01.011 | +| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:01.012 | +| {"a":"abc\u2028tom"} | 1970-01-01T00:00:01.013 | +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+ --Insert invalid json strings-- DELETE FROM jsons; -Affected Rows: 25 +Affected Rows: 27 INSERT INTO jsons VALUES(parse_json('{"a":1, "b":2, "c":3'), 4); diff --git a/tests/cases/standalone/common/types/json/json.sql b/tests/cases/standalone/common/types/json/json.sql index 868edc59e8..5a521ee1c6 100644 --- a/tests/cases/standalone/common/types/json/json.sql +++ b/tests/cases/standalone/common/types/json/json.sql @@ -35,20 +35,21 @@ INSERT INTO jsons VALUES('[null]', 0), } ] } -}}', 11); +}}', 11), +('{"a":"abc\u2028tom"}', 12); -INSERT INTO jsons VALUES(parse_json('[null]'), 12), -(parse_json('[true]'), 13), -(parse_json('[false]'), 14), -(parse_json('[0]'), 15), -(parse_json('["foo"]'), 16), -(parse_json('[]'), 17), -(parse_json('{}'), 18), -(parse_json('[0,1]'), 19), -(parse_json('{"foo":"bar"}'), 20), -(parse_json('{"a":null,"foo":"bar"}'), 21), -(parse_json('[-1]'), 22), -(parse_json('[-2147483648]'), 23), +INSERT INTO jsons VALUES(parse_json('[null]'), 1000), +(parse_json('[true]'), 1001), +(parse_json('[false]'), 1002), +(parse_json('[0]'), 1003), +(parse_json('["foo"]'), 1004), +(parse_json('[]'), 1005), +(parse_json('{}'), 1006), +(parse_json('[0,1]'), 1007), +(parse_json('{"foo":"bar"}'), 1008), +(parse_json('{"a":null,"foo":"bar"}'), 1009), +(parse_json('[-1]'), 1010), +(parse_json('[-2147483648]'), 1011), (parse_json('{"entities": { "description": { "urls": [ @@ -72,7 +73,8 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12), } ] } - }}'), 24); + }}'), 1012), +(parse_json('{"a":"abc\u2028tom"}'), 1013); SELECT json_to_string(j), t FROM jsons;