From 0a3aa9913b2fde1ae9135a9f8f8b85bb0118a2aa Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Mon, 19 Jan 2026 11:50:02 +0800 Subject: [PATCH] fix: correctly parse json string that contain unicode code point literal (#7520) * fix: correctly parse some json strings Signed-off-by: luofucong * resolve PR comments Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: luofucong --------- Signed-off-by: luofucong --- Cargo.lock | 1 + src/datatypes/Cargo.toml | 1 + src/datatypes/src/types/json_type.rs | 123 ++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c25d495a97..5477d40cf8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4355,6 +4355,7 @@ dependencies = [ "num-traits", "ordered-float 4.6.0", "paste", + "regex", "serde", "serde_json", "snafu 0.8.6", diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index 8eb642f5d6..3bf2a5d8ee 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -28,6 +28,7 @@ num = "0.4" num-traits = "0.2" ordered-float.workspace = true paste.workspace = true +regex.workspace = true serde.workspace = true serde_json.workspace = true snafu.workspace = true diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 7e8d69252e..67c3ec951e 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -15,10 +15,11 @@ use std::collections::BTreeMap; use std::fmt::{Debug, Display, Formatter}; use std::str::FromStr; -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::Bytes; +use regex::{Captures, Regex}; use serde::{Deserialize, Serialize}; use snafu::ResultExt; @@ -401,8 +402,81 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { /// Converts a json type value to serde_json::Value pub fn jsonb_to_serde_json(val: &[u8]) -> Result { let json_string = jsonb_to_string(val)?; - serde_json::Value::from_str(json_string.as_str()) - .context(DeserializeSnafu { json: json_string }) + jsonb_string_to_serde_value(&json_string) +} + +/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort +/// fallback for Rust-style Unicode escape sequences. +/// +/// This function is intended to be used on JSON strings produced from the internal +/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls +/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is +/// returned as-is. +/// +/// If the initial parse fails, the input is scanned for Rust-style Unicode code +/// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace, +/// followed by 1–6 hexadecimal digits, and a closing brace). Each such escape is +/// converted into JSON-compatible UTF‑16 escape sequences: +/// +/// - For code points in the Basic Multilingual Plane (≤ `0xFFFF`), the escape is +/// converted to a single JSON `\\uXXXX` sequence with four uppercase hex digits. +/// - For code points above `0xFFFF` and less than Unicode max code point `0x10FFFF`, +/// the code point is encoded as a UTF‑16 surrogate pair and emitted as two consecutive +/// `\\uXXXX` sequences (as JSON format required). +/// +/// After this normalization, the function retries parsing the resulting string as +/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it +/// still cannot be parsed. +fn jsonb_string_to_serde_value(json: &str) -> Result { + match serde_json::Value::from_str(json) { + Ok(v) => Ok(v), + Err(e) => { + // If above deserialization is failed, the JSON string might contain some Rust chars + // that are somehow incorrectly represented as Unicode code point literal. For example, + // "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then + // try to deserialize the JSON string again. + if !e.is_syntax() || !e.to_string().contains("invalid escape") { + return Err(e).context(DeserializeSnafu { json }); + } + + static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { + // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits + // inside braces. + Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) + }); + + let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { + // Extract the hex payload (without braces) and parse to a code point. + let hex = &caps[1]; + let Ok(code) = u32::from_str_radix(hex, 16) else { + // On parse failure, leave the original escape sequence unchanged. + return caps[0].to_string(); + }; + + if code <= 0xFFFF { + // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. + format!("\\u{:04X}", code) + } else if code > 0x10FFFF { + // Beyond max Unicode code point + caps[0].to_string() + } else { + // Supplementary planes: JSON needs UTF-16 surrogate pairs. + // Convert the code point to a 20-bit value. + let code = code - 0x10000; + + // High surrogate: top 10 bits, offset by 0xD800. + let high = 0xD800 + ((code >> 10) & 0x3FF); + + // Low surrogate: bottom 10 bits, offset by 0xDC00. + let low = 0xDC00 + (code & 0x3FF); + + // Emit two \uXXXX escapes in sequence. + format!("\\u{:04X}\\u{:04X}", high, low) + } + }); + serde_json::Value::from_str(&v).context(DeserializeSnafu { json }) + } + } } /// Parses a string to a json type value @@ -417,6 +491,49 @@ mod tests { use super::*; use crate::json::JsonStructureSettings; + #[test] + fn test_jsonb_string_to_serde_value() -> Result<()> { + let valid_cases = vec![ + (r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#), + ( + r#"{"data": "Greek sigma: \u{03a3}"}"#, + r#"{"data":"Greek sigma: Σ"}"#, + ), + ( + r#"{"data": "Joker card: \u{1f0df}"}"#, + r#"{"data":"Joker card: 🃟"}"#, + ), + ( + r#"{"data": "BMP boundary: \u{ffff}"}"#, + r#"{"data":"BMP boundary: ￿"}"#, + ), + ( + r#"{"data": "Supplementary min: \u{10000}"}"#, + r#"{"data":"Supplementary min: 𐀀"}"#, + ), + ( + r#"{"data": "Supplementary max: \u{10ffff}"}"#, + r#"{"data":"Supplementary max: 􏿿"}"#, + ), + ]; + for (input, expect) in valid_cases { + let v = jsonb_string_to_serde_value(input)?; + assert_eq!(v.to_string(), expect); + } + + let invalid_cases = vec![ + r#"{"data": "Invalid hex: \u{gggg}"}"#, + r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#, + r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit + r#"{"data": "Empty braces: \u{}"}"#, + ]; + for input in invalid_cases { + let result = jsonb_string_to_serde_value(input); + assert!(result.is_err()); + } + Ok(()) + } + #[test] fn test_json_type_include() { fn test(this: &JsonNativeType, that: &JsonNativeType, expected: bool) {