Compare commits

...

2 Commits

Author SHA1 Message Date
LFC
f40d777dc0 resolve PR comments
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

Signed-off-by: luofucong <luofc@foxmail.com>
2026-01-06 11:16:24 +08:00
luofucong
911ee49a11 fix: correctly parse some json strings
Signed-off-by: luofucong <luofc@foxmail.com>
2026-01-05 21:20:52 +08:00
3 changed files with 122 additions and 3 deletions

1
Cargo.lock generated
View File

@@ -4100,6 +4100,7 @@ dependencies = [
"num-traits",
"ordered-float 4.6.0",
"paste",
"regex",
"serde",
"serde_json",
"snafu 0.8.6",

View File

@@ -28,6 +28,7 @@ num = "0.4"
num-traits = "0.2"
ordered-float.workspace = true
paste.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
snafu.workspace = true

View File

@@ -15,10 +15,11 @@
use std::collections::BTreeMap;
use std::fmt::{Debug, Display, Formatter};
use std::str::FromStr;
use std::sync::Arc;
use std::sync::{Arc, LazyLock};
use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::Bytes;
use regex::{Captures, Regex};
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
@@ -401,8 +402,81 @@ pub fn jsonb_to_string(val: &[u8]) -> Result<String> {
/// Converts a json type value to serde_json::Value
pub fn jsonb_to_serde_json(val: &[u8]) -> Result<serde_json::Value> {
let json_string = jsonb_to_string(val)?;
serde_json::Value::from_str(json_string.as_str())
.context(DeserializeSnafu { json: json_string })
jsonb_string_to_serde_value(&json_string)
}
/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort
/// fallback for Rust-style Unicode escape sequences.
///
/// This function is intended to be used on JSON strings produced from the internal
/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls
/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is
/// returned as-is.
///
/// If the initial parse fails, the input is scanned for Rust-style Unicode code
/// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace,
/// followed by 16 hexadecimal digits, and a closing brace). Each such escape is
/// converted into JSON-compatible UTF16 escape sequences:
///
/// - For code points in the Basic Multilingual Plane (≤ `0xFFFF`), the escape is
/// converted to a single JSON `\\uXXXX` sequence with four uppercase hex digits.
/// - For code points above `0xFFFF` and less than Unicode max code point `0x10FFFF`,
/// the code point is encoded as a UTF16 surrogate pair and emitted as two consecutive
/// `\\uXXXX` sequences (as JSON format required).
///
/// After this normalization, the function retries parsing the resulting string as
/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it
/// still cannot be parsed.
fn jsonb_string_to_serde_value(json: &str) -> Result<serde_json::Value> {
match serde_json::Value::from_str(json) {
Ok(v) => Ok(v),
Err(e) => {
// If above deserialization is failed, the JSON string might contain some Rust chars
// that are somehow incorrectly represented as Unicode code point literal. For example,
// "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then
// try to deserialize the JSON string again.
if !e.is_syntax() || !e.to_string().contains("invalid escape") {
return Err(e).context(DeserializeSnafu { json });
}
static UNICODE_CODE_POINT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
// Match literal "\u{...}" sequences, capturing 16 (code point range) hex digits
// inside braces.
Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e))
});
let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| {
// Extract the hex payload (without braces) and parse to a code point.
let hex = &caps[1];
let Ok(code) = u32::from_str_radix(hex, 16) else {
// On parse failure, leave the original escape sequence unchanged.
return caps[0].to_string();
};
if code <= 0xFFFF {
// Basic Multilingual Plane: JSON can represent this directly as \uXXXX.
format!("\\u{:04X}", code)
} else if code > 0x10FFFF {
// Beyond max Unicode code point
caps[0].to_string()
} else {
// Supplementary planes: JSON needs UTF-16 surrogate pairs.
// Convert the code point to a 20-bit value.
let code = code - 0x10000;
// High surrogate: top 10 bits, offset by 0xD800.
let high = 0xD800 + ((code >> 10) & 0x3FF);
// Low surrogate: bottom 10 bits, offset by 0xDC00.
let low = 0xDC00 + (code & 0x3FF);
// Emit two \uXXXX escapes in sequence.
format!("\\u{:04X}\\u{:04X}", high, low)
}
});
serde_json::Value::from_str(&v).context(DeserializeSnafu { json })
}
}
}
/// Parses a string to a json type value
@@ -417,6 +491,49 @@ mod tests {
use super::*;
use crate::json::JsonStructureSettings;
#[test]
fn test_jsonb_string_to_serde_value() -> Result<()> {
let valid_cases = vec![
(r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#),
(
r#"{"data": "Greek sigma: \u{03a3}"}"#,
r#"{"data":"Greek sigma: Σ"}"#,
),
(
r#"{"data": "Joker card: \u{1f0df}"}"#,
r#"{"data":"Joker card: 🃟"}"#,
),
(
r#"{"data": "BMP boundary: \u{ffff}"}"#,
r#"{"data":"BMP boundary: ￿"}"#,
),
(
r#"{"data": "Supplementary min: \u{10000}"}"#,
r#"{"data":"Supplementary min: 𐀀"}"#,
),
(
r#"{"data": "Supplementary max: \u{10ffff}"}"#,
r#"{"data":"Supplementary max: 􏿿"}"#,
),
];
for (input, expect) in valid_cases {
let v = jsonb_string_to_serde_value(input)?;
assert_eq!(v.to_string(), expect);
}
let invalid_cases = vec![
r#"{"data": "Invalid hex: \u{gggg}"}"#,
r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#,
r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit
r#"{"data": "Empty braces: \u{}"}"#,
];
for input in invalid_cases {
let result = jsonb_string_to_serde_value(input);
assert!(result.is_err());
}
Ok(())
}
#[test]
fn test_json_type_include() {
fn test(this: &JsonNativeType, that: &JsonNativeType, expected: bool) {