From 032f1994ca8188913ae713a24dd51cfd8bf6031d Mon Sep 17 00:00:00 2001 From: David Sherret Date: Wed, 12 Nov 2025 10:57:49 -0500 Subject: [PATCH] refactor: extract parse_hex_char function --- src/string.rs | 225 ++++++++++++++++++++++---------------------------- 1 file changed, 101 insertions(+), 124 deletions(-) diff --git a/src/string.rs b/src/string.rs index cf4a17f..ebbf7d9 100644 --- a/src/string.rs +++ b/src/string.rs @@ -136,130 +136,10 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>( let text = text.as_mut().unwrap(); text.push_str(previous_text); if current_char == 'u' { - let mut hex_text = String::new(); - // expect four hex values - for _ in 0..4 { - let current_char = chars.move_next_char(); - if !is_hex(current_char) { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::ExpectedFourHexDigits, - }); - } - if let Some(current_char) = current_char { - hex_text.push(current_char); - } - } - - let hex_value = match u32::from_str_radix(&hex_text, 16) { - Ok(v) => v, - Err(_) => { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text), - }); - } - }; - - // Check if this is a high surrogate (0xD800-0xDBFF) - let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) { - // High surrogate - must be followed by low surrogate - // Peek ahead for \uXXXX pattern - let next_char = chars.move_next_char(); - if next_char != Some('\\') { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( - "{} (unpaired high surrogate)", - hex_text - )), - }); - } - - let next_char = chars.move_next_char(); - if next_char != Some('u') { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( - "{} (unpaired high surrogate)", - hex_text - )), - }); - } - - // Parse the second \uXXXX - let mut hex_text2 = String::new(); - for _ in 0..4 { - let current_char = chars.move_next_char(); - if !is_hex(current_char) { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::ExpectedFourHexDigits, - }); - } - if let Some(current_char) = current_char { - hex_text2.push(current_char); - } - } - - let hex_value2 = match u32::from_str_radix(&hex_text2, 16) { - Ok(v) => v, - Err(_) => { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2), - }); - } - }; - - // Verify it's a low surrogate (0xDC00-0xDFFF) - if !(0xDC00..=0xDFFF).contains(&hex_value2) { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( - "{} (high surrogate not followed by low surrogate)", - hex_text - )), - }); - } - - // Combine surrogate pair using RFC 8259 formula - let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000; - - match std::char::from_u32(code_point) { - Some(c) => c, - None => { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( - "{}\\u{} (invalid surrogate pair)", - hex_text, hex_text2 - )), - }); - } - } - } else if (0xDC00..=0xDFFF).contains(&hex_value) { - // Low surrogate without high surrogate - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( - "{} (unpaired low surrogate)", - hex_text - )), - }); - } else { - // Normal unicode escape - match std::char::from_u32(hex_value) { - Some(hex_char) => hex_char, - None => { - return Err(ParseStringError { - byte_index: escape_start, - kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text), - }); - } - } - }; - + let hex_char = parse_hex_char(chars).map_err(|kind| ParseStringError { + byte_index: escape_start, + kind, + })?; text.push(hex_char); last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0); } else { @@ -308,6 +188,103 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>( } } +fn parse_hex_char<'a, T: CharProvider<'a>>(chars: &mut T) -> Result { + let mut hex_text = String::new(); + // expect four hex values + for _ in 0..4 { + let current_char = chars.move_next_char(); + if !is_hex(current_char) { + return Err(ParseStringErrorKind::ExpectedFourHexDigits); + } + if let Some(current_char) = current_char { + hex_text.push(current_char); + } + } + + let hex_value = match u32::from_str_radix(&hex_text, 16) { + Ok(v) => v, + Err(_) => { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text)); + } + }; + + // Check if this is a high surrogate (0xD800-0xDBFF) + let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) { + // High surrogate - must be followed by low surrogate + // Peek ahead for \uXXXX pattern + let next_char = chars.move_next_char(); + if next_char != Some('\\') { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (unpaired high surrogate)", + hex_text + ))); + } + + let next_char = chars.move_next_char(); + if next_char != Some('u') { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (unpaired high surrogate)", + hex_text + ))); + } + + // Parse the second \uXXXX + let mut hex_text2 = String::new(); + for _ in 0..4 { + let current_char = chars.move_next_char(); + if !is_hex(current_char) { + return Err(ParseStringErrorKind::ExpectedFourHexDigits); + } + if let Some(current_char) = current_char { + hex_text2.push(current_char); + } + } + + let hex_value2 = match u32::from_str_radix(&hex_text2, 16) { + Ok(v) => v, + Err(_) => { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2)); + } + }; + + // Verify it's a low surrogate (0xDC00-0xDFFF) + if !(0xDC00..=0xDFFF).contains(&hex_value2) { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (high surrogate not followed by low surrogate)", + hex_text + ))); + } + + // Combine surrogate pair using RFC 8259 formula + let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000; + + match std::char::from_u32(code_point) { + Some(c) => c, + None => { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{}\\u{} (invalid surrogate pair)", + hex_text, hex_text2 + ))); + } + } + } else if (0xDC00..=0xDFFF).contains(&hex_value) { + // Low surrogate without high surrogate + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (unpaired low surrogate)", + hex_text + ))); + } else { + // Normal unicode escape + match std::char::from_u32(hex_value) { + Some(hex_char) => hex_char, + None => { + return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text)); + } + } + }; + Ok(hex_char) +} + fn is_hex(c: Option) -> bool { let Some(c) = c else { return false;