Skip to content

Commit 032f199

Browse files
committed
refactor: extract parse_hex_char function
1 parent d9e9430 commit 032f199

File tree

1 file changed

+101
-124
lines changed

1 file changed

+101
-124
lines changed

src/string.rs

Lines changed: 101 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -136,130 +136,10 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
136136
let text = text.as_mut().unwrap();
137137
text.push_str(previous_text);
138138
if current_char == 'u' {
139-
let mut hex_text = String::new();
140-
// expect four hex values
141-
for _ in 0..4 {
142-
let current_char = chars.move_next_char();
143-
if !is_hex(current_char) {
144-
return Err(ParseStringError {
145-
byte_index: escape_start,
146-
kind: ParseStringErrorKind::ExpectedFourHexDigits,
147-
});
148-
}
149-
if let Some(current_char) = current_char {
150-
hex_text.push(current_char);
151-
}
152-
}
153-
154-
let hex_value = match u32::from_str_radix(&hex_text, 16) {
155-
Ok(v) => v,
156-
Err(_) => {
157-
return Err(ParseStringError {
158-
byte_index: escape_start,
159-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
160-
});
161-
}
162-
};
163-
164-
// Check if this is a high surrogate (0xD800-0xDBFF)
165-
let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
166-
// High surrogate - must be followed by low surrogate
167-
// Peek ahead for \uXXXX pattern
168-
let next_char = chars.move_next_char();
169-
if next_char != Some('\\') {
170-
return Err(ParseStringError {
171-
byte_index: escape_start,
172-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
173-
"{} (unpaired high surrogate)",
174-
hex_text
175-
)),
176-
});
177-
}
178-
179-
let next_char = chars.move_next_char();
180-
if next_char != Some('u') {
181-
return Err(ParseStringError {
182-
byte_index: escape_start,
183-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
184-
"{} (unpaired high surrogate)",
185-
hex_text
186-
)),
187-
});
188-
}
189-
190-
// Parse the second \uXXXX
191-
let mut hex_text2 = String::new();
192-
for _ in 0..4 {
193-
let current_char = chars.move_next_char();
194-
if !is_hex(current_char) {
195-
return Err(ParseStringError {
196-
byte_index: escape_start,
197-
kind: ParseStringErrorKind::ExpectedFourHexDigits,
198-
});
199-
}
200-
if let Some(current_char) = current_char {
201-
hex_text2.push(current_char);
202-
}
203-
}
204-
205-
let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
206-
Ok(v) => v,
207-
Err(_) => {
208-
return Err(ParseStringError {
209-
byte_index: escape_start,
210-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2),
211-
});
212-
}
213-
};
214-
215-
// Verify it's a low surrogate (0xDC00-0xDFFF)
216-
if !(0xDC00..=0xDFFF).contains(&hex_value2) {
217-
return Err(ParseStringError {
218-
byte_index: escape_start,
219-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
220-
"{} (high surrogate not followed by low surrogate)",
221-
hex_text
222-
)),
223-
});
224-
}
225-
226-
// Combine surrogate pair using RFC 8259 formula
227-
let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;
228-
229-
match std::char::from_u32(code_point) {
230-
Some(c) => c,
231-
None => {
232-
return Err(ParseStringError {
233-
byte_index: escape_start,
234-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
235-
"{}\\u{} (invalid surrogate pair)",
236-
hex_text, hex_text2
237-
)),
238-
});
239-
}
240-
}
241-
} else if (0xDC00..=0xDFFF).contains(&hex_value) {
242-
// Low surrogate without high surrogate
243-
return Err(ParseStringError {
244-
byte_index: escape_start,
245-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
246-
"{} (unpaired low surrogate)",
247-
hex_text
248-
)),
249-
});
250-
} else {
251-
// Normal unicode escape
252-
match std::char::from_u32(hex_value) {
253-
Some(hex_char) => hex_char,
254-
None => {
255-
return Err(ParseStringError {
256-
byte_index: escape_start,
257-
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
258-
});
259-
}
260-
}
261-
};
262-
139+
let hex_char = parse_hex_char(chars).map_err(|kind| ParseStringError {
140+
byte_index: escape_start,
141+
kind,
142+
})?;
263143
text.push(hex_char);
264144
last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
265145
} else {
@@ -308,6 +188,103 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
308188
}
309189
}
310190

191+
fn parse_hex_char<'a, T: CharProvider<'a>>(chars: &mut T) -> Result<char, ParseStringErrorKind> {
192+
let mut hex_text = String::new();
193+
// expect four hex values
194+
for _ in 0..4 {
195+
let current_char = chars.move_next_char();
196+
if !is_hex(current_char) {
197+
return Err(ParseStringErrorKind::ExpectedFourHexDigits);
198+
}
199+
if let Some(current_char) = current_char {
200+
hex_text.push(current_char);
201+
}
202+
}
203+
204+
let hex_value = match u32::from_str_radix(&hex_text, 16) {
205+
Ok(v) => v,
206+
Err(_) => {
207+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
208+
}
209+
};
210+
211+
// Check if this is a high surrogate (0xD800-0xDBFF)
212+
let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
213+
// High surrogate - must be followed by low surrogate
214+
// Peek ahead for \uXXXX pattern
215+
let next_char = chars.move_next_char();
216+
if next_char != Some('\\') {
217+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
218+
"{} (unpaired high surrogate)",
219+
hex_text
220+
)));
221+
}
222+
223+
let next_char = chars.move_next_char();
224+
if next_char != Some('u') {
225+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
226+
"{} (unpaired high surrogate)",
227+
hex_text
228+
)));
229+
}
230+
231+
// Parse the second \uXXXX
232+
let mut hex_text2 = String::new();
233+
for _ in 0..4 {
234+
let current_char = chars.move_next_char();
235+
if !is_hex(current_char) {
236+
return Err(ParseStringErrorKind::ExpectedFourHexDigits);
237+
}
238+
if let Some(current_char) = current_char {
239+
hex_text2.push(current_char);
240+
}
241+
}
242+
243+
let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
244+
Ok(v) => v,
245+
Err(_) => {
246+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2));
247+
}
248+
};
249+
250+
// Verify it's a low surrogate (0xDC00-0xDFFF)
251+
if !(0xDC00..=0xDFFF).contains(&hex_value2) {
252+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
253+
"{} (high surrogate not followed by low surrogate)",
254+
hex_text
255+
)));
256+
}
257+
258+
// Combine surrogate pair using RFC 8259 formula
259+
let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;
260+
261+
match std::char::from_u32(code_point) {
262+
Some(c) => c,
263+
None => {
264+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
265+
"{}\\u{} (invalid surrogate pair)",
266+
hex_text, hex_text2
267+
)));
268+
}
269+
}
270+
} else if (0xDC00..=0xDFFF).contains(&hex_value) {
271+
// Low surrogate without high surrogate
272+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
273+
"{} (unpaired low surrogate)",
274+
hex_text
275+
)));
276+
} else {
277+
// Normal unicode escape
278+
match std::char::from_u32(hex_value) {
279+
Some(hex_char) => hex_char,
280+
None => {
281+
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
282+
}
283+
}
284+
};
285+
Ok(hex_char)
286+
}
287+
311288
fn is_hex(c: Option<char>) -> bool {
312289
let Some(c) = c else {
313290
return false;

0 commit comments

Comments
 (0)