Skip to content

Commit d9e9430

Browse files
authored
fix: support UTF-16 surrogate pairs in unicode escape sequences (#59)
1 parent 52b0ca7 commit d9e9430

File tree

3 files changed

+168
-5
lines changed

3 files changed

+168
-5
lines changed

src/parse_to_value.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,68 @@ mod tests {
155155
assert_eq!(err.range().end, 11);
156156
assert_eq!(err.kind().clone(), ParseErrorKind::UnexpectedToken);
157157
}
158+
159+
#[test]
160+
fn it_should_parse_surrogate_pair() {
161+
// RFC 8259 § 7: non-BMP character 𝄞 (U+1D11E) should be escaped as surrogate pair \uD834\uDD1E
162+
let src = r#""\uD834\uDD1E""#;
163+
let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
164+
if let JsonValue::String(s) = v {
165+
assert_eq!("\u{1D11E}", s.as_ref());
166+
} else {
167+
panic!("Expected string value, got {:?}", v);
168+
}
169+
}
170+
171+
#[test]
172+
fn it_should_parse_multiple_surrogate_pairs() {
173+
let src = r#""\uD834\uDD1E\uD834\uDD1E""#;
174+
let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
175+
if let JsonValue::String(s) = v {
176+
assert_eq!("\u{1D11E}\u{1D11E}", s.as_ref());
177+
} else {
178+
panic!("Expected string value, got {:?}", v);
179+
}
180+
}
181+
182+
#[test]
183+
fn it_should_parse_mixed_escapes_with_surrogate_pairs() {
184+
// "A𝄞B" where 𝄞 is encoded as surrogate pair
185+
let src = r#""\u0041\uD834\uDD1E\u0042""#;
186+
let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
187+
if let JsonValue::String(s) = v {
188+
assert_eq!("A\u{1D11E}B", s.as_ref());
189+
} else {
190+
panic!("Expected string value, got {:?}", v);
191+
}
192+
}
193+
194+
#[test]
195+
fn it_should_error_on_unpaired_high_surrogate_with_text() {
196+
let src = r#""\uD834x""#;
197+
let err = parse_to_value(src, &Default::default()).err().unwrap();
198+
assert!(err.to_string().contains("unpaired high surrogate"));
199+
}
200+
201+
#[test]
202+
fn it_should_error_on_unpaired_high_surrogate_at_eof() {
203+
let src = r#""\uD834""#;
204+
let err = parse_to_value(src, &Default::default()).err().unwrap();
205+
assert!(err.to_string().contains("unpaired high surrogate"));
206+
}
207+
208+
#[test]
209+
fn it_should_error_on_high_surrogate_followed_by_non_low_surrogate() {
210+
let src = r#""\uD834\u0041""#;
211+
let err = parse_to_value(src, &Default::default()).err().unwrap();
212+
assert!(err.to_string().contains("not followed by low surrogate"));
213+
}
214+
215+
#[test]
216+
fn it_should_error_on_unpaired_low_surrogate() {
217+
// This test verifies existing behavior is maintained
218+
let src = r#""\uDC00""#;
219+
let err = parse_to_value(src, &Default::default()).err().unwrap();
220+
assert!(err.to_string().contains("unpaired low surrogate"));
221+
}
158222
}

src/scanner.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,7 @@ mod tests {
620620
fn it_errors_on_invalid_utf8_char_for_issue_6() {
621621
assert_has_error(
622622
"\"\\uDF06\"",
623-
"Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
623+
"Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
624624
);
625625
}
626626

src/string.rs

Lines changed: 103 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,16 +151,115 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
151151
}
152152
}
153153

154-
let hex_u32 = u32::from_str_radix(&hex_text, 16);
155-
let hex_char = match hex_u32.ok().and_then(std::char::from_u32) {
156-
Some(hex_char) => hex_char,
157-
None => {
154+
let hex_value = match u32::from_str_radix(&hex_text, 16) {
155+
Ok(v) => v,
156+
Err(_) => {
158157
return Err(ParseStringError {
159158
byte_index: escape_start,
160159
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
161160
});
162161
}
163162
};
163+
164+
// Check if this is a high surrogate (0xD800-0xDBFF)
165+
let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
166+
// High surrogate - must be followed by low surrogate
167+
// Peek ahead for \uXXXX pattern
168+
let next_char = chars.move_next_char();
169+
if next_char != Some('\\') {
170+
return Err(ParseStringError {
171+
byte_index: escape_start,
172+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
173+
"{} (unpaired high surrogate)",
174+
hex_text
175+
)),
176+
});
177+
}
178+
179+
let next_char = chars.move_next_char();
180+
if next_char != Some('u') {
181+
return Err(ParseStringError {
182+
byte_index: escape_start,
183+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
184+
"{} (unpaired high surrogate)",
185+
hex_text
186+
)),
187+
});
188+
}
189+
190+
// Parse the second \uXXXX
191+
let mut hex_text2 = String::new();
192+
for _ in 0..4 {
193+
let current_char = chars.move_next_char();
194+
if !is_hex(current_char) {
195+
return Err(ParseStringError {
196+
byte_index: escape_start,
197+
kind: ParseStringErrorKind::ExpectedFourHexDigits,
198+
});
199+
}
200+
if let Some(current_char) = current_char {
201+
hex_text2.push(current_char);
202+
}
203+
}
204+
205+
let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
206+
Ok(v) => v,
207+
Err(_) => {
208+
return Err(ParseStringError {
209+
byte_index: escape_start,
210+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2),
211+
});
212+
}
213+
};
214+
215+
// Verify it's a low surrogate (0xDC00-0xDFFF)
216+
if !(0xDC00..=0xDFFF).contains(&hex_value2) {
217+
return Err(ParseStringError {
218+
byte_index: escape_start,
219+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
220+
"{} (high surrogate not followed by low surrogate)",
221+
hex_text
222+
)),
223+
});
224+
}
225+
226+
// Combine surrogate pair using RFC 8259 formula
227+
let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;
228+
229+
match std::char::from_u32(code_point) {
230+
Some(c) => c,
231+
None => {
232+
return Err(ParseStringError {
233+
byte_index: escape_start,
234+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
235+
"{}\\u{} (invalid surrogate pair)",
236+
hex_text, hex_text2
237+
)),
238+
});
239+
}
240+
}
241+
} else if (0xDC00..=0xDFFF).contains(&hex_value) {
242+
// Low surrogate without high surrogate
243+
return Err(ParseStringError {
244+
byte_index: escape_start,
245+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
246+
"{} (unpaired low surrogate)",
247+
hex_text
248+
)),
249+
});
250+
} else {
251+
// Normal unicode escape
252+
match std::char::from_u32(hex_value) {
253+
Some(hex_char) => hex_char,
254+
None => {
255+
return Err(ParseStringError {
256+
byte_index: escape_start,
257+
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
258+
});
259+
}
260+
}
261+
};
262+
164263
text.push(hex_char);
165264
last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
166265
} else {

0 commit comments

Comments
 (0)