fix: support UTF-16 surrogate pairs in unicode escape sequences (#59)

Maaarcocr · web-flow · commit d9e943083fd2 · 2025-11-12T10:27:37.000-05:00
diff --git a/src/parse_to_value.rs b/src/parse_to_value.rs
@@ -155,4 +155,68 @@ mod tests {
     assert_eq!(err.range().end, 11);
     assert_eq!(err.kind().clone(), ParseErrorKind::UnexpectedToken);
   }
+
+  #[test]
+  fn it_should_parse_surrogate_pair() {
+    // RFC 8259 § 7: non-BMP character 𝄞 (U+1D11E) should be escaped as surrogate pair \uD834\uDD1E
+    let src = r#""\uD834\uDD1E""#;
+    let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
+    if let JsonValue::String(s) = v {
+      assert_eq!("\u{1D11E}", s.as_ref());
+    } else {
+      panic!("Expected string value, got {:?}", v);
+    }
+  }
+
+  #[test]
+  fn it_should_parse_multiple_surrogate_pairs() {
+    let src = r#""\uD834\uDD1E\uD834\uDD1E""#;
+    let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
+    if let JsonValue::String(s) = v {
+      assert_eq!("\u{1D11E}\u{1D11E}", s.as_ref());
+    } else {
+      panic!("Expected string value, got {:?}", v);
+    }
+  }
+
+  #[test]
+  fn it_should_parse_mixed_escapes_with_surrogate_pairs() {
+    // "A𝄞B" where 𝄞 is encoded as surrogate pair
+    let src = r#""\u0041\uD834\uDD1E\u0042""#;
+    let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
+    if let JsonValue::String(s) = v {
+      assert_eq!("A\u{1D11E}B", s.as_ref());
+    } else {
+      panic!("Expected string value, got {:?}", v);
+    }
+  }
+
+  #[test]
+  fn it_should_error_on_unpaired_high_surrogate_with_text() {
+    let src = r#""\uD834x""#;
+    let err = parse_to_value(src, &Default::default()).err().unwrap();
+    assert!(err.to_string().contains("unpaired high surrogate"));
+  }
+
+  #[test]
+  fn it_should_error_on_unpaired_high_surrogate_at_eof() {
+    let src = r#""\uD834""#;
+    let err = parse_to_value(src, &Default::default()).err().unwrap();
+    assert!(err.to_string().contains("unpaired high surrogate"));
+  }
+
+  #[test]
+  fn it_should_error_on_high_surrogate_followed_by_non_low_surrogate() {
+    let src = r#""\uD834\u0041""#;
+    let err = parse_to_value(src, &Default::default()).err().unwrap();
+    assert!(err.to_string().contains("not followed by low surrogate"));
+  }
+
+  #[test]
+  fn it_should_error_on_unpaired_low_surrogate() {
+    // This test verifies existing behavior is maintained
+    let src = r#""\uDC00""#;
+    let err = parse_to_value(src, &Default::default()).err().unwrap();
+    assert!(err.to_string().contains("unpaired low surrogate"));
+  }
 }
diff --git a/src/scanner.rs b/src/scanner.rs
@@ -620,7 +620,7 @@ mod tests {
   fn it_errors_on_invalid_utf8_char_for_issue_6() {
     assert_has_error(
       "\"\\uDF06\"",
-      "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
+      "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
     );
   }
 
diff --git a/src/string.rs b/src/string.rs
@@ -151,16 +151,115 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
               }
             }
 
-            let hex_u32 = u32::from_str_radix(&hex_text, 16);
-            let hex_char = match hex_u32.ok().and_then(std::char::from_u32) {
-              Some(hex_char) => hex_char,
-              None => {
+            let hex_value = match u32::from_str_radix(&hex_text, 16) {
+              Ok(v) => v,
+              Err(_) => {
                 return Err(ParseStringError {
                   byte_index: escape_start,
                   kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
                 });
               }
             };
+
+            // Check if this is a high surrogate (0xD800-0xDBFF)
+            let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
+              // High surrogate - must be followed by low surrogate
+              // Peek ahead for \uXXXX pattern
+              let next_char = chars.move_next_char();
+              if next_char != Some('\\') {
+                return Err(ParseStringError {
+                  byte_index: escape_start,
+                  kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
+                    "{} (unpaired high surrogate)",
+                    hex_text
+                  )),
+                });
+              }
+
+              let next_char = chars.move_next_char();
+              if next_char != Some('u') {
+                return Err(ParseStringError {
+                  byte_index: escape_start,
+                  kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
+                    "{} (unpaired high surrogate)",
+                    hex_text
+                  )),
+                });
+              }
+
+              // Parse the second \uXXXX
+              let mut hex_text2 = String::new();
+              for _ in 0..4 {
+                let current_char = chars.move_next_char();
+                if !is_hex(current_char) {
+                  return Err(ParseStringError {
+                    byte_index: escape_start,
+                    kind: ParseStringErrorKind::ExpectedFourHexDigits,
+                  });
+                }
+                if let Some(current_char) = current_char {
+                  hex_text2.push(current_char);
+                }
+              }
+
+              let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
+                Ok(v) => v,
+                Err(_) => {
+                  return Err(ParseStringError {
+                    byte_index: escape_start,
+                    kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2),
+                  });
+                }
+              };
+
+              // Verify it's a low surrogate (0xDC00-0xDFFF)
+              if !(0xDC00..=0xDFFF).contains(&hex_value2) {
+                return Err(ParseStringError {
+                  byte_index: escape_start,
+                  kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
+                    "{} (high surrogate not followed by low surrogate)",
+                    hex_text
+                  )),
+                });
+              }
+
+              // Combine surrogate pair using RFC 8259 formula
+              let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;
+
+              match std::char::from_u32(code_point) {
+                Some(c) => c,
+                None => {
+                  return Err(ParseStringError {
+                    byte_index: escape_start,
+                    kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
+                      "{}\\u{} (invalid surrogate pair)",
+                      hex_text, hex_text2
+                    )),
+                  });
+                }
+              }
+            } else if (0xDC00..=0xDFFF).contains(&hex_value) {
+              // Low surrogate without high surrogate
+              return Err(ParseStringError {
+                byte_index: escape_start,
+                kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
+                  "{} (unpaired low surrogate)",
+                  hex_text
+                )),
+              });
+            } else {
+              // Normal unicode escape
+              match std::char::from_u32(hex_value) {
+                Some(hex_char) => hex_char,
+                None => {
+                  return Err(ParseStringError {
+                    byte_index: escape_start,
+                    kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
+                  });
+                }
+              }
+            };
+
             text.push(hex_char);
             last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
           } else {

Original file line number	Diff line number	Diff line change
`@@ -620,7 +620,7 @@ mod tests {`
`620`	`620`	`fn it_errors_on_invalid_utf8_char_for_issue_6() {`
`621`	`621`	`assert_has_error(`
`622`	`622`	`"\"\\uDF06\"",`
`623`		`- "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",`
	`623`	`+ "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",`
`624`	`624`	`);`
`625`	`625`	`}`
`626`	`626`