@@ -151,16 +151,115 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
151151 }
152152 }
153153
154- let hex_u32 = u32:: from_str_radix ( & hex_text, 16 ) ;
155- let hex_char = match hex_u32. ok ( ) . and_then ( std:: char:: from_u32) {
156- Some ( hex_char) => hex_char,
157- None => {
154+ let hex_value = match u32:: from_str_radix ( & hex_text, 16 ) {
155+ Ok ( v) => v,
156+ Err ( _) => {
158157 return Err ( ParseStringError {
159158 byte_index : escape_start,
160159 kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text) ,
161160 } ) ;
162161 }
163162 } ;
163+
164+ // Check if this is a high surrogate (0xD800-0xDBFF)
165+ let hex_char = if ( 0xD800 ..=0xDBFF ) . contains ( & hex_value) {
166+ // High surrogate - must be followed by low surrogate
167+ // Peek ahead for \uXXXX pattern
168+ let next_char = chars. move_next_char ( ) ;
169+ if next_char != Some ( '\\' ) {
170+ return Err ( ParseStringError {
171+ byte_index : escape_start,
172+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
173+ "{} (unpaired high surrogate)" ,
174+ hex_text
175+ ) ) ,
176+ } ) ;
177+ }
178+
179+ let next_char = chars. move_next_char ( ) ;
180+ if next_char != Some ( 'u' ) {
181+ return Err ( ParseStringError {
182+ byte_index : escape_start,
183+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
184+ "{} (unpaired high surrogate)" ,
185+ hex_text
186+ ) ) ,
187+ } ) ;
188+ }
189+
190+ // Parse the second \uXXXX
191+ let mut hex_text2 = String :: new ( ) ;
192+ for _ in 0 ..4 {
193+ let current_char = chars. move_next_char ( ) ;
194+ if !is_hex ( current_char) {
195+ return Err ( ParseStringError {
196+ byte_index : escape_start,
197+ kind : ParseStringErrorKind :: ExpectedFourHexDigits ,
198+ } ) ;
199+ }
200+ if let Some ( current_char) = current_char {
201+ hex_text2. push ( current_char) ;
202+ }
203+ }
204+
205+ let hex_value2 = match u32:: from_str_radix ( & hex_text2, 16 ) {
206+ Ok ( v) => v,
207+ Err ( _) => {
208+ return Err ( ParseStringError {
209+ byte_index : escape_start,
210+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text2) ,
211+ } ) ;
212+ }
213+ } ;
214+
215+ // Verify it's a low surrogate (0xDC00-0xDFFF)
216+ if !( 0xDC00 ..=0xDFFF ) . contains ( & hex_value2) {
217+ return Err ( ParseStringError {
218+ byte_index : escape_start,
219+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
220+ "{} (high surrogate not followed by low surrogate)" ,
221+ hex_text
222+ ) ) ,
223+ } ) ;
224+ }
225+
226+ // Combine surrogate pair using RFC 8259 formula
227+ let code_point = ( ( hex_value - 0xD800 ) * 0x400 ) + ( hex_value2 - 0xDC00 ) + 0x10000 ;
228+
229+ match std:: char:: from_u32 ( code_point) {
230+ Some ( c) => c,
231+ None => {
232+ return Err ( ParseStringError {
233+ byte_index : escape_start,
234+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
235+ "{}\\ u{} (invalid surrogate pair)" ,
236+ hex_text, hex_text2
237+ ) ) ,
238+ } ) ;
239+ }
240+ }
241+ } else if ( 0xDC00 ..=0xDFFF ) . contains ( & hex_value) {
242+ // Low surrogate without high surrogate
243+ return Err ( ParseStringError {
244+ byte_index : escape_start,
245+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
246+ "{} (unpaired low surrogate)" ,
247+ hex_text
248+ ) ) ,
249+ } ) ;
250+ } else {
251+ // Normal unicode escape
252+ match std:: char:: from_u32 ( hex_value) {
253+ Some ( hex_char) => hex_char,
254+ None => {
255+ return Err ( ParseStringError {
256+ byte_index : escape_start,
257+ kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text) ,
258+ } ) ;
259+ }
260+ }
261+ } ;
262+
164263 text. push ( hex_char) ;
165264 last_start_byte_index = chars. byte_index ( ) + chars. current_char ( ) . map ( |c| c. len_utf8 ( ) ) . unwrap_or ( 0 ) ;
166265 } else {
0 commit comments