@@ -136,130 +136,10 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
136136 let text = text. as_mut ( ) . unwrap ( ) ;
137137 text. push_str ( previous_text) ;
138138 if current_char == 'u' {
139- let mut hex_text = String :: new ( ) ;
140- // expect four hex values
141- for _ in 0 ..4 {
142- let current_char = chars. move_next_char ( ) ;
143- if !is_hex ( current_char) {
144- return Err ( ParseStringError {
145- byte_index : escape_start,
146- kind : ParseStringErrorKind :: ExpectedFourHexDigits ,
147- } ) ;
148- }
149- if let Some ( current_char) = current_char {
150- hex_text. push ( current_char) ;
151- }
152- }
153-
154- let hex_value = match u32:: from_str_radix ( & hex_text, 16 ) {
155- Ok ( v) => v,
156- Err ( _) => {
157- return Err ( ParseStringError {
158- byte_index : escape_start,
159- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text) ,
160- } ) ;
161- }
162- } ;
163-
164- // Check if this is a high surrogate (0xD800-0xDBFF)
165- let hex_char = if ( 0xD800 ..=0xDBFF ) . contains ( & hex_value) {
166- // High surrogate - must be followed by low surrogate
167- // Peek ahead for \uXXXX pattern
168- let next_char = chars. move_next_char ( ) ;
169- if next_char != Some ( '\\' ) {
170- return Err ( ParseStringError {
171- byte_index : escape_start,
172- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
173- "{} (unpaired high surrogate)" ,
174- hex_text
175- ) ) ,
176- } ) ;
177- }
178-
179- let next_char = chars. move_next_char ( ) ;
180- if next_char != Some ( 'u' ) {
181- return Err ( ParseStringError {
182- byte_index : escape_start,
183- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
184- "{} (unpaired high surrogate)" ,
185- hex_text
186- ) ) ,
187- } ) ;
188- }
189-
190- // Parse the second \uXXXX
191- let mut hex_text2 = String :: new ( ) ;
192- for _ in 0 ..4 {
193- let current_char = chars. move_next_char ( ) ;
194- if !is_hex ( current_char) {
195- return Err ( ParseStringError {
196- byte_index : escape_start,
197- kind : ParseStringErrorKind :: ExpectedFourHexDigits ,
198- } ) ;
199- }
200- if let Some ( current_char) = current_char {
201- hex_text2. push ( current_char) ;
202- }
203- }
204-
205- let hex_value2 = match u32:: from_str_radix ( & hex_text2, 16 ) {
206- Ok ( v) => v,
207- Err ( _) => {
208- return Err ( ParseStringError {
209- byte_index : escape_start,
210- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text2) ,
211- } ) ;
212- }
213- } ;
214-
215- // Verify it's a low surrogate (0xDC00-0xDFFF)
216- if !( 0xDC00 ..=0xDFFF ) . contains ( & hex_value2) {
217- return Err ( ParseStringError {
218- byte_index : escape_start,
219- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
220- "{} (high surrogate not followed by low surrogate)" ,
221- hex_text
222- ) ) ,
223- } ) ;
224- }
225-
226- // Combine surrogate pair using RFC 8259 formula
227- let code_point = ( ( hex_value - 0xD800 ) * 0x400 ) + ( hex_value2 - 0xDC00 ) + 0x10000 ;
228-
229- match std:: char:: from_u32 ( code_point) {
230- Some ( c) => c,
231- None => {
232- return Err ( ParseStringError {
233- byte_index : escape_start,
234- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
235- "{}\\ u{} (invalid surrogate pair)" ,
236- hex_text, hex_text2
237- ) ) ,
238- } ) ;
239- }
240- }
241- } else if ( 0xDC00 ..=0xDFFF ) . contains ( & hex_value) {
242- // Low surrogate without high surrogate
243- return Err ( ParseStringError {
244- byte_index : escape_start,
245- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
246- "{} (unpaired low surrogate)" ,
247- hex_text
248- ) ) ,
249- } ) ;
250- } else {
251- // Normal unicode escape
252- match std:: char:: from_u32 ( hex_value) {
253- Some ( hex_char) => hex_char,
254- None => {
255- return Err ( ParseStringError {
256- byte_index : escape_start,
257- kind : ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text) ,
258- } ) ;
259- }
260- }
261- } ;
262-
139+ let hex_char = parse_hex_char ( chars) . map_err ( |kind| ParseStringError {
140+ byte_index : escape_start,
141+ kind,
142+ } ) ?;
263143 text. push ( hex_char) ;
264144 last_start_byte_index = chars. byte_index ( ) + chars. current_char ( ) . map ( |c| c. len_utf8 ( ) ) . unwrap_or ( 0 ) ;
265145 } else {
@@ -308,6 +188,103 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
308188 }
309189}
310190
191+ fn parse_hex_char < ' a , T : CharProvider < ' a > > ( chars : & mut T ) -> Result < char , ParseStringErrorKind > {
192+ let mut hex_text = String :: new ( ) ;
193+ // expect four hex values
194+ for _ in 0 ..4 {
195+ let current_char = chars. move_next_char ( ) ;
196+ if !is_hex ( current_char) {
197+ return Err ( ParseStringErrorKind :: ExpectedFourHexDigits ) ;
198+ }
199+ if let Some ( current_char) = current_char {
200+ hex_text. push ( current_char) ;
201+ }
202+ }
203+
204+ let hex_value = match u32:: from_str_radix ( & hex_text, 16 ) {
205+ Ok ( v) => v,
206+ Err ( _) => {
207+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text) ) ;
208+ }
209+ } ;
210+
211+ // Check if this is a high surrogate (0xD800-0xDBFF)
212+ let hex_char = if ( 0xD800 ..=0xDBFF ) . contains ( & hex_value) {
213+ // High surrogate - must be followed by low surrogate
214+ // Peek ahead for \uXXXX pattern
215+ let next_char = chars. move_next_char ( ) ;
216+ if next_char != Some ( '\\' ) {
217+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
218+ "{} (unpaired high surrogate)" ,
219+ hex_text
220+ ) ) ) ;
221+ }
222+
223+ let next_char = chars. move_next_char ( ) ;
224+ if next_char != Some ( 'u' ) {
225+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
226+ "{} (unpaired high surrogate)" ,
227+ hex_text
228+ ) ) ) ;
229+ }
230+
231+ // Parse the second \uXXXX
232+ let mut hex_text2 = String :: new ( ) ;
233+ for _ in 0 ..4 {
234+ let current_char = chars. move_next_char ( ) ;
235+ if !is_hex ( current_char) {
236+ return Err ( ParseStringErrorKind :: ExpectedFourHexDigits ) ;
237+ }
238+ if let Some ( current_char) = current_char {
239+ hex_text2. push ( current_char) ;
240+ }
241+ }
242+
243+ let hex_value2 = match u32:: from_str_radix ( & hex_text2, 16 ) {
244+ Ok ( v) => v,
245+ Err ( _) => {
246+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text2) ) ;
247+ }
248+ } ;
249+
250+ // Verify it's a low surrogate (0xDC00-0xDFFF)
251+ if !( 0xDC00 ..=0xDFFF ) . contains ( & hex_value2) {
252+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
253+ "{} (high surrogate not followed by low surrogate)" ,
254+ hex_text
255+ ) ) ) ;
256+ }
257+
258+ // Combine surrogate pair using RFC 8259 formula
259+ let code_point = ( ( hex_value - 0xD800 ) * 0x400 ) + ( hex_value2 - 0xDC00 ) + 0x10000 ;
260+
261+ match std:: char:: from_u32 ( code_point) {
262+ Some ( c) => c,
263+ None => {
264+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
265+ "{}\\ u{} (invalid surrogate pair)" ,
266+ hex_text, hex_text2
267+ ) ) ) ;
268+ }
269+ }
270+ } else if ( 0xDC00 ..=0xDFFF ) . contains ( & hex_value) {
271+ // Low surrogate without high surrogate
272+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( format ! (
273+ "{} (unpaired low surrogate)" ,
274+ hex_text
275+ ) ) ) ;
276+ } else {
277+ // Normal unicode escape
278+ match std:: char:: from_u32 ( hex_value) {
279+ Some ( hex_char) => hex_char,
280+ None => {
281+ return Err ( ParseStringErrorKind :: InvalidUnicodeEscapeSequence ( hex_text) ) ;
282+ }
283+ }
284+ } ;
285+ Ok ( hex_char)
286+ }
287+
311288fn is_hex ( c : Option < char > ) -> bool {
312289 let Some ( c) = c else {
313290 return false ;
0 commit comments