@@ -133,7 +133,11 @@ private function evaluateBracket(string $expr, mixed $value): array
133133 return [];
134134 }
135135
136- if ('* ' === $ expr ) {
136+ if (str_contains ($ expr , ', ' ) && (str_starts_with ($ trimmed = trim ($ expr ), ', ' ) || str_ends_with ($ trimmed , ', ' ))) {
137+ throw new JsonCrawlerException ($ expr , 'Expression cannot have leading or trailing commas ' );
138+ }
139+
140+ if ('* ' === $ expr = JsonPathUtils::normalizeWhitespace ($ expr )) {
137141 return array_values ($ value );
138142 }
139143
@@ -168,8 +172,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168172 return $ result ;
169173 }
170174
171- // start, end and step
172- if (preg_match ('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/ ' , $ expr , $ matches )) {
175+ if (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ expr , $ matches )) {
173176 if (!array_is_list ($ value )) {
174177 return [];
175178 }
@@ -217,14 +220,12 @@ private function evaluateBracket(string $expr, mixed $value): array
217220
218221 // filter expressions
219222 if (preg_match ('/^\?(.*)$/ ' , $ expr , $ matches )) {
220- $ filterExpr = $ matches [1 ];
221-
222- if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr )) {
223+ if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr = trim ($ matches [1 ]))) {
223224 $ filterExpr = "( $ filterExpr) " ;
224225 }
225226
226227 if (!str_starts_with ($ filterExpr , '( ' )) {
227- throw new JsonCrawlerException ( $ expr , ' Invalid filter expression ' ) ;
228+ $ filterExpr = " ( $ filterExpr ) " ;
228229 }
229230
230231 // remove outer filter parentheses
@@ -235,30 +236,30 @@ private function evaluateBracket(string $expr, mixed $value): array
235236
236237 // comma-separated values, e.g. `['key1', 'key2', 123]` or `[0, 1, 'key']`
237238 if (str_contains ($ expr , ', ' )) {
238- $ parts = $ this -> parseCommaSeparatedValues ($ expr );
239+ $ parts = JsonPathUtils:: parseCommaSeparatedValues ($ expr );
239240
240241 $ result = [];
241- $ keysIndices = array_keys ($ value );
242- $ isList = array_is_list ($ value );
243242
244243 foreach ($ parts as $ part ) {
245244 $ part = trim ($ part );
246245
247- if (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
246+ if ('* ' === $ part ) {
247+ $ result = array_merge ($ result , array_values ($ value ));
248+ } elseif (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ part , $ matches )) {
249+ // slice notation
250+ $ sliceResult = $ this ->evaluateBracket ($ part , $ value );
251+ $ result = array_merge ($ result , $ sliceResult );
252+ } elseif (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
248253 $ key = JsonPathUtils::unescapeString ($ matches [2 ], $ matches [1 ]);
249254
250- if ($ isList ) {
255+ if (array_is_list ($ value )) {
256+ // for arrays, find ALL objects that contain this key
251257 foreach ($ value as $ item ) {
252258 if (\is_array ($ item ) && \array_key_exists ($ key , $ item )) {
253259 $ result [] = $ item ;
254- break ;
255260 }
256261 }
257-
258- continue ; // no results here
259- }
260-
261- if (\array_key_exists ($ key , $ value )) {
262+ } elseif (\array_key_exists ($ key , $ value )) { // for objects, get the value for this key
262263 $ result [] = $ value [$ key ];
263264 }
264265 } elseif (preg_match ('/^-?\d+$/ ' , $ part )) {
@@ -268,14 +269,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268269 $ index = \count ($ value ) + $ index ;
269270 }
270271
271- if ($ isList && \array_key_exists ($ index , $ value )) {
272+ if (array_is_list ( $ value ) && \array_key_exists ($ index , $ value )) {
272273 $ result [] = $ value [$ index ];
273- continue ;
274- }
275-
276- // numeric index on a hashmap
277- if ( isset ( $ keysIndices [ $ index ]) && isset ( $ value [$ keysIndices [$ index ]])) {
278- $ result [] = $ value [ $ keysIndices [ $ index ]];
274+ } else {
275+ // numeric index on a hashmap
276+ $ keysIndices = array_keys ( $ value );
277+ if ( isset ( $ keysIndices [ $ index]) && isset ( $ value [ $ keysIndices [ $ index ]])) {
278+ $ result [] = $ value [$ keysIndices [$ index ]];
279+ }
279280 }
280281 }
281282 }
@@ -310,7 +311,29 @@ private function evaluateFilter(string $expr, mixed $value): array
310311
311312 private function evaluateFilterExpression (string $ expr , mixed $ context ): bool
312313 {
313- $ expr = trim ($ expr );
314+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
315+
316+ // remove outer parentheses if they wrap the entire expression
317+ if (str_starts_with ($ expr , '( ' ) && str_ends_with ($ expr , ') ' )) {
318+ $ depth = 0 ;
319+ $ isWrapped = true ;
320+ $ i = -1 ;
321+ while (null !== $ char = $ expr [++$ i ] ?? null ) {
322+ if ('( ' === $ char ) {
323+ ++$ depth ;
324+ } elseif (') ' === $ char && 0 === --$ depth && isset ($ expr [$ i + 1 ])) {
325+ $ isWrapped = false ;
326+ break ;
327+ }
328+ }
329+ if ($ isWrapped ) {
330+ $ expr = trim (substr ($ expr , 1 , -1 ));
331+ }
332+ }
333+
334+ if (str_starts_with ($ expr , '! ' )) {
335+ return !$ this ->evaluateFilterExpression (trim (substr ($ expr , 1 )), $ context );
336+ }
314337
315338 if (str_contains ($ expr , '&& ' )) {
316339 $ parts = array_map ('trim ' , explode ('&& ' , $ expr ));
@@ -353,8 +376,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353376 }
354377
355378 // function calls
356- if (preg_match ('/^(\w+) \((.*)\)$/ ' , $ expr , $ matches )) {
357- $ functionName = $ matches [1 ];
379+ if (preg_match ('/^(\w++)\s*+ \((.*)\)$/ ' , $ expr , $ matches )) {
380+ $ functionName = trim ( $ matches [1 ]) ;
358381 if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
359382 throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
360383 }
@@ -369,8 +392,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369392
370393 private function evaluateScalar (string $ expr , mixed $ context ): mixed
371394 {
372- if (is_numeric ($ expr )) {
373- return str_contains ($ expr , '. ' ) ? (float ) $ expr : (int ) $ expr ;
395+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
396+
397+ if (JsonPathUtils::isJsonNumber ($ expr )) {
398+ return str_contains ($ expr , '. ' ) || str_contains (strtolower ($ expr ), 'e ' ) ? (float ) $ expr : (int ) $ expr ;
399+ }
400+
401+ // only validate tokens that look like standalone numbers
402+ if (preg_match ('/^[\d+\-.eE]+$/ ' , $ expr ) && preg_match ('/\d/ ' , $ expr )) {
403+ throw new JsonCrawlerException ($ expr , \sprintf ('Invalid number format "%s" ' , $ expr ));
374404 }
375405
376406 if ('@ ' === $ expr ) {
@@ -404,9 +434,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404434 }
405435
406436 // function calls
407- if (preg_match ('/^(\w+)\((.*)\)$/ ' , $ expr , $ matches )) {
408- $ functionName = $ matches [1 ];
409- if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
437+ if (preg_match ('/^(\w++)\((.*)\)$/ ' , $ expr , $ matches )) {
438+ if (!isset (self ::RFC9535_FUNCTIONS [$ functionName = trim ($ matches [1 ])])) {
410439 throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
411440 }
412441
@@ -416,31 +445,60 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416445 return null ;
417446 }
418447
419- private function evaluateFunction (string $ name , string $ args , array $ context ): mixed
448+ private function evaluateFunction (string $ name , string $ args , mixed $ context ): mixed
420449 {
421- $ args = array_map (
422- fn ($ arg ) => $ this ->evaluateScalar (trim ($ arg ), $ context ),
423- explode (', ' , $ args )
424- );
450+ $ argList = [];
451+ $ nodelistSizes = [];
452+ if ($ args = trim ($ args )) {
453+ $ args = JsonPathUtils::parseCommaSeparatedValues ($ args );
454+ foreach ($ args as $ arg ) {
455+ $ arg = trim ($ arg );
456+ if (str_starts_with ($ arg , '$ ' )) { // special handling for absolute paths
457+ $ results = $ this ->evaluate (new JsonPath ($ arg ));
458+ $ argList [] = $ results [0 ] ?? null ;
459+ $ nodelistSizes [] = \count ($ results );
460+ } elseif (!str_starts_with ($ arg , '@ ' )) { // special handling for @ to track nodelist size
461+ $ argList [] = $ this ->evaluateScalar ($ arg , $ context );
462+ $ nodelistSizes [] = 1 ;
463+ } elseif ('@ ' === $ arg ) {
464+ $ argList [] = $ context ;
465+ $ nodelistSizes [] = 1 ;
466+ } elseif (!\is_array ($ context )) {
467+ $ argList [] = null ;
468+ $ nodelistSizes [] = 0 ;
469+ } elseif (str_starts_with ($ pathPart = substr ($ arg , 1 ), '[ ' )) {
470+ // handle bracket expressions like @['a','d']
471+ $ results = $ this ->evaluateBracket (substr ($ pathPart , 1 , -1 ), $ context );
472+ $ argList [] = $ results ;
473+ $ nodelistSizes [] = \count ($ results );
474+ } else {
475+ // handle dot notation like @.a
476+ $ results = $ this ->evaluateTokensOnDecodedData (JsonPathTokenizer::tokenize (new JsonPath ('$ ' .$ pathPart )), $ context );
477+ $ argList [] = $ results [0 ] ?? null ;
478+ $ nodelistSizes [] = \count ($ results );
479+ }
480+ }
481+ }
425482
426- $ value = $ args [0 ] ?? null ;
483+ $ value = $ argList [0 ] ?? null ;
484+ $ nodelistSize = $ nodelistSizes [0 ] ?? 0 ;
427485
428486 return match ($ name ) {
429487 'length ' => match (true ) {
430488 \is_string ($ value ) => mb_strlen ($ value ),
431489 \is_array ($ value ) => \count ($ value ),
432490 default => 0 ,
433491 },
434- 'count ' => \is_array ( $ value ) ? \count ( $ value ) : 0 ,
492+ 'count ' => $ nodelistSize ,
435493 'match ' => match (true ) {
436- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/ ' , $ args [1 ]), $ value ),
494+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/u ' , $ this -> transformJsonPathRegex ( $ argList [1 ]) ), $ value ),
437495 default => false ,
438496 },
439497 'search ' => match (true ) {
440- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match ("/ $ args [1 ]/ " , $ value ),
498+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match ("/ { $ this -> transformJsonPathRegex ( $ argList [1 ])} /u " , $ value ),
441499 default => false ,
442500 },
443- 'value ' => $ value ,
501+ 'value ' => 1 < $ nodelistSize ? null : ( 1 === $ nodelistSize ? ( \is_array ( $ value) ? ( $ value [ 0 ] ?? null ) : $ value ) : $ value ) ,
444502 default => null ,
445503 };
446504 }
@@ -474,43 +532,51 @@ private function compare(mixed $left, mixed $right, string $operator): bool
474532 };
475533 }
476534
477- private function parseCommaSeparatedValues (string $ expr ): array
535+ /**
536+ * Transforms JSONPath regex patterns to comply with RFC 9535.
537+ *
538+ * The main issue is that '.' should not match \r or \n but should
539+ * match Unicode line separators U+2028 and U+2029.
540+ */
541+ private function transformJsonPathRegex (string $ pattern ): string
478542 {
479- $ parts = [];
480- $ current = '' ;
481- $ inQuotes = false ;
482- $ quoteChar = null ;
483-
484- for ($ i = 0 ; $ i < \strlen ($ expr ); ++$ i ) {
485- $ char = $ expr [$ i ];
486-
487- if ('\\' === $ char && $ i + 1 < \strlen ($ expr )) {
488- $ current .= $ char .$ expr [++$ i ];
543+ $ result = '' ;
544+ $ inCharClass = false ;
545+ $ escaped = false ;
546+ $ i = -1 ;
547+
548+ while (null !== $ char = $ pattern [++$ i ] ?? null ) {
549+ if ($ escaped ) {
550+ $ result .= $ char ;
551+ $ escaped = false ;
489552 continue ;
490553 }
491554
492- if ('" ' === $ char || "' " === $ char ) {
493- if (!$ inQuotes ) {
494- $ inQuotes = true ;
495- $ quoteChar = $ char ;
496- } elseif ($ char === $ quoteChar ) {
497- $ inQuotes = false ;
498- $ quoteChar = null ;
499- }
500- } elseif (!$ inQuotes && ', ' === $ char ) {
501- $ parts [] = trim ($ current );
502- $ current = '' ;
555+ if ('\\' === $ char ) {
556+ $ result .= $ char ;
557+ $ escaped = true ;
558+ continue ;
559+ }
503560
561+ if ('[ ' === $ char && !$ inCharClass ) {
562+ $ inCharClass = true ;
563+ $ result .= $ char ;
504564 continue ;
505565 }
506566
507- $ current .= $ char ;
508- }
567+ if ('] ' === $ char && $ inCharClass ) {
568+ $ inCharClass = false ;
569+ $ result .= $ char ;
570+ continue ;
571+ }
509572
510- if ('' !== $ current ) {
511- $ parts [] = trim ($ current );
573+ if ('. ' === $ char && !$ inCharClass ) {
574+ $ result .= '(?:[^\r\n]|\x{2028}|\x{2029}) ' ;
575+ } else {
576+ $ result .= $ char ;
577+ }
512578 }
513579
514- return $ parts ;
580+ return $ result ;
515581 }
516582}
0 commit comments