@@ -69,6 +69,19 @@ PathChars()
6969 return pathValid;
7070}
7171
72+ // / Characters which are valid within a URI query section
73+ static const CharacterSet &
74+ QueryChars ()
75+ {
76+ /*
77+ * RFC 3986 section 3.4
78+ *
79+ * query = *( pchar / "/" / "?" )
80+ */
81+ static const auto queryValid = CharacterSet (" query" , " /?" ) + PathChars ();
82+ return queryValid;
83+ }
84+
7285/* *
7386 * Governed by RFC 3986 section 2.1
7487 */
@@ -208,6 +221,22 @@ AnyP::Uri::path() const
208221 return path_;
209222}
210223
224+ const SBuf &
225+ AnyP::Uri::query () const
226+ {
227+ /*
228+ * RFC 3986 section 3.4:
229+ *
230+ * query = *( pchar / "/" / "?" )
231+ */
232+
233+ static SBuf empty;
234+ if (scheme_ == AnyP::PROTO_FTP)
235+ return empty; // FTP does not have a query section
236+
237+ return query_;
238+ }
239+
211240void
212241urlInitialize (void )
213242{
@@ -310,6 +339,47 @@ urlAppendDomain(char *host)
310339 return true ;
311340}
312341
342+ // / returns whether whitespace has 'chopped' the URL apart.
343+ static bool
344+ StripAnyWsp (char *buf)
345+ {
346+ if (stringHasWhitespace (buf)) {
347+ switch (Config.uri_whitespace ) {
348+
349+ case URI_WHITESPACE_DENY:
350+ throw TextException (" URI has whitespace" , Here ());
351+
352+ case URI_WHITESPACE_ALLOW:
353+ break ;
354+
355+ case URI_WHITESPACE_ENCODE: {
356+ auto *t = rfc1738_escape_unescaped (buf);
357+ xstrncpy (buf, t, MAX_URL);
358+ }
359+ break ;
360+
361+ case URI_WHITESPACE_CHOP:
362+ *(buf + strcspn (buf, w_space)) = ' \0 ' ;
363+ return true ;
364+
365+ case URI_WHITESPACE_STRIP:
366+ default : {
367+ char *t = buf;
368+ char *q = buf;
369+ while (*t) {
370+ if (!xisspace (*t)) {
371+ *q = *t;
372+ ++q;
373+ }
374+ ++t;
375+ }
376+ *q = ' \0 ' ;
377+ }
378+ }
379+ }
380+ return false ;
381+ }
382+
313383/*
314384 * Parse a URI/URL.
315385 *
@@ -330,14 +400,14 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
330400 LOCAL_ARRAY (char , login, MAX_URL);
331401 LOCAL_ARRAY (char , foundHost, MAX_URL);
332402 LOCAL_ARRAY (char , urlpath, MAX_URL);
403+ LOCAL_ARRAY (char , foundQuery, MAX_URL);
333404 char *t = nullptr ;
334- char *q = nullptr ;
335405 int foundPort;
336406 int l;
337407 int i;
338408 const char *src;
339409 char *dst;
340- foundHost[0 ] = urlpath[0 ] = login[0 ] = ' \0 ' ;
410+ foundHost[0 ] = urlpath[0 ] = foundQuery[ 0 ] = login[0 ] = ' \0 ' ;
341411
342412 if ((l = rawUrl.length ()) + Config.appendDomainLen > (MAX_URL - 1 )) {
343413 debugs (23 , DBG_IMPORTANT, MYNAME << " URL too large (" << l << " bytes)" );
@@ -411,23 +481,56 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
411481 return false ;
412482 *dst = ' \0 ' ;
413483
414- // We are looking at path-abempty.
415- if (*src != ' /' ) {
416- // path-empty, including the end of the `src` c-string cases
417- urlpath[0 ] = ' /' ;
418- dst = &urlpath[1 ];
419- } else {
484+ bool chopped = false ;
485+ if (*src == ' /' ) {
420486 dst = urlpath;
421- }
422- /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
423- for (; i < l && *src != ' \r ' && *src != ' \n ' && *src != ' \0 ' ; ++i, ++src, ++dst) {
424- *dst = *src;
487+ /* Then everything from / (inclusive) until '?', '#', '\r\n' or '\0' - that's urlpath */
488+ for (; i < l && *src != ' ?' && *src != ' #' && *src != ' \r ' && *src != ' \n ' && *src != ' \0 ' ; ++i, ++src, ++dst) {
489+ *dst = *src;
490+ }
491+ *dst = ' \0 ' ;
492+ chopped = StripAnyWsp (urlpath);
493+ } else {
494+ // We should be looking at path-abempty. relative-path is not supported (yet).
495+ urlpath[0 ] = ' /' ;
496+ urlpath[1 ] = ' \0 ' ;
425497 }
426498
427499 /* We -could- be at the end of the buffer here */
428500 if (i > l)
429501 return false ;
430- *dst = ' \0 ' ;
502+
503+ if (*src == ' ?' ) {
504+ dst = foundQuery;
505+ ++src; // skip '?' delimiter
506+ ++i; // keep track of bytes 'consumed' from src
507+ /* Then everything from '?' (exclusive) until '#', '\r\n' or '\0' - that's query */
508+ for (; i < l && *src != ' #' && *src != ' \r ' && *src != ' \n ' && *src != ' \0 ' ; ++i, ++src, ++dst) {
509+ *dst = *src;
510+ }
511+ *dst = ' \0 ' ;
512+ if (chopped)
513+ *foundQuery = ' \0 ' ;
514+ else
515+ chopped = StripAnyWsp (foundQuery);
516+ }
517+ if (i > l)
518+ return false ;
519+
520+ if (*src == ' #' ) {
521+ ++src; // skip '#' delimiter
522+ ++i; // keep track of bytes 'consumed' from src
523+ /* Then everything from '#' (exclusive) until '\r\n' or '\0' - that's fragment */
524+ for (; i < l && *src != ' \r ' && *src != ' \n ' && *src != ' \0 ' ; ++i, ++src) {
525+ ; // fragment component is not expected in network protocols. discard for now.
526+ }
527+ }
528+
529+ if (i > l)
530+ return false ;
531+
532+ if (*src != ' \r ' && *src != ' \n ' && *src != ' \0 ' )
533+ throw TextException (" invalid URL" , Here ());
431534
432535 // If the parsed scheme has no (known) default port, and there is no
433536 // explicit port, then we will reject the zero port during foundPort
@@ -490,27 +593,20 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
490593 *t = ' \0 ' ;
491594 ++t;
492595 foundPort = atoi (t);
596+ // BUG: fails to detect non-DIGIT garbage like "http://localhost:80fubar"
493597 }
494598 }
495599
496600 for (t = foundHost; *t; ++t)
497601 *t = xtolower (*t);
498602
499- if (stringHasWhitespace (foundHost)) {
500- if (URI_WHITESPACE_STRIP == Config.uri_whitespace ) {
501- t = q = foundHost;
502- while (*t) {
503- if (!xisspace (*t)) {
504- *q = *t;
505- ++q;
506- }
507- ++t;
508- }
509- *q = ' \0 ' ;
510- }
603+ if (StripAnyWsp (foundHost)) {
604+ urlpath[0 ] = ' /' ;
605+ urlpath[1 ] = ' \0 ' ;
606+ *foundQuery = ' \0 ' ;
511607 }
512608
513- debugs (23 , 3 , " Split URL '" << rawUrl << " ' into proto='" << scheme.image () << " ', host='" << foundHost << " ', port='" << foundPort << " ', path='" << urlpath << " '" );
609+ debugs (23 , 3 , " Split URL '" << rawUrl << " ' into proto='" << scheme.image () << " ', host='" << foundHost << " ', port='" << foundPort << " ', path='" << urlpath << " ', query=' " << foundQuery << " ' " );
514610
515611 if (Config.onoff .check_hostnames &&
516612 strspn (foundHost, Config.onoff .allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen (foundHost)) {
@@ -536,41 +632,8 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
536632 return false ;
537633 }
538634
539- if (stringHasWhitespace (urlpath)) {
540- debugs (23 , 2 , " URI has whitespace: {" << rawUrl << " }" );
541-
542- switch (Config.uri_whitespace ) {
543-
544- case URI_WHITESPACE_DENY:
545- return false ;
546-
547- case URI_WHITESPACE_ALLOW:
548- break ;
549-
550- case URI_WHITESPACE_ENCODE:
551- t = rfc1738_escape_unescaped (urlpath);
552- xstrncpy (urlpath, t, MAX_URL);
553- break ;
554-
555- case URI_WHITESPACE_CHOP:
556- *(urlpath + strcspn (urlpath, w_space)) = ' \0 ' ;
557- break ;
558-
559- case URI_WHITESPACE_STRIP:
560- default :
561- t = q = urlpath;
562- while (*t) {
563- if (!xisspace (*t)) {
564- *q = *t;
565- ++q;
566- }
567- ++t;
568- }
569- *q = ' \0 ' ;
570- }
571- }
572-
573635 setScheme (scheme);
636+ query (SBuf (foundQuery));
574637 path (urlpath);
575638 host (foundHost);
576639 userInfo (SBuf (login));
@@ -775,8 +838,12 @@ SBuf &
775838AnyP::Uri::absolutePath () const
776839{
777840 if (absolutePath_.isEmpty ()) {
778- // TODO: Encode each URI subcomponent in path_ as needed.
779- absolutePath_ = Encode (path (), PathChars ());
841+ if (!path ().isEmpty ())
842+ absolutePath_ = Encode (path (), PathChars ());
843+ if (!query ().isEmpty ()) {
844+ absolutePath_.append (" ?" );
845+ absolutePath_.append (Encode (query (), QueryChars ()));
846+ }
780847 }
781848
782849 return absolutePath_;
0 commit comments