2929# Stop on errors and on usage of unset variables.
3030set -eu
3131
32- VERSION=" 2025.05.26 "
32+ VERSION=" 2025.11.04 "
3333
3434PROGRAM_NAME=" $( basename " $0 " ) "
3535readonly PROGRAM_NAME
@@ -65,7 +65,7 @@ Options:
6565 multiple times, only the last value is considered.
6666
6767 --no-decode-filename: Don't percent-decode the output filename, even if the percent-encoding in
68- the URL was done by wcurl, e.g.: The URL contained whitespaces .
68+ the URL was done by wcurl, e.g.: The URL contained whitespace .
6969
7070 --dry-run: Don't actually execute curl, just print what would be invoked.
7171
@@ -77,7 +77,7 @@ Options:
7777 instead forwarded to the curl invocation.
7878
7979 <URL>: URL to be downloaded. Anything that is not a parameter is considered
80- an URL. Whitespaces are percent-encoded and the URL is passed to curl, which
80+ an URL. Whitespace is percent-encoded and the URL is passed to curl, which
8181 then performs the parsing. May be specified more than once.
8282_EOF_
8383}
@@ -91,7 +91,7 @@ error()
9191
9292# Extra curl options provided by the user.
9393# This is set per-URL for every URL provided.
94- # Some options are global, but we are erroring on the side of needlesly setting
94+ # Some options are global, but we are erroring on the side of needlessly setting
9595# them multiple times instead of causing issues with parameters that needs to
9696# be set per-URL.
9797CURL_OPTIONS=" "
@@ -113,6 +113,13 @@ readonly PER_URL_PARAMETERS="\
113113 --remote-time \
114114 --retry 5 "
115115
116+ # Valid percent-encode codes that are considered unsafe to be decoded.
117+ # This is a list of space-separated percent-encoded uppercase
118+ # characters.
119+ # 2F = /
120+ # 5C = \
121+ readonly UNSAFE_PERCENT_ENCODE=" 2F 5C"
122+
116123# Whether to invoke curl or not.
117124DRY_RUN=" false"
118125
@@ -133,10 +140,24 @@ sanitize()
133140is_subset_of ()
134141{
135142 case " ${1} " in
136- * [!${2} ]* | ' ' ) return 1;;
143+ * [!${2} ]* | ' ' ) return 1 ;;
137144 esac
138145}
139146
147+ # Indicate via exit code whether the HTML code given in the first
148+ # parameter is safe to be decoded.
149+ is_safe_percent_encode ()
150+ {
151+ upper_str=$( printf " %s" " ${1} " | tr " [:lower:]" " [:upper:]" )
152+ for unsafe in ${UNSAFE_PERCENT_ENCODE} ; do
153+ if [ " ${unsafe} " = " ${upper_str} " ]; then
154+ return 1
155+ fi
156+ done
157+
158+ return 0
159+ }
160+
140161# Print the given string percent-decoded.
141162percent_decode ()
142163{
@@ -151,9 +172,10 @@ percent_decode()
151172 decode_out=" ${decode_out}${decode_hex2} "
152173 # Skip decoding if this is a control character (00-1F).
153174 # Skip decoding if DECODE_FILENAME is not "true".
154- if is_subset_of " ${decode_hex1} " " 23456789abcdefABCDEF" && \
155- is_subset_of " ${decode_hex2} " " 0123456789abcdefABCDEF" && \
156- [ " ${DECODE_FILENAME} " = " true" ]; then
175+ if [ " ${DECODE_FILENAME} " = " true" ] \
176+ && is_subset_of " ${decode_hex1} " " 23456789abcdefABCDEF" \
177+ && is_subset_of " ${decode_hex2} " " 0123456789abcdefABCDEF" \
178+ && is_safe_percent_encode " ${decode_out} " ; then
157179 # Use printf to decode it into octal and then decode it to the final format.
158180 decode_out=" $( printf " %b" " \\ $( printf %o " 0x${decode_hex1}${decode_hex2} " ) " ) "
159181 fi
@@ -171,7 +193,7 @@ get_url_filename()
171193 # If what remains contains a slash, there's a path; return it percent-decoded.
172194 case " ${hostname_and_path} " in
173195 # sed to remove everything preceding the last '/', e.g.: "example/something" becomes "something"
174- * /* ) percent_decode " $( printf %s " ${hostname_and_path} " | sed -e ' s,^.*/,,' ) " ;;
196+ * /* ) percent_decode " $( printf %s " ${hostname_and_path} " | sed -e ' s,^.*/,,' ) " ;;
175197 esac
176198 # No slash means there was just a hostname and no path; return empty string.
177199}
@@ -181,35 +203,38 @@ exec_curl()
181203{
182204 CMD=" curl "
183205
184- # Store version to check if it supports --no-clobber and --parallel.
206+ # Store version to check if it supports --no-clobber, --parallel and --parallel-max-host .
185207 curl_version=$( $CMD --version | cut -f2 -d' ' | head -n1)
186208 curl_version_major=$( echo " $curl_version " | cut -f1 -d.)
187209 curl_version_minor=$( echo " $curl_version " | cut -f2 -d.)
188210
189- CURL_HAS_NO_CLOBBER =" "
190- CURL_HAS_PARALLEL =" "
211+ CURL_NO_CLOBBER =" "
212+ CURL_PARALLEL =" "
191213 # --no-clobber is only supported since 7.83.0.
192214 # --parallel is only supported since 7.66.0.
215+ # --parallel-max-host is only supported since 8.16.0.
193216 if [ " ${curl_version_major} " -ge 8 ]; then
194- CURL_HAS_NO_CLOBBER=" --no-clobber"
195- CURL_HAS_PARALLEL=" --parallel"
196- elif [ " ${curl_version_major} " -eq 7 ]; then
217+ CURL_NO_CLOBBER=" --no-clobber"
218+ CURL_PARALLEL=" --parallel"
219+ if [ " ${curl_version_minor} " -ge 16 ]; then
220+ CURL_PARALLEL=" --parallel --parallel-max-host 5"
221+ fi
222+ elif [ " ${curl_version_major} " -eq 7 ]; then
197223 if [ " ${curl_version_minor} " -ge 83 ]; then
198- CURL_HAS_NO_CLOBBER =" --no-clobber"
224+ CURL_NO_CLOBBER =" --no-clobber"
199225 fi
200226 if [ " ${curl_version_minor} " -ge 66 ]; then
201- CURL_HAS_PARALLEL =" --parallel"
227+ CURL_PARALLEL =" --parallel"
202228 fi
203229 fi
204230
205- # Detecting whether we need --parallel. It's easier to rely on
231+ # Detecting whether we need --parallel. It's easier to rely on
206232 # the shell's argument parsing.
207233 # shellcheck disable=SC2086
208234 set -- $URLS
209235
210- if [ " $# " -gt 1 ]; then
211- CURL_PARALLEL=" $CURL_HAS_PARALLEL "
212- else
236+ # If there are less than two URLs, don't set the parallel flag.
237+ if [ " $# " -lt 2 ]; then
213238 CURL_PARALLEL=" "
214239 fi
215240
@@ -231,7 +256,7 @@ exec_curl()
231256 [ -z " ${OUTPUT_PATH} " ] && OUTPUT_PATH=index.html
232257 fi
233258 # shellcheck disable=SC2086
234- set -- " $@ " ${NEXT_PARAMETER} ${PER_URL_PARAMETERS} ${CURL_HAS_NO_CLOBBER} ${CURL_OPTIONS} --output " ${OUTPUT_PATH} " " ${url} "
259+ set -- " $@ " ${NEXT_PARAMETER} ${PER_URL_PARAMETERS} ${CURL_NO_CLOBBER} --output " ${OUTPUT_PATH} " ${CURL_OPTIONS} " ${url} "
235260 NEXT_PARAMETER=" --next"
236261 done
237262
@@ -268,13 +293,13 @@ while [ -n "${1-}" ]; do
268293 OUTPUT_PATH=" ${opt} "
269294 ;;
270295
271- -o|-O| --output)
296+ -o | -O | --output)
272297 shift
273298 HAS_USER_SET_OUTPUT=" true"
274299 OUTPUT_PATH=" ${1} "
275300 ;;
276301
277- -o* | -O* )
302+ -o* | -O* )
278303 opt=$( printf " %s\n" " ${1} " | sed ' s/^-[oO]//' )
279304 HAS_USER_SET_OUTPUT=" true"
280305 OUTPUT_PATH=" ${opt} "
@@ -284,12 +309,12 @@ while [ -n "${1-}" ]; do
284309 DECODE_FILENAME=" false"
285310 ;;
286311
287- -h| --help)
312+ -h | --help)
288313 usage
289314 exit 0
290315 ;;
291316
292- -V| --version)
317+ -V | --version)
293318 print_version
294319 exit 0
295320 ;;
@@ -298,7 +323,7 @@ while [ -n "${1-}" ]; do
298323 # This is the start of the list of URLs.
299324 shift
300325 for url in " $@ " ; do
301- # Encode whitespaces into %20, since wget supports those URLs.
326+ # Encode whitespace into %20, since wget supports those URLs.
302327 newurl=$( printf " %s\n" " ${url} " | sed ' s/ /%20/g' )
303328 URLS=" ${URLS} ${newurl} "
304329 done
@@ -311,7 +336,7 @@ while [ -n "${1-}" ]; do
311336
312337 * )
313338 # This must be a URL.
314- # Encode whitespaces into %20, since wget supports those URLs.
339+ # Encode whitespace into %20, since wget supports those URLs.
315340 newurl=$( printf " %s\n" " ${1} " | sed ' s/ /%20/g' )
316341 URLS=" ${URLS} ${newurl} "
317342 ;;
0 commit comments