Skip to content

Commit af3ade8

Browse files
committed
Use download.sh script instead of wget, refs #2
1 parent 88d79bc commit af3ade8

File tree

2 files changed

+82
-2
lines changed

2 files changed

+82
-2
lines changed

.github/workflows/scrape.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ jobs:
2222
if [ ! -f "scrape.sh" ]; then
2323
echo '#!/bin/bash' > scrape.sh
2424
if [[ "$REPO_DESC" == http://* ]] || [[ "$REPO_DESC" == https://* ]]; then
25-
echo "wget $REPO_DESC" >> scrape.sh
25+
echo "./download.sh $REPO_DESC" >> scrape.sh
2626
else
27-
echo '# wget https://www.example.com/' >> scrape.sh
27+
echo '# ./download.sh https://www.example.com/' >> scrape.sh
2828
fi
2929
chmod +x scrape.sh
3030
echo "Created scrape.sh"

download.sh

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env bash
2+
#
3+
# download - Simple downloader that always constructs the filename from the URL
4+
# Usage: ./download.sh URL
5+
6+
set -e
7+
8+
# Function to detect MIME type and return appropriate extension
9+
get_file_extension() {
10+
local file_path="$1"
11+
local mime_type=$(file --mime-type -b "$file_path")
12+
local extension=""
13+
14+
case "$mime_type" in
15+
text/html) extension=".html" ;;
16+
application/json) extension=".json" ;;
17+
text/plain) extension=".txt" ;;
18+
application/javascript) extension=".js" ;;
19+
application/xml|text/xml) extension=".xml" ;;
20+
application/pdf) extension=".pdf" ;;
21+
image/jpeg) extension=".jpg" ;;
22+
image/png) extension=".png" ;;
23+
image/gif) extension=".gif" ;;
24+
image/svg+xml) extension=".svg" ;;
25+
application/zip) extension=".zip" ;;
26+
application/gzip) extension=".gz" ;;
27+
application/x-tar) extension=".tar" ;;
28+
application/x-bzip2) extension=".bz2" ;;
29+
*) extension=".html" ;; # Default to HTML if unknown
30+
esac
31+
32+
echo "$extension"
33+
}
34+
35+
# Check if URL provided
36+
if [ $# -ne 1 ]; then
37+
echo "Usage: $0 URL"
38+
exit 1
39+
fi
40+
41+
URL="$1"
42+
43+
# Validate URL format (must start with http:// or https://)
44+
if [[ ! "$URL" =~ ^https?:// ]]; then
45+
echo "Error: URL must start with http:// or https://"
46+
exit 1
47+
fi
48+
49+
# Create temporary file
50+
TEMP_FILE=$(mktemp)
51+
52+
# Download the file
53+
echo "Downloading $URL..."
54+
curl -s -L "$URL" -o "$TEMP_FILE" || {
55+
echo "Error: Failed to download $URL"
56+
rm -f "$TEMP_FILE"
57+
exit 1
58+
}
59+
60+
# Get file extension based on MIME type
61+
EXTENSION=$(get_file_extension "$TEMP_FILE")
62+
63+
# Always construct filename from the URL, replacing slashes with hyphens
64+
FILENAME=$(echo "$URL" | sed -E 's|^https?://||' | sed -E 's|^www\.||' | sed 's|/$||' | sed 's|/|-|g')
65+
66+
# Add extension to the filename
67+
FILENAME="${FILENAME}${EXTENSION}"
68+
69+
# Make sure we don't end up with just an extension
70+
if [ "$FILENAME" = "${EXTENSION}" ]; then
71+
FILENAME="index${EXTENSION}"
72+
fi
73+
74+
# Get the current directory to ensure we save to this location
75+
CURRENT_DIR="$(pwd)"
76+
FULL_PATH="${CURRENT_DIR}/${FILENAME}"
77+
78+
# Move to final destination
79+
mv "$TEMP_FILE" "$FULL_PATH"
80+

0 commit comments

Comments
 (0)