Implemented initial setup script and wrote README

simonw · simonw · commit 659c8f857785 · 2025-02-25T19:35:27.000-08:00
diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml
@@ -0,0 +1,81 @@
+name: Scrape
+
+on:
+  push:
+  workflow_dispatch:
+  schedule:
+  # Daily at 6:23 AM UTC
+  - cron: '23 6 * * *'
+  # For hourly at 42 minutes past the hour: '42 * * * *'
+
+permissions:
+  contents: write
+
+jobs:
+  setup: # Delete this joh after it first runs if you like
+    runs-on: ubuntu-latest
+    if: ${{ !github.event.repository.is_template }}
+    steps:
+    - uses: actions/checkout@v4
+      if: ${{ always() && !hashFiles('scrape.sh') }}
+    - name: Create scrape.sh (using github context)
+      if: ${{ always() && !hashFiles('scrape.sh') }}
+      run: |
+        if [ ! -f "scrape.sh" ]; then
+          echo '#!/bin/bash' > scrape.sh
+          if [[ "$REPO_DESC" == http://* ]] || [[ "$REPO_DESC" == https://* ]]; then
+            echo "wget $REPO_DESC" >> scrape.sh
+          else
+            echo '# wget https://www.example.com/' >> scrape.sh
+          fi
+          chmod +x scrape.sh
+        fi
+        # Now push that to git
+        git config user.name "Automated"
+        git config user.email "actions@users.noreply.github.com"
+        git add scrape.sh
+        timestamp=$(date -u)
+        git commit -m "${timestamp}" || exit 0
+        git pull --rebase
+        git push
+      env:
+        REPO_DESC: ${{ github.event.repository.description }}
+
+  scrape:
+    runs-on: ubuntu-latest
+    if: ${{ !github.event.repository.is_template }}
+    steps:
+    - uses: actions/checkout@v4
+    # Uncomment to use Python:
+    # - name: Set up Python 3.13
+    #   uses: actions/setup-python@v5
+    #   with:
+    #     python-version: "3.13"
+    #     cache: "pip"
+    # - name: Install dependencies
+    #   run: |
+    #     pip install -r requirements.txt
+    # Uncomment to use Playwright via shot-scraper (put shot-scraper in requirements.txt):
+    # - name: Cache Playwright browsers
+    #   uses: actions/cache@v4
+    #   with:
+    #     path: ~/.cache/ms-playwright/
+    #     key: ${{ runner.os }}-browsers
+    # - name: Install Playwright dependencies
+    #   run: |
+    #     shot-scraper install
+    - name: Run the scraper
+      run: |
+        if [ ! -x scrape.sh ]; then
+          chmod 755 scrape.sh
+        fi
+        ./scrape.sh
+    - name: Commit and push
+      run: |-
+        git config user.name "Automated"
+        git config user.email "actions@users.noreply.github.com"
+        git add -A
+        timestamp=$(date -u)
+        git commit -m "${timestamp}" || exit 0
+        git pull --rebase
+        git push
diff --git a/README.md b/README.md
@@ -1,3 +1,21 @@
 # git-scraper-template
 
 Template repository for setting up a new [git scraper](https://simonwillison.net/2020/Oct/9/git-scraping/) using GitHub Actions.
+
+## How to use this
+
+Visit https://github.com/simonw/git-scraper-template/generate
+
+Pick a name for your new repository, then paste **the URL** of the page you would like to take scrape into the **description field** (including the `http://` or `https://`). JSON works best, but any URL will be fetched and saved.
+
+Then click **Create repository from template**.
+
+Your new repository will be created, and a script will run which will do the following:
+
+- Add a `scrape.sh` script to your repository which uses `wget` to fetch the URL you requested
+- Run that `wget` command, write the result to the repository and commit it
+- Configure a schedule to run this script once every 24 hours
+
+You can edit `scrape.sh` to customize what is scraped, and you can edit `.github/workflows/scrape.yml` to change how often the scraping happens.
+
+If you want to use Python in your scraper you can uncomment the relevant block in `scrape.yml` and add a `requirements.txt` file to your repository containing any dependencies you need.