Skip to content

Commit 008c61b

Browse files
committed
scripts for managing the archive and mirror machines
1 parent 6b0bf44 commit 008c61b

File tree

7 files changed

+164
-0
lines changed

7 files changed

+164
-0
lines changed

machine/archive/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Archive
2+
3+
This machine contains a local copy of all wort signatures,
4+
synced from AWS S3 buckets.
5+
6+
```
7+
.
8+
├── branchwater-index
9+
├── filelists
10+
│   ├── genomes_20250829
11+
│   ├── img_20250829
12+
│   └── sra_20250829
13+
├── manifests
14+
│   ├── genomes_20250829.csv
15+
│   ├── img_20250829.csv
16+
│   └── sra_20250829.csv
17+
├── s3_sync.sh
18+
├── scripts
19+
│   ├── csv_to_parquet.py
20+
│   ├── genomes.sbatch
21+
│   ├── img.sbatch
22+
│   └── sra.sbatch
23+
├── slurm_logs/
24+
├── update_logs/
25+
│   ├── 20250825_genomes
26+
│   ├── 20250825_img
27+
│   └── 20250825_sra
28+
├── wort-genomes
29+
│   └── sigs
30+
├── wort-img
31+
│   └── sigs
32+
└── wort-sra
33+
└── sigs
34+
```
35+
36+
## The `s3_sync.sh` script
37+
38+
## Updating manifests
39+

machine/archive/s3_sync.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
mkdir -p update_logs/
6+
7+
# using s5cmd
8+
pixi exec s5cmd --stat --json sync --size-only 's3://wort-sra/*' wort-sra/ &> update_logs/"$(date +%Y%m%d)_sra" &
9+
pixi exec s5cmd --stat --json sync --size-only 's3://wort-genomes/*' wort-genomes/ &> update_logs/"$(date +%Y%m%d)_genomes" &
10+
pixi exec s5cmd --stat --json sync --size-only 's3://wort-img/*' wort-img/ &> update_logs/"$(date +%Y%m%d)_img" &
11+
wait
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# /// script
2+
# requires-python = ">=3.10"
3+
# dependencies = [
4+
# "polars",
5+
# ]
6+
# ///
7+
8+
import polars as pl
9+
10+
11+
def main(args):
12+
manifest_df = pl.scan_csv(args.manifest, skip_rows=1)
13+
sha256_df = pl.scan_csv(
14+
args.sha256,
15+
has_header=False,
16+
separator=" ",
17+
new_columns=["sha256", "sep", "path"],
18+
).drop("sep")
19+
df = manifest_df.join(sha256_df, left_on="internal_location", right_on="path")
20+
if args.basepath:
21+
df = df.with_columns(
22+
pl.col("internal_location").str.strip_prefix(args.basepath)
23+
)
24+
25+
match args.format:
26+
case "parquet":
27+
df.sink_parquet(
28+
args.output,
29+
compression_level=22,
30+
metadata={"SOURMASH-MANIFEST-VERSION": "1.0"},
31+
)
32+
case "csv":
33+
args.output.writelines(["# SOURMASH-MANIFEST-VERSION: 1.0\n"])
34+
df.sink_csv(args.output)
35+
case _:
36+
print("Unknown output format")
37+
38+
39+
if __name__ == "__main__":
40+
import argparse
41+
42+
parser = argparse.ArgumentParser()
43+
parser.add_argument("-b", "--basepath", default=None)
44+
parser.add_argument("-F", "--format", choices=["parquet", "csv"], default="parquet")
45+
parser.add_argument("manifest")
46+
parser.add_argument("sha256")
47+
parser.add_argument("output", type=argparse.FileType("w", encoding="utf-8"))
48+
49+
args = parser.parse_args()
50+
51+
main(args)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
#SBATCH --ntasks=1
3+
#SBATCH --cpus-per-task=16
4+
#SBATCH --time=40:00:00
5+
#SBATCH --mem=20GB
6+
#SBATCH --partition=high2
7+
#SBATCH -A ctbrowngrp
8+
#SBATCH -o slurm_logs/genomes-%j.log
9+
10+
set -exu -o pipefail
11+
12+
cd $SLURM_SUBMIT_DIR
13+
14+
#find wort-genomes/ -iname "*.sig" > filelists/genomes_20250829
15+
16+
RAYON_NUM_THREADS=16 RUST_LOG=info ./branchwater-index manifest -o manifests/genomes_20250829.csv filelists/genomes_20250829

machine/archive/scripts/img.sbatch

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
#SBATCH --ntasks=1
3+
#SBATCH --cpus-per-task=32
4+
#SBATCH --time=01:00:00
5+
#SBATCH --mem=10GB
6+
#SBATCH --partition=bmm
7+
#SBATCH -o slurm_logs/img-%j.log
8+
9+
set -exu -o pipefail
10+
11+
cd $SLURM_SUBMIT_DIR
12+
13+
#find wort-img/ -iname "*.sig" > filelists/img_20250829
14+
15+
RAYON_NUM_THREADS=32 RUST_LOG=info ./branchwater-index manifest -o manifests/img_20250829.csv filelists/img_20250829
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
#SBATCH --ntasks=1
3+
#SBATCH --cpus-per-task=16
4+
#SBATCH --time=40:00:00
5+
#SBATCH --mem=20GB
6+
#SBATCH --partition=high2
7+
#SBATCH -A ctbrowngrp
8+
#SBATCH -o slurm_logs/sha256-%j.log
9+
10+
set -exu -o pipefail
11+
12+
cd $SLURM_SUBMIT_DIR
13+
14+
parallel -j16 sha256sum :::: filelists/img_20250829 > filelists/img_20250829.sha256
15+
parallel -j16 sha256sum :::: filelists/genomes_20250829 > filelists/genomes_20250829.sha256
16+
parallel -j16 sha256sum :::: filelists/sra_20250829 > filelists/sra_20250829.sha256

machine/archive/scripts/sra.sbatch

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
#SBATCH --ntasks=1
3+
#SBATCH --cpus-per-task=32
4+
#SBATCH --time=100:00:00
5+
#SBATCH --mem=200GB
6+
#SBATCH --partition=high2
7+
#SBATCH -A ctbrowngrp
8+
#SBATCH -o slurm_logs/sra-%j.log
9+
10+
set -exu -o pipefail
11+
12+
cd $SLURM_SUBMIT_DIR
13+
14+
#find wort-sra/ -iname "*.sig" > filelists/sra_20250829
15+
16+
RAYON_NUM_THREADS=32 RUST_LOG=info ./branchwater-index manifest -o manifests/sra_20250829.csv filelists/sra_20250829

0 commit comments

Comments
 (0)