From e2a6125a05b2b17046e91584f917e4957dd3f928 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 2 Mar 2024 19:46:27 -0800 Subject: [PATCH 1/8] add benchmarks --- src/core/benches/minhash.rs | 158 +++++++++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 1 deletion(-) diff --git a/src/core/benches/minhash.rs b/src/core/benches/minhash.rs index c2e27b0249..9a1975cf47 100644 --- a/src/core/benches/minhash.rs +++ b/src/core/benches/minhash.rs @@ -107,5 +107,161 @@ fn intersection(c: &mut Criterion) { }); } -criterion_group!(minhash, intersection); +fn jaccard(c: &mut Criterion) { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/genome-s10.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let mut sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let mh = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] { + mh.clone() + } else { + unimplemented!() + }; + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/genome-s11.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let mut sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let mh2 = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] { + mh.clone() + } else { + unimplemented!() + }; + + let mut group = c.benchmark_group("minhash"); + group.sample_size(10); + + group.bench_function("jaccard", |b| { + b.iter(|| { + mh.jaccard(&mh2).unwrap(); + }); + }); + + let mut mh1 = KmerMinHash::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + let mut mh2 = KmerMinHash::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + + let mut mh1_btree = KmerMinHashBTree::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + let mut mh2_btree = KmerMinHashBTree::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + + for i in 0..=1_000_000 { + if i % 2 == 0 { + mh1.add_hash(i); + mh1_btree.add_hash(i); + } + if i % 45 == 0 { + mh2.add_hash(i); + mh2_btree.add_hash(i); + } + } + + group.bench_function("large jaccard", |b| { + b.iter(|| { + mh1.jaccard(&mh2).unwrap(); + }); + }); + + group.bench_function("large jaccard btree", |b| { + b.iter(|| { + mh1_btree.jaccard(&mh2_btree).unwrap(); + }); + }); +} + +fn count_common(c: &mut Criterion) { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/genome-s10.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let mut sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let mh = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] { + mh.clone() + } else { + unimplemented!() + }; + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/genome-s11.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let mut sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let mh2 = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] { + mh.clone() + } else { + unimplemented!() + }; + + let mut group = c.benchmark_group("minhash"); + group.sample_size(10); + + group.bench_function("count_common", |b| { + b.iter(|| { + mh.count_common(&mh2, false).unwrap(); + }); + }); + + let mut mh1 = KmerMinHash::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + let mut mh2 = KmerMinHash::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + + let mut mh1_btree = KmerMinHashBTree::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + let mut mh2_btree = KmerMinHashBTree::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + + for i in 0..=1_000_000 { + if i % 2 == 0 { + mh1.add_hash(i); + mh1_btree.add_hash(i); + } + if i % 45 == 0 { + mh2.add_hash(i); + mh2_btree.add_hash(i); + } + } + + group.bench_function("large count_common", |b| { + b.iter(|| { + mh1.count_common(&mh2, false).unwrap(); + }); + }); + + group.bench_function("large count_common btree", |b| { + b.iter(|| { + mh1_btree.count_common(&mh2_btree, false).unwrap(); + }); + }); +} + +criterion_group!(minhash, intersection, jaccard, count_common); criterion_main!(minhash); From 771bf3c60502d79199088bca5637f7573b450fb0 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 2 Mar 2024 19:58:13 -0800 Subject: [PATCH 2/8] make add_many take an iterator --- src/core/src/ffi/minhash.rs | 2 +- src/core/src/sketch/minhash.rs | 12 ++++++------ src/core/tests/minhash.rs | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 7d7c050fb0..79a66e623f 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -434,7 +434,7 @@ unsafe fn kmerminhash_intersection(ptr: *const SourmashKmerMinHash, other: *cons let isect = mh.intersection(other_mh)?; let mut new_mh = mh.clone(); new_mh.clear(); - new_mh.add_many(&isect.0)?; + new_mh.add_many(isect.0.iter().copied())?; Ok(SourmashKmerMinHash::from_rust(new_mh)) } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 1ee747745a..5d6b601b39 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -522,9 +522,9 @@ impl KmerMinHash { Ok(()) } - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { + pub fn add_many>(&mut self, hashes: T) -> Result<(), Error> { for min in hashes { - self.add_hash(*min); + self.add_hash(min); } Ok(()) } @@ -735,7 +735,7 @@ impl KmerMinHash { if self.abunds.is_some() { new_mh.add_many_with_abund(&self.to_vec_abunds())?; } else { - new_mh.add_many(&self.mins)?; + new_mh.add_many(self.mins.iter().copied())?; } Ok(new_mh) } @@ -1341,9 +1341,9 @@ impl KmerMinHashBTree { Ok(()) } - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { + pub fn add_many>(&mut self, hashes: T) -> Result<(), Error> { for min in hashes { - self.add_hash(*min); + self.add_hash(min); } Ok(()) } @@ -1543,7 +1543,7 @@ impl KmerMinHashBTree { if self.abunds.is_some() { new_mh.add_many_with_abund(&self.to_vec_abunds())?; } else { - new_mh.add_many(&self.mins())?; + new_mh.add_many(self.mins.iter().copied())?; } Ok(new_mh) } diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 195091056f..dc0484942a 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -242,8 +242,8 @@ fn oracle_mins_scaled(hashes in vec(u64::ANY, 1..10000)) { } } - c.add_many(&hashes).unwrap(); - d.add_many(&hashes).unwrap(); + c.add_many(hashes.iter().copied()).unwrap(); + d.add_many(hashes.iter().copied()).unwrap(); c.remove_many(to_remove.iter().copied()).unwrap(); d.remove_many(to_remove.iter().copied()).unwrap(); From 240d727820c0c90bcd5d6f2047d13d2b3c6688f1 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 2 Mar 2024 19:58:25 -0800 Subject: [PATCH 3/8] add size_hint to Intersection iterator --- src/core/src/sketch/minhash.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 5d6b601b39..75c0c88e6c 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -940,6 +940,15 @@ impl> Iterator for Intersection { } } } + #[inline(always)] + fn size_hint(&self) -> (usize, Option) { + let min = match (self.iter.size_hint().1, self.other.size_hint().1) { + (Some(a), Some(b)) => Some(std::cmp::min(a, b)), + (Some(a), None) | (None, Some(a)) => Some(a), + (None, None) => None, + }; + (0, min) + } } //############# From 77fa8bede0f0f753d7277c18347b08085659734a Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 2 Mar 2024 20:23:35 -0800 Subject: [PATCH 4/8] remove sketches method on Signature, use iter instead --- src/core/src/ffi/signature.rs | 2 +- src/core/src/index/linear.rs | 2 +- src/core/src/index/revindex/disk_revindex.rs | 2 +- src/core/src/index/revindex/mod.rs | 9 +-------- src/core/src/signature.rs | 4 ---- src/core/tests/minhash.rs | 8 ++------ src/core/tests/storage.rs | 3 +-- 7 files changed, 7 insertions(+), 23 deletions(-) diff --git a/src/core/src/ffi/signature.rs b/src/core/src/ffi/signature.rs index 06a0bd9fe5..d96e4c6493 100644 --- a/src/core/src/ffi/signature.rs +++ b/src/core/src/ffi/signature.rs @@ -202,7 +202,7 @@ ffi_fn! { unsafe fn signature_get_mhs(ptr: *const SourmashSignature, size: *mut usize) -> Result<*mut *mut SourmashKmerMinHash> { let sig = SourmashSignature::as_rust(ptr); - let output = sig.sketches(); + let output = sig.iter(); // FIXME: how to fit this into the ForeignObject trait? let ptr_sigs: Vec<*mut Signature> = output.into_iter().map(|x| { diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index f58bbd1542..64bfb67d74 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -25,7 +25,7 @@ pub struct LinearIndex { impl LinearIndex { pub fn from_collection(collection: CollectionSet) -> Self { let sig = collection.sig_for_dataset(0).unwrap(); - let template = sig.sketches().swap_remove(0); + let template = sig.iter().next().unwrap().clone(); Self { collection, template, diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index d8287d1bf3..04f4b3f72c 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -191,7 +191,7 @@ impl RevIndex { .collection .sig_for_dataset(dataset_id) .expect("Couldn't find a compatible Signature"); - let search_mh = &search_sig.sketches()[0]; + let search_mh = search_sig.iter().next().unwrap(); let colors = Datasets::new(&[dataset_id]).as_bytes().unwrap(); diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index a9b7091c8b..040582c008 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -19,7 +19,6 @@ use crate::index::{GatherResult, SigCounter}; use crate::prelude::*; use crate::signature::Signature; use crate::sketch::minhash::KmerMinHash; -use crate::sketch::Sketch; use crate::HashIntoType; use crate::Result; @@ -225,13 +224,7 @@ impl RevIndex { pub fn prepare_query(search_sig: Signature, selection: &Selection) -> Option { let sig = search_sig.select(selection).ok(); - sig.and_then(|sig| { - if let Sketch::MinHash(mh) = sig.sketches().swap_remove(0) { - Some(mh) - } else { - None - } - }) + sig.and_then(|sig| sig.minhash().cloned()) } #[derive(Debug, Default, Clone)] diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 0ab8190f98..83b98794f9 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -475,10 +475,6 @@ impl Signature { self.signatures.len() } - pub fn sketches(&self) -> Vec { - self.signatures.clone() - } - pub fn reset_sketches(&mut self) { self.signatures = vec![]; } diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index dc0484942a..ae9d6a0a1a 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -386,10 +386,9 @@ fn load_save_minhash_sketches() { let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); let sig = sigs.get(0).unwrap(); - let sketches = sig.sketches(); + let mh = sig.minhash().unwrap(); let mut buffer = vec![]; - if let Sketch::MinHash(mh) = &sketches[0] { let bmh: KmerMinHashBTree = mh.clone().into(); { serde_json::to_writer(&mut buffer, &bmh).unwrap(); @@ -474,7 +473,6 @@ fn load_save_minhash_sketches() { .abs() < EPSILON ); - } } #[test] @@ -487,10 +485,9 @@ fn load_save_minhash_sketches_abund() { let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); let sig = sigs.get(0).unwrap(); - let sketches = sig.sketches(); + let mh = sig.minhash().unwrap(); let mut buffer = vec![]; - if let Sketch::MinHash(mh) = &sketches[0] { let bmh: KmerMinHashBTree = mh.clone().into(); { serde_json::to_writer(&mut buffer, &bmh).unwrap(); @@ -575,7 +572,6 @@ fn load_save_minhash_sketches_abund() { .abs() < EPSILON ); - } } #[test] diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index e0d355d6b0..79b55f43af 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -50,7 +50,6 @@ fn zipstorage_list_sbts() -> Result<(), Box> { #[test] fn zipstorage_parallel_access() -> Result<(), Box> { use rayon::prelude::*; - use sourmash::signature::SigsTrait; let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("../../tests/test-data/v6.sbt.zip"); @@ -71,7 +70,7 @@ fn zipstorage_parallel_access() -> Result<(), Box> { let data = zs.load(path).unwrap(); let sigs: Vec = serde_json::from_reader(&data[..]).expect("Loading error"); sigs.iter() - .map(|v| v.sketches().iter().map(|mh| mh.size()).sum::()) + .map(|v| v.iter().map(|mh| mh.size()).sum::()) .sum::() }) .sum(); From c1a8526cac7ac0447aca6c217057b8a4b2915c17 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 2 Mar 2024 20:46:48 -0800 Subject: [PATCH 5/8] remove mins method from KmerMinHash, use iter_mins instead --- src/core/src/ffi/minhash.rs | 3 ++- src/core/src/index/revindex/disk_revindex.rs | 6 +++--- src/core/src/index/revindex/mem_revindex.rs | 2 +- src/core/src/sketch/hyperloglog/mod.rs | 4 ++-- src/core/src/sketch/minhash.rs | 10 +--------- src/core/src/sketch/nodegraph.rs | 8 ++++---- 6 files changed, 13 insertions(+), 20 deletions(-) diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 79a66e623f..a99e3a02ed 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -8,6 +8,7 @@ use crate::ffi::HashFunctions; use crate::signature::SeqToHashes; use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash; +use crate::HashIntoType; pub struct SourmashKmerMinHash; @@ -206,7 +207,7 @@ pub unsafe extern "C" fn kmerminhash_remove_many( ffi_fn! { unsafe fn kmerminhash_get_mins(ptr: *const SourmashKmerMinHash, size: *mut usize) -> Result<*const u64> { let mh = SourmashKmerMinHash::as_rust(ptr); - let output = mh.mins(); + let output: Vec = mh.iter_mins().copied().collect(); *size = output.len(); // FIXME: make a SourmashSlice_u64 type? diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 04f4b3f72c..f2501d2931 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -197,9 +197,9 @@ impl RevIndex { let cf_hashes = self.db.cf_handle(HASHES).unwrap(); - let hashes = match search_mh { - Sketch::MinHash(mh) => mh.mins(), - Sketch::LargeMinHash(mh) => mh.mins(), + let hashes: Vec<_> = match search_mh { + Sketch::MinHash(mh) => mh.iter_mins().copied().collect(), + Sketch::LargeMinHash(mh) => mh.iter_mins().copied().collect(), _ => unimplemented!(), }; diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index 20f4c3aabd..10d95b152a 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -175,7 +175,7 @@ impl RevIndex { } } } else { - let matched = search_mh.mins(); + let matched: Vec<_> = search_mh.iter_mins().copied().collect(); let size = matched.len() as u64; if !matched.is_empty() || size > threshold as u64 { hash_to_color.add_to(&mut colors, dataset_id, matched); diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index ee09caa6e5..f73ff8554c 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -214,8 +214,8 @@ impl SigsTrait for HyperLogLog { impl Update for KmerMinHash { fn update(&self, other: &mut HyperLogLog) -> Result<(), Error> { - for h in self.mins() { - other.add_hash(h); + for h in self.iter_mins() { + other.add_hash(*h); } Ok(()) } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 75c0c88e6c..3e12979e1a 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -708,10 +708,6 @@ impl KmerMinHash { self.hash_function == HashFunctions::Murmur64Hp } - pub fn mins(&self) -> Vec { - self.mins.clone() - } - pub fn iter_mins(&self) -> impl Iterator { self.mins.iter() } @@ -1523,10 +1519,6 @@ impl KmerMinHashBTree { self.hash_function.clone() } - pub fn mins(&self) -> Vec { - self.mins.iter().cloned().collect() - } - pub fn iter_mins(&self) -> impl Iterator { self.mins.iter() } @@ -1590,7 +1582,7 @@ impl SigsTrait for KmerMinHashBTree { } fn to_vec(&self) -> Vec { - self.mins() + self.iter_mins().copied().collect() } fn ksize(&self) -> usize { diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index bbfef5cd0d..e2878d7748 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -51,8 +51,8 @@ impl Update for Nodegraph { impl Update for KmerMinHash { fn update(&self, other: &mut Nodegraph) -> Result<(), Error> { - for h in self.mins() { - other.count(h); + for h in self.iter_mins() { + other.count(*h); } Ok(()) } @@ -60,8 +60,8 @@ impl Update for KmerMinHash { impl Update for KmerMinHashBTree { fn update(&self, other: &mut Nodegraph) -> Result<(), Error> { - for h in self.mins() { - other.count(h); + for h in self.iter_mins() { + other.count(*h); } Ok(()) } From 29a06638b3088e352ed65d58a8d658a00a1f6bed Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 3 Mar 2024 10:35:38 -0800 Subject: [PATCH 6/8] fmt --- src/core/tests/minhash.rs | 328 +++++++++++++++++++------------------- 1 file changed, 164 insertions(+), 164 deletions(-) diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index ae9d6a0a1a..1ff3f51719 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -389,90 +389,90 @@ fn load_save_minhash_sketches() { let mh = sig.minhash().unwrap(); let mut buffer = vec![]; - let bmh: KmerMinHashBTree = mh.clone().into(); - { - serde_json::to_writer(&mut buffer, &bmh).unwrap(); - } + let bmh: KmerMinHashBTree = mh.clone().into(); + { + serde_json::to_writer(&mut buffer, &bmh).unwrap(); + } - let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); - let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); - - assert_eq!(mh.md5sum(), new_mh.md5sum()); - assert_eq!(bmh.md5sum(), new_bmh.md5sum()); - assert_eq!(bmh.md5sum(), new_mh.md5sum()); - assert_eq!(mh.md5sum(), new_bmh.md5sum()); - - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); - - assert_eq!(mh.abunds(), new_mh.abunds()); - assert_eq!(bmh.abunds(), new_bmh.abunds()); - assert_eq!(bmh.abunds(), new_mh.abunds()); - assert_eq!(mh.abunds(), new_bmh.abunds()); - - assert!( - (mh.similarity(&new_mh, false, false).unwrap() - - bmh.similarity(&new_bmh, false, false).unwrap()) - .abs() - < EPSILON - ); - - assert!( - (mh.similarity(&new_mh, true, false).unwrap() - - bmh.similarity(&new_bmh, true, false).unwrap()) - .abs() - < EPSILON - ); - - buffer.clear(); - let imh: KmerMinHash = bmh.clone().into(); - { - serde_json::to_writer(&mut buffer, &imh).unwrap(); - } + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); - let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); - let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); - - assert_eq!(mh.md5sum(), new_mh.md5sum()); - assert_eq!(bmh.md5sum(), new_bmh.md5sum()); - assert_eq!(bmh.md5sum(), new_mh.md5sum()); - assert_eq!(mh.md5sum(), new_bmh.md5sum()); - - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); - - assert_eq!(mh.abunds(), new_mh.abunds()); - assert_eq!(bmh.abunds(), new_bmh.abunds()); - assert_eq!(bmh.abunds(), new_mh.abunds()); - assert_eq!(mh.abunds(), new_bmh.abunds()); - - assert_eq!(mh.to_vec(), new_mh.to_vec()); - assert_eq!(bmh.to_vec(), new_bmh.to_vec()); - assert_eq!(bmh.to_vec(), new_mh.to_vec()); - assert_eq!(mh.to_vec(), new_bmh.to_vec()); - - assert_eq!(mh.to_vec_abunds(), new_mh.to_vec_abunds()); - assert_eq!(bmh.to_vec_abunds(), new_bmh.to_vec_abunds()); - assert_eq!(bmh.to_vec_abunds(), new_mh.to_vec_abunds()); - assert_eq!(mh.to_vec_abunds(), new_bmh.to_vec_abunds()); - - assert!( - (mh.similarity(&new_mh, false, false).unwrap() - - bmh.similarity(&new_bmh, false, false).unwrap()) - .abs() - < EPSILON - ); - - assert!( - (mh.similarity(&new_mh, true, false).unwrap() - - bmh.similarity(&new_bmh, true, false).unwrap()) - .abs() - < EPSILON - ); + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); + + buffer.clear(); + let imh: KmerMinHash = bmh.clone().into(); + { + serde_json::to_writer(&mut buffer, &imh).unwrap(); + } + + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); + + assert_eq!(mh.to_vec(), new_mh.to_vec()); + assert_eq!(bmh.to_vec(), new_bmh.to_vec()); + assert_eq!(bmh.to_vec(), new_mh.to_vec()); + assert_eq!(mh.to_vec(), new_bmh.to_vec()); + + assert_eq!(mh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_bmh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(mh.to_vec_abunds(), new_bmh.to_vec_abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); } #[test] @@ -488,90 +488,90 @@ fn load_save_minhash_sketches_abund() { let mh = sig.minhash().unwrap(); let mut buffer = vec![]; - let bmh: KmerMinHashBTree = mh.clone().into(); - { - serde_json::to_writer(&mut buffer, &bmh).unwrap(); - } + let bmh: KmerMinHashBTree = mh.clone().into(); + { + serde_json::to_writer(&mut buffer, &bmh).unwrap(); + } - let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); - let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); - - assert_eq!(mh.md5sum(), new_mh.md5sum()); - assert_eq!(bmh.md5sum(), new_bmh.md5sum()); - assert_eq!(bmh.md5sum(), new_mh.md5sum()); - assert_eq!(mh.md5sum(), new_bmh.md5sum()); - - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); - - assert_eq!(mh.abunds(), new_mh.abunds()); - assert_eq!(bmh.abunds(), new_bmh.abunds()); - assert_eq!(bmh.abunds(), new_mh.abunds()); - assert_eq!(mh.abunds(), new_bmh.abunds()); - - assert_eq!(mh.to_vec(), new_mh.to_vec()); - assert_eq!(bmh.to_vec(), new_bmh.to_vec()); - assert_eq!(bmh.to_vec(), new_mh.to_vec()); - assert_eq!(mh.to_vec(), new_bmh.to_vec()); - - assert_eq!(mh.to_vec_abunds(), new_mh.to_vec_abunds()); - assert_eq!(bmh.to_vec_abunds(), new_bmh.to_vec_abunds()); - assert_eq!(bmh.to_vec_abunds(), new_mh.to_vec_abunds()); - assert_eq!(mh.to_vec_abunds(), new_bmh.to_vec_abunds()); - - assert!( - (mh.similarity(&new_mh, false, false).unwrap() - - bmh.similarity(&new_bmh, false, false).unwrap()) - .abs() - < EPSILON - ); - - assert!( - (mh.similarity(&new_mh, true, false).unwrap() - - bmh.similarity(&new_bmh, true, false).unwrap()) - .abs() - < EPSILON - ); - - buffer.clear(); - let imh: KmerMinHash = bmh.clone().into(); - { - serde_json::to_writer(&mut buffer, &imh).unwrap(); - } + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); - let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); - let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); - - assert_eq!(mh.md5sum(), new_mh.md5sum()); - assert_eq!(bmh.md5sum(), new_bmh.md5sum()); - assert_eq!(bmh.md5sum(), new_mh.md5sum()); - assert_eq!(mh.md5sum(), new_bmh.md5sum()); - - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); - - assert_eq!(mh.abunds(), new_mh.abunds()); - assert_eq!(bmh.abunds(), new_bmh.abunds()); - assert_eq!(bmh.abunds(), new_mh.abunds()); - assert_eq!(mh.abunds(), new_bmh.abunds()); - - assert!( - (mh.similarity(&new_mh, false, false).unwrap() - - bmh.similarity(&new_bmh, false, false).unwrap()) - .abs() - < EPSILON - ); - - assert!( - (mh.similarity(&new_mh, true, false).unwrap() - - bmh.similarity(&new_bmh, true, false).unwrap()) - .abs() - < EPSILON - ); + assert_eq!(mh.to_vec(), new_mh.to_vec()); + assert_eq!(bmh.to_vec(), new_bmh.to_vec()); + assert_eq!(bmh.to_vec(), new_mh.to_vec()); + assert_eq!(mh.to_vec(), new_bmh.to_vec()); + + assert_eq!(mh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_bmh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(mh.to_vec_abunds(), new_bmh.to_vec_abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); + + buffer.clear(); + let imh: KmerMinHash = bmh.clone().into(); + { + serde_json::to_writer(&mut buffer, &imh).unwrap(); + } + + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); } #[test] From a0f43fefe86eb5ffa76469324ea3c77a471d1db7 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 3 Mar 2024 12:42:57 -0800 Subject: [PATCH 7/8] fix tests --- src/core/src/from.rs | 19 +++++----- src/core/tests/minhash.rs | 73 +++++++++++++++++++++++---------------- src/core/tests/storage.rs | 2 ++ 3 files changed, 54 insertions(+), 40 deletions(-) diff --git a/src/core/src/from.rs b/src/core/src/from.rs index dbeeb58a2f..d8222fd42b 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -65,7 +65,7 @@ mod test { let b_hashes = b.to_vec(); - let s1: HashSet<_> = a.mins().into_iter().collect(); + let s1: HashSet<_> = a.iter_mins().copied().collect(); let s2: HashSet<_> = b_hashes.iter().map(|x| x.hash).collect(); let i1 = &s1 & &s2; @@ -73,8 +73,7 @@ mod test { assert!(i1.len() == b_hashes.len()); if let Some(abunds) = a.abunds() { - let mins = a.mins(); - let smap: HashMap<_, _> = mins.iter().zip(abunds.iter()).collect(); + let smap: HashMap<_, _> = a.iter_mins().zip(abunds.iter()).collect(); println!("{:?}", smap); for item in b_hashes.iter() { assert!(smap.contains_key(&{ item.hash })); @@ -101,19 +100,17 @@ mod test { let c = KmerMinHash::from(b); - let s1: HashSet<_> = a.mins().into_iter().collect(); - let s2: HashSet<_> = c.mins().into_iter().collect(); + let s1: HashSet<_> = a.iter_mins().collect(); + let s2: HashSet<_> = c.iter_mins().collect(); let i1 = &s1 & &s2; - assert!(i1.len() == a.mins().len()); - assert!(i1.len() == c.mins().len()); + assert!(i1.len() == a.iter_mins().count()); + assert!(i1.len() == c.iter_mins().count()); if let Some(a_abunds) = a.abunds() { if let Some(c_abunds) = c.abunds() { - let a_mins = a.mins(); - let a_smap: HashMap<_, _> = a_mins.iter().zip(a_abunds.iter()).collect(); - let c_mins = c.mins(); - let c_smap: HashMap<_, _> = c_mins.iter().zip(c_abunds.iter()).collect(); + let a_smap: HashMap<_, _> = a.iter_mins().zip(a_abunds.iter()).collect(); + let c_smap: HashMap<_, _> = c.iter_mins().zip(c_abunds.iter()).collect(); for item in a_smap.iter() { assert!(c_smap.contains_key(*item.0)); assert!(c_smap.get(*item.0).unwrap() == item.1); diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 1ff3f51719..0eea20166d 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -11,7 +11,6 @@ use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{ max_hash_for_scaled, scaled_for_max_hash, KmerMinHash, KmerMinHashBTree, }; -use sourmash::sketch::Sketch; // TODO: use f64::EPSILON when we bump MSRV const EPSILON: f64 = 0.01; @@ -58,11 +57,11 @@ fn invalid_dna() { let mut a = KmerMinHash::new(0, 3, HashFunctions::Murmur64Dna, 42, false, 20); a.add_sequence(b"AAANNCCCTN", true).unwrap(); - assert_eq!(a.mins().len(), 3); + assert_eq!(a.iter_mins().count(), 3); let mut b = KmerMinHash::new(0, 3, HashFunctions::Murmur64Dna, 42, false, 20); b.add_sequence(b"NAAA", true).unwrap(); - assert_eq!(b.mins().len(), 1); + assert_eq!(b.iter_mins().count(), 1); } #[test] @@ -209,8 +208,8 @@ fn oracle_mins(hashes in vec(u64::ANY, 1..10000)) { d.add_from(&b).unwrap(); d.remove_many(to_remove.iter().copied()).unwrap(); - assert_eq!(a.mins(), b.mins()); - assert_eq!(c.mins(), d.mins()); + assert!(a.iter_mins().zip(b.iter_mins()).all(|(a, b)| a == b)); + assert!(c.iter_mins().zip(d.iter_mins()).all(|(a, b)| a == b)); assert_eq!(a.count_common(&c, false).unwrap(), b.count_common(&d, false).unwrap()); assert_eq!(a.count_common(&c, true).unwrap(), b.count_common(&d, true).unwrap()); @@ -251,8 +250,8 @@ fn oracle_mins_scaled(hashes in vec(u64::ANY, 1..10000)) { a.remove_hash(hashes[0]); b.remove_hash(hashes[0]); - assert_eq!(a.mins(), b.mins()); - assert_eq!(c.mins(), d.mins()); + assert!(a.iter_mins().zip(b.iter_mins()).all(|(a, b)| a == b)); + assert!(c.iter_mins().zip(d.iter_mins()).all(|(a, b)| a == b)); assert_eq!(a.md5sum(), b.md5sum()); assert_eq!(c.md5sum(), d.md5sum()); @@ -343,8 +342,8 @@ fn prop_merge(seq1 in "[ACGT]{6,100}", seq2 in "[ACGT]{6,200}") { a.merge(&c).unwrap(); b.merge(&d).unwrap(); - assert_eq!(a.mins(), b.mins()); - assert_eq!(c.mins(), d.mins()); + assert!(a.iter_mins().zip(b.iter_mins()).all(|(a, b)| a == b)); + assert!(c.iter_mins().zip(d.iter_mins()).all(|(a, b)| a == b)); assert_eq!(a.abunds(), b.abunds()); assert_eq!(c.abunds(), d.abunds()); @@ -402,10 +401,13 @@ fn load_save_minhash_sketches() { assert_eq!(bmh.md5sum(), new_mh.md5sum()); assert_eq!(mh.md5sum(), new_bmh.md5sum()); - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); + assert!(mh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(bmh + .iter_mins() + .zip(new_bmh.iter_mins()) + .all(|(a, b)| a == b)); + assert!(bmh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(mh.iter_mins().zip(new_bmh.iter_mins()).all(|(a, b)| a == b)); assert_eq!(mh.abunds(), new_mh.abunds()); assert_eq!(bmh.abunds(), new_bmh.abunds()); @@ -440,10 +442,13 @@ fn load_save_minhash_sketches() { assert_eq!(bmh.md5sum(), new_mh.md5sum()); assert_eq!(mh.md5sum(), new_bmh.md5sum()); - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); + assert!(mh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(bmh + .iter_mins() + .zip(new_bmh.iter_mins()) + .all(|(a, b)| a == b)); + assert!(bmh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(mh.iter_mins().zip(new_bmh.iter_mins()).all(|(a, b)| a == b)); assert_eq!(mh.abunds(), new_mh.abunds()); assert_eq!(bmh.abunds(), new_bmh.abunds()); @@ -501,10 +506,13 @@ fn load_save_minhash_sketches_abund() { assert_eq!(bmh.md5sum(), new_mh.md5sum()); assert_eq!(mh.md5sum(), new_bmh.md5sum()); - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); + assert!(mh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(bmh + .iter_mins() + .zip(new_bmh.iter_mins()) + .all(|(a, b)| a == b)); + assert!(bmh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(mh.iter_mins().zip(new_bmh.iter_mins()).all(|(a, b)| a == b)); assert_eq!(mh.abunds(), new_mh.abunds()); assert_eq!(bmh.abunds(), new_bmh.abunds()); @@ -549,10 +557,13 @@ fn load_save_minhash_sketches_abund() { assert_eq!(bmh.md5sum(), new_mh.md5sum()); assert_eq!(mh.md5sum(), new_bmh.md5sum()); - assert_eq!(mh.mins(), new_mh.mins()); - assert_eq!(bmh.mins(), new_bmh.mins()); - assert_eq!(bmh.mins(), new_mh.mins()); - assert_eq!(mh.mins(), new_bmh.mins()); + assert!(mh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(bmh + .iter_mins() + .zip(new_bmh.iter_mins()) + .all(|(a, b)| a == b)); + assert!(bmh.iter_mins().zip(new_mh.iter_mins()).all(|(a, b)| a == b)); + assert!(mh.iter_mins().zip(new_bmh.iter_mins()).all(|(a, b)| a == b)); assert_eq!(mh.abunds(), new_mh.abunds()); assert_eq!(bmh.abunds(), new_bmh.abunds()); @@ -755,10 +766,12 @@ fn seq_to_hashes(seq in "ACGTGTAGCTAGACACTGACTGACTGAC") { } } - mh.mins().sort_unstable(); hashes.sort_unstable(); - assert_eq!(mh.mins(), hashes); + assert!(mh.iter_mins().zip(hashes.iter()).all(|(a, b)| a == b)); + let mut mins: Vec<_> = mh.iter_mins().copied().collect(); + mins.sort_unstable(); + assert!(mins.iter().zip(hashes.iter()).all(|(a, b)| a == b)); } #[test] @@ -778,10 +791,12 @@ fn seq_to_hashes_2(seq in "QRMTHINK") { } } - mh.mins().sort_unstable(); hashes.sort_unstable(); - assert_eq!(mh.mins(), hashes); + assert!(mh.iter_mins().zip(hashes.iter()).all(|(a, b)| a == b)); + let mut mins: Vec<_> = mh.iter_mins().copied().collect(); + mins.sort_unstable(); + assert!(mins.iter().zip(hashes.iter()).all(|(a, b)| a == b)); } } diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 79b55f43af..6331868bd2 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -51,6 +51,8 @@ fn zipstorage_list_sbts() -> Result<(), Box> { fn zipstorage_parallel_access() -> Result<(), Box> { use rayon::prelude::*; + use sourmash::signature::SigsTrait; + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("../../tests/test-data/v6.sbt.zip"); From 2d0dc0f67e7ea7daa1ea7fc08b80dd751e57f37d Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 3 Mar 2024 12:43:49 -0800 Subject: [PATCH 8/8] fix clippy lint --- src/core/src/from.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/from.rs b/src/core/src/from.rs index d8222fd42b..c6ef6aafa5 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -16,7 +16,7 @@ impl From for KmerMinHash { let mut new_mh = KmerMinHash::new( 0, - values.get(0).unwrap().kmer.len() as u32, + values.first().unwrap().kmer.len() as u32, HashFunctions::Murmur64Dna, 42, true,