diff --git a/src/sourmash_utils.py b/src/sourmash_utils.py index e5fbf85..ec1cea1 100644 --- a/src/sourmash_utils.py +++ b/src/sourmash_utils.py @@ -1,5 +1,6 @@ "Python utilities for sourmash plugins and scripts." import sourmash +from sourmash.picklist import PickStyle from sourmash import sourmash_args from sourmash.cli import utils as sourmash_cli @@ -76,13 +77,16 @@ def add_standard_minhash_args(parser): sourmash_cli.add_scaled_arg(parser) -def create_minhash_from_args(args, *, track_abundance=False, **defaults): +def create_minhash_from_args(args=None, *, ksize=None, scaled=None, moltype=None, track_abundance=False, **defaults): default_moltype = defaults.get('moltype') - moltype = sourmash_args.calculate_moltype(args, default=default_moltype) - ksize = args.ksize or defaults.get('ksize') + + if args: + moltype = sourmash_args.calculate_moltype(args, default=default_moltype) + ksize = args.ksize or defaults.get('ksize') + scaled = args.scaled or defaults.get('scaled') + if not ksize: ksize = DEFAULTS[moltype]['ksize'] - scaled = args.scaled or defaults.get('scaled') if not scaled: scaled = DEFAULTS[moltype]['scaled'] @@ -92,15 +96,27 @@ def create_minhash_from_args(args, *, track_abundance=False, **defaults): track_abundance=track_abundance) -def load_index_and_select(filename, minhash_obj, *, raise_on_empty=True): +def load_index_and_select(filename, minhash_obj, *, picklist=None, pickfile=None, coltype=None, colname=None, pickstyle=PickStyle.INCLUDE, raise_on_empty=True, args=None): """Load a sourmash Index object from filename, selecting sketches compatible with minhash_obj. """ idx = sourmash.load_file_as_index(filename) + + if args is not None: + pl = sourmash_args.load_picklist(args) + elif pickfile: + picklist_arg = f"{pickfile}:{colname}:{coltype}:{pickstyle.name.lower()}" + pl = sourmash_args.load_picklist(picklist=picklist_arg) + else: + pl = picklist + idx = idx.select(ksize=minhash_obj.ksize, moltype=minhash_obj.moltype, scaled=minhash_obj.scaled, - abund=minhash_obj.track_abundance) + abund=minhash_obj.track_abundance, + picklist=pl + ) + print(idx) if not idx: raise ValueError(f"no matching sketches in '{filename}' for k={minhash_obj.ksize} moltype={minhash_obj.moltype} scaled={minhash_obj.scaled}") return idx diff --git a/tests/podar-ref.picklist.csv b/tests/podar-ref.picklist.csv new file mode 100644 index 0000000..f9b3cba --- /dev/null +++ b/tests/podar-ref.picklist.csv @@ -0,0 +1,10 @@ +accession,taxid,superkingdom,phylum,class,order,family,genus,species,strain +AE000782.1,224325,Archaea,Euryarchaeota,Archaeoglobi,Archaeoglobales,Archaeoglobaceae,Archaeoglobus,Archaeoglobus fulgidus,Archaeoglobus fulgidus DSM 4304 +NC_000909.1,243232,Archaea,Euryarchaeota,Methanococci,Methanococcales,Methanocaldococcaceae,Methanocaldococcus,Methanocaldococcus jannaschii,Methanocaldococcus jannaschii DSM 2661 +NC_003272.1,103690,Bacteria,Cyanobacteria,,Nostocales,Nostocaceae,Nostoc,Nostoc sp. PCC 7120, +AE009441.1,178306,Archaea,Crenarchaeota,Thermoprotei,Thermoproteales,Thermoproteaceae,Pyrobaculum,Pyrobaculum aerophilum,Pyrobaculum aerophilum str. IM2 +AE009950.1,186497,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Pyrococcus,Pyrococcus furiosus,Pyrococcus furiosus DSM 3638 +AE009951.2,190304,Bacteria,Fusobacteria,Fusobacteriia,Fusobacteriales,Fusobacteriaceae,Fusobacterium,Fusobacterium nucleatum, +AE010299.1,188937,Archaea,Euryarchaeota,Methanomicrobia,Methanosarcinales,Methanosarcinaceae,Methanosarcina,Methanosarcina acetivorans,Methanosarcina acetivorans C2A +AE009439.1,190192,Archaea,Euryarchaeota,Methanopyri,Methanopyrales,Methanopyraceae,Methanopyrus,Methanopyrus kandleri,Methanopyrus kandleri AV19 +NC_003911.12,246200,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Ruegeria,Ruegeria pomeroyi,Ruegeria pomeroyi DSS-3 diff --git a/tests/podar-ref.zip b/tests/podar-ref.zip new file mode 100644 index 0000000..0042f12 Binary files /dev/null and b/tests/podar-ref.zip differ diff --git a/tests/test_sourmash_utils.py b/tests/test_sourmash_utils.py index 8e1ab63..0c56f85 100644 --- a/tests/test_sourmash_utils.py +++ b/tests/test_sourmash_utils.py @@ -1,4 +1,7 @@ from sourmash_utils import * +from sourmash_utils import create_minhash_from_args, load_index_and_select + +import os def test_basic(): @@ -59,3 +62,24 @@ def test_minhash_create_skipm2n3(): assert mh.moltype == 'skipm2n3' assert mh.scaled == 1000 assert mh.ksize == 21 + + +def test_load_index_and_select(): + # test loading index and selecting without picklist + select_mh = create_minhash_from_args(ksize=31,scaled=1000,moltype='DNA', dna=True, abund=True) + + here = os.path.dirname(__file__) + fpath = os.path.join(here, 'podar-ref.zip') + idx = load_index_and_select(filename=fpath, minhash_obj=select_mh) + assert idx + +def test_load_index_and_select_picklist(): + # test loading index and selecting with picklist + select_mh = create_minhash_from_args(ksize=31,scaled=1000,moltype='DNA',abund=True) + + here = os.path.dirname(__file__) + fpath = os.path.join(here, 'podar-ref.zip') + ppath = os.path.join(here, 'podar-ref.picklist.csv') + idx = load_index_and_select(filename=fpath, minhash_obj=select_mh, pickfile=ppath, coltype='ident', colname='accession') + assert idx + assert len(idx) == 9