diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 4db0ea0a..ea07e381 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.7.6' +__version__ = '2.7.7' # Minimum sketchlib version SKETCHLIB_MAJOR = 2 diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 9cdcbf63..970a0927 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -99,8 +99,6 @@ def get_options(): default = default_max_pi_dist, type = float) qcGroup.add_argument('--max-zero-dist', help=f"Maximum proportion of zero distances to permit [default = {default_max_zero}]", default = default_max_zero, type = float) - qcGroup.add_argument('--type-isolate', help='Isolate from which distances will be calculated for pruning [default = None]', - default = None, type = str) qcGroup.add_argument('--length-sigma', help=f"Number of standard deviations of length distribution beyond " f"which sequences will be excluded [default = {default_length_sigma}]", default = default_length_sigma, type = int) qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' @@ -420,7 +418,6 @@ def main(): 'max_pi_dist': args.max_pi_dist, 'max_a_dist': args.max_a_dist, 'prop_zero': args.max_zero_dist, - 'type_isolate': args.type_isolate } refList, queryList, self, distMat = readPickle(distances, enforce_self=True) @@ -454,14 +451,10 @@ def main(): pass_list = set(refList) - fail_unconditionally.keys() - fail_assembly_qc.keys() - fail_dist_qc.keys() assert(pass_list == (set(refList) - fail_unconditionally.keys()).intersection(set(pass_assembly_qc)).intersection(set(pass_dist_qc))) passed = [x for x in refList if x in pass_list] - if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in pass_list: - raise RuntimeError('Type isolate ' + qc_dict['type_isolate'] + \ - ' not found in isolates after QC; check ' - 'name of type isolate and QC options\n') - sys.stderr.write(f"{len(passed)} samples passed QC\n") - if len(passed) < len(refList): - remove_qc_fail(qc_dict, refList, passed, + # Update distMat and write output files + if len(passed) <= len(refList): + distMat = remove_qc_fail(qc_dict, refList, passed, [fail_unconditionally, fail_assembly_qc, fail_dist_qc], args.ref_db, distMat, output, args.strand_preserved, args.threads, @@ -747,7 +740,6 @@ def main(): refList, output, outSuffix = dist_string, - type_isolate = args.type_isolate, threads = args.threads, use_gpu = args.gpu_graph) nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index da746d2e..8d5c5a50 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -77,8 +77,6 @@ def get_options(): default = default_max_merge, type = int) qcGroup.add_argument('--betweenness', default=False, action='store_true', help='Report the betweenness of all the query nodes [default = False]') - qcGroup.add_argument('--type-isolate', help='Isolate from which distances can be calculated for pruning [default = None]', - default = None, type = str) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = None, type = int) qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' @@ -188,10 +186,9 @@ def main(): 'prop_zero': args.max_zero_dist, 'max_merge': args.max_merge, 'betweenness': args.betweenness, - 'type_isolate': args.type_isolate } else: - qc_dict = {'run_qc': False, 'type_isolate': None } + qc_dict = {'run_qc': False} # Dict of DB access functions for assign_query (which is out of scope) dbFuncs = setupDBFuncs(args) @@ -785,7 +782,6 @@ def assign_query_hdf5(dbFuncs, merged_queries, outSuffix = file_extension_string, existingRefs = existing_ref_list, - type_isolate = qc_dict['type_isolate'], threads = threads, use_gpu = gpu_graph, fast_mode = update_db == "fast") diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py index 3e02f55a..66fb45f1 100755 --- a/PopPUNK/lineages.py +++ b/PopPUNK/lineages.py @@ -326,7 +326,7 @@ def query_db(args): external_clustering = clustering_file # Ignore QC at the moment - qc_dict = {'run_qc': False, 'type_isolate': None } + qc_dict = {'run_qc': False} # Check output file if args.output is None: diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9d227386..b03d6ace 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -280,7 +280,7 @@ def translate_network_indices(G_ref_df, reference_indices): G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True) return(G_ref) -def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '', type_isolate = None, +def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '', existingRefs = None, threads = 1, use_gpu = False, fast_mode = False): """Extract references for each cluster based on cliques @@ -298,8 +298,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix (used for fast assign, to enrich refs with) outSuffix (str) Suffix for output file (.refs will be appended) - type_isolate (str) - Isolate to be included in set of references existingRefs (list) References that should be used for each clique use_gpu (bool) @@ -328,15 +326,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix else: merged_query_idx = set() - # Add type isolate, if necessary - type_isolate_index = None - if type_isolate is not None: - if type_isolate in dbOrder: - type_isolate_index = dbOrder.index(type_isolate) - else: - sys.stderr.write('Type isolate ' + type_isolate + ' not found\n') - sys.exit(1) - if use_gpu: if fast_mode: raise RuntimeError("GPU graphs not yet supported with --update-db fast") @@ -350,10 +339,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix reference_index_df = partition_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() - # Add type isolate if necessary - before edges are added - if type_isolate_index is not None and type_isolate_index not in reference_indices: - reference_indices.append(type_isolate_index) - # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] @@ -439,10 +424,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix # Returns nested lists, which need to be flattened reference_indices = set([entry for sublist in ref_lists for entry in sublist]) - # Add type isolate if necessary - before edges are added - if type_isolate_index is not None and type_isolate_index not in reference_indices: - reference_indices.add(type_isolate_index) - sys.stderr.write("Reconstructing clusters with shortest paths\n") if gt.openmp_enabled(): gt.openmp_set_num_threads(threads) diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py index 71e9dcf5..60648f85 100755 --- a/PopPUNK/qc.py +++ b/PopPUNK/qc.py @@ -81,11 +81,12 @@ def prune_distance_matrix(refList, remove_seqs_in, distMat, output): else: raise RuntimeError("Row name mismatch. Old: " + ref1 + "," + ref2 + "\n" "New: " + newRef1 + "," + newRef2 + "\n") - - storePickle(newRefList, newRefList, True, newDistMat, output) + else: newRefList = refList newDistMat = distMat + + storePickle(newRefList, newRefList, True, newDistMat, output) # return new distance matrix and sequence lists return newRefList, newDistMat @@ -270,11 +271,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict): names = refList + queryList self = False - # Pick type isolate if not supplied - if qc_dict['type_isolate'] is None: - qc_dict['type_isolate'] = pickTypeIsolate(ref_db, refList) - sys.stderr.write('Selected type isolate for distance QC is ' + qc_dict['type_isolate'] + '\n') - # First check with numpy, which is quicker than iterating over everything #long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist() @@ -285,7 +281,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict): int_offset = 0) failed = prune_edges(long_edges, - type_isolate=names.index(qc_dict['type_isolate']), query_start=len(refList), allow_ref_ref=self) # Convert the edge IDs back to sample names @@ -302,7 +297,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict): num_ref = len(refList), int_offset = 0) failed = prune_edges(zero_edges, - type_isolate=names.index(qc_dict['type_isolate']), query_start=len(refList), failed=failed, min_count=zero_count, @@ -366,18 +360,15 @@ def qcQueryAssignments(rList, qList, query_assignments, max_clusters, retained_samples.append(query) return retained_samples, failed_samples -def prune_edges(long_edges, type_isolate, query_start, +def prune_edges(long_edges, query_start, failed=None, min_count=1, allow_ref_ref=True): """Gives a list of failed vertices from a list of failing edges. Tries to prune by those nodes with highest degree of - bad nodes, preferentially removes queries, and doesn't remove - the type isolate + bad nodes, preferentially removes queries. Args: long_edges (list of tuples) List of bad edges as node IDs - type_isolate (int) - The node ID of the type isolate query_start (int) The first node ID which corresponds to queries failed (set or None) @@ -405,7 +396,7 @@ def prune_edges(long_edges, type_isolate, query_start, # Do not add any refs if querying if r < query_start and q < query_start: if allow_ref_ref: - if q == type_isolate or (counts[r] > counts[q] and counts[r] >= min_count): + if (counts[r] > counts[q] and counts[r] >= min_count): failed.add(r) elif counts[q] >= min_count: failed.add(q) @@ -423,7 +414,7 @@ def prune_edges(long_edges, type_isolate, query_start, def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix, strand_preserved=False, threads=1, use_gpu=False): """Removes samples failing QC from the database and distances. Also - recalculates random match chances. + recalculates random match chances. Return a new distance matrix. Args: qc_dict (dict) @@ -449,6 +440,9 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix, [default = 1]. use_gpu (bool) Whether GPU libraries were used to generate the original network. + Return: + newDistMat (numpy.array) + Updated version of distMat """ from .sketchlib import removeFromDB, addRandom, readDBParams @@ -467,7 +461,7 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix, passed, full_names = True) # new database file if pruning - if failed and not qc_dict['no_remove']: + if not qc_dict['no_remove']: # sys.stderr.write(f"Removing {len(failed)} samples from database and distances\n") tmp_filtered_db_name = f"{prefix}/filtered.{os.path.basename(prefix)}.h5" output_db_name = f"{prefix}/{os.path.basename(prefix)}.h5" @@ -479,7 +473,7 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix, os.rename(tmp_filtered_db_name, output_db_name) # Remove from the distMat too - prune_distance_matrix(names, + (_, newDistMat) = prune_distance_matrix(names, failed, distMat, f"{prefix}/{os.path.basename(prefix)}.dists") @@ -501,6 +495,8 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix, # write failing & reasons write_qc_failure_report(failed, fail_dicts, prefix) + return newDistMat + def write_qc_failure_report(failed_samples, fail_dicts, output_prefix): """ Writes a report of failed samples and their reasons to a file. @@ -535,51 +531,3 @@ def get_failure_reasons(sample, fail_dicts): if sample in fail_dict for reason in fail_dict[sample] ] - -def pickTypeIsolate(prefix, refList): - """Selects a type isolate as that with a minimal proportion - of missing data. - - Args: - prefix (str) - Prefix for database - refList (list) - References to pick from - - Returns: - type_isolate (str) - Name of isolate selected as reference - """ - # open databases - import h5py - db_name = prefix + '/' + os.path.basename(prefix) + '.h5' - hdf_in = h5py.File(db_name, 'r') - - min_prop_n = 1.0 - type_isolate = None - - try: - # process data structures - read_grp = hdf_in['sketches'] - # iterate through sketches - for sample in refList: - if sample in read_grp: - if 'reads' in hdf_in['sketches'][sample].attrs and \ - hdf_in['sketches'][sample].attrs['reads']: - sample_prop_n = 1.0 - else: - sample_prop_n = hdf_in['sketches'][sample].attrs['missing_bases']/hdf_in['sketches'][sample].attrs['length'] - if sample_prop_n < min_prop_n: - min_prop_n = sample_prop_n - type_isolate = sample - if min_prop_n == 0.0: - break - # if failure still close files to avoid corruption - except: - hdf_in.close() - sys.stderr.write('Problem processing h5 databases during QC - aborting\n') - print("Unexpected error:", sys.exc_info()[0], file = sys.stderr) - raise - - return type_isolate - diff --git a/docker/Dockerfile b/docker/Dockerfile index 8ac4ee40..c74932b5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -7,7 +7,7 @@ RUN apt-get update && \ libarmadillo-dev \ libeigen3-dev \ libopenblas-dev \ - software-properties-common + python3-graph-tool RUN git clone https://github.com/somme89/rapidNJ.git && \ cd rapidNJ && \ @@ -15,23 +15,7 @@ RUN git clone https://github.com/somme89/rapidNJ.git && \ mv ./bin/rapidnj /usr/bin &&\ cd .. && rm -r rapidNJ -# Special snowflake treatment for graph_tool because the maintainer -# refuses to make the package pip-installable, meaning we can't depend -# on it in a sane way. Least pain seems to be to install via apt, even -# though that might result in slightly incorrect versions (built -# around system python). -# -# The alternative would be to build it from source -# https://git.skewed.de/count0/graph-tool/-/wikis/installation-instructions#manual-compilation -# but it takes over an hour to compile apparently, so that sounds not -# ideal. -RUN add-apt-repository https://downloads.skewed.de/apt && \ - apt-key adv --keyserver keyserver.ubuntu.com \ - --recv-key 612DEFB798507F25 && \ - apt-get update && \ - apt-get install -y --no-install-recommends python3-graph-tool && \ - ln -s /usr/lib/python3/dist-packages/graph_tool \ - /usr/local/lib/python3.10/site-packages + RUN pip install pybind11[global] diff --git a/docs/qc.rst b/docs/qc.rst index 4f019378..ccb745c8 100644 --- a/docs/qc.rst +++ b/docs/qc.rst @@ -4,7 +4,7 @@ PopPUNK now comes with some basic quality control options, which you should run on your sketch database made with ``--create-db`` by running ``--qc-db`` as follows:: - poppunk --qc-db --ref-db example_db --type-isolate 12754_4_79 --length-range 2000000 3000000 + poppunk --qc-db --ref-db example_db --length-range 2000000 3000000 For ``poppunk_assign``, instead add ``--run-qc``:: @@ -50,8 +50,7 @@ and ``--upper-n`` which gives the absolute maximum value. QC of pairwise distances ------------------------ The second QC step uses the pairwise distances, to enable the removal of outlier samples -that may not be part of the taxon being studied. This is with reference to a type -isolate. The type isolate will be selected by PopPUNK, unless specified using ``--type-isolate``. +that may not be part of the taxon being studied. By default, the maximum allowed accessory distance is 0.5 to ensure you check for contamination. However, many species do really have high accessory values above this range, in which case you @@ -59,9 +58,6 @@ should increase the value of ``--max-a-dist``. The maximum allowed core distance is also 0.5, by default. This can be altered with ``--max-pi-dist``. -All sequences differing from the type isolate by distances greater than either threshold will be -identified by the analysis. - Each isolate may have a proportion of distances that are exactly zero as set by ``--max-zero-dist``. diff --git a/test/run_test.py b/test/run_test.py index 858dee87..248c612b 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -22,9 +22,12 @@ # create database with different QC options sys.stderr.write("Running database QC test (--qc-db)\n") -subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --type-isolate \"12754_4#79\" --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --type-isolate \"12754_4#79\" --length-range 2000000 3000000 --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --type-isolate \"12754_4#79\" --remove-samples remove.txt --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --length-range 2000000 3000000 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --remove-samples remove.txt --overwrite", shell=True, check=True) +# test if QC gives the same output as the original distances when there is no samples to remove +subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_qc --output example_qc_noRemove", shell=True, check=True) + #fit GMM sys.stderr.write("Running GMM model fit (--fit-model gmm)\n") @@ -60,7 +63,7 @@ subprocess.run(python_cmd + " ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True) #test pruning a database with a graph -subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_network_qc --type-isolate \"12754_4#79\" --remove-samples remove.txt --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_network_qc --remove-samples remove.txt --overwrite", shell=True, check=True) # tests of other command line programs sys.stderr.write("Testing C++ extension\n")