bacpop · johnlees · Nov 28, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 18, 2025
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -3,7 +3,7 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.7.6'
+__version__ = '2.7.7'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 2

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -99,8 +99,6 @@ def get_options():
                                                 default = default_max_pi_dist, type = float)
     qcGroup.add_argument('--max-zero-dist', help=f"Maximum proportion of zero distances to permit [default = {default_max_zero}]",
                                                 default = default_max_zero, type = float)
-    qcGroup.add_argument('--type-isolate', help='Isolate from which distances will be calculated for pruning [default = None]',
-                                                default = None, type = str)
     qcGroup.add_argument('--length-sigma', help=f"Number of standard deviations of length distribution beyond "
                                                 f"which sequences will be excluded [default = {default_length_sigma}]", default = default_length_sigma, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -420,7 +418,6 @@ def main():
             'max_pi_dist': args.max_pi_dist,
             'max_a_dist': args.max_a_dist,
             'prop_zero': args.max_zero_dist,
-            'type_isolate': args.type_isolate
         }
 
         refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
@@ -454,14 +451,10 @@ def main():
         pass_list = set(refList) - fail_unconditionally.keys() - fail_assembly_qc.keys() - fail_dist_qc.keys()
         assert(pass_list == (set(refList) - fail_unconditionally.keys()).intersection(set(pass_assembly_qc)).intersection(set(pass_dist_qc)))
         passed = [x for x in refList if x in pass_list]
-        if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in pass_list:
-            raise RuntimeError('Type isolate ' + qc_dict['type_isolate'] + \
-                               ' not found in isolates after QC; check '
-                               'name of type isolate and QC options\n')
-
         sys.stderr.write(f"{len(passed)} samples passed QC\n")
-        if len(passed) < len(refList):
-            remove_qc_fail(qc_dict, refList, passed,
+        # Update distMat and write output files
+        if len(passed) <= len(refList):
+            distMat = remove_qc_fail(qc_dict, refList, passed,
                            [fail_unconditionally, fail_assembly_qc, fail_dist_qc],
                            args.ref_db, distMat, output,
                            args.strand_preserved, args.threads,
@@ -747,7 +740,6 @@ def main():
                                         refList,
                                         output,
                                         outSuffix = dist_string,
-                                        type_isolate = args.type_isolate,
                                         threads = args.threads,
                                         use_gpu = args.gpu_graph)
                 nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -77,8 +77,6 @@ def get_options():
                                                 default = default_max_merge, type = int)
     qcGroup.add_argument('--betweenness', default=False, action='store_true',
                                help='Report the betweenness of all the query nodes [default = False]')
-    qcGroup.add_argument('--type-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
-                                                default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = None, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -188,10 +186,9 @@ def main():
             'prop_zero': args.max_zero_dist,
             'max_merge': args.max_merge,
             'betweenness': args.betweenness,
-            'type_isolate': args.type_isolate
         }
     else:
-        qc_dict = {'run_qc': False, 'type_isolate': None }
+        qc_dict = {'run_qc': False}
 
     # Dict of DB access functions for assign_query (which is out of scope)
     dbFuncs = setupDBFuncs(args)
@@ -785,7 +782,6 @@ def assign_query_hdf5(dbFuncs,
                                             merged_queries,
                                             outSuffix = file_extension_string,
                                             existingRefs = existing_ref_list,
-                                            type_isolate = qc_dict['type_isolate'],
                                             threads = threads,
                                             use_gpu = gpu_graph,
                                             fast_mode = update_db == "fast")

diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py
@@ -326,7 +326,7 @@ def query_db(args):
         external_clustering = clustering_file
 
     # Ignore QC at the moment
-    qc_dict = {'run_qc': False, 'type_isolate': None }
+    qc_dict = {'run_qc': False}
 
     # Check output file
     if args.output is None:

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
@@ -280,7 +280,7 @@ def translate_network_indices(G_ref_df, reference_indices):
     G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True)
     return(G_ref)
 
-def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '', type_isolate = None,
+def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '',
                         existingRefs = None, threads = 1, use_gpu = False, fast_mode = False):
     """Extract references for each cluster based on cliques
 
@@ -298,8 +298,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
                (used for fast assign, to enrich refs with)
            outSuffix (str)
                Suffix for output file  (.refs will be appended)
-           type_isolate (str)
-               Isolate to be included in set of references
            existingRefs (list)
                References that should be used for each clique
            use_gpu (bool)
@@ -328,15 +326,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
     else:
         merged_query_idx = set()
 
-    # Add type isolate, if necessary
-    type_isolate_index = None
-    if type_isolate is not None:
-        if type_isolate in dbOrder:
-            type_isolate_index = dbOrder.index(type_isolate)
-        else:
-            sys.stderr.write('Type isolate ' + type_isolate + ' not found\n')
-            sys.exit(1)
-
     if use_gpu:
         if fast_mode:
             raise RuntimeError("GPU graphs not yet supported with --update-db fast")
@@ -350,10 +339,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
         reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
 
-        # Add type isolate if necessary - before edges are added
-        if type_isolate_index is not None and type_isolate_index not in reference_indices:
-            reference_indices.append(type_isolate_index)
-
         # Order found references as in sketchlib database
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
 
@@ -439,10 +424,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
         # Returns nested lists, which need to be flattened
         reference_indices = set([entry for sublist in ref_lists for entry in sublist])
 
-        # Add type isolate if necessary - before edges are added
-        if type_isolate_index is not None and type_isolate_index not in reference_indices:
-            reference_indices.add(type_isolate_index)
-
         sys.stderr.write("Reconstructing clusters with shortest paths\n")
         if gt.openmp_enabled():
             gt.openmp_set_num_threads(threads)

diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py
@@ -81,11 +81,12 @@ def prune_distance_matrix(refList, remove_seqs_in, distMat, output):
                 else:
                     raise RuntimeError("Row name mismatch. Old: " + ref1 + "," + ref2 + "\n"
                                        "New: " + newRef1 + "," + newRef2 + "\n")
-
-        storePickle(newRefList, newRefList, True, newDistMat, output)
+
     else:
         newRefList = refList
         newDistMat = distMat
+
+    storePickle(newRefList, newRefList, True, newDistMat, output)
 
     # return new distance matrix and sequence lists
     return newRefList, newDistMat
@@ -270,11 +271,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
         names = refList + queryList
         self = False
 
-    # Pick type isolate if not supplied
-    if qc_dict['type_isolate'] is None:
-        qc_dict['type_isolate'] = pickTypeIsolate(ref_db, refList)
-        sys.stderr.write('Selected type isolate for distance QC is ' + qc_dict['type_isolate'] + '\n')
-
     # First check with numpy, which is quicker than iterating over everything
     #long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
     long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist()
@@ -285,7 +281,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
                                                 int_offset = 0)
 
     failed = prune_edges(long_edges,
-                                 type_isolate=names.index(qc_dict['type_isolate']),
                                  query_start=len(refList),
                                  allow_ref_ref=self)
     # Convert the edge IDs back to sample names
@@ -302,7 +297,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
                                                     num_ref = len(refList),
                                                     int_offset = 0)
         failed = prune_edges(zero_edges,
-                            type_isolate=names.index(qc_dict['type_isolate']),
                             query_start=len(refList),
                             failed=failed,
                             min_count=zero_count,
@@ -366,18 +360,15 @@ def qcQueryAssignments(rList, qList, query_assignments, max_clusters,
             retained_samples.append(query)
     return retained_samples, failed_samples
 
-def prune_edges(long_edges, type_isolate, query_start,
+def prune_edges(long_edges, query_start,
                 failed=None, min_count=1, allow_ref_ref=True):
     """Gives a list of failed vertices from a list of failing
     edges. Tries to prune by those nodes with highest degree of
-    bad nodes, preferentially removes queries, and doesn't remove
-    the type isolate
+    bad nodes, preferentially removes queries.
 
     Args:
         long_edges (list of tuples)
             List of bad edges as node IDs
-        type_isolate (int)
-            The node ID of the type isolate
         query_start (int)
             The first node ID which corresponds to queries
         failed (set or None)
@@ -405,7 +396,7 @@ def prune_edges(long_edges, type_isolate, query_start,
                 # Do not add any refs if querying
                 if r < query_start and q < query_start:
                     if allow_ref_ref:
-                        if q == type_isolate or (counts[r] > counts[q] and counts[r] >= min_count):
+                        if (counts[r] > counts[q] and counts[r] >= min_count):
                             failed.add(r)
                         elif counts[q] >= min_count:
                             failed.add(q)
@@ -423,7 +414,7 @@ def prune_edges(long_edges, type_isolate, query_start,
 def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
                    strand_preserved=False, threads=1, use_gpu=False):
     """Removes samples failing QC from the database and distances. Also
-    recalculates random match chances.
+    recalculates random match chances. Return a new distance matrix.
 
     Args:
         qc_dict (dict)
@@ -449,6 +440,9 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
             [default = 1].
         use_gpu (bool)
             Whether GPU libraries were used to generate the original network.
+    Return:
+        newDistMat (numpy.array)
+            Updated version of distMat
     """
     from .sketchlib import removeFromDB, addRandom, readDBParams
 
@@ -467,7 +461,7 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
                     passed,
                     full_names = True)
     # new database file if pruning
-    if failed and not qc_dict['no_remove']:
+    if not qc_dict['no_remove']:
         # sys.stderr.write(f"Removing {len(failed)} samples from database and distances\n")
         tmp_filtered_db_name = f"{prefix}/filtered.{os.path.basename(prefix)}.h5"
         output_db_name = f"{prefix}/{os.path.basename(prefix)}.h5"
@@ -479,7 +473,7 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
         os.rename(tmp_filtered_db_name, output_db_name)
 
         # Remove from the distMat too
-        prune_distance_matrix(names,
+        (_, newDistMat) = prune_distance_matrix(names,
                               failed,
                               distMat,
                               f"{prefix}/{os.path.basename(prefix)}.dists")
@@ -501,6 +495,8 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
     # write failing & reasons
     write_qc_failure_report(failed, fail_dicts, prefix)
 
+    return newDistMat
+
 def write_qc_failure_report(failed_samples, fail_dicts, output_prefix):
     """
     Writes a report of failed samples and their reasons to a file.
@@ -535,51 +531,3 @@ def get_failure_reasons(sample, fail_dicts):
         if sample in fail_dict
         for reason in fail_dict[sample]
     ]
-
-def pickTypeIsolate(prefix, refList):
-    """Selects a type isolate as that with a minimal proportion
-    of missing data.
-
-    Args:
-        prefix (str)
-            Prefix for database
-        refList (list)
-            References to pick from
-
-    Returns:
-        type_isolate (str)
-            Name of isolate selected as reference
-    """
-    # open databases
-    import h5py
-    db_name = prefix + '/' + os.path.basename(prefix) + '.h5'
-    hdf_in = h5py.File(db_name, 'r')
-
-    min_prop_n = 1.0
-    type_isolate = None
-
-    try:
-        # process data structures
-        read_grp = hdf_in['sketches']
-        # iterate through sketches
-        for sample in refList:
-            if sample in read_grp:
-                if 'reads' in hdf_in['sketches'][sample].attrs and \
-                    hdf_in['sketches'][sample].attrs['reads']:
-                    sample_prop_n = 1.0
-                else:
-                    sample_prop_n = hdf_in['sketches'][sample].attrs['missing_bases']/hdf_in['sketches'][sample].attrs['length']
-                if sample_prop_n < min_prop_n:
-                    min_prop_n = sample_prop_n
-                    type_isolate = sample
-                if min_prop_n == 0.0:
-                    break
-    # if failure still close files to avoid corruption
-    except:
-        hdf_in.close()
-        sys.stderr.write('Problem processing h5 databases during QC - aborting\n')
-        print("Unexpected error:", sys.exc_info()[0], file = sys.stderr)
-        raise
-
-    return type_isolate
-
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -7,31 +7,15 @@ RUN apt-get update && \
                           libarmadillo-dev \
                           libeigen3-dev \
                           libopenblas-dev \
-                          software-properties-common
+                          python3-graph-tool
 
 RUN git clone https://github.com/somme89/rapidNJ.git && \
     cd rapidNJ && \
     make && \
     mv ./bin/rapidnj /usr/bin &&\
     cd .. && rm -r rapidNJ
 
-# Special snowflake treatment for graph_tool because the maintainer
-# refuses to make the package pip-installable, meaning we can't depend
-# on it in a sane way. Least pain seems to be to install via apt, even
-# though that might result in slightly incorrect versions (built
-# around system python).
-#
-# The alternative would be to build it from source
-# https://git.skewed.de/count0/graph-tool/-/wikis/installation-instructions#manual-compilation
-# but it takes over an hour to compile apparently, so that sounds not
-# ideal.
-RUN add-apt-repository https://downloads.skewed.de/apt && \
-    apt-key adv --keyserver keyserver.ubuntu.com \
-            --recv-key 612DEFB798507F25 && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends python3-graph-tool && \
-    ln -s /usr/lib/python3/dist-packages/graph_tool \
-      /usr/local/lib/python3.10/site-packages
+
 
 RUN pip install pybind11[global]
 

diff --git a/docs/qc.rst b/docs/qc.rst
@@ -4,7 +4,7 @@ PopPUNK now comes with some basic quality control options, which you should
 run on your sketch database made with ``--create-db`` by running ``--qc-db``
 as follows::
 
-    poppunk --qc-db --ref-db example_db --type-isolate 12754_4_79 --length-range 2000000 3000000
+    poppunk --qc-db --ref-db example_db --length-range 2000000 3000000
 
 For ``poppunk_assign``, instead add ``--run-qc``::
 
@@ -50,18 +50,14 @@ and ``--upper-n`` which gives the absolute maximum value.
 QC of pairwise distances
 ------------------------
 The second QC step uses the pairwise distances, to enable the removal of outlier samples
-that may not be part of the taxon being studied. This is with reference to a type
-isolate. The type isolate will be selected by PopPUNK, unless specified using ``--type-isolate``.
+that may not be part of the taxon being studied.
 
 By default, the maximum allowed accessory distance is 0.5 to ensure you check for contamination.
 However, many species do really have high accessory values above this range, in which case you
 should increase the value of ``--max-a-dist``.
 
 The maximum allowed core distance is also 0.5, by default. This can be altered with ``--max-pi-dist``.
 
-All sequences differing from the type isolate by distances greater than either threshold will be
-identified by the analysis.
-
 Each isolate may have a proportion of distances that are exactly zero as set by
 ``--max-zero-dist``.