bacpop · johnlees · Nov 28, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 18, 2025
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -99,8 +99,6 @@ def get_options():
                                                 default = default_max_pi_dist, type = float)
     qcGroup.add_argument('--max-zero-dist', help=f"Maximum proportion of zero distances to permit [default = {default_max_zero}]",
                                                 default = default_max_zero, type = float)
-    qcGroup.add_argument('--type-isolate', help='Isolate from which distances will be calculated for pruning [default = None]',
-                                                default = None, type = str)
     qcGroup.add_argument('--length-sigma', help=f"Number of standard deviations of length distribution beyond "
                                                 f"which sequences will be excluded [default = {default_length_sigma}]", default = default_length_sigma, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -420,10 +418,10 @@ def main():
             'max_pi_dist': args.max_pi_dist,
             'max_a_dist': args.max_a_dist,
             'prop_zero': args.max_zero_dist,
-            'type_isolate': args.type_isolate
         }
 
         refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
+        newDistMat = distMat
 
         fail_unconditionally = {}
         # Unconditional removal
@@ -454,22 +452,18 @@ def main():
         pass_list = set(refList) - fail_unconditionally.keys() - fail_assembly_qc.keys() - fail_dist_qc.keys()
         assert(pass_list == (set(refList) - fail_unconditionally.keys()).intersection(set(pass_assembly_qc)).intersection(set(pass_dist_qc)))
         passed = [x for x in refList if x in pass_list]
-        if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in pass_list:
-            raise RuntimeError('Type isolate ' + qc_dict['type_isolate'] + \
-                               ' not found in isolates after QC; check '
-                               'name of type isolate and QC options\n')
-
         sys.stderr.write(f"{len(passed)} samples passed QC\n")
-        if len(passed) < len(refList):
-            remove_qc_fail(qc_dict, refList, passed,
+        # Update newDistMat and write output files
+        if len(passed) <= len(refList):
+            newDistMat = remove_qc_fail(qc_dict, refList, passed,
                            [fail_unconditionally, fail_assembly_qc, fail_dist_qc],
                            args.ref_db, distMat, output,
                            args.strand_preserved, args.threads,
                            args.gpu_graph)
 
         # Plot results
         if not args.no_plot:
-            plot_scatter(distMat,
+            plot_scatter(newDistMat,
                          output,
                          output + " distances")
             genome_lengths, ambiguous_bases = get_database_statistics(output)
@@ -747,7 +741,6 @@ def main():
                                         refList,
                                         output,
                                         outSuffix = dist_string,
-                                        type_isolate = args.type_isolate,
                                         threads = args.threads,
                                         use_gpu = args.gpu_graph)
                 nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -77,8 +77,6 @@ def get_options():
                                                 default = default_max_merge, type = int)
     qcGroup.add_argument('--betweenness', default=False, action='store_true',
                                help='Report the betweenness of all the query nodes [default = False]')
-    qcGroup.add_argument('--type-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
-                                                default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = None, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -188,10 +186,9 @@ def main():
             'prop_zero': args.max_zero_dist,
             'max_merge': args.max_merge,
             'betweenness': args.betweenness,
-            'type_isolate': args.type_isolate
         }
     else:
-        qc_dict = {'run_qc': False, 'type_isolate': None }
+        qc_dict = {'run_qc': False}
 
     # Dict of DB access functions for assign_query (which is out of scope)
     dbFuncs = setupDBFuncs(args)
@@ -785,7 +782,6 @@ def assign_query_hdf5(dbFuncs,
                                             merged_queries,
                                             outSuffix = file_extension_string,
                                             existingRefs = existing_ref_list,
-                                            type_isolate = qc_dict['type_isolate'],
                                             threads = threads,
                                             use_gpu = gpu_graph,
                                             fast_mode = update_db == "fast")

diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py
@@ -326,7 +326,7 @@ def query_db(args):
         external_clustering = clustering_file
 
     # Ignore QC at the moment
-    qc_dict = {'run_qc': False, 'type_isolate': None }
+    qc_dict = {'run_qc': False}
 
     # Check output file
     if args.output is None:

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
@@ -280,7 +280,7 @@ def translate_network_indices(G_ref_df, reference_indices):
     G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True)
     return(G_ref)
 
-def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '', type_isolate = None,
+def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '',
                         existingRefs = None, threads = 1, use_gpu = False, fast_mode = False):
     """Extract references for each cluster based on cliques
 
@@ -298,8 +298,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
                (used for fast assign, to enrich refs with)
            outSuffix (str)
                Suffix for output file  (.refs will be appended)
-           type_isolate (str)
-               Isolate to be included in set of references
            existingRefs (list)
                References that should be used for each clique
            use_gpu (bool)
@@ -328,15 +326,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
     else:
         merged_query_idx = set()
 
-    # Add type isolate, if necessary
-    type_isolate_index = None
-    if type_isolate is not None:
-        if type_isolate in dbOrder:
-            type_isolate_index = dbOrder.index(type_isolate)
-        else:
-            sys.stderr.write('Type isolate ' + type_isolate + ' not found\n')
-            sys.exit(1)
-
     if use_gpu:
         if fast_mode:
             raise RuntimeError("GPU graphs not yet supported with --update-db fast")
@@ -350,10 +339,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
         reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
 
-        # Add type isolate if necessary - before edges are added
-        if type_isolate_index is not None and type_isolate_index not in reference_indices:
-            reference_indices.append(type_isolate_index)
-
         # Order found references as in sketchlib database
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
 
@@ -439,10 +424,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
         # Returns nested lists, which need to be flattened
         reference_indices = set([entry for sublist in ref_lists for entry in sublist])
 
-        # Add type isolate if necessary - before edges are added
-        if type_isolate_index is not None and type_isolate_index not in reference_indices:
-            reference_indices.add(type_isolate_index)
-
         sys.stderr.write("Reconstructing clusters with shortest paths\n")
         if gt.openmp_enabled():
             gt.openmp_set_num_threads(threads)

diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py
@@ -81,11 +81,12 @@ def prune_distance_matrix(refList, remove_seqs_in, distMat, output):
                 else:
                     raise RuntimeError("Row name mismatch. Old: " + ref1 + "," + ref2 + "\n"
                                        "New: " + newRef1 + "," + newRef2 + "\n")
-
-        storePickle(newRefList, newRefList, True, newDistMat, output)
+
     else:
         newRefList = refList
         newDistMat = distMat
+
+    storePickle(newRefList, newRefList, True, newDistMat, output)
 
     # return new distance matrix and sequence lists
     return newRefList, newDistMat
@@ -270,11 +271,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
         names = refList + queryList
         self = False
 
-    # Pick type isolate if not supplied
-    if qc_dict['type_isolate'] is None:
-        qc_dict['type_isolate'] = pickTypeIsolate(ref_db, refList)
-        sys.stderr.write('Selected type isolate for distance QC is ' + qc_dict['type_isolate'] + '\n')
-
     # First check with numpy, which is quicker than iterating over everything
     #long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
     long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist()
@@ -285,7 +281,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
                                                 int_offset = 0)
 
     failed = prune_edges(long_edges,
-                                 type_isolate=names.index(qc_dict['type_isolate']),
                                  query_start=len(refList),
                                  allow_ref_ref=self)
     # Convert the edge IDs back to sample names
@@ -302,7 +297,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
                                                     num_ref = len(refList),
                                                     int_offset = 0)
         failed = prune_edges(zero_edges,
-                            type_isolate=names.index(qc_dict['type_isolate']),
                             query_start=len(refList),
                             failed=failed,
                             min_count=zero_count,
@@ -366,18 +360,15 @@ def qcQueryAssignments(rList, qList, query_assignments, max_clusters,
             retained_samples.append(query)
     return retained_samples, failed_samples
 
-def prune_edges(long_edges, type_isolate, query_start,
+def prune_edges(long_edges, query_start,
                 failed=None, min_count=1, allow_ref_ref=True):
     """Gives a list of failed vertices from a list of failing
     edges. Tries to prune by those nodes with highest degree of
-    bad nodes, preferentially removes queries, and doesn't remove
-    the type isolate
+    bad nodes, preferentially removes queries.
 
     Args:
         long_edges (list of tuples)
             List of bad edges as node IDs
-        type_isolate (int)
-            The node ID of the type isolate
         query_start (int)
             The first node ID which corresponds to queries
         failed (set or None)
@@ -405,7 +396,7 @@ def prune_edges(long_edges, type_isolate, query_start,
                 # Do not add any refs if querying
                 if r < query_start and q < query_start:
                     if allow_ref_ref:
-                        if q == type_isolate or (counts[r] > counts[q] and counts[r] >= min_count):
+                        if (counts[r] > counts[q] and counts[r] >= min_count):
                             failed.add(r)
                         elif counts[q] >= min_count:
                             failed.add(q)
@@ -423,7 +414,7 @@ def prune_edges(long_edges, type_isolate, query_start,
 def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
                    strand_preserved=False, threads=1, use_gpu=False):
     """Removes samples failing QC from the database and distances. Also
-    recalculates random match chances.
+    recalculates random match chances. Return a new distance matrix.
 
     Args:
         qc_dict (dict)
@@ -449,6 +440,9 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
             [default = 1].
         use_gpu (bool)
             Whether GPU libraries were used to generate the original network.
+    Return:
+        newDistMat (numpy.array)
+            Updated version of distMat
     """
     from .sketchlib import removeFromDB, addRandom, readDBParams
 
@@ -467,7 +461,7 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
                     passed,
                     full_names = True)
     # new database file if pruning
-    if failed and not qc_dict['no_remove']:
+    if not qc_dict['no_remove']:
         # sys.stderr.write(f"Removing {len(failed)} samples from database and distances\n")
         tmp_filtered_db_name = f"{prefix}/filtered.{os.path.basename(prefix)}.h5"
         output_db_name = f"{prefix}/{os.path.basename(prefix)}.h5"
@@ -479,10 +473,10 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
         os.rename(tmp_filtered_db_name, output_db_name)
 
         # Remove from the distMat too
-        prune_distance_matrix(names,
+        newDistMat = prune_distance_matrix(names,
                               failed,
                               distMat,
-                              f"{prefix}/{os.path.basename(prefix)}.dists")
+                              f"{prefix}/{os.path.basename(prefix)}.dists")[1]
 
         # Update the graph
         prune_graph(ref_db,
@@ -501,6 +495,8 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
     # write failing & reasons
     write_qc_failure_report(failed, fail_dicts, prefix)
 
+    return newDistMat
+
 def write_qc_failure_report(failed_samples, fail_dicts, output_prefix):
     """
     Writes a report of failed samples and their reasons to a file.
@@ -535,7 +531,8 @@ def get_failure_reasons(sample, fail_dicts):
         if sample in fail_dict
         for reason in fail_dict[sample]
     ]
-
+
+# Type isolate is no longer supported in PopPUNK   
 def pickTypeIsolate(prefix, refList):
     """Selects a type isolate as that with a minimal proportion
     of missing data.

diff --git a/test/run_test.py b/test/run_test.py
@@ -22,9 +22,13 @@
 
 # create database with different QC options
 sys.stderr.write("Running database QC test (--qc-db)\n")
-subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --type-isolate \"12754_4#79\" --overwrite", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --type-isolate \"12754_4#79\" --length-range 2000000 3000000 --overwrite", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --type-isolate \"12754_4#79\" --remove-samples remove.txt --overwrite", shell=True, check=True)
+# Type isolate can no longer be user specified. Use to have --type-isolate \"12754_4#79\"
+subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --length-range 2000000 3000000 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --remove-samples remove.txt --overwrite", shell=True, check=True)
+# test if QC gives the same output as the original distances when there is no samples to remove
+#subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_qc --output example_qc_noRemove", shell=True, check=True)
+
 
 #fit GMM
 sys.stderr.write("Running GMM model fit (--fit-model gmm)\n")
@@ -60,7 +64,7 @@
 subprocess.run(python_cmd + " ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True)
 
 #test pruning a database with a graph
-subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_network_qc --type-isolate \"12754_4#79\" --remove-samples remove.txt --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_network_qc --remove-samples remove.txt --overwrite", shell=True, check=True)
 
 # tests of other command line programs
 sys.stderr.write("Testing C++ extension\n")