Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
7f5564b
added an if statement to see if type isolate was user-defined
anleip Nov 17, 2025
ad6bb43
changed - to _ in type_isolate
anleip Nov 17, 2025
b5841c8
removed first condition in if statement since it is never true
anleip Nov 18, 2025
1cb5b0d
removed type isolate input from prune_edges()
anleip Nov 20, 2025
6750d2f
removed type isolate generation and usage from qcDistMat()
anleip Nov 20, 2025
d2e8d18
updated description for prune_edges()
anleip Nov 20, 2025
017ab98
Removed type isolate arguement and error
anleip Nov 20, 2025
805452a
added the newDistMat variable and changed the input for plot from dis…
anleip Nov 20, 2025
be055df
make the update of newDistMat to global variable in prune_distance_ma…
anleip Nov 20, 2025
3e1778b
Deleted global statement before newDistMat since it is not needed
anleip Nov 21, 2025
c0ec62a
added comment where newDistMat gets updated
anleip Nov 21, 2025
d499017
get newDistMat from remove_qc_fail
anleip Nov 21, 2025
83c99c6
added a return for newDistMat in remove_qc_fail()
anleip Nov 21, 2025
c83a2bd
a maybe cleaner way to get newDistMat from prune_distance_mtrix()
anleip Nov 21, 2025
01b8f4b
added comments on newDistMat for remove_qc_fail()
anleip Nov 21, 2025
10fa84b
make distance files despite empty failed list prune_distance_matrix()
anleip Nov 21, 2025
e1d9815
allow h5 file to be written with empty failed list remove_qc_fail()
anleip Nov 21, 2025
800c46f
allow writing output files with no sample to remove
anleip Nov 21, 2025
262a693
updated QC test scripts
anleip Nov 25, 2025
9431036
updated qc test script again
anleip Nov 25, 2025
fe55e14
removed type isolate argument from extractReferences()
anleip Nov 25, 2025
b787cf3
removed type isolate argument in clique pruning
anleip Nov 25, 2025
0ba9481
Removed the last type isolate argument
anleip Nov 25, 2025
34a74d3
Removed type isolate arguements
anleip Nov 25, 2025
81eff19
Removed type isolate argument
anleip Nov 25, 2025
33e48be
Added a comment
anleip Nov 25, 2025
df8a832
Uncommented a qc test
anleip Nov 28, 2025
2b05b3f
output whole tuple
anleip Nov 28, 2025
2d4a500
removed the use of newDistMat, instead update distMat
anleip Nov 28, 2025
19191a5
removed type isolate function
anleip Nov 28, 2025
2602f22
updated version number to 2.7.7
anleip Nov 28, 2025
6649056
Removed reference to type isolate
anleip Nov 28, 2025
2fd57f5
removed software-properties-common
anleip Nov 28, 2025
72ad364
Try graph tools apt package
johnlees Nov 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,6 @@ def get_options():
default = default_max_pi_dist, type = float)
qcGroup.add_argument('--max-zero-dist', help=f"Maximum proportion of zero distances to permit [default = {default_max_zero}]",
default = default_max_zero, type = float)
qcGroup.add_argument('--type-isolate', help='Isolate from which distances will be calculated for pruning [default = None]',
default = None, type = str)
qcGroup.add_argument('--length-sigma', help=f"Number of standard deviations of length distribution beyond "
f"which sequences will be excluded [default = {default_length_sigma}]", default = default_length_sigma, type = int)
qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
Expand Down Expand Up @@ -420,10 +418,10 @@ def main():
'max_pi_dist': args.max_pi_dist,
'max_a_dist': args.max_a_dist,
'prop_zero': args.max_zero_dist,
'type_isolate': args.type_isolate
}

refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
newDistMat = distMat

fail_unconditionally = {}
# Unconditional removal
Expand Down Expand Up @@ -454,22 +452,18 @@ def main():
pass_list = set(refList) - fail_unconditionally.keys() - fail_assembly_qc.keys() - fail_dist_qc.keys()
assert(pass_list == (set(refList) - fail_unconditionally.keys()).intersection(set(pass_assembly_qc)).intersection(set(pass_dist_qc)))
passed = [x for x in refList if x in pass_list]
if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in pass_list:
raise RuntimeError('Type isolate ' + qc_dict['type_isolate'] + \
' not found in isolates after QC; check '
'name of type isolate and QC options\n')

sys.stderr.write(f"{len(passed)} samples passed QC\n")
if len(passed) < len(refList):
remove_qc_fail(qc_dict, refList, passed,
# Update newDistMat and write output files
if len(passed) <= len(refList):
newDistMat = remove_qc_fail(qc_dict, refList, passed,
[fail_unconditionally, fail_assembly_qc, fail_dist_qc],
args.ref_db, distMat, output,
args.strand_preserved, args.threads,
args.gpu_graph)

# Plot results
if not args.no_plot:
plot_scatter(distMat,
plot_scatter(newDistMat,
output,
output + " distances")
genome_lengths, ambiguous_bases = get_database_statistics(output)
Expand Down Expand Up @@ -747,7 +741,6 @@ def main():
refList,
output,
outSuffix = dist_string,
type_isolate = args.type_isolate,
threads = args.threads,
use_gpu = args.gpu_graph)
nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices)
Expand Down
6 changes: 1 addition & 5 deletions PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ def get_options():
default = default_max_merge, type = int)
qcGroup.add_argument('--betweenness', default=False, action='store_true',
help='Report the betweenness of all the query nodes [default = False]')
qcGroup.add_argument('--type-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
default = None, type = str)
qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
'which sequences will be excluded [default = 5]', default = None, type = int)
qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
Expand Down Expand Up @@ -188,10 +186,9 @@ def main():
'prop_zero': args.max_zero_dist,
'max_merge': args.max_merge,
'betweenness': args.betweenness,
'type_isolate': args.type_isolate
}
else:
qc_dict = {'run_qc': False, 'type_isolate': None }
qc_dict = {'run_qc': False}

# Dict of DB access functions for assign_query (which is out of scope)
dbFuncs = setupDBFuncs(args)
Expand Down Expand Up @@ -785,7 +782,6 @@ def assign_query_hdf5(dbFuncs,
merged_queries,
outSuffix = file_extension_string,
existingRefs = existing_ref_list,
type_isolate = qc_dict['type_isolate'],
threads = threads,
use_gpu = gpu_graph,
fast_mode = update_db == "fast")
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/lineages.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def query_db(args):
external_clustering = clustering_file

# Ignore QC at the moment
qc_dict = {'run_qc': False, 'type_isolate': None }
qc_dict = {'run_qc': False}

# Check output file
if args.output is None:
Expand Down
21 changes: 1 addition & 20 deletions PopPUNK/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def translate_network_indices(G_ref_df, reference_indices):
G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True)
return(G_ref)

def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '', type_isolate = None,
def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix = '',
existingRefs = None, threads = 1, use_gpu = False, fast_mode = False):
"""Extract references for each cluster based on cliques

Expand All @@ -298,8 +298,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
(used for fast assign, to enrich refs with)
outSuffix (str)
Suffix for output file (.refs will be appended)
type_isolate (str)
Isolate to be included in set of references
existingRefs (list)
References that should be used for each clique
use_gpu (bool)
Expand Down Expand Up @@ -328,15 +326,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
else:
merged_query_idx = set()

# Add type isolate, if necessary
type_isolate_index = None
if type_isolate is not None:
if type_isolate in dbOrder:
type_isolate_index = dbOrder.index(type_isolate)
else:
sys.stderr.write('Type isolate ' + type_isolate + ' not found\n')
sys.exit(1)

if use_gpu:
if fast_mode:
raise RuntimeError("GPU graphs not yet supported with --update-db fast")
Expand All @@ -350,10 +339,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
reference_index_df = partition_assignments.groupby('partition').nth(0)
reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()

# Add type isolate if necessary - before edges are added
if type_isolate_index is not None and type_isolate_index not in reference_indices:
reference_indices.append(type_isolate_index)

# Order found references as in sketchlib database
reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]

Expand Down Expand Up @@ -439,10 +424,6 @@ def extractReferences(G, dbOrder, outPrefix, merged_queries = list(), outSuffix
# Returns nested lists, which need to be flattened
reference_indices = set([entry for sublist in ref_lists for entry in sublist])

# Add type isolate if necessary - before edges are added
if type_isolate_index is not None and type_isolate_index not in reference_indices:
reference_indices.add(type_isolate_index)

sys.stderr.write("Reconstructing clusters with shortest paths\n")
if gt.openmp_enabled():
gt.openmp_set_num_threads(threads)
Expand Down
37 changes: 17 additions & 20 deletions PopPUNK/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,12 @@ def prune_distance_matrix(refList, remove_seqs_in, distMat, output):
else:
raise RuntimeError("Row name mismatch. Old: " + ref1 + "," + ref2 + "\n"
"New: " + newRef1 + "," + newRef2 + "\n")

storePickle(newRefList, newRefList, True, newDistMat, output)

else:
newRefList = refList
newDistMat = distMat

storePickle(newRefList, newRefList, True, newDistMat, output)

# return new distance matrix and sequence lists
return newRefList, newDistMat
Expand Down Expand Up @@ -270,11 +271,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
names = refList + queryList
self = False

# Pick type isolate if not supplied
if qc_dict['type_isolate'] is None:
qc_dict['type_isolate'] = pickTypeIsolate(ref_db, refList)
sys.stderr.write('Selected type isolate for distance QC is ' + qc_dict['type_isolate'] + '\n')

# First check with numpy, which is quicker than iterating over everything
#long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist()
Expand All @@ -285,7 +281,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
int_offset = 0)

failed = prune_edges(long_edges,
type_isolate=names.index(qc_dict['type_isolate']),
query_start=len(refList),
allow_ref_ref=self)
# Convert the edge IDs back to sample names
Expand All @@ -302,7 +297,6 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict):
num_ref = len(refList),
int_offset = 0)
failed = prune_edges(zero_edges,
type_isolate=names.index(qc_dict['type_isolate']),
query_start=len(refList),
failed=failed,
min_count=zero_count,
Expand Down Expand Up @@ -366,18 +360,15 @@ def qcQueryAssignments(rList, qList, query_assignments, max_clusters,
retained_samples.append(query)
return retained_samples, failed_samples

def prune_edges(long_edges, type_isolate, query_start,
def prune_edges(long_edges, query_start,
failed=None, min_count=1, allow_ref_ref=True):
"""Gives a list of failed vertices from a list of failing
edges. Tries to prune by those nodes with highest degree of
bad nodes, preferentially removes queries, and doesn't remove
the type isolate
bad nodes, preferentially removes queries.

Args:
long_edges (list of tuples)
List of bad edges as node IDs
type_isolate (int)
The node ID of the type isolate
query_start (int)
The first node ID which corresponds to queries
failed (set or None)
Expand Down Expand Up @@ -405,7 +396,7 @@ def prune_edges(long_edges, type_isolate, query_start,
# Do not add any refs if querying
if r < query_start and q < query_start:
if allow_ref_ref:
if q == type_isolate or (counts[r] > counts[q] and counts[r] >= min_count):
if (counts[r] > counts[q] and counts[r] >= min_count):
failed.add(r)
elif counts[q] >= min_count:
failed.add(q)
Expand All @@ -423,7 +414,7 @@ def prune_edges(long_edges, type_isolate, query_start,
def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
strand_preserved=False, threads=1, use_gpu=False):
"""Removes samples failing QC from the database and distances. Also
recalculates random match chances.
recalculates random match chances. Return a new distance matrix.

Args:
qc_dict (dict)
Expand All @@ -449,6 +440,9 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
[default = 1].
use_gpu (bool)
Whether GPU libraries were used to generate the original network.
Return:
newDistMat (numpy.array)
Updated version of distMat
"""
from .sketchlib import removeFromDB, addRandom, readDBParams

Expand All @@ -467,7 +461,7 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
passed,
full_names = True)
# new database file if pruning
if failed and not qc_dict['no_remove']:
if not qc_dict['no_remove']:
# sys.stderr.write(f"Removing {len(failed)} samples from database and distances\n")
tmp_filtered_db_name = f"{prefix}/filtered.{os.path.basename(prefix)}.h5"
output_db_name = f"{prefix}/{os.path.basename(prefix)}.h5"
Expand All @@ -479,10 +473,10 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
os.rename(tmp_filtered_db_name, output_db_name)

# Remove from the distMat too
prune_distance_matrix(names,
newDistMat = prune_distance_matrix(names,
failed,
distMat,
f"{prefix}/{os.path.basename(prefix)}.dists")
f"{prefix}/{os.path.basename(prefix)}.dists")[1]

# Update the graph
prune_graph(ref_db,
Expand All @@ -501,6 +495,8 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
# write failing & reasons
write_qc_failure_report(failed, fail_dicts, prefix)

return newDistMat

def write_qc_failure_report(failed_samples, fail_dicts, output_prefix):
"""
Writes a report of failed samples and their reasons to a file.
Expand Down Expand Up @@ -535,7 +531,8 @@ def get_failure_reasons(sample, fail_dicts):
if sample in fail_dict
for reason in fail_dict[sample]
]


# Type isolate is no longer supported in PopPUNK
def pickTypeIsolate(prefix, refList):
"""Selects a type isolate as that with a minimal proportion
of missing data.
Expand Down
12 changes: 8 additions & 4 deletions test/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,13 @@

# create database with different QC options
sys.stderr.write("Running database QC test (--qc-db)\n")
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --type-isolate \"12754_4#79\" --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --type-isolate \"12754_4#79\" --length-range 2000000 3000000 --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --type-isolate \"12754_4#79\" --remove-samples remove.txt --overwrite", shell=True, check=True)
# Type isolate can no longer be user specified. Use to have --type-isolate \"12754_4#79\"
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --length-range 2000000 3000000 --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_qc --remove-samples remove.txt --overwrite", shell=True, check=True)
# test if QC gives the same output as the original distances when there is no samples to remove
#subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_qc --output example_qc_noRemove", shell=True, check=True)


#fit GMM
sys.stderr.write("Running GMM model fit (--fit-model gmm)\n")
Expand Down Expand Up @@ -60,7 +64,7 @@
subprocess.run(python_cmd + " ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True)

#test pruning a database with a graph
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_network_qc --type-isolate \"12754_4#79\" --remove-samples remove.txt --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --qc-db --ref-db example_db --output example_network_qc --remove-samples remove.txt --overwrite", shell=True, check=True)

# tests of other command line programs
sys.stderr.write("Testing C++ extension\n")
Expand Down
Loading