Skip to content

Commit 4fd70fa

Browse files
committed
Various bugfixes
1 parent 2402f6e commit 4fd70fa

File tree

3 files changed

+15
-10
lines changed

3 files changed

+15
-10
lines changed

matutils_and_friends.wdl

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,6 @@ task cluster_CDPH_method {
819819
echo "Contents of workdir:"
820820
tree
821821
# A_big.nwk big tree, nwk format
822-
# all_neighbors.tsv
823822
# LONELY-subtree-n.nwk (n as variable) subtrees (usually multiple) of unclustered samples
824823
# lonely-subtree-assignments.tsv which subtree each unclustered sample ended up in
825824
# cluster_annotation_workdirIDs.tsv can be used to annotate by nonpersistent cluster (but isn't, at least not yet)
@@ -864,7 +863,7 @@ task cluster_CDPH_method {
864863
File? change_report_json = "change_report" + today + ".json"
865864

866865
# brand new samples list, not fully finished processing but here ya go
867-
File new_samples = "new_samples" + today + ".tsv"
866+
File? new_samples = "new_samples" + today + ".tsv"
868867

869868
# trees, all in nwk format for now
870869
# A = not internally masked
@@ -883,19 +882,19 @@ task cluster_CDPH_method {
883882
Array[File]? bcluster_matrices = glob("b*_dmtrx.tsv") # !UnnecessaryQuantifier
884883
885884
# cluster information
886-
File unclustered_neighbors = "lonely_closest_relatives.txt"
885+
File? unclustered_neighbors = "unclustered_neighbors.txt"
887886
#File rosetta_stone_20 = "rosetta_stone_20.tsv"
888887
#File rosetta_stone_10 = "rosetta_stone_10.tsv"
889888
#File rosetta_stone_5 = "rosetta_stone_5.tsv"
890-
File nearest_and_furtherst_info = "all_neighbors.tsv"
889+
#File nearest_and_furtherst_info = "all_neighbors.tsv"
891890
Int n_big_clusters = read_int("n_big_clusters")
892891
Int n_samples_in_clusters = read_int("n_samples_in_clusters")
893892
Int n_samples_processed = read_int("n_samples_processed")
894893
Int n_unclustered = read_int("n_unclustered")
895894

896895
# old, maybe restore later?
897896
#Array[File] abig_subtrees = glob("abig-subtree-*.nwk")
898-
File samp_cluster = "samp_persiscluster" + today + ".tsv" # for nextstrain conversion
897+
File? samp_cluster = "samp_persiscluster" + today + ".tsv" # for nextstrain conversion
899898
#File? persistent_cluster_translator = "mapped_persistent_cluster_ids_to_new_cluster_ids.tsv"
900899
#Array[File] cluster_trees_json = glob("*.json")
901900
#Array[File] metadata_tsvs = glob("*.tsv") # for auspice.us, which supports nwk

process_clusters.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ def main():
118118
# Are there any samples present in all_persistent_samples not present in all_latest_samples?
119119
# If no: Literally who cares, the perl script will handle it
120120
# If yes: Iterate the *persistent* clusters rowwise to make sure they aren't decimated... or just give up
121-
all_latest_samples_set = set(all_latest_samples["latest_cluster_id"].to_list())
122-
all_persistent_samples_set = set(all_persistent_samples["latest_cluster_id"].to_list())
121+
all_latest_samples_set = set(all_latest_samples["sample_id"].to_list())
122+
all_persistent_samples_set = set(all_persistent_samples["sample_id"].to_list())
123+
print(all_latest_samples_set)
124+
print(all_persistent_samples_set)
123125
if all_persistent_samples_set.issubset(all_latest_samples_set):
124126
logging.info("All persistent samples is a subset of all latest samples")
125127
else:
@@ -797,6 +799,7 @@ def main():
797799
change_report = []
798800

799801
for row in all_cluster_information.iter_rows(named=True):
802+
logging.debug("Checking %s", row['cluster_id'])
800803
try:
801804
what_is = set(row["sample_id"])
802805
except TypeError:
@@ -805,15 +808,18 @@ def main():
805808
what_was = set(row["sample_id_previously"])
806809
except TypeError:
807810
what_was = set()
811+
logging.debug("what is: %s", what_is)
812+
logging.debug("what was: %s", what_was)
808813
if len(what_is.intersection(what_was)) == len(what_was):
809-
change_report.append({"cluster": f"{row['cluster_id']}@{row['cluster_distance']}", "gained": None, "lost": None, "maintained": list(what_is.intersection(what_was))})
814+
change_report.append({"cluster": f"{row['cluster_id']}@{row['cluster_distance']}", "gained": [], "lost": [], "maintained": list(what_is.intersection(what_was))})
810815
else:
811816
change_report.append({"cluster": f"{row['cluster_id']}@{row['cluster_distance']}", "gained": list(what_is - what_was), "lost": list(what_was - what_is), "maintained": list(what_is.intersection(what_was))})
812817
change_report_df = pl.DataFrame(change_report).with_columns([
813818
pl.when(pl.col('gained').list.len() == 0).then(None).otherwise(pl.col('gained')).alias("gained"),
814819
pl.when(pl.col('lost').list.len() == 0).then(None).otherwise(pl.col('lost')).alias("lost"),
815820
pl.when(pl.col('maintained').list.len() == 0).then(None).otherwise(pl.col('maintained')).alias("maintained"),
816821
])
822+
logging.info("Finished. Here's how clusters have changed:")
817823
print(change_report_df)
818824
change_report_df.write_ndjson(f'change_report{today.isoformat()}.json')
819825

@@ -876,7 +882,7 @@ def get_nwk_and_matrix_plus_local_mask(big_ol_dataframe, combineddiff):
876882
logging.debug("[%s] matUtils mask returned 0 (atree.pb --> masked btree.pb)", this_cluster_id)
877883
subprocess.run(f"matUtils extract -i {btreepb} -t {btree}", shell=True, check=True)
878884
logging.debug("[%s] matUtils extract returned 0 (masked btree.pb --> masked btree.nwk)", this_cluster_id)
879-
subprocess.run(f"python3 /scripts/find_clusters.py {btreepb} {btree} --type BM --collection-name {this_cluster_id} -jmatsu", shell=True, check=True)
885+
subprocess.run(f"python3 /scripts/find_clusters.py {btreepb} --type BM --collection-name {this_cluster_id} -jmatsu", shell=True, check=True)
880886
logging.debug("[%s] ran find_clusters.py, looks like it returned 0", this_cluster_id)
881887
bmatrix = f"b{this_cluster_id}_dmtrx.tsv" if os.path.exists(f"b{this_cluster_id}_dmtrx.tsv") else None
882888
except subprocess.CalledProcessError as e:

tree_nine.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ workflow Tree_Nine {
271271
# other cluster information
272272
File? new_persistent_ids = cluster.new_persistent_ids
273273
File? new_persistent_meta = cluster.new_persistent_meta
274-
File? nearest_and_furtherst_info = cluster.nearest_and_furtherst_info
274+
File? unclustered_neighbors = cluster.unclustered_neighbors
275275
File? final_cluster_information_json = cluster.final_cluster_information_json
276276
Int? nb_n_clusters = cluster.n_big_clusters
277277
Int? nb_n_samps_unclustered = cluster.n_unclustered

0 commit comments

Comments
 (0)