@@ -118,8 +118,10 @@ def main():
118
118
# Are there any samples present in all_persistent_samples not present in all_latest_samples?
119
119
# If no: Literally who cares, the perl script will handle it
120
120
# If yes: Iterate the *persistent* clusters rowwise to make sure they aren't decimated... or just give up
121
- all_latest_samples_set = set (all_latest_samples ["latest_cluster_id" ].to_list ())
122
- all_persistent_samples_set = set (all_persistent_samples ["latest_cluster_id" ].to_list ())
121
+ all_latest_samples_set = set (all_latest_samples ["sample_id" ].to_list ())
122
+ all_persistent_samples_set = set (all_persistent_samples ["sample_id" ].to_list ())
123
+ print (all_latest_samples_set )
124
+ print (all_persistent_samples_set )
123
125
if all_persistent_samples_set .issubset (all_latest_samples_set ):
124
126
logging .info ("All persistent samples is a subset of all latest samples" )
125
127
else :
@@ -797,6 +799,7 @@ def main():
797
799
change_report = []
798
800
799
801
for row in all_cluster_information .iter_rows (named = True ):
802
+ logging .debug ("Checking %s" , row ['cluster_id' ])
800
803
try :
801
804
what_is = set (row ["sample_id" ])
802
805
except TypeError :
@@ -805,15 +808,18 @@ def main():
805
808
what_was = set (row ["sample_id_previously" ])
806
809
except TypeError :
807
810
what_was = set ()
811
+ logging .debug ("what is: %s" , what_is )
812
+ logging .debug ("what was: %s" , what_was )
808
813
if len (what_is .intersection (what_was )) == len (what_was ):
809
- change_report .append ({"cluster" : f"{ row ['cluster_id' ]} @{ row ['cluster_distance' ]} " , "gained" : None , "lost" : None , "maintained" : list (what_is .intersection (what_was ))})
814
+ change_report .append ({"cluster" : f"{ row ['cluster_id' ]} @{ row ['cluster_distance' ]} " , "gained" : [] , "lost" : [] , "maintained" : list (what_is .intersection (what_was ))})
810
815
else :
811
816
change_report .append ({"cluster" : f"{ row ['cluster_id' ]} @{ row ['cluster_distance' ]} " , "gained" : list (what_is - what_was ), "lost" : list (what_was - what_is ), "maintained" : list (what_is .intersection (what_was ))})
812
817
change_report_df = pl .DataFrame (change_report ).with_columns ([
813
818
pl .when (pl .col ('gained' ).list .len () == 0 ).then (None ).otherwise (pl .col ('gained' )).alias ("gained" ),
814
819
pl .when (pl .col ('lost' ).list .len () == 0 ).then (None ).otherwise (pl .col ('lost' )).alias ("lost" ),
815
820
pl .when (pl .col ('maintained' ).list .len () == 0 ).then (None ).otherwise (pl .col ('maintained' )).alias ("maintained" ),
816
821
])
822
+ logging .info ("Finished. Here's how clusters have changed:" )
817
823
print (change_report_df )
818
824
change_report_df .write_ndjson (f'change_report{ today .isoformat ()} .json' )
819
825
@@ -876,7 +882,7 @@ def get_nwk_and_matrix_plus_local_mask(big_ol_dataframe, combineddiff):
876
882
logging .debug ("[%s] matUtils mask returned 0 (atree.pb --> masked btree.pb)" , this_cluster_id )
877
883
subprocess .run (f"matUtils extract -i { btreepb } -t { btree } " , shell = True , check = True )
878
884
logging .debug ("[%s] matUtils extract returned 0 (masked btree.pb --> masked btree.nwk)" , this_cluster_id )
879
- subprocess .run (f"python3 /scripts/find_clusters.py { btreepb } { btree } --type BM --collection-name { this_cluster_id } -jmatsu" , shell = True , check = True )
885
+ subprocess .run (f"python3 /scripts/find_clusters.py { btreepb } --type BM --collection-name { this_cluster_id } -jmatsu" , shell = True , check = True )
880
886
logging .debug ("[%s] ran find_clusters.py, looks like it returned 0" , this_cluster_id )
881
887
bmatrix = f"b{ this_cluster_id } _dmtrx.tsv" if os .path .exists (f"b{ this_cluster_id } _dmtrx.tsv" ) else None
882
888
except subprocess .CalledProcessError as e :
0 commit comments