@@ -80,51 +80,45 @@ def initialize_vv_log(outdir):
80
80
"""Initialize or append to the VV_log.csv file."""
81
81
vv_log_path = os .path .join (outdir , "VV_log.csv" )
82
82
83
- # Check if file exists
84
83
if not os .path .exists (vv_log_path ):
85
- # Create new file with header
86
84
with open (vv_log_path , 'w' ) as f :
87
- f .write ("component,sample_id,check_name,status,message,details\n " )
85
+ f .write ("component,sample_id,check_name,status,flag_code, message,details\n " )
88
86
89
87
return vv_log_path
90
88
91
89
def log_check_result (log_path , component , sample_id , check_name , status , message = "" , details = "" ):
92
90
"""Log check result to the VV_log.csv file."""
93
- # Properly escape and format fields for CSV
94
91
def escape_field (field , is_details = False ):
95
- # Convert to string if not already
96
- field = str (field )
97
-
98
- # Replace newlines with semicolons to keep CSV valid
92
+ if not isinstance (field , str ):
93
+ field = str (field )
99
94
field = field .replace ('\n ' , '; ' )
100
-
101
- # For details field, replace commas with semicolons to avoid CSV quoting
102
95
if is_details and ',' in field :
103
96
field = field .replace (', ' , '; ' )
104
-
105
- # Also replace commas in message fields that contain percentages
106
- if '%' in field and ',' in field :
107
- field = field .replace (', ' , '; ' )
108
-
109
- # If the field contains commas or quotes (but not just semicolons), wrap it in quotes
110
97
if ',' in field or '"' in field :
111
- # Double any quotes within the field
112
98
field = field .replace ('"' , '""' )
113
- # Wrap in quotes
114
99
field = f'"{ field } "'
115
100
return field
101
+
102
+ # Map status strings to flag codes
103
+ flag_codes = {
104
+ "GREEN" : "20" ,
105
+ "YELLOW" : "30" ,
106
+ "RED" : "50" ,
107
+ "HALT" : "80"
108
+ }
109
+
110
+ # Get flag code based on status
111
+ flag_code = flag_codes .get (status , "80" ) # Default to HALT if unknown status
116
112
117
- # Format all fields
118
113
component = escape_field (component )
119
114
sample_id = escape_field (sample_id )
120
115
check_name = escape_field (check_name )
121
116
status = escape_field (status )
122
117
message = escape_field (message )
123
- details = escape_field (details , True ) # Specify this is a details field
118
+ details = escape_field (details , True )
124
119
125
- # Write the formatted line
126
120
with open (log_path , 'a' ) as f :
127
- f .write (f"{ component } ,{ sample_id } ,{ check_name } ,{ status } ,{ message } ,{ details } \n " )
121
+ f .write (f"{ component } ,{ sample_id } ,{ check_name } ,{ status } ,{ flag_code } , { message } ,{ details } \n " )
128
122
129
123
def check_gzip_integrity (outdir , samples , paired_end , log_path ):
130
124
"""Check GZIP integrity for all FASTQ files using gzip -t."""
@@ -286,76 +280,88 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
286
280
287
281
if not os .path .exists (multiqc_zip ):
288
282
print (f"WARNING: MultiQC data not found at: { multiqc_zip } " )
289
- log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED " ,
283
+ log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "HALT " ,
290
284
"MultiQC data not found" , multiqc_zip )
291
285
return False
292
286
293
- print (f"Checking samples in MultiQC data: { multiqc_zip } " )
294
-
295
- # Create a temporary directory to extract files
296
- with tempfile .TemporaryDirectory () as temp_dir :
297
- try :
298
- # Extract the zip file
287
+ try :
288
+ # Extract and check the data
289
+ with tempfile .TemporaryDirectory () as temp_dir :
299
290
with zipfile .ZipFile (multiqc_zip , 'r' ) as zip_ref :
300
291
zip_ref .extractall (temp_dir )
301
292
302
- # Path to the MultiQC data JSON file (new structure)
303
293
json_path = os .path .join (temp_dir , f"align_multiqc{ assay_suffix } _data" , "multiqc_data.json" )
304
294
305
295
if not os .path .exists (json_path ):
306
- print (f"WARNING: No multiqc_data.json file found in the expected location" )
307
296
log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED" ,
308
297
"multiqc_data.json not found in zip" , "" )
309
298
return False
310
299
311
- # Parse the JSON file
312
300
with open (json_path ) as f :
313
301
multiqc_data = json .load (f )
314
302
315
- # Check for samples in report_data_sources
316
- if 'report_data_sources' not in multiqc_data :
317
- print (f"WARNING: No report_data_sources found in MultiQC data" )
303
+ # Look for Bowtie2 data in general stats
304
+ if not multiqc_data .get ('report_general_stats_data' ):
318
305
log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED" ,
319
- "No report_data_sources in MultiQC data" , "" )
306
+ "No general stats data in MultiQC data" , "" )
320
307
return False
321
308
322
- # Look for Bowtie2 section
323
- if 'Bowtie 2 / HiSAT2' not in multiqc_data ['report_data_sources' ]:
324
- print (f"WARNING: No Bowtie2 data found in MultiQC data" )
309
+ # Get samples from general stats
310
+ multiqc_samples = set ()
311
+ for stats_section in multiqc_data ['report_general_stats_data' ]:
312
+ for sample in stats_section .keys ():
313
+ # Clean sample name if needed
314
+ base_sample = os .path .basename (sample )
315
+ multiqc_samples .add (base_sample )
316
+
317
+ # Check for PE/SE mismatch
318
+ is_paired_in_runsheet = paired_end [0 ]
319
+ has_paired_data = False
320
+
321
+ # Check multiple indicators of paired data:
322
+ # 1. Check unmapped files
323
+ unmapped_dir = os .path .join (outdir , "02-Bowtie2_Alignment" )
324
+ has_paired_files = any (os .path .exists (os .path .join (unmapped_dir , sample , f"{ sample } .unmapped.fastq.2.gz" ))
325
+ for sample in samples )
326
+
327
+ # 2. Check stats data for paired indicators
328
+ for stats_section in multiqc_data ['report_general_stats_data' ]:
329
+ for sample_stats in stats_section .values ():
330
+ if any ('paired' in key .lower () for key in sample_stats .keys ()):
331
+ has_paired_data = True
332
+ break
333
+ if has_paired_data :
334
+ break
335
+
336
+ has_paired_data = has_paired_data or has_paired_files
337
+
338
+ if is_paired_in_runsheet and not has_paired_data :
325
339
log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED" ,
326
- "No Bowtie2 data in MultiQC data" , "" )
340
+ "Paired-end/single-end mismatch" ,
341
+ "Runsheet specifies paired-end but data appears to be single-end" )
327
342
return False
328
343
329
- # Get samples from the Bowtie2 section
330
- bowtie_data = multiqc_data ['report_data_sources' ]['Bowtie 2 / HiSAT2' ]['all_sections' ]
331
- multiqc_samples = set (bowtie_data .keys ())
332
-
333
344
# Check for missing samples
334
345
missing_samples = []
335
346
for sample in samples :
336
347
if sample not in multiqc_samples :
337
348
missing_samples .append (sample )
338
349
339
350
if missing_samples :
340
- print (f"WARNING: The following samples are missing from the MultiQC data:" )
341
- for sample in missing_samples :
342
- print (f" - { sample } " )
343
351
log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED" ,
344
352
f"Missing { len (missing_samples )} samples in MultiQC data" ,
345
353
"; " .join (missing_samples ))
346
354
return False
347
355
348
- print (f"All { len (samples )} samples found in MultiQC data" )
349
356
log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "GREEN" ,
350
357
f"All { len (samples )} samples found in data" ,
351
358
f"Checked presence of { len (samples )} samples in Bowtie2 section of MultiQC data" )
352
359
return True
353
360
354
- except Exception as e :
355
- print (f"Error checking MultiQC data: { str (e )} " )
356
- log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED" ,
357
- f"Error checking MultiQC data: { str (e )} " , "" )
358
- return False
361
+ except Exception as e :
362
+ log_check_result (log_path , "alignment" , "all" , "check_samples_multiqc" , "RED" ,
363
+ f"Error checking MultiQC data: { str (e )} " , "" )
364
+ return False
359
365
360
366
def get_bowtie2_multiqc_stats (outdir , samples , paired_end , log_path , assay_suffix = "_GLbulkRNAseq" ):
361
367
"""Get alignment statistics from MultiQC data."""
@@ -432,7 +438,7 @@ def get_bowtie2_multiqc_stats(outdir, samples, paired_end, log_path, assay_suffi
432
438
return None
433
439
434
440
def report_multiqc_outliers (outdir , multiqc_data , log_path ):
435
- """Report statistical outliers in MultiQC statistics using standard deviations ."""
441
+ """Identify and report outliers in MultiQC statistics."""
436
442
metrics = ['total_reads' , 'overall_alignment_rate' , 'aligned_none' , 'aligned_one' , 'aligned_multi' ]
437
443
outlier_samples = set ()
438
444
@@ -461,6 +467,11 @@ def report_multiqc_outliers(outdir, multiqc_data, log_path):
461
467
sample_values [metric ][sample ] = stats [metric ]
462
468
463
469
# Calculate statistics and check for outliers
470
+ thresholds = [
471
+ {"code" : "RED" , "stdev_threshold" : 4 , "middle_fcn" : "median" }, # Check severe outliers first
472
+ {"code" : "YELLOW" , "stdev_threshold" : 2 , "middle_fcn" : "median" } # Then check minor outliers
473
+ ]
474
+
464
475
for metric in metrics :
465
476
if not metric_values [metric ]:
466
477
continue
@@ -623,6 +634,21 @@ def add_bowtie2_group_stats(outdir, multiqc_data, log_path):
623
634
detail_str
624
635
)
625
636
637
+ for metric , stats in stats_summary .items ():
638
+ if metric == "overall_alignment_rate" :
639
+ if stats ['median' ] < 50 :
640
+ status = "RED"
641
+ flag_code = "50"
642
+ elif stats ['median' ] < 70 :
643
+ status = "YELLOW"
644
+ flag_code = "30"
645
+ else :
646
+ status = "GREEN"
647
+ flag_code = "20"
648
+ else :
649
+ status = "GREEN"
650
+ flag_code = "20"
651
+
626
652
return True
627
653
628
654
def check_bowtie2_existence (outdir , samples , paired_end , log_path ):
@@ -669,7 +695,7 @@ def check_bowtie2_existence(outdir, samples, paired_end, log_path):
669
695
print (f"WARNING: Missing alignment files:" )
670
696
for file in missing_files :
671
697
print (f" - { file } " )
672
- log_check_result (log_path , "alignment" , "all" , "check_bowtie2_existence" , "RED " ,
698
+ log_check_result (log_path , "alignment" , "all" , "check_bowtie2_existence" , "HALT " ,
673
699
f"Missing { len (missing_files )} files" , "; " .join (missing_files ))
674
700
return False
675
701
@@ -759,11 +785,33 @@ def validate_unmapped_fastq(outdir, samples, paired_end, log_path, max_lines=200
759
785
"; " .join (invalid_files ))
760
786
return False
761
787
788
+ if not all_files :
789
+ print ("No unmapped FASTQ files found to validate" )
790
+ log_check_result (log_path , "alignment" , "all" , "validate_unmapped_fastq" , "HALT" ,
791
+ "No unmapped FASTQ files found to validate" , "" )
792
+ return False
793
+
762
794
print (f"All unmapped FASTQ files are valid" )
763
795
log_check_result (log_path , "alignment" , "all" , "validate_unmapped_fastq" , "GREEN" ,
764
796
"All files valid" , f"Validated { len (all_files )} unmapped FASTQ files; Checked for valid FASTQ format and complete records" )
765
797
return True
766
798
799
+ def check_mapping_rates (outdir , star_data , log_path ):
800
+ """Check if mapping rates meet expected thresholds."""
801
+ # ... [existing code] ...
802
+
803
+ # Define thresholds
804
+ thresholds = {
805
+ "total_mapped" : [
806
+ {"code" : "YELLOW" , "type" : "lower" , "value" : 70 },
807
+ {"code" : "RED" , "type" : "lower" , "value" : 50 }
808
+ ],
809
+ "multi_mapped" : [
810
+ {"code" : "YELLOW" , "type" : "lower" , "value" : 30 },
811
+ {"code" : "RED" , "type" : "lower" , "value" : 15 }
812
+ ]
813
+ }
814
+
767
815
def main ():
768
816
"""Main function to process runsheet and validate raw reads."""
769
817
parser = argparse .ArgumentParser (description = 'Validate raw reads based on runsheet information.' )
0 commit comments