Skip to content

Commit 66fa468

Browse files
committed
update microbes vv scripts to include flag code
1 parent e16bae0 commit 66fa468

File tree

6 files changed

+349
-258
lines changed

6 files changed

+349
-258
lines changed

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_bowtie2_alignment.py

Lines changed: 103 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -80,51 +80,45 @@ def initialize_vv_log(outdir):
8080
"""Initialize or append to the VV_log.csv file."""
8181
vv_log_path = os.path.join(outdir, "VV_log.csv")
8282

83-
# Check if file exists
8483
if not os.path.exists(vv_log_path):
85-
# Create new file with header
8684
with open(vv_log_path, 'w') as f:
87-
f.write("component,sample_id,check_name,status,message,details\n")
85+
f.write("component,sample_id,check_name,status,flag_code,message,details\n")
8886

8987
return vv_log_path
9088

9189
def log_check_result(log_path, component, sample_id, check_name, status, message="", details=""):
9290
"""Log check result to the VV_log.csv file."""
93-
# Properly escape and format fields for CSV
9491
def escape_field(field, is_details=False):
95-
# Convert to string if not already
96-
field = str(field)
97-
98-
# Replace newlines with semicolons to keep CSV valid
92+
if not isinstance(field, str):
93+
field = str(field)
9994
field = field.replace('\n', '; ')
100-
101-
# For details field, replace commas with semicolons to avoid CSV quoting
10295
if is_details and ',' in field:
10396
field = field.replace(', ', '; ')
104-
105-
# Also replace commas in message fields that contain percentages
106-
if '%' in field and ',' in field:
107-
field = field.replace(', ', '; ')
108-
109-
# If the field contains commas or quotes (but not just semicolons), wrap it in quotes
11097
if ',' in field or '"' in field:
111-
# Double any quotes within the field
11298
field = field.replace('"', '""')
113-
# Wrap in quotes
11499
field = f'"{field}"'
115100
return field
101+
102+
# Map status strings to flag codes
103+
flag_codes = {
104+
"GREEN": "20",
105+
"YELLOW": "30",
106+
"RED": "50",
107+
"HALT": "80"
108+
}
109+
110+
# Get flag code based on status
111+
flag_code = flag_codes.get(status, "80") # Default to HALT if unknown status
116112

117-
# Format all fields
118113
component = escape_field(component)
119114
sample_id = escape_field(sample_id)
120115
check_name = escape_field(check_name)
121116
status = escape_field(status)
122117
message = escape_field(message)
123-
details = escape_field(details, True) # Specify this is a details field
118+
details = escape_field(details, True)
124119

125-
# Write the formatted line
126120
with open(log_path, 'a') as f:
127-
f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
121+
f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
128122

129123
def check_gzip_integrity(outdir, samples, paired_end, log_path):
130124
"""Check GZIP integrity for all FASTQ files using gzip -t."""
@@ -286,76 +280,88 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
286280

287281
if not os.path.exists(multiqc_zip):
288282
print(f"WARNING: MultiQC data not found at: {multiqc_zip}")
289-
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
283+
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "HALT",
290284
"MultiQC data not found", multiqc_zip)
291285
return False
292286

293-
print(f"Checking samples in MultiQC data: {multiqc_zip}")
294-
295-
# Create a temporary directory to extract files
296-
with tempfile.TemporaryDirectory() as temp_dir:
297-
try:
298-
# Extract the zip file
287+
try:
288+
# Extract and check the data
289+
with tempfile.TemporaryDirectory() as temp_dir:
299290
with zipfile.ZipFile(multiqc_zip, 'r') as zip_ref:
300291
zip_ref.extractall(temp_dir)
301292

302-
# Path to the MultiQC data JSON file (new structure)
303293
json_path = os.path.join(temp_dir, f"align_multiqc{assay_suffix}_data", "multiqc_data.json")
304294

305295
if not os.path.exists(json_path):
306-
print(f"WARNING: No multiqc_data.json file found in the expected location")
307296
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
308297
"multiqc_data.json not found in zip", "")
309298
return False
310299

311-
# Parse the JSON file
312300
with open(json_path) as f:
313301
multiqc_data = json.load(f)
314302

315-
# Check for samples in report_data_sources
316-
if 'report_data_sources' not in multiqc_data:
317-
print(f"WARNING: No report_data_sources found in MultiQC data")
303+
# Look for Bowtie2 data in general stats
304+
if not multiqc_data.get('report_general_stats_data'):
318305
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
319-
"No report_data_sources in MultiQC data", "")
306+
"No general stats data in MultiQC data", "")
320307
return False
321308

322-
# Look for Bowtie2 section
323-
if 'Bowtie 2 / HiSAT2' not in multiqc_data['report_data_sources']:
324-
print(f"WARNING: No Bowtie2 data found in MultiQC data")
309+
# Get samples from general stats
310+
multiqc_samples = set()
311+
for stats_section in multiqc_data['report_general_stats_data']:
312+
for sample in stats_section.keys():
313+
# Clean sample name if needed
314+
base_sample = os.path.basename(sample)
315+
multiqc_samples.add(base_sample)
316+
317+
# Check for PE/SE mismatch
318+
is_paired_in_runsheet = paired_end[0]
319+
has_paired_data = False
320+
321+
# Check multiple indicators of paired data:
322+
# 1. Check unmapped files
323+
unmapped_dir = os.path.join(outdir, "02-Bowtie2_Alignment")
324+
has_paired_files = any(os.path.exists(os.path.join(unmapped_dir, sample, f"{sample}.unmapped.fastq.2.gz"))
325+
for sample in samples)
326+
327+
# 2. Check stats data for paired indicators
328+
for stats_section in multiqc_data['report_general_stats_data']:
329+
for sample_stats in stats_section.values():
330+
if any('paired' in key.lower() for key in sample_stats.keys()):
331+
has_paired_data = True
332+
break
333+
if has_paired_data:
334+
break
335+
336+
has_paired_data = has_paired_data or has_paired_files
337+
338+
if is_paired_in_runsheet and not has_paired_data:
325339
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
326-
"No Bowtie2 data in MultiQC data", "")
340+
"Paired-end/single-end mismatch",
341+
"Runsheet specifies paired-end but data appears to be single-end")
327342
return False
328343

329-
# Get samples from the Bowtie2 section
330-
bowtie_data = multiqc_data['report_data_sources']['Bowtie 2 / HiSAT2']['all_sections']
331-
multiqc_samples = set(bowtie_data.keys())
332-
333344
# Check for missing samples
334345
missing_samples = []
335346
for sample in samples:
336347
if sample not in multiqc_samples:
337348
missing_samples.append(sample)
338349

339350
if missing_samples:
340-
print(f"WARNING: The following samples are missing from the MultiQC data:")
341-
for sample in missing_samples:
342-
print(f" - {sample}")
343351
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
344352
f"Missing {len(missing_samples)} samples in MultiQC data",
345353
"; ".join(missing_samples))
346354
return False
347355

348-
print(f"All {len(samples)} samples found in MultiQC data")
349356
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "GREEN",
350357
f"All {len(samples)} samples found in data",
351358
f"Checked presence of {len(samples)} samples in Bowtie2 section of MultiQC data")
352359
return True
353360

354-
except Exception as e:
355-
print(f"Error checking MultiQC data: {str(e)}")
356-
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
357-
f"Error checking MultiQC data: {str(e)}", "")
358-
return False
361+
except Exception as e:
362+
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
363+
f"Error checking MultiQC data: {str(e)}", "")
364+
return False
359365

360366
def get_bowtie2_multiqc_stats(outdir, samples, paired_end, log_path, assay_suffix="_GLbulkRNAseq"):
361367
"""Get alignment statistics from MultiQC data."""
@@ -432,7 +438,7 @@ def get_bowtie2_multiqc_stats(outdir, samples, paired_end, log_path, assay_suffi
432438
return None
433439

434440
def report_multiqc_outliers(outdir, multiqc_data, log_path):
435-
"""Report statistical outliers in MultiQC statistics using standard deviations."""
441+
"""Identify and report outliers in MultiQC statistics."""
436442
metrics = ['total_reads', 'overall_alignment_rate', 'aligned_none', 'aligned_one', 'aligned_multi']
437443
outlier_samples = set()
438444

@@ -461,6 +467,11 @@ def report_multiqc_outliers(outdir, multiqc_data, log_path):
461467
sample_values[metric][sample] = stats[metric]
462468

463469
# Calculate statistics and check for outliers
470+
thresholds = [
471+
{"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, # Check severe outliers first
472+
{"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"} # Then check minor outliers
473+
]
474+
464475
for metric in metrics:
465476
if not metric_values[metric]:
466477
continue
@@ -623,6 +634,21 @@ def add_bowtie2_group_stats(outdir, multiqc_data, log_path):
623634
detail_str
624635
)
625636

637+
for metric, stats in stats_summary.items():
638+
if metric == "overall_alignment_rate":
639+
if stats['median'] < 50:
640+
status = "RED"
641+
flag_code = "50"
642+
elif stats['median'] < 70:
643+
status = "YELLOW"
644+
flag_code = "30"
645+
else:
646+
status = "GREEN"
647+
flag_code = "20"
648+
else:
649+
status = "GREEN"
650+
flag_code = "20"
651+
626652
return True
627653

628654
def check_bowtie2_existence(outdir, samples, paired_end, log_path):
@@ -669,7 +695,7 @@ def check_bowtie2_existence(outdir, samples, paired_end, log_path):
669695
print(f"WARNING: Missing alignment files:")
670696
for file in missing_files:
671697
print(f" - {file}")
672-
log_check_result(log_path, "alignment", "all", "check_bowtie2_existence", "RED",
698+
log_check_result(log_path, "alignment", "all", "check_bowtie2_existence", "HALT",
673699
f"Missing {len(missing_files)} files", "; ".join(missing_files))
674700
return False
675701

@@ -759,11 +785,33 @@ def validate_unmapped_fastq(outdir, samples, paired_end, log_path, max_lines=200
759785
"; ".join(invalid_files))
760786
return False
761787

788+
if not all_files:
789+
print("No unmapped FASTQ files found to validate")
790+
log_check_result(log_path, "alignment", "all", "validate_unmapped_fastq", "HALT",
791+
"No unmapped FASTQ files found to validate", "")
792+
return False
793+
762794
print(f"All unmapped FASTQ files are valid")
763795
log_check_result(log_path, "alignment", "all", "validate_unmapped_fastq", "GREEN",
764796
"All files valid", f"Validated {len(all_files)} unmapped FASTQ files; Checked for valid FASTQ format and complete records")
765797
return True
766798

799+
def check_mapping_rates(outdir, star_data, log_path):
800+
"""Check if mapping rates meet expected thresholds."""
801+
# ... [existing code] ...
802+
803+
# Define thresholds
804+
thresholds = {
805+
"total_mapped": [
806+
{"code": "YELLOW", "type": "lower", "value": 70},
807+
{"code": "RED", "type": "lower", "value": 50}
808+
],
809+
"multi_mapped": [
810+
{"code": "YELLOW", "type": "lower", "value": 30},
811+
{"code": "RED", "type": "lower", "value": 15}
812+
]
813+
}
814+
767815
def main():
768816
"""Main function to process runsheet and validate raw reads."""
769817
parser = argparse.ArgumentParser(description='Validate raw reads based on runsheet information.')

0 commit comments

Comments
 (0)