Skip to content

Commit 5b38f64

Browse files
committed
add vv updates
1 parent 66fa468 commit 5b38f64

File tree

8 files changed

+142
-53
lines changed

8 files changed

+142
-53
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#! /usr/bin/env python
2+
# Concatenate V&V logs
3+
4+
from pathlib import Path
5+
6+
logs = sorted(list(Path.cwd().glob("VV_in.csv*")))
7+
8+
OUTPUT_FN = Path("VV_log_final_GLbulkRNAseq.csv")
9+
10+
for i, log in enumerate(logs):
11+
with open(log, "r") as in_f:
12+
if i == 0: # first file
13+
contents = in_f.read()
14+
else:
15+
contents += "".join(in_f.readlines()[1:]) # skip header
16+
17+
with open(OUTPUT_FN, "w") as out_f:
18+
out_f.write(contents)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#! /usr/bin/env python
2+
import pandas as pd
3+
import argparse
4+
5+
# Set up command line arguments
6+
parser = argparse.ArgumentParser()
7+
parser.add_argument('--assay_suffix', type=str, required=True,
8+
help='Suffix for input/output files (e.g. "_GLbulkRNAseq")')
9+
args = parser.parse_args()
10+
11+
INPUT_FN = f"VV_log_final{args.assay_suffix}.csv"
12+
OUTPUT_FN = f"VV_log_final_only_issues{args.assay_suffix}.csv"
13+
14+
df = pd.read_csv(INPUT_FN, sep=",")
15+
16+
# Filter out GREEN status (code 20) to keep only issues
17+
# Handle both string and integer flag_codes
18+
df_filtered = df.loc[~((df["flag_code"] == "20") | (df["flag_code"] == 20))]
19+
df_filtered.to_csv(OUTPUT_FN, sep=",", index=False)

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rsem_counts.py

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -86,43 +86,57 @@ def initialize_vv_log(outdir):
8686

8787
# Check if file exists
8888
if not os.path.exists(vv_log_path):
89-
# Create new file with header
89+
# Create new file with header - both status (color) and flag_code (number)
9090
with open(vv_log_path, 'w') as f:
91-
f.write("component,sample_id,check_name,status,message,details\n")
91+
f.write("component,sample_id,check_name,status,flag_code,message,details\n")
9292

9393
return vv_log_path
9494

9595

9696
def log_check_result(log_path, component, sample_id, check_name, status, message="", details=""):
9797
"""Log check result to the VV_log.csv file."""
98-
9998
def escape_field(field, is_details=False):
10099
# Convert to string if not already
101100
if not isinstance(field, str):
102101
field = str(field)
103102

104-
# Replace commas with semicolons to avoid CSV quoting
105-
field = field.replace(',', ';')
106-
107-
# Remove any quotes to prevent issues with CSV format
108-
field = field.replace('"', '')
109-
field = field.replace("'", "")
103+
# Replace newlines with semicolons to keep CSV valid
104+
field = field.replace('\n', '; ')
110105

111-
# If it's the details field, truncate to 1000 chars if too long
112-
if is_details and len(field) > 1000:
113-
field = field[:997] + "..."
106+
# For details field, replace commas with semicolons to avoid CSV quoting
107+
if is_details and ',' in field:
108+
field = field.replace(', ', '; ')
114109

110+
# If the field contains commas or quotes (but not just semicolons), wrap it in quotes
111+
if ',' in field or '"' in field:
112+
# Double any quotes within the field
113+
field = field.replace('"', '""')
114+
# Wrap in quotes
115+
field = f'"{field}"'
115116
return field
116-
117+
118+
# Map status (color) to flag_code (number)
119+
flag_codes = {
120+
"GREEN": "20", # Using strings for consistency in CSV
121+
"YELLOW": "30",
122+
"RED": "50",
123+
"HALT": "80"
124+
}
125+
126+
# Get numeric flag code based on status color
127+
flag_code = flag_codes.get(status, "80") # Default to HALT if unknown status
128+
129+
# Format all fields
130+
component = escape_field(component)
131+
sample_id = escape_field(sample_id)
132+
check_name = escape_field(check_name)
133+
status = escape_field(status) # The color (GREEN/YELLOW/RED/HALT)
134+
message = escape_field(message)
135+
details = escape_field(details, True)
136+
137+
# Write both status (color) and flag_code (number)
117138
with open(log_path, 'a') as f:
118-
component = escape_field(component)
119-
sample_id = escape_field(sample_id)
120-
check_name = escape_field(check_name)
121-
status = escape_field(status)
122-
message = escape_field(message)
123-
details = escape_field(details, is_details=True)
124-
125-
f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
139+
f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
126140

127141

128142
def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulkRNAseq"):
@@ -179,7 +193,7 @@ def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulk
179193
"RSEM_counts",
180194
sample,
181195
"check_rsem_output_existence",
182-
"RED",
196+
"HALT",
183197
f"Missing {len(missing_files)} expected RSEM output files",
184198
",".join(missing_files)
185199
)
@@ -191,7 +205,7 @@ def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulk
191205
"RSEM_counts",
192206
"all",
193207
"check_rsem_output_existence",
194-
"RED",
208+
"HALT",
195209
f"Missing {len(missing_dataset_files)} expected dataset-level RSEM output files",
196210
",".join(missing_dataset_files)
197211
)
@@ -266,7 +280,7 @@ def get_rsem_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
266280
"RSEM_counts",
267281
"all",
268282
"get_rsem_multiqc_stats",
269-
"RED",
283+
"HALT",
270284
"RSEM MultiQC data not found",
271285
f"Expected at {multiqc_zip}"
272286
)
@@ -291,7 +305,7 @@ def get_rsem_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
291305
"RSEM_counts",
292306
"all",
293307
"get_rsem_multiqc_stats",
294-
"RED",
308+
"HALT",
295309
"multiqc_data.json not found in MultiQC zip",
296310
f"Expected at {json_path}"
297311
)
@@ -500,7 +514,7 @@ def check_all_samples_in_multiqc(outdir, samples, log_path, assay_suffix="_GLbul
500514
"RSEM_counts",
501515
"all",
502516
"check_all_samples_in_multiqc",
503-
"RED",
517+
"HALT",
504518
"RSEM MultiQC data not found",
505519
f"Expected at {multiqc_zip}"
506520
)
@@ -526,7 +540,7 @@ def check_all_samples_in_multiqc(outdir, samples, log_path, assay_suffix="_GLbul
526540
"RSEM_counts",
527541
"all",
528542
"check_all_samples_in_multiqc",
529-
"RED",
543+
"HALT",
530544
"multiqc_sources.txt not found in MultiQC zip",
531545
f"Expected at {sources_file}"
532546
)

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_star_alignment.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def initialize_vv_log(outdir):
8888
if not os.path.exists(vv_log_path):
8989
# Create new file with header
9090
with open(vv_log_path, 'w') as f:
91-
f.write("component,sample_id,check_name,status,message,details\n")
91+
f.write("component,sample_id,check_name,status,flag_code,message,details\n")
9292

9393
return vv_log_path
9494

@@ -114,6 +114,17 @@ def escape_field(field, is_details=False):
114114

115115
return field
116116

117+
# Map status strings to flag codes
118+
flag_codes = {
119+
"GREEN": "20", # Using strings for consistency in CSV
120+
"YELLOW": "30",
121+
"RED": "50",
122+
"HALT": "80"
123+
}
124+
125+
# Get numeric flag code based on status color
126+
flag_code = flag_codes.get(status, "80") # Default to HALT if unknown status
127+
117128
with open(log_path, 'a') as f:
118129
component = escape_field(component)
119130
sample_id = escape_field(sample_id)
@@ -122,7 +133,7 @@ def escape_field(field, is_details=False):
122133
message = escape_field(message)
123134
details = escape_field(details, is_details=True)
124135

125-
f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
136+
f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
126137

127138

128139
def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suffix="_GLbulkRNAseq"):
@@ -190,7 +201,7 @@ def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suf
190201
"STAR_alignment",
191202
sample,
192203
"check_star_output_existence",
193-
"RED",
204+
"HALT",
194205
f"Missing {len(missing_files)} expected STAR output files",
195206
",".join(missing_files)
196207
)
@@ -202,7 +213,7 @@ def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suf
202213
"STAR_alignment",
203214
"all",
204215
"check_star_output_existence",
205-
"RED",
216+
"HALT",
206217
f"Missing {len(missing_dataset_files)} expected dataset-level STAR output files",
207218
",".join(missing_dataset_files)
208219
)
@@ -272,7 +283,7 @@ def check_bam_file_integrity(outdir, samples, log_path):
272283
"STAR_alignment",
273284
sample,
274285
"check_bam_file_integrity",
275-
"RED",
286+
"HALT",
276287
f"{len(failed_bams)} BAM files failed integrity check",
277288
",".join(failed_bams)
278289
)
@@ -304,7 +315,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
304315
"STAR_alignment",
305316
"all",
306317
"get_star_multiqc_stats",
307-
"RED",
318+
"HALT",
308319
"STAR MultiQC data not found",
309320
f"Expected at {multiqc_zip}"
310321
)
@@ -329,7 +340,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
329340
"STAR_alignment",
330341
"all",
331342
"get_star_multiqc_stats",
332-
"RED",
343+
"HALT",
333344
"multiqc_data.json not found in MultiQC zip",
334345
f"Expected at {json_path}"
335346
)
@@ -350,7 +361,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
350361
"STAR_alignment",
351362
"all",
352363
"get_star_multiqc_stats",
353-
"RED",
364+
"HALT",
354365
"No general stats data in MultiQC data",
355366
""
356367
)
@@ -383,7 +394,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
383394
"STAR_alignment",
384395
"all",
385396
"get_star_multiqc_stats",
386-
"RED",
397+
"HALT",
387398
"No STAR metrics found in MultiQC data",
388399
""
389400
)
@@ -432,7 +443,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
432443
"STAR_alignment",
433444
"all",
434445
"get_star_multiqc_stats",
435-
"RED",
446+
"HALT",
436447
f"Error extracting STAR stats: {str(e)}",
437448
""
438449
)
@@ -448,7 +459,7 @@ def report_star_alignment_outliers(outdir, star_data, log_path):
448459
"STAR_alignment",
449460
"all",
450461
"report_star_alignment_outliers",
451-
"RED",
462+
"HALT",
452463
"No STAR data to analyze for outliers",
453464
""
454465
)
@@ -566,7 +577,7 @@ def check_mapping_rates(outdir, star_data, log_path):
566577
"STAR_alignment",
567578
"all",
568579
"check_mapping_rates",
569-
"RED",
580+
"HALT",
570581
"No STAR data to analyze mapping rates",
571582
""
572583
)

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_trimmed_reads.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,9 @@ def initialize_vv_log(outdir):
9393

9494
# Check if file exists
9595
if not os.path.exists(vv_log_path):
96-
# Create new file with header
96+
# Create new file with header - both status (color) and flag_code (number)
9797
with open(vv_log_path, 'w') as f:
98-
f.write("component,sample_id,check_name,status,message,details\n")
98+
f.write("component,sample_id,check_name,status,flag_code,message,details\n")
9999

100100
return vv_log_path
101101

@@ -104,7 +104,8 @@ def log_check_result(log_path, component, sample_id, check_name, status, message
104104
# Properly escape and format fields for CSV
105105
def escape_field(field, is_details=False):
106106
# Convert to string if not already
107-
field = str(field)
107+
if not isinstance(field, str):
108+
field = str(field)
108109

109110
# Replace newlines with semicolons to keep CSV valid
110111
field = field.replace('\n', '; ')
@@ -125,6 +126,17 @@ def escape_field(field, is_details=False):
125126
field = f'"{field}"'
126127
return field
127128

129+
# Map status (color) to flag_code (number)
130+
flag_codes = {
131+
"GREEN": "20", # Using strings for consistency in CSV
132+
"YELLOW": "30",
133+
"RED": "50",
134+
"HALT": "80"
135+
}
136+
137+
# Get numeric flag code based on status color
138+
flag_code = flag_codes.get(status, "80") # Default to HALT if unknown status
139+
128140
# Format all fields
129141
component = escape_field(component)
130142
sample_id = escape_field(sample_id)
@@ -135,7 +147,7 @@ def escape_field(field, is_details=False):
135147

136148
# Write the formatted line
137149
with open(log_path, 'a') as f:
138-
f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
150+
f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
139151

140152
def check_trimmed_fastq_existence(outdir, samples, paired_end, log_path):
141153
"""Check if the expected trimmed FASTQ files exist for each sample."""

0 commit comments

Comments
 (0)