nasa
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/concat_logs.py
Lines changed: 18 additions & 0 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/concat_logs.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/filter_to_only_issues.py
Lines changed: 19 additions & 0 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/filter_to_only_issues.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rsem_counts.py
Lines changed: 41 additions & 27 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rsem_counts.py
Lines changed: 41 additions & 27 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_star_alignment.py
Lines changed: 23 additions & 12 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_star_alignment.py
Lines changed: 23 additions & 12 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_trimmed_reads.py
Lines changed: 16 additions & 4 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_trimmed_reads.py
Lines changed: 16 additions & 4 deletions
@@ -0,0 +1,18 @@
+#! /usr/bin/env python
+# Concatenate V&V logs
+
+from pathlib import Path
+
+logs = sorted(list(Path.cwd().glob("VV_in.csv*")))
+
+OUTPUT_FN = Path("VV_log_final_GLbulkRNAseq.csv")
+
+for i, log in enumerate(logs):
+    with open(log, "r") as in_f:
+        if i == 0:  # first file
+            contents = in_f.read()
+        else:
+            contents += "".join(in_f.readlines()[1:])  # skip header
+
+with open(OUTPUT_FN, "w") as out_f:
+    out_f.write(contents)
@@ -0,0 +1,19 @@
+#! /usr/bin/env python
+import pandas as pd
+import argparse
+
+# Set up command line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--assay_suffix', type=str, required=True, 
+                    help='Suffix for input/output files (e.g. "_GLbulkRNAseq")')
+args = parser.parse_args()
+
+INPUT_FN = f"VV_log_final{args.assay_suffix}.csv"
+OUTPUT_FN = f"VV_log_final_only_issues{args.assay_suffix}.csv"
+
+df = pd.read_csv(INPUT_FN, sep=",")
+
+# Filter out GREEN status (code 20) to keep only issues
+# Handle both string and integer flag_codes
+df_filtered = df.loc[~((df["flag_code"] == "20") | (df["flag_code"] == 20))]
+df_filtered.to_csv(OUTPUT_FN, sep=",", index=False)
@@ -86,43 +86,57 @@ def initialize_vv_log(outdir):
 
     # Check if file exists
     if not os.path.exists(vv_log_path):
-        # Create new file with header
+        # Create new file with header - both status (color) and flag_code (number)
         with open(vv_log_path, 'w') as f:
-            f.write("component,sample_id,check_name,status,message,details\n")
+            f.write("component,sample_id,check_name,status,flag_code,message,details\n")
 
     return vv_log_path
 
 
 def log_check_result(log_path, component, sample_id, check_name, status, message="", details=""):
     """Log check result to the VV_log.csv file."""
-    
     def escape_field(field, is_details=False):
         # Convert to string if not already
         if not isinstance(field, str):
             field = str(field)
 
-        # Replace commas with semicolons to avoid CSV quoting
-        field = field.replace(',', ';')
-        
-        # Remove any quotes to prevent issues with CSV format
-        field = field.replace('"', '')
-        field = field.replace("'", "")
+        # Replace newlines with semicolons to keep CSV valid
+        field = field.replace('\n', '; ')
 
-        # If it's the details field, truncate to 1000 chars if too long
-        if is_details and len(field) > 1000:
-            field = field[:997] + "..."
+        # For details field, replace commas with semicolons to avoid CSV quoting
+        if is_details and ',' in field:
+            field = field.replace(', ', '; ')
 
+        # If the field contains commas or quotes (but not just semicolons), wrap it in quotes
+        if ',' in field or '"' in field:
+            # Double any quotes within the field
+            field = field.replace('"', '""')
+            # Wrap in quotes
+            field = f'"{field}"'
         return field
-    
+
+    # Map status (color) to flag_code (number)
+    flag_codes = {
+        "GREEN": "20",   # Using strings for consistency in CSV
+        "YELLOW": "30",
+        "RED": "50",
+        "HALT": "80"
+    }
+
+    # Get numeric flag code based on status color
+    flag_code = flag_codes.get(status, "80")  # Default to HALT if unknown status
+    
+    # Format all fields
+    component = escape_field(component)
+    sample_id = escape_field(sample_id)
+    check_name = escape_field(check_name)
+    status = escape_field(status)  # The color (GREEN/YELLOW/RED/HALT)
+    message = escape_field(message)
+    details = escape_field(details, True)
+    
+    # Write both status (color) and flag_code (number)
     with open(log_path, 'a') as f:
-        component = escape_field(component)
-        sample_id = escape_field(sample_id)
-        check_name = escape_field(check_name)
-        status = escape_field(status)
-        message = escape_field(message)
-        details = escape_field(details, is_details=True)
-        
-        f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
+        f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
 
 
 def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulkRNAseq"):
@@ -179,7 +193,7 @@ def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulk
                 "RSEM_counts", 
                 sample, 
                 "check_rsem_output_existence", 
-                "RED", 
+                "HALT", 
                 f"Missing {len(missing_files)} expected RSEM output files", 
                 ",".join(missing_files)
             )
@@ -191,7 +205,7 @@ def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulk
                 "RSEM_counts", 
                 "all", 
                 "check_rsem_output_existence", 
-                "RED", 
+                "HALT", 
                 f"Missing {len(missing_dataset_files)} expected dataset-level RSEM output files", 
                 ",".join(missing_dataset_files)
             )
@@ -266,7 +280,7 @@ def get_rsem_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
             "RSEM_counts", 
             "all", 
             "get_rsem_multiqc_stats", 
-            "RED", 
+            "HALT", 
             "RSEM MultiQC data not found", 
             f"Expected at {multiqc_zip}"
         )
@@ -291,7 +305,7 @@ def get_rsem_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
                     "RSEM_counts", 
                     "all", 
                     "get_rsem_multiqc_stats", 
-                    "RED", 
+                    "HALT", 
                     "multiqc_data.json not found in MultiQC zip", 
                     f"Expected at {json_path}"
                 )
@@ -500,7 +514,7 @@ def check_all_samples_in_multiqc(outdir, samples, log_path, assay_suffix="_GLbul
             "RSEM_counts", 
             "all", 
             "check_all_samples_in_multiqc", 
-            "RED", 
+            "HALT", 
             "RSEM MultiQC data not found", 
             f"Expected at {multiqc_zip}"
         )
@@ -526,7 +540,7 @@ def check_all_samples_in_multiqc(outdir, samples, log_path, assay_suffix="_GLbul
                     "RSEM_counts", 
                     "all", 
                     "check_all_samples_in_multiqc", 
-                    "RED", 
+                    "HALT", 
                     "multiqc_sources.txt not found in MultiQC zip", 
                     f"Expected at {sources_file}"
                 )
 
@@ -88,7 +88,7 @@ def initialize_vv_log(outdir):
     if not os.path.exists(vv_log_path):
         # Create new file with header
         with open(vv_log_path, 'w') as f:
-            f.write("component,sample_id,check_name,status,message,details\n")
+            f.write("component,sample_id,check_name,status,flag_code,message,details\n")
 
     return vv_log_path
 
@@ -114,6 +114,17 @@ def escape_field(field, is_details=False):
 
         return field
 
+    # Map status strings to flag codes
+    flag_codes = {
+        "GREEN": "20",   # Using strings for consistency in CSV
+        "YELLOW": "30",
+        "RED": "50",
+        "HALT": "80"
+    }
+
+    # Get numeric flag code based on status color
+    flag_code = flag_codes.get(status, "80")  # Default to HALT if unknown status
+    
     with open(log_path, 'a') as f:
         component = escape_field(component)
         sample_id = escape_field(sample_id)
@@ -122,7 +133,7 @@ def escape_field(field, is_details=False):
         message = escape_field(message)
         details = escape_field(details, is_details=True)
 
-        f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
+        f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
 
 
 def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suffix="_GLbulkRNAseq"):
@@ -190,7 +201,7 @@ def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suf
                 "STAR_alignment", 
                 sample, 
                 "check_star_output_existence", 
-                "RED", 
+                "HALT", 
                 f"Missing {len(missing_files)} expected STAR output files", 
                 ",".join(missing_files)
             )
@@ -202,7 +213,7 @@ def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suf
                 "STAR_alignment", 
                 "all", 
                 "check_star_output_existence", 
-                "RED", 
+                "HALT", 
                 f"Missing {len(missing_dataset_files)} expected dataset-level STAR output files", 
                 ",".join(missing_dataset_files)
             )
@@ -272,7 +283,7 @@ def check_bam_file_integrity(outdir, samples, log_path):
                 "STAR_alignment", 
                 sample, 
                 "check_bam_file_integrity", 
-                "RED", 
+                "HALT", 
                 f"{len(failed_bams)} BAM files failed integrity check", 
                 ",".join(failed_bams)
             )
@@ -304,7 +315,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
             "STAR_alignment", 
             "all", 
             "get_star_multiqc_stats", 
-            "RED", 
+            "HALT", 
             "STAR MultiQC data not found", 
             f"Expected at {multiqc_zip}"
         )
@@ -329,7 +340,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
                     "STAR_alignment", 
                     "all", 
                     "get_star_multiqc_stats", 
-                    "RED", 
+                    "HALT", 
                     "multiqc_data.json not found in MultiQC zip", 
                     f"Expected at {json_path}"
                 )
@@ -350,7 +361,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
                     "STAR_alignment", 
                     "all", 
                     "get_star_multiqc_stats", 
-                    "RED", 
+                    "HALT", 
                     "No general stats data in MultiQC data", 
                     ""
                 )
@@ -383,7 +394,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
                     "STAR_alignment", 
                     "all", 
                     "get_star_multiqc_stats", 
-                    "RED", 
+                    "HALT", 
                     "No STAR metrics found in MultiQC data", 
                     ""
                 )
@@ -432,7 +443,7 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
                 "STAR_alignment", 
                 "all", 
                 "get_star_multiqc_stats", 
-                "RED", 
+                "HALT", 
                 f"Error extracting STAR stats: {str(e)}", 
                 ""
             )
@@ -448,7 +459,7 @@ def report_star_alignment_outliers(outdir, star_data, log_path):
             "STAR_alignment", 
             "all", 
             "report_star_alignment_outliers", 
-            "RED", 
+            "HALT", 
             "No STAR data to analyze for outliers", 
             ""
         )
@@ -566,7 +577,7 @@ def check_mapping_rates(outdir, star_data, log_path):
             "STAR_alignment", 
             "all", 
             "check_mapping_rates", 
-            "RED", 
+            "HALT", 
             "No STAR data to analyze mapping rates", 
             ""
         )
 
@@ -93,9 +93,9 @@ def initialize_vv_log(outdir):
 
     # Check if file exists
     if not os.path.exists(vv_log_path):
-        # Create new file with header
+        # Create new file with header - both status (color) and flag_code (number)
         with open(vv_log_path, 'w') as f:
-            f.write("component,sample_id,check_name,status,message,details\n")
+            f.write("component,sample_id,check_name,status,flag_code,message,details\n")
 
     return vv_log_path
 
@@ -104,7 +104,8 @@ def log_check_result(log_path, component, sample_id, check_name, status, message
     # Properly escape and format fields for CSV
     def escape_field(field, is_details=False):
         # Convert to string if not already
-        field = str(field)
+        if not isinstance(field, str):
+            field = str(field)
 
         # Replace newlines with semicolons to keep CSV valid
         field = field.replace('\n', '; ')
@@ -125,6 +126,17 @@ def escape_field(field, is_details=False):
             field = f'"{field}"'
         return field
 
+    # Map status (color) to flag_code (number)
+    flag_codes = {
+        "GREEN": "20",   # Using strings for consistency in CSV
+        "YELLOW": "30",
+        "RED": "50",
+        "HALT": "80"
+    }
+
+    # Get numeric flag code based on status color
+    flag_code = flag_codes.get(status, "80")  # Default to HALT if unknown status
+    
     # Format all fields
     component = escape_field(component)
     sample_id = escape_field(sample_id)
@@ -135,7 +147,7 @@ def escape_field(field, is_details=False):
 
     # Write the formatted line
     with open(log_path, 'a') as f:
-        f.write(f"{component},{sample_id},{check_name},{status},{message},{details}\n")
+        f.write(f"{component},{sample_id},{check_name},{status},{flag_code},{message},{details}\n")
 
 def check_trimmed_fastq_existence(outdir, samples, paired_end, log_path):
     """Check if the expected trimmed FASTQ files exist for each sample."""