Allow integer sample names - finished

torres-alexis · torres-alexis · commit d178627f7639 · 2025-05-16T16:41:35.000-07:00
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories.py
@@ -24,13 +24,14 @@
 # For a given directory, sort all files into {sample: str, [files: str]}
 files_by_sample = dict()
 for sample in samples:
+    sample = str(sample)  # Add this line before the path is constructed
     pattern = f"{sample}{args.glob_suffix}"
     print(f"Looking for files matching: {pattern}")
     files_for_this_sample = list(Path(args.from_dir).glob(pattern))
     
     # Move files
     for file in files_for_this_sample:
-        dest = Path(args.to_dir) / sample / file.name
+        dest = Path(args.to_dir) / str(sample) / file.name
         print(f"Moving {file} to {dest}")
         dest.parent.mkdir( parents=True, exist_ok=True )
         shutil.move(file, dest)
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_bowtie2_alignment.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_bowtie2_alignment.py
@@ -41,6 +41,10 @@ def parse_runsheet(runsheet_path):
         # Try to read the runsheet using pandas
         df = pd.read_csv(runsheet_path)
         
+        # Ensure sample names are strings
+        if 'Sample Name' in df.columns:
+            df['Sample Name'] = df['Sample Name'].astype(str)
+        
         # Check for required columns
         required_columns = ['Sample Name', 'paired_end', 'has_ERCC', 'organism']
         missing_columns = [col for col in required_columns if col not in df.columns]
@@ -321,7 +325,7 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
             # Check multiple indicators of paired data:
             # 1. Check unmapped files
             unmapped_dir = os.path.join(outdir, "02-Bowtie2_Alignment")
-            has_paired_files = any(os.path.exists(os.path.join(unmapped_dir, sample, f"{sample}_R2_unmapped.fastq.gz")) 
+            has_paired_files = any(os.path.exists(os.path.join(unmapped_dir, str(sample), f"{sample}_R2_unmapped.fastq.gz")) 
                                  for sample in samples)
             
             # 2. Check stats data for paired indicators
@@ -344,8 +348,10 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
             # Check for missing samples
             missing_samples = []
             for sample in samples:
-                if sample not in multiqc_samples:
-                    missing_samples.append(sample)
+                # Convert sample to string for consistent comparison
+                sample_str = str(sample)
+                if sample_str not in multiqc_samples:
+                    missing_samples.append(sample_str)
             
             if missing_samples:
                 log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED", 
@@ -665,6 +671,9 @@ def check_bowtie2_existence(outdir, samples, paired_end, log_path):
         missing_files.append(multiqc_file)
 
     for sample in samples:
+        # Ensure sample is a string
+        sample = str(sample)
+        
         # Check sample directory
         sample_dir = os.path.join(align_dir, sample)
         if not os.path.exists(sample_dir):
@@ -743,8 +752,8 @@ def main():
     # Parse the runsheet
     runsheet_df = parse_runsheet(args.runsheet)
     
-    # Extract sample names
-    sample_names = runsheet_df['Sample Name'].tolist()
+    # Extract sample names and convert to strings
+    sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
     
     # Check consistency of paired_end, has_ERCC, and organism values
     paired_end_values = runsheet_df['paired_end'].unique()
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_dge_deseq2.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_dge_deseq2.py
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_featurecounts.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_featurecounts.py
@@ -41,6 +41,10 @@ def parse_runsheet(runsheet_path):
     
     try:
         df = pd.read_csv(runsheet_path)
+        
+        # Convert Sample Name to string to ensure compatibility with numeric sample names
+        df['Sample Name'] = df['Sample Name'].astype(str)
+        
         required_columns = ['Sample Name', 'paired_end', 'has_ERCC', 'organism']
         for col in required_columns:
             if col not in df.columns:
@@ -179,6 +183,9 @@ def get_featurecounts_multiqc_stats(outdir, samples, log_path, assay_suffix="_GL
     
     print(f"Extracting stats from MultiQC data: {multiqc_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -214,6 +221,9 @@ def get_featurecounts_multiqc_stats(outdir, samples, log_path, assay_suffix="_GL
                 # Clean sample name (remove path if present)
                 sample_name = os.path.basename(sample) if '/' in sample else sample
                 
+                # Convert to string to ensure compatibility with numeric sample names
+                sample_name = str(sample_name)
+                
                 # Extract key metrics
                 fc_data[sample_name] = {
                     'total_count': count_data['Total'],
@@ -282,6 +292,9 @@ def parse_featurecounts(multiqc_data_dir, assay_suffix="_GLbulkRNAseq"):
             if "/" in sample:
                 sample_name = sample.split("/")[-1]
             
+            # Convert to string to ensure compatibility with numeric sample names
+            sample_name = str(sample_name)
+            
             fc_data[sample_name] = {
                 'total_count': count_data['Total'],
                 'num_assigned': count_data['Assigned'],
@@ -607,8 +620,8 @@ def main():
     # Parse the runsheet
     runsheet_df = parse_runsheet(args.runsheet)
     
-    # Extract sample names
-    sample_names = runsheet_df['Sample Name'].tolist()
+    # Extract sample names and convert to strings to handle numeric sample names
+    sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
     
     # Check consistency of paired_end, has_ERCC, and organism values
     paired_end_values = runsheet_df['paired_end'].unique()
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_raw_reads.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_raw_reads.py
@@ -361,6 +361,9 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
     
     print(f"Found MultiQC data zip: {multiqc_data_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -391,6 +394,8 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
                 
                 fastqc_sections = multiqc_data['report_data_sources']['FastQC']['all_sections']
                 for mqc_sample in fastqc_sections.keys():
+                    # Convert to string to ensure compatibility with numeric sample names
+                    mqc_sample = str(mqc_sample)
                     # For paired-end data, remove _R1 and _R2 suffixes
                     base_sample = mqc_sample.replace("_raw_fastqc", "").replace("_fastqc", "")
                     if is_paired:
@@ -439,6 +444,9 @@ def get_raw_multiqc_stats(outdir, samples, paired_end, log_path, assay_suffix="_
     
     print(f"Extracting stats from MultiQC data: {multiqc_data_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -516,6 +524,9 @@ def parse_fastqc(prefix, assay_suffix):
     # Group the samples by base name for paired end data
     sample_groups = {}
     for sample in j['report_general_stats_data'][-1].keys():
+        # Convert sample to string to handle numeric sample names
+        sample = str(sample)
+        
         # Handle various naming patterns
         if ' Read 1' in sample:
             base_name = sample.replace(' Read 1', '')
@@ -569,7 +580,7 @@ def parse_fastqc(prefix, assay_suffix):
     ]:
         if section in j['report_plot_data']:
             for data_item in j['report_plot_data'][section]['datasets'][0]['lines']:
-                sample = data_item['name']
+                sample = str(data_item['name'])  # Convert to string to handle numeric sample names
                 
                 # Determine if it's forward or reverse read
                 read_suffix = '_f'  # Default to forward
@@ -962,8 +973,8 @@ def main():
     # Parse the runsheet
     runsheet_df = parse_runsheet(args.runsheet)
     
-    # Extract sample names
-    sample_names = runsheet_df['Sample Name'].tolist()
+    # Extract sample names and convert to strings to handle numeric sample names
+    sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
     
     # Check consistency of paired_end, has_ERCC, and organism values
     paired_end_values = runsheet_df['paired_end'].unique()
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rsem_counts.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rsem_counts.py
@@ -143,6 +143,9 @@ def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulk
     """Check if all expected RSEM output files exist for each sample."""
     rsem_dir = os.path.join(outdir, '03-RSEM_Counts')
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Expected file patterns for each sample in sample-specific subdirectories
     expected_patterns = [
         "{sample}/{sample}.genes.results",
@@ -271,9 +274,11 @@ def parse_rsem(multiqc_data_dir, assay_suffix="_GLbulkRNAseq"):
         
         for sample, count_data in j['report_saved_raw_data']['multiqc_rsem'].items():
             sample_name = sample
-            # Clean up sample name if needed
+            # Clean up sample name if needed and ensure it's a string
             if "/" in sample:
-                sample_name = sample.split("/")[-1]
+                sample_name = str(sample.split("/")[-1])
+            else:
+                sample_name = str(sample)
             
             total_reads = count_data['Unique'] + count_data['Multi'] + count_data['Filtered'] + count_data['Unalignable']
             
@@ -314,6 +319,9 @@ def get_rsem_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
     
     print(f"Extracting RSEM stats from MultiQC data: {multiqc_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -547,6 +555,9 @@ def check_all_samples_in_multiqc(outdir, samples, log_path, assay_suffix="_GLbul
         )
         return False
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -829,8 +840,8 @@ def main():
     # Parse the runsheet
     runsheet_df = parse_runsheet(args.runsheet)
     
-    # Extract sample names
-    sample_names = runsheet_df['Sample Name'].tolist()
+    # Extract sample names and convert to strings to handle numeric sample names
+    sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
     
     # Extract paired_end status
     paired_end = runsheet_df['paired_end'].iloc[0] if len(runsheet_df) > 0 else False
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rseqc.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rseqc.py
@@ -71,8 +71,8 @@ def parse_runsheet(runsheet_path):
             print(f"Error: Runsheet missing required columns: {', '.join(missing_columns)}")
             sys.exit(1)
         
-        # Extract sample names
-        sample_names = df['Sample Name'].tolist()
+        # Extract sample names and convert to strings to handle numeric sample names
+        sample_names = [str(sample) for sample in df['Sample Name'].tolist()]
         
         # Extract metadata - store raw values, handle type conversion when using
         metadata = {
@@ -172,6 +172,9 @@ def check_gene_body_coverage_existence(outdir, samples, log_path):
     check_name = "check_gene_body_coverage_existence"
     print(f"Checking for gene body coverage files...")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Get the expected directory path
     rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "02_geneBody_coverage")
     
@@ -240,6 +243,9 @@ def check_infer_experiment_existence(outdir, samples, log_path):
     check_name = "check_infer_experiment_existence"
     print(f"Checking for infer experiment files...")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Get the expected directory path
     rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "03_infer_experiment")
     
@@ -300,6 +306,9 @@ def check_read_distribution_existence(outdir, samples, log_path):
     check_name = "check_read_distribution_files"
     print(f"Checking for read distribution files...")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Get the expected directory path
     rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "05_read_distribution")
     
@@ -358,6 +367,9 @@ def check_inner_distance_existence(outdir, samples, log_path):
     check_name = "check_inner_distance_files"
     print(f"Checking for inner distance files...")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Get the expected directory path
     rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "04_inner_distance")
     
@@ -428,6 +440,9 @@ def get_genebody_coverage_multiqc_stats(outdir, samples, log_path, assay_suffix=
     
     print(f"Extracting stats from Gene body coverage MultiQC data: {multiqc_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -707,6 +722,9 @@ def get_infer_experiment_multiqc_stats(outdir, samples, log_path, assay_suffix="
     
     print(f"Extracting stats from infer experiment MultiQC data: {multiqc_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -1039,6 +1057,9 @@ def get_inner_distance_multiqc_stats(outdir, samples, log_path, assay_suffix="_G
     check_name = "inner_distance_multiqc_stats"
     print(f"Extracting inner distance stats from MultiQC...")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Get the RSeQC inner distance directory
     inner_dist_dir = os.path.join(outdir, "RSeQC_Analyses", "04_inner_distance")
     
@@ -1094,8 +1115,8 @@ def get_inner_distance_multiqc_stats(outdir, samples, log_path, assay_suffix="_G
                     for dist_data in plot_data:
                         sample_name = dist_data['name']
                         
-                        # Remove any file extension or path from the sample name
-                        sample_name = os.path.basename(sample_name)
+                        # Convert sample name to string to ensure compatibility with numeric sample names
+                        sample_name = str(os.path.basename(sample_name))
                         if '.' in sample_name:
                             sample_name = sample_name.split('.')[0]
                         
@@ -1371,6 +1392,9 @@ def get_read_distribution_multiqc_stats(outdir, samples, log_path, assay_suffix=
     check_name = "read_distribution_multiqc_stats"
     print(f"Extracting read distribution stats from MultiQC...")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Get the RSeQC read distribution directory
     read_dist_dir = os.path.join(outdir, "RSeQC_Analyses", "05_read_distribution")
     
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_star_alignment.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_star_alignment.py
@@ -140,6 +140,9 @@ def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suf
     """Check if all expected STAR alignment output files exist for each sample."""
     alignment_dir = os.path.join(outdir, '02-STAR_Alignment')
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Expected file patterns for each sample in sample-specific subdirectories
     expected_patterns = [
         "{sample}/{sample}_Aligned.sortedByCoord.out.bam",
@@ -242,6 +245,9 @@ def check_bam_file_integrity(outdir, samples, log_path):
     """Verify BAM file integrity using samtools quickcheck."""
     alignment_dir = os.path.join(outdir, '02-STAR_Alignment')
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # BAM file patterns to check for each sample
     bam_patterns = [
         "{sample}/{sample}_Aligned.toTranscriptome.out.bam",
@@ -328,6 +334,9 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
     
     print(f"Extracting STAR stats from MultiQC data: {multiqc_zip}")
     
+    # Convert all samples to strings for consistent comparison
+    samples = [str(sample) for sample in samples]
+    
     # Create a temporary directory to extract files
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
@@ -380,8 +389,8 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
                         # This is STAR data
                         star_metrics_found = True
                         
-                        # Clean sample name (remove path prefix if present)
-                        sample_name = os.path.basename(sample)
+                        # Convert sample to string to ensure compatibility with numeric sample names
+                        sample_name = str(os.path.basename(sample))
                         
                         # Extract key STAR metrics
                         star_data[sample_name] = {
@@ -705,8 +714,8 @@ def main():
     # Parse the runsheet
     runsheet_df = parse_runsheet(args.runsheet)
     
-    # Extract sample names
-    sample_names = runsheet_df['Sample Name'].tolist()
+    # Extract sample names and convert to strings to handle numeric sample names
+    sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
     
     # Extract paired_end status
     paired_end = runsheet_df['paired_end'].iloc[0] if len(runsheet_df) > 0 else False
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_trimmed_reads.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_trimmed_reads.py