Skip to content

Commit d178627

Browse files
committed
Allow integer sample names - finished
1 parent 8a2a38a commit d178627

File tree

9 files changed

+290
-70
lines changed

9 files changed

+290
-70
lines changed

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@
2424
# For a given directory, sort all files into {sample: str, [files: str]}
2525
files_by_sample = dict()
2626
for sample in samples:
27+
sample = str(sample) # Add this line before the path is constructed
2728
pattern = f"{sample}{args.glob_suffix}"
2829
print(f"Looking for files matching: {pattern}")
2930
files_for_this_sample = list(Path(args.from_dir).glob(pattern))
3031

3132
# Move files
3233
for file in files_for_this_sample:
33-
dest = Path(args.to_dir) / sample / file.name
34+
dest = Path(args.to_dir) / str(sample) / file.name
3435
print(f"Moving {file} to {dest}")
3536
dest.parent.mkdir( parents=True, exist_ok=True )
3637
shutil.move(file, dest)

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_bowtie2_alignment.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ def parse_runsheet(runsheet_path):
4141
# Try to read the runsheet using pandas
4242
df = pd.read_csv(runsheet_path)
4343

44+
# Ensure sample names are strings
45+
if 'Sample Name' in df.columns:
46+
df['Sample Name'] = df['Sample Name'].astype(str)
47+
4448
# Check for required columns
4549
required_columns = ['Sample Name', 'paired_end', 'has_ERCC', 'organism']
4650
missing_columns = [col for col in required_columns if col not in df.columns]
@@ -321,7 +325,7 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
321325
# Check multiple indicators of paired data:
322326
# 1. Check unmapped files
323327
unmapped_dir = os.path.join(outdir, "02-Bowtie2_Alignment")
324-
has_paired_files = any(os.path.exists(os.path.join(unmapped_dir, sample, f"{sample}_R2_unmapped.fastq.gz"))
328+
has_paired_files = any(os.path.exists(os.path.join(unmapped_dir, str(sample), f"{sample}_R2_unmapped.fastq.gz"))
325329
for sample in samples)
326330

327331
# 2. Check stats data for paired indicators
@@ -344,8 +348,10 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
344348
# Check for missing samples
345349
missing_samples = []
346350
for sample in samples:
347-
if sample not in multiqc_samples:
348-
missing_samples.append(sample)
351+
# Convert sample to string for consistent comparison
352+
sample_str = str(sample)
353+
if sample_str not in multiqc_samples:
354+
missing_samples.append(sample_str)
349355

350356
if missing_samples:
351357
log_check_result(log_path, "alignment", "all", "check_samples_multiqc", "RED",
@@ -665,6 +671,9 @@ def check_bowtie2_existence(outdir, samples, paired_end, log_path):
665671
missing_files.append(multiqc_file)
666672

667673
for sample in samples:
674+
# Ensure sample is a string
675+
sample = str(sample)
676+
668677
# Check sample directory
669678
sample_dir = os.path.join(align_dir, sample)
670679
if not os.path.exists(sample_dir):
@@ -743,8 +752,8 @@ def main():
743752
# Parse the runsheet
744753
runsheet_df = parse_runsheet(args.runsheet)
745754

746-
# Extract sample names
747-
sample_names = runsheet_df['Sample Name'].tolist()
755+
# Extract sample names and convert to strings
756+
sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
748757

749758
# Check consistency of paired_end, has_ERCC, and organism values
750759
paired_end_values = runsheet_df['paired_end'].unique()

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_dge_deseq2.py

Lines changed: 169 additions & 44 deletions
Large diffs are not rendered by default.

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_featurecounts.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ def parse_runsheet(runsheet_path):
4141

4242
try:
4343
df = pd.read_csv(runsheet_path)
44+
45+
# Convert Sample Name to string to ensure compatibility with numeric sample names
46+
df['Sample Name'] = df['Sample Name'].astype(str)
47+
4448
required_columns = ['Sample Name', 'paired_end', 'has_ERCC', 'organism']
4549
for col in required_columns:
4650
if col not in df.columns:
@@ -179,6 +183,9 @@ def get_featurecounts_multiqc_stats(outdir, samples, log_path, assay_suffix="_GL
179183

180184
print(f"Extracting stats from MultiQC data: {multiqc_zip}")
181185

186+
# Convert all samples to strings for consistent comparison
187+
samples = [str(sample) for sample in samples]
188+
182189
# Create a temporary directory to extract files
183190
with tempfile.TemporaryDirectory() as temp_dir:
184191
try:
@@ -214,6 +221,9 @@ def get_featurecounts_multiqc_stats(outdir, samples, log_path, assay_suffix="_GL
214221
# Clean sample name (remove path if present)
215222
sample_name = os.path.basename(sample) if '/' in sample else sample
216223

224+
# Convert to string to ensure compatibility with numeric sample names
225+
sample_name = str(sample_name)
226+
217227
# Extract key metrics
218228
fc_data[sample_name] = {
219229
'total_count': count_data['Total'],
@@ -282,6 +292,9 @@ def parse_featurecounts(multiqc_data_dir, assay_suffix="_GLbulkRNAseq"):
282292
if "/" in sample:
283293
sample_name = sample.split("/")[-1]
284294

295+
# Convert to string to ensure compatibility with numeric sample names
296+
sample_name = str(sample_name)
297+
285298
fc_data[sample_name] = {
286299
'total_count': count_data['Total'],
287300
'num_assigned': count_data['Assigned'],
@@ -607,8 +620,8 @@ def main():
607620
# Parse the runsheet
608621
runsheet_df = parse_runsheet(args.runsheet)
609622

610-
# Extract sample names
611-
sample_names = runsheet_df['Sample Name'].tolist()
623+
# Extract sample names and convert to strings to handle numeric sample names
624+
sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
612625

613626
# Check consistency of paired_end, has_ERCC, and organism values
614627
paired_end_values = runsheet_df['paired_end'].unique()

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_raw_reads.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,9 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
361361

362362
print(f"Found MultiQC data zip: {multiqc_data_zip}")
363363

364+
# Convert all samples to strings for consistent comparison
365+
samples = [str(sample) for sample in samples]
366+
364367
# Create a temporary directory to extract files
365368
with tempfile.TemporaryDirectory() as temp_dir:
366369
try:
@@ -391,6 +394,8 @@ def check_samples_multiqc(outdir, samples, paired_end, log_path, assay_suffix="_
391394

392395
fastqc_sections = multiqc_data['report_data_sources']['FastQC']['all_sections']
393396
for mqc_sample in fastqc_sections.keys():
397+
# Convert to string to ensure compatibility with numeric sample names
398+
mqc_sample = str(mqc_sample)
394399
# For paired-end data, remove _R1 and _R2 suffixes
395400
base_sample = mqc_sample.replace("_raw_fastqc", "").replace("_fastqc", "")
396401
if is_paired:
@@ -439,6 +444,9 @@ def get_raw_multiqc_stats(outdir, samples, paired_end, log_path, assay_suffix="_
439444

440445
print(f"Extracting stats from MultiQC data: {multiqc_data_zip}")
441446

447+
# Convert all samples to strings for consistent comparison
448+
samples = [str(sample) for sample in samples]
449+
442450
# Create a temporary directory to extract files
443451
with tempfile.TemporaryDirectory() as temp_dir:
444452
try:
@@ -516,6 +524,9 @@ def parse_fastqc(prefix, assay_suffix):
516524
# Group the samples by base name for paired end data
517525
sample_groups = {}
518526
for sample in j['report_general_stats_data'][-1].keys():
527+
# Convert sample to string to handle numeric sample names
528+
sample = str(sample)
529+
519530
# Handle various naming patterns
520531
if ' Read 1' in sample:
521532
base_name = sample.replace(' Read 1', '')
@@ -569,7 +580,7 @@ def parse_fastqc(prefix, assay_suffix):
569580
]:
570581
if section in j['report_plot_data']:
571582
for data_item in j['report_plot_data'][section]['datasets'][0]['lines']:
572-
sample = data_item['name']
583+
sample = str(data_item['name']) # Convert to string to handle numeric sample names
573584

574585
# Determine if it's forward or reverse read
575586
read_suffix = '_f' # Default to forward
@@ -962,8 +973,8 @@ def main():
962973
# Parse the runsheet
963974
runsheet_df = parse_runsheet(args.runsheet)
964975

965-
# Extract sample names
966-
sample_names = runsheet_df['Sample Name'].tolist()
976+
# Extract sample names and convert to strings to handle numeric sample names
977+
sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
967978

968979
# Check consistency of paired_end, has_ERCC, and organism values
969980
paired_end_values = runsheet_df['paired_end'].unique()

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rsem_counts.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ def check_rsem_output_existence(outdir, samples, log_path, assay_suffix="_GLbulk
143143
"""Check if all expected RSEM output files exist for each sample."""
144144
rsem_dir = os.path.join(outdir, '03-RSEM_Counts')
145145

146+
# Convert all samples to strings for consistent comparison
147+
samples = [str(sample) for sample in samples]
148+
146149
# Expected file patterns for each sample in sample-specific subdirectories
147150
expected_patterns = [
148151
"{sample}/{sample}.genes.results",
@@ -271,9 +274,11 @@ def parse_rsem(multiqc_data_dir, assay_suffix="_GLbulkRNAseq"):
271274

272275
for sample, count_data in j['report_saved_raw_data']['multiqc_rsem'].items():
273276
sample_name = sample
274-
# Clean up sample name if needed
277+
# Clean up sample name if needed and ensure it's a string
275278
if "/" in sample:
276-
sample_name = sample.split("/")[-1]
279+
sample_name = str(sample.split("/")[-1])
280+
else:
281+
sample_name = str(sample)
277282

278283
total_reads = count_data['Unique'] + count_data['Multi'] + count_data['Filtered'] + count_data['Unalignable']
279284

@@ -314,6 +319,9 @@ def get_rsem_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
314319

315320
print(f"Extracting RSEM stats from MultiQC data: {multiqc_zip}")
316321

322+
# Convert all samples to strings for consistent comparison
323+
samples = [str(sample) for sample in samples]
324+
317325
# Create a temporary directory to extract files
318326
with tempfile.TemporaryDirectory() as temp_dir:
319327
try:
@@ -547,6 +555,9 @@ def check_all_samples_in_multiqc(outdir, samples, log_path, assay_suffix="_GLbul
547555
)
548556
return False
549557

558+
# Convert all samples to strings for consistent comparison
559+
samples = [str(sample) for sample in samples]
560+
550561
# Create a temporary directory to extract files
551562
with tempfile.TemporaryDirectory() as temp_dir:
552563
try:
@@ -829,8 +840,8 @@ def main():
829840
# Parse the runsheet
830841
runsheet_df = parse_runsheet(args.runsheet)
831842

832-
# Extract sample names
833-
sample_names = runsheet_df['Sample Name'].tolist()
843+
# Extract sample names and convert to strings to handle numeric sample names
844+
sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
834845

835846
# Extract paired_end status
836847
paired_end = runsheet_df['paired_end'].iloc[0] if len(runsheet_df) > 0 else False

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_rseqc.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ def parse_runsheet(runsheet_path):
7171
print(f"Error: Runsheet missing required columns: {', '.join(missing_columns)}")
7272
sys.exit(1)
7373

74-
# Extract sample names
75-
sample_names = df['Sample Name'].tolist()
74+
# Extract sample names and convert to strings to handle numeric sample names
75+
sample_names = [str(sample) for sample in df['Sample Name'].tolist()]
7676

7777
# Extract metadata - store raw values, handle type conversion when using
7878
metadata = {
@@ -172,6 +172,9 @@ def check_gene_body_coverage_existence(outdir, samples, log_path):
172172
check_name = "check_gene_body_coverage_existence"
173173
print(f"Checking for gene body coverage files...")
174174

175+
# Convert all samples to strings for consistent comparison
176+
samples = [str(sample) for sample in samples]
177+
175178
# Get the expected directory path
176179
rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "02_geneBody_coverage")
177180

@@ -240,6 +243,9 @@ def check_infer_experiment_existence(outdir, samples, log_path):
240243
check_name = "check_infer_experiment_existence"
241244
print(f"Checking for infer experiment files...")
242245

246+
# Convert all samples to strings for consistent comparison
247+
samples = [str(sample) for sample in samples]
248+
243249
# Get the expected directory path
244250
rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "03_infer_experiment")
245251

@@ -300,6 +306,9 @@ def check_read_distribution_existence(outdir, samples, log_path):
300306
check_name = "check_read_distribution_files"
301307
print(f"Checking for read distribution files...")
302308

309+
# Convert all samples to strings for consistent comparison
310+
samples = [str(sample) for sample in samples]
311+
303312
# Get the expected directory path
304313
rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "05_read_distribution")
305314

@@ -358,6 +367,9 @@ def check_inner_distance_existence(outdir, samples, log_path):
358367
check_name = "check_inner_distance_files"
359368
print(f"Checking for inner distance files...")
360369

370+
# Convert all samples to strings for consistent comparison
371+
samples = [str(sample) for sample in samples]
372+
361373
# Get the expected directory path
362374
rseqc_dir = os.path.join(outdir, "RSeQC_Analyses", "04_inner_distance")
363375

@@ -428,6 +440,9 @@ def get_genebody_coverage_multiqc_stats(outdir, samples, log_path, assay_suffix=
428440

429441
print(f"Extracting stats from Gene body coverage MultiQC data: {multiqc_zip}")
430442

443+
# Convert all samples to strings for consistent comparison
444+
samples = [str(sample) for sample in samples]
445+
431446
# Create a temporary directory to extract files
432447
with tempfile.TemporaryDirectory() as temp_dir:
433448
try:
@@ -707,6 +722,9 @@ def get_infer_experiment_multiqc_stats(outdir, samples, log_path, assay_suffix="
707722

708723
print(f"Extracting stats from infer experiment MultiQC data: {multiqc_zip}")
709724

725+
# Convert all samples to strings for consistent comparison
726+
samples = [str(sample) for sample in samples]
727+
710728
# Create a temporary directory to extract files
711729
with tempfile.TemporaryDirectory() as temp_dir:
712730
try:
@@ -1039,6 +1057,9 @@ def get_inner_distance_multiqc_stats(outdir, samples, log_path, assay_suffix="_G
10391057
check_name = "inner_distance_multiqc_stats"
10401058
print(f"Extracting inner distance stats from MultiQC...")
10411059

1060+
# Convert all samples to strings for consistent comparison
1061+
samples = [str(sample) for sample in samples]
1062+
10421063
# Get the RSeQC inner distance directory
10431064
inner_dist_dir = os.path.join(outdir, "RSeQC_Analyses", "04_inner_distance")
10441065

@@ -1094,8 +1115,8 @@ def get_inner_distance_multiqc_stats(outdir, samples, log_path, assay_suffix="_G
10941115
for dist_data in plot_data:
10951116
sample_name = dist_data['name']
10961117

1097-
# Remove any file extension or path from the sample name
1098-
sample_name = os.path.basename(sample_name)
1118+
# Convert sample name to string to ensure compatibility with numeric sample names
1119+
sample_name = str(os.path.basename(sample_name))
10991120
if '.' in sample_name:
11001121
sample_name = sample_name.split('.')[0]
11011122

@@ -1371,6 +1392,9 @@ def get_read_distribution_multiqc_stats(outdir, samples, log_path, assay_suffix=
13711392
check_name = "read_distribution_multiqc_stats"
13721393
print(f"Extracting read distribution stats from MultiQC...")
13731394

1395+
# Convert all samples to strings for consistent comparison
1396+
samples = [str(sample) for sample in samples]
1397+
13741398
# Get the RSeQC read distribution directory
13751399
read_dist_dir = os.path.join(outdir, "RSeQC_Analyses", "05_read_distribution")
13761400

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/vv_star_alignment.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ def check_star_output_existence(outdir, samples, paired_end, log_path, assay_suf
140140
"""Check if all expected STAR alignment output files exist for each sample."""
141141
alignment_dir = os.path.join(outdir, '02-STAR_Alignment')
142142

143+
# Convert all samples to strings for consistent comparison
144+
samples = [str(sample) for sample in samples]
145+
143146
# Expected file patterns for each sample in sample-specific subdirectories
144147
expected_patterns = [
145148
"{sample}/{sample}_Aligned.sortedByCoord.out.bam",
@@ -242,6 +245,9 @@ def check_bam_file_integrity(outdir, samples, log_path):
242245
"""Verify BAM file integrity using samtools quickcheck."""
243246
alignment_dir = os.path.join(outdir, '02-STAR_Alignment')
244247

248+
# Convert all samples to strings for consistent comparison
249+
samples = [str(sample) for sample in samples]
250+
245251
# BAM file patterns to check for each sample
246252
bam_patterns = [
247253
"{sample}/{sample}_Aligned.toTranscriptome.out.bam",
@@ -328,6 +334,9 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
328334

329335
print(f"Extracting STAR stats from MultiQC data: {multiqc_zip}")
330336

337+
# Convert all samples to strings for consistent comparison
338+
samples = [str(sample) for sample in samples]
339+
331340
# Create a temporary directory to extract files
332341
with tempfile.TemporaryDirectory() as temp_dir:
333342
try:
@@ -380,8 +389,8 @@ def get_star_multiqc_stats(outdir, samples, log_path, assay_suffix="_GLbulkRNAse
380389
# This is STAR data
381390
star_metrics_found = True
382391

383-
# Clean sample name (remove path prefix if present)
384-
sample_name = os.path.basename(sample)
392+
# Convert sample to string to ensure compatibility with numeric sample names
393+
sample_name = str(os.path.basename(sample))
385394

386395
# Extract key STAR metrics
387396
star_data[sample_name] = {
@@ -705,8 +714,8 @@ def main():
705714
# Parse the runsheet
706715
runsheet_df = parse_runsheet(args.runsheet)
707716

708-
# Extract sample names
709-
sample_names = runsheet_df['Sample Name'].tolist()
717+
# Extract sample names and convert to strings to handle numeric sample names
718+
sample_names = [str(sample) for sample in runsheet_df['Sample Name'].tolist()]
710719

711720
# Extract paired_end status
712721
paired_end = runsheet_df['paired_end'].iloc[0] if len(runsheet_df) > 0 else False

0 commit comments

Comments
 (0)