|
1 |
| -#!/usr/bin/env python3 |
| 1 | +#!/usr/bin/env python |
2 | 2 | """
|
3 | 3 | This script generates a protocol text file for GeneLab RNA-seq data processing.
|
4 | 4 | It reads software versions from a YAML file and incorporates other parameters.
|
|
9 | 9 | import os
|
10 | 10 | import sys
|
11 | 11 | from datetime import datetime
|
| 12 | +import pandas as pd |
| 13 | +import re |
12 | 14 |
|
13 | 15 | def parse_args():
|
14 | 16 | parser = argparse.ArgumentParser(description='Generate protocol file for GeneLab RNA-seq pipeline')
|
@@ -38,6 +40,8 @@ def parse_args():
|
38 | 40 | help='Path to the reference genome FASTA file')
|
39 | 41 | parser.add_argument('--reference_gtf', required=True,
|
40 | 42 | help='Path to the reference genome GTF file')
|
| 43 | + parser.add_argument('--runsheet', required=False, |
| 44 | + help='Path to the runsheet CSV file') |
41 | 45 | return parser.parse_args()
|
42 | 46 |
|
43 | 47 | def read_software_versions(yaml_file):
|
@@ -267,8 +271,54 @@ def generate_protocol_content(args, software_versions):
|
267 | 271 | # For standard workflow (include tximport)
|
268 | 272 | description += f"The runsheet was generated with dp_tools (version {dp_tools_version}) and the runsheet and quantification data were imported to R (version {r_version}) with tximport (version {tximport_version}) and normalized with DESeq2 (version {deseq2_version}) median of ratios method. "
|
269 | 273 |
|
| 274 | + # Parse runsheet for technical replicate handling |
| 275 | + tech_rep_sentence = "" |
| 276 | + if hasattr(args, 'runsheet') and args.runsheet and os.path.exists(args.runsheet): |
| 277 | + try: |
| 278 | + runsheet_df = pd.read_csv(args.runsheet) |
| 279 | + if 'Sample Name' in runsheet_df.columns: |
| 280 | + # Remove whitespace and NA |
| 281 | + sample_names = runsheet_df['Sample Name'].dropna().astype(str).tolist() |
| 282 | + # Remove trailing/leading whitespace |
| 283 | + sample_names = [s.strip() for s in sample_names] |
| 284 | + # Remove empty |
| 285 | + sample_names = [s for s in sample_names if s] |
| 286 | + # Find base names (remove _techrepN if present) |
| 287 | + base_names = [re.sub(r'_techrep\d+$', '', s) for s in sample_names] |
| 288 | + from collections import Counter |
| 289 | + base_counts = Counter(base_names) |
| 290 | + n_reps = list(base_counts.values()) |
| 291 | + unique_n = set(n_reps) |
| 292 | + if all(x == 1 for x in n_reps): |
| 293 | + # No technical replicates at all |
| 294 | + tech_rep_sentence = "" |
| 295 | + elif len(unique_n) == 1 and list(unique_n)[0] > 1: |
| 296 | + # All samples have the same number of tech reps |
| 297 | + tech_rep_sentence = ("Counts from all technical replicates for each sample were summed using DESeq2's collapseReplicates function. " |
| 298 | + "These collapsed counts were then used for count normalization and differential expression analysis. ") |
| 299 | + elif len(unique_n) > 1 and min(unique_n) > 1: |
| 300 | + # All samples have tech reps, but unequal number |
| 301 | + tech_rep_sentence = ("For each sample, counts from the first n technical replicates were summed using DESeq2's collapseReplicates function. " |
| 302 | + "These collapsed counts were then used for count normalization and differential expression analysis. ") |
| 303 | + else: |
| 304 | + # Some samples have tech reps, some don't |
| 305 | + tech_rep_sentence = ("For samples with technical replicates, only the first replicate was used for count normalization and differential expression analysis. ") |
| 306 | + except Exception as e: |
| 307 | + tech_rep_sentence = "" |
| 308 | + # If no runsheet, leave tech_rep_sentence as empty |
| 309 | + |
| 310 | + # Add ERCC normalization sentence if ERCC spike-ins were used |
| 311 | + if args.has_ercc.lower() == "true": |
| 312 | + description += ("The data were normalized twice, each time using a different size factor. " |
| 313 | + "The first used non-ERCC genes for size factor estimation, and the second used only ERCC group B genes to estimate the size factor. " |
| 314 | + "Both sets of normalized gene counts were subject to differential expression analysis. ") |
| 315 | + else: |
| 316 | + description += "Normalized gene counts were subject to differential expression analysis. " |
| 317 | + # Add tech rep sentence |
| 318 | + if tech_rep_sentence: |
| 319 | + description += tech_rep_sentence |
270 | 320 | # Add differential expression analysis sentence
|
271 |
| - description += f"Normalized gene counts were subject to differential expression analysis. Differential expression analysis was performed in R (version {r_version}) using DESeq2 (version {deseq2_version}); all groups were compared pairwise using the Wald test and the likelihood ratio test was used to generate the F statistic p-value. " |
| 321 | + description += f"Differential expression analysis was performed in R (version {r_version}) using DESeq2 (version {deseq2_version}); all groups were compared pairwise using the Wald test and the likelihood ratio test was used to generate the F statistic p-value. " |
272 | 322 |
|
273 | 323 | # Add gene annotations section
|
274 | 324 | # Define versions for annotation packages
|
@@ -321,15 +371,32 @@ def generate_protocol_content(args, software_versions):
|
321 | 371 | organism_formatted = args.organism.replace(' ', '_').replace('-', '_').lower()
|
322 | 372 |
|
323 | 373 | # Build gene annotations sentence
|
324 |
| - gene_annotations_text = f"Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110-A (https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md), with STRINGdb (version {stringdb_version}) and PANTHER.db (version {pantherdb_version})" |
325 |
| - |
326 |
| - # Add organism-specific annotation package if it's an officially supported organism |
| 374 | + annotation_sources = [] |
| 375 | + # Only include STRINGdb if not in organisms_without_string |
| 376 | + if organism_formatted not in organisms_without_string: |
| 377 | + annotation_sources.append(f"STRINGdb (version {stringdb_version})") |
| 378 | + # Only include PANTHER.db if not in organisms_without_panther |
| 379 | + if organism_formatted not in organisms_without_panther: |
| 380 | + annotation_sources.append(f"PANTHER.db (version {pantherdb_version})") |
| 381 | + # Custom annotation package |
| 382 | + custom_pkg = None |
| 383 | + if hasattr(args, 'organism') and args.organism and args.organism in organisms_with_custom_annotations: |
| 384 | + custom_pkg = "a custom annotation package generated in-house using AnnotationForge" |
| 385 | + annotation_sources.append(custom_pkg) |
| 386 | + # org.*.eg.db package |
327 | 387 | if organism_formatted and organism_formatted in organism_annotation_packages:
|
328 | 388 | package_name, package_version = organism_annotation_packages[organism_formatted]
|
329 |
| - gene_annotations_text += f", and {package_name} (version {package_version})" |
330 |
| - |
331 |
| - # Complete the gene annotations sentence |
332 |
| - gene_annotations_text += "." |
| 389 | + annotation_sources.append(f"{package_name} (version {package_version})") |
| 390 | + # Build the sentence |
| 391 | + base_text = ("Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110-A " |
| 392 | + "(https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md)") |
| 393 | + if annotation_sources: |
| 394 | + if len(annotation_sources) == 1: |
| 395 | + gene_annotations_text = f"{base_text}, with {annotation_sources[0]}." |
| 396 | + else: |
| 397 | + gene_annotations_text = f"{base_text}, with {', '.join(annotation_sources[:-1])}, and {annotation_sources[-1]}." |
| 398 | + else: |
| 399 | + gene_annotations_text = f"{base_text}." |
333 | 400 | description += gene_annotations_text
|
334 | 401 |
|
335 | 402 | # Add ERCC assessment sentence if ERCC spike-ins were used
|
|
0 commit comments