make all resource files input to rules

sreichl · sreichl · commit 043630eaede8 · 2025-05-27T17:17:55.000+02:00
diff --git a/workflow/rules/processing.smk b/workflow/rules/processing.smk
@@ -7,6 +7,7 @@ rule align:
         output_bai =  os.path.join(result_path,"results","{sample}","mapped", "{sample}.bam.bai"),
         filtered_bam = os.path.join(result_path,"results","{sample}","mapped", "{sample}.filtered.bam"),
         filtered_bai = os.path.join(result_path,"results","{sample}","mapped", "{sample}.filtered.bam.bai"),
+        bowtie2_index = config["bowtie2_index"],
         bowtie_log = os.path.join(result_path, 'results', "{sample}", 'mapped', '{sample}.txt'),
         bowtie_met = os.path.join(result_path, 'results', "{sample}", 'mapped', '{sample}.bowtie2.met'),
         fastp_html = os.path.join(result_path, 'results', "{sample}", 'mapped', '{sample}.fastp.html'),
@@ -16,6 +17,8 @@ rule align:
         samtools_log = os.path.join(result_path, 'results', "{sample}", 'mapped', '{sample}.samtools.log'),
         samtools_flagstat_log = os.path.join(result_path, 'results', "{sample}", 'mapped', '{sample}.samtools_flagstat.log'),
         stats = os.path.join(result_path, 'results', "{sample}", '{sample}.align.stats.tsv'),
+        adapter_fasta = config["adapter_fasta"] if config["adapter_fasta"]!="" else [],
+        whitelisted_regions = config["whitelisted_regions"],
     params:
         interleaved_in = lambda w: "--interleaved_in" if samples["{}".format(w.sample)]["read_type"] == "paired"  else " ",
         interleaved = lambda w: "--interleaved" if samples["{}".format(w.sample)]["read_type"] == "paired" else " ",
@@ -26,7 +29,6 @@ rule align:
         sequencing_platform = config["sequencing_platform"],
         sequencing_center = config["sequencing_center"],
         mitochondria_name = config["mitochondria_name"],
-        bowtie2_index = config["bowtie2_index"],
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: 4*config.get("threads", 2)
@@ -43,7 +45,7 @@ rule align:
 
         for i in {input}; do samtools fastq $i 2>> "{output.samtools_log}" ; done | \
             fastp {params.adapter_sequence} {params.adapter_fasta} --stdin {params.interleaved_in} --stdout --html "{output.fastp_html}" --json "{output.fastp_json}" 2> "{output.fastp_log}" | \
-            bowtie2 $RG --very-sensitive --no-discordant -p {threads} --maxins 2000 -x {params.bowtie2_index} --met-file "{output.bowtie_met}" {params.interleaved} - 2> "{output.bowtie_log}" | \
+            bowtie2 $RG --very-sensitive --no-discordant -p {threads} --maxins 2000 -x {input.bowtie2_index} --met-file "{output.bowtie_met}" {params.interleaved} - 2> "{output.bowtie_log}" | \
             samblaster {params.add_mate_tags} 2> "{output.samblaster_log}" | \
             samtools sort -o "{output.bam}" - 2>> "{output.samtools_log}";
         
@@ -59,15 +61,15 @@ rule tss_coverage:
     input:
         bam = os.path.join(result_path,"results","{sample}","mapped","{sample}.filtered.bam"),
         bai = os.path.join(result_path,"results","{sample}","mapped","{sample}.filtered.bam.bai"),
+        chromosome_sizes = config["chromosome_sizes"],
+        unique_tss = config["unique_tss"],
     output:
         tss_hist = os.path.join(result_path,"results","{sample}","{sample}.tss_histogram.csv"),
     params:
         noise_upper = ( config["tss_slop"] * 2 ) - config["noise_lower"],
         double_slop = ( config["tss_slop"] * 2 ),
         genome_size = config["genome_size"],
         tss_slop = config["tss_slop"],
-        unique_tss = config["unique_tss"],
-        chromosome_sizes = config["chromosome_sizes"],
         noise_lower = config["noise_lower"],
     resources:
         mem_mb=config.get("mem", "16000"),
@@ -79,7 +81,7 @@ rule tss_coverage:
     shell:
         """
         echo "base,count" > {output.tss_hist};
-        bedtools slop -b {params.tss_slop} -i {params.unique_tss} -g {params.chromosome_sizes} | \
+        bedtools slop -b {params.tss_slop} -i {input.unique_tss} -g {input.chromosome_sizes} | \
             bedtools coverage -a - -b {input.bam} -d -sorted | \
             awk '{{if($6 == "+"){{ counts[$7] += $8;}} else counts[{params.double_slop} - $7 + 1] += $8;}} END {{ for(pos in counts) {{ if(pos < {params.noise_lower} || pos > {params.noise_upper}) {{ noise += counts[pos] }} }}; average_noise = noise /(2 * {params.noise_lower}); for(pos in counts) {{print pos-2000-1","(counts[pos]/average_noise) }} }}' | \
             sort -t "," -k1,1n >> {output.tss_hist} ;
@@ -91,6 +93,7 @@ rule peak_calling:
         bam = os.path.join(result_path,"results","{sample}","mapped", "{sample}.filtered.bam"),
         bai = os.path.join(result_path,"results","{sample}","mapped", "{sample}.filtered.bam.bai"),
         homer_script = os.path.join(HOMER_path,"configureHomer.pl"),
+        regulatory_regions = config["regulatory_regions"],
     output:
         peak_calls = os.path.join(result_path,"results","{sample}","peaks","{sample}_peaks.narrowPeak"),
         peak_annot = os.path.join(result_path,"results","{sample}","peaks","{sample}_peaks.narrowPeak.annotated.tsv"),
@@ -108,7 +111,6 @@ rule peak_calling:
         formating = lambda w: '--format BAMPE' if samples["{}".format(w.sample)]["read_type"] == "paired" else '--format BAM',
         genome_size = config["genome_size"],
         genome = config["genome"],
-        regulatory_regions = config["regulatory_regions"],
         keep_dup = config['macs2_keep_dup'],
     resources:
         mem_mb=config.get("mem", "16000"),
@@ -136,7 +138,7 @@ rule peak_calling:
         
         samtools view -c -L {output.peak_calls} {input.bam} | awk -v total=$TOTAL_READS '{{print "frip\t" $1/total}}' >> "{output.stats}";
 
-        samtools view -c -L {params.regulatory_regions} {input.bam} | awk -v total=$TOTAL_READS '{{print "regulatory_fraction\t" $1/total}}' >> "{output.stats}";
+        samtools view -c -L {input.regulatory_regions} {input.bam} | awk -v total=$TOTAL_READS '{{print "regulatory_fraction\t" $1/total}}' >> "{output.stats}";
         
         if [ ! -f {output.homer_knownResults} ]; then
             touch {output.homer_knownResults}
diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -19,10 +19,15 @@ rule sample_annotation:
 # generate promoter regions using (py)bedtools
 rule get_promoter_regions:
     input:
-        config["gencode_gtf"],
+        gencode_gtf = config["gencode_gtf"],
+        chromosome_sizes = config["chromosome_sizes"],
+        genome_fasta = config["genome_fasta"],
     output:
         promoter_regions = os.path.join(result_path,"counts","promoter_regions.bed"),
         promoter_annot = os.path.join(result_path,"counts","promoter_annotation.csv"),
+    params:
+        proximal_size_up = config["proximal_size_up"],
+        proximal_size_dn = config["proximal_size_dn"],
     resources:
         mem_mb = config.get("mem", "16000"),
     threads: config.get("threads", 2)
@@ -37,6 +42,8 @@ rule get_promoter_regions:
 rule get_consensus_regions:
     input:
         summits_bed = expand(os.path.join(result_path,"results","{sample}","peaks","{sample}_summits.bed"), sample=samples_quantify),
+        blacklisted_regions = config["blacklisted_regions"],
+        chromosome_sizes = config["chromosome_sizes"],
     output:
         consensus_regions = os.path.join(result_path,"counts","consensus_regions.bed"),
     resources:
@@ -54,8 +61,11 @@ rule quantify_support_sample:
     input:
         consensus_regions = os.path.join(result_path,"counts","consensus_regions.bed"),
         peakfile = os.path.join(result_path,"results","{sample}","peaks", "{sample}_summits.bed"),
+        chromosome_sizes = config["chromosome_sizes"],
     output:
         quant_support = os.path.join(result_path,"results","{sample}","peaks", "{sample}_quantification_support_counts.csv"),
+    params:
+        slop_extension = config["slop_extension"],
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 2)
@@ -71,6 +81,7 @@ rule quantify_counts_sample:
     input:
         regions = os.path.join(result_path,"counts","{kind}_regions.bed"),
         bamfile = os.path.join(result_path,"results","{sample}","mapped", "{sample}.filtered.bam"),
+        chromosome_sizes = config["chromosome_sizes"],
     output:
         quant_counts = os.path.join(result_path,"results","{sample}","mapped", "{sample}_quantification_{kind}_counts.csv"),
     resources:
diff --git a/workflow/rules/region_annotation.smk b/workflow/rules/region_annotation.smk
@@ -6,9 +6,16 @@ rule uropa_prepare:
         consensus_regions = os.path.join(result_path,"counts","consensus_regions.bed"),
         gencode_template = workflow.source_path(config["gencode_template"]),
         regulatory_template = workflow.source_path(config["regulatory_template"]),
+        gencode_gtf = config["gencode_gtf"],
+        regulatory_build_gtf = config["regulatory_build_gtf"],
     output:
         gencode_config = os.path.join(result_path,"tmp","consensus_regions_gencode.json"),
         reg_config = os.path.join(result_path,"tmp","consensus_regions_reg.json"),
+    params:
+        tss_size = config['tss_size'],
+        proximal_size_up = config["proximal_size_up"],
+        proximal_size_dn = config["proximal_size_dn"],
+        distal_size = config['distal_size'],
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 2)
@@ -20,11 +27,11 @@ rule uropa_prepare:
             gencode_template=Template(f.read())
 
         gencode_config=gencode_template.substitute({
-                'TSS_flanking':config['tss_size'],
-                'TSS_proximal_upstream':config['proximal_size_up'],
-                'TSS_proximal_downstream':config['proximal_size_dn'],
-                'distal_distance':config['distal_size'],
-                'gtf_file':'"{}"'.format(config["gencode_gtf"]),
+                'TSS_flanking':'"{}"'.format(params.tss_size),
+                'TSS_proximal_upstream':'"{}"'.format(params.proximal_size_up),
+                'TSS_proximal_downstream':'"{}"'.format(params.proximal_size_dn),
+                'distal_distance':'"{}"'.format(params.distal_size),
+                'gtf_file':'"{}"'.format(input.gencode_gtf),
                 'bed_file':'"{}"'.format(input.consensus_regions)
             })
 
@@ -36,7 +43,7 @@ rule uropa_prepare:
             reg_template=Template(f.read())  
 
         reg_config=reg_template.substitute({
-            'gtf_file':'"{}"'.format(config["regulatory_build_gtf"]),
+            'gtf_file':'"{}"'.format(input.regulatory_build_gtf),
             'bed_file':'"{}"'.format(input.consensus_regions)
         })
 
@@ -116,10 +123,9 @@ rule homer_region_annotation:
 rule bedtools_annotation:
     input:
         consensus_regions = os.path.join(result_path,"counts","consensus_regions.bed"),
+        genome_fasta = config["genome_fasta"],
     output:
         bedtools_annotation = os.path.join(result_path, "tmp", "bedtools_annotation.bed"),
-    params:
-        genome_fasta = config["genome_fasta"],
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 2)
@@ -129,7 +135,7 @@ rule bedtools_annotation:
         "logs/rules/bedtools_annotation.log"
     shell:
         """
-        bedtools nuc -fi {params.genome_fasta} -bed {input.consensus_regions} > {output.bedtools_annotation}
+        bedtools nuc -fi {input.genome_fasta} -bed {input.consensus_regions} > {output.bedtools_annotation}
         """
         
 # aggregate uropa and homer annotation results
diff --git a/workflow/scripts/get_consensus_regions.py b/workflow/scripts/get_consensus_regions.py
@@ -9,14 +9,14 @@
 
 # input
 peakfiles = snakemake.input["summits_bed"]
-blacklist_file = snakemake.config["blacklisted_regions"]
-chrom_file = snakemake.config["chromosome_sizes"]
+blacklist_file = snakemake.input["blacklisted_regions"]
+chrom_file = snakemake.input["chromosome_sizes"]
 
 # output
 consensus_regions_path = snakemake.output["consensus_regions"]
 
 # parameters
-slop_extension=snakemake.config["slop_extension"]
+slop_extension = snakemake.params["slop_extension"]
 
 # load summits and generate consensus regions using (py)bedtools
 output_bed = None
diff --git a/workflow/scripts/get_promoter_regions.py b/workflow/scripts/get_promoter_regions.py
@@ -37,17 +37,17 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
 #### configurations
 
 # input
-gtf_file = snakemake.config["gencode_gtf"]
-chrom_file = snakemake.config["chromosome_sizes"]
+gtf_file = snakemake.input["gencode_gtf"]
+chrom_file = snakemake.input["chromosome_sizes"]
+genome_fasta_path = snakemake.input["genome_fasta"]
 
 # output
 promoter_regions_path = snakemake.output["promoter_regions"]
 promoter_annot_path = snakemake.output["promoter_annot"]
 
 # parameters
-TSS_up = snakemake.config["proximal_size_up"]
-TSS_dn = snakemake.config["proximal_size_dn"]
-genome_fasta_path = snakemake.config["genome_fasta"]
+TSS_up = snakemake.params["proximal_size_up"]
+TSS_dn = snakemake.params["proximal_size_dn"]
 
 # load the genome annotation file using pybedtools
 gtf = bedtools.BedTool(gtf_file)
diff --git a/workflow/scripts/quantify_counts_sample.py b/workflow/scripts/quantify_counts_sample.py
@@ -9,7 +9,7 @@
 # input
 regions_path = snakemake.input["regions"]
 bamfile_path = snakemake.input["bamfile"]
-chrom_file = snakemake.config["chromosome_sizes"]
+chrom_file = snakemake.input["chromosome_sizes"]
 
 # output
 quant_count_path = snakemake.output["quant_counts"]
diff --git a/workflow/scripts/quantify_support_sample.py b/workflow/scripts/quantify_support_sample.py
@@ -10,7 +10,7 @@
 # input
 consensus_regions_path = snakemake.input["consensus_regions"]
 peakfile_path = snakemake.input["peakfile"]
-chrom_file = snakemake.config["chromosome_sizes"]
+chrom_file = snakemake.input["chromosome_sizes"]
 
 # output
 quant_support_path = snakemake.output["quant_support"]