snakemake-workflows · dlaehnemann · Jun 24, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 4, 2024
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -9,13 +9,13 @@ ref:
   # Ensembl species name
   species: homo_sapiens
   # Ensembl release
-  release: 108
+  release: 112
   # Genome build
   build: GRCh38
   # Optionally, instead of downloading the whole reference from Ensembl via the 
   # parameters above, specify a specific chromosome below and uncomment the line.
   # This is usually only relevant for testing.
-  chromosome: 7
+  #chromosome: 7
 
 # These filters mostly correspond to the output columns of Circle-Map:
 # https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files

diff --git a/config/config.yaml b/config/config.yaml
@@ -9,7 +9,7 @@ ref:
   # Ensembl species name
   species: homo_sapiens
   # Ensembl release
-  release: 108
+  release: 112
   # Genome build
   build: GRCh38
   # Optionally, instead of downloading the whole reference from Ensembl via the 

diff --git a/workflow/envs/annotatr.yaml b/workflow/envs/annotatr.yaml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bioconductor-annotatr =1.28
+  - r-tidyverse = 2.0
diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bioconductor-biomart =2.58
+  - r-tidyverse = 2.0
diff --git a/workflow/envs/rtracklayer.yaml b/workflow/envs/rtracklayer.yaml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bioconductor-rtracklayer =1.62
+  - r-tidyverse = 2.0
diff --git a/workflow/rules/circle_map.smk b/workflow/rules/circle_map.smk
@@ -76,3 +76,19 @@ rule clean_circle_map_realign_output:
         max_circle_length=config["circle_filtering"]["max_circle_length"],
     script:
         "../scripts/clean_circle_map_realign_output.py"
+
+
+rule annotate_cleaned_circles:
+    input:
+        all_annotations="resources/all_annotations.harmonized.gff3.gz",
+        tsv="results/circle-map/{sample}.circles.cleaned.tsv",
+    output:
+        tsvs=directory("results/circle-map/{sample}.circles.cleaned.annotated/"),
+    log:
+        "logs/circle-map/{sample}.circles.cleaned.annotated.logs",
+    conda:
+        "../envs/annotatr.yaml"
+    params:
+        build=config["ref"]["build"],
+    script:
+        "../scripts/annotate_cleaned_circles.R"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -83,6 +83,12 @@ def get_adapters(wildcards):
     ].get("adapters", ""),
 
 
+def get_bioc_species_name():
+    first_letter = config["ref"]["species"][0]
+    subspecies = config["ref"]["species"].split("_")[1]
+    return first_letter + subspecies
+
+
 def get_bwa_extra(wildcards):
     """
     Denote sample name and platform in read group.

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
@@ -10,6 +10,7 @@ rule get_genome:
         release=config["ref"]["release"],
         chromosome=config["ref"].get("chromosome"),
     cache: "omit-software"
+    localrule: True
     wrapper:
         "v1.21.2/bio/reference/ensembl-sequence"
 
@@ -69,6 +70,7 @@ rule get_known_variants:
         type="all",
         chromosome=config["ref"].get("chromosome"),
     cache: "omit-software"
+    localrule: True
     wrapper:
         "v1.21.2/bio/reference/ensembl-variation"
 
@@ -99,3 +101,67 @@ rule tabix_known_variants:
         "-p vcf",
     wrapper:
         "v1.21.2/bio/tabix/index"
+
+
+rule get_annotation:
+    output:
+        "resources/genomic_annotations.gff3.gz",
+    params:
+        species=config["ref"]["species"],
+        release=config["ref"]["release"],
+        build=config["ref"]["build"],
+    log:
+        "logs/get-annotation.log",
+    cache: "omit-software"
+    localrule: True
+    wrapper:
+        "v3.13.6/bio/reference/ensembl-annotation"
+
+
+rule get_regulatory_features_gff3_gz:
+    output:
+        "resources/regulatory_annotations.gff3.gz",  # presence of .gz determines if downloaded is kept compressed
+    params:
+        species=config["ref"]["species"],
+        release=config["ref"]["release"],
+        build=config["ref"]["build"],
+    log:
+        "logs/get_regulatory_features.log",
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    localrule: True
+    wrapper:
+        "v3.13.6/bio/reference/ensembl-regulation"
+
+
+rule create_transcripts_to_genes_mappings:
+    output:
+        mapping="resources/transcripts_to_genes_mappings.tsv.gz",
+    params:
+        species=get_bioc_species_name(),
+        release=config["ref"]["release"],
+        build=config["ref"]["build"],
+        chromosome=config["ref"].get("chromosome", ""),
+    log:
+        "logs/transcripts_to_genes_mappings.log",
+    conda:
+        "../envs/biomart.yaml"
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    script:
+        "../scripts/create_transcripts_to_genes_mappings.R"
+
+
+rule create_annotation_gff:
+    input:
+        genomic_annotations="resources/genomic_annotations.gff3.gz",
+        mapping="resources/transcripts_to_genes_mappings.tsv.gz",
+        regulatory_annotations="resources/regulatory_annotations.gff3.gz",
+    output:
+        all_annotations="resources/all_annotations.harmonized.gff3.gz",
+    log:
+        "logs/all_annotations.harmonized.gff3.log",
+    conda:
+        "../envs/rtracklayer.yaml"
+    params:
+        build=config["ref"]["build"],
+    script:
+        "../scripts/create_annotation_gff3.R"
diff --git a/workflow/scripts/annotate_cleaned_circles.R b/workflow/scripts/annotate_cleaned_circles.R
@@ -0,0 +1,108 @@
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
+library("tidyverse")
+rlang::global_entrace()
+
+library("annotatr")
+library("GenomicRanges")
+
+circles <- read_tsv(
+  snakemake@input[["tsv"]]
+)
+
+genome_build <- snakemake@params[["build"]]
+
+circles_gr <- GRanges(
+  seqnames = pull(circles, region),
+  ecDNA_status = "ecDNA"
+)
+
+genome(circles_gr) <- genome_build
+
+overlapping_annotations <- rtracklayer::import(
+  snakemake@input[["all_annotations"]],
+  which = circles_gr, #import only intervals overlapping circles
+  genome = genome_build
+)
+
+annotated_circles <- annotate_regions(
+  regions = circles_gr,
+  annotations = overlapping_annotations,
+  ignore.strand = TRUE,
+  quiet = FALSE
+) |>
+  as_tibble() |>
+  dplyr::select(
+    -c(
+      strand,
+      width,
+      ecDNA_status,
+      annot.width,
+      annot.source,
+      annot.score,
+      annot.phase
+    )
+  ) |>
+  arrange(
+    annot.seqnames,
+    annot.start,
+    annot.end
+  ) |>
+  mutate(
+    circle_region = str_c(
+      seqnames,
+      ":",
+      start,
+      "-",
+      end
+    ),
+    region = str_c(
+      annot.seqnames,
+      ":",
+      annot.start,
+      "-",
+      annot.end
+    )
+  ) |>
+  dplyr::select(
+    -c(
+      seqnames,
+      start,
+      end,
+      annot.seqnames,
+      annot.start,
+      annot.end
+    )
+  ) |>
+  dplyr::rename(
+    exon_rank = annot.rank
+  ) |>
+  dplyr::rename_with(
+    ~ str_replace(.x, fixed("annot."), "")
+  ) |>
+  dplyr::select(
+    region,
+    type,
+    id,
+    name,
+    parent_type,
+    parent_id,
+    exon_rank,
+    circle_region,
+  ) |>
+  group_by(
+    circle_region
+  ) |>
+  group_walk(
+    ~ write_tsv(
+      .x,
+      file = file.path(
+        str_c(
+          str_replace_all(.y$circle_region, "[:-]", "_"),
+          ".tsv"
+        )
+      )
+    )
+  )
diff --git a/workflow/scripts/clean_circle_map_realign_output.py b/workflow/scripts/clean_circle_map_realign_output.py
@@ -33,7 +33,7 @@
 ]
 
 # turn int cols into int
-circles.loc[:, int_cols] = circles.loc[:, int_cols].round(0).applymap(lambda v: int(v) if not pd.isna(v) else pd.NA)
+circles[int_cols] = circles[int_cols].round(0).map(lambda v: int(v) if not pd.isna(v) else pd.NA)
 
-# turn int cols into int
-circles.loc[:, int_cols] = circles.loc[:, int_cols].round(0).applymap(lambda v: int(v) if not pd.isna(v) else pd.NA)
-circles[int_cols] = circles[int_cols].round(0).map(lambda v: int(v) if not pd.isna(v) else pd.NA)
+# turn int cols into int
+# convert each value to integer while keeping NA
+circles[int_cols] = (
+    circles[int_cols]
+        .round(0)
+        .applymap(lambda v: pd.NA if pd.isna(v) else int(v))
+)
-# turn int cols into int
-circles.loc[:, int_cols] = circles.loc[:, int_cols].round(0).applymap(lambda v: int(v) if not pd.isna(v) else pd.NA)
-circles[int_cols] = circles[int_cols].round(0).map(lambda v: int(v) if not pd.isna(v) else pd.NA)
+# turn int cols into int
+# convert each value to integer while keeping NA
+circles[int_cols] = (
+    circles[int_cols]
+        .round(0)
+        .applymap(lambda v: pd.NA if pd.isna(v) else int(v))
+)
 # filter out low-quality circles, according to:
 # https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files