diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9cb92df..0bb8c83 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,9 +11,13 @@ jobs: Formatting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + with: + # super-linter needs the full git history to get the + # list of files that changed across commits + fetch-depth: 0 - name: Formatting - uses: github/super-linter@v4 + uses: github/super-linter@v7 env: VALIDATE_ALL_CODEBASE: false DEFAULT_BRANCH: main @@ -23,9 +27,9 @@ jobs: Linting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Lint workflow - uses: snakemake/snakemake-github-action@v1.24.0 + uses: snakemake/snakemake-github-action@v2 with: directory: . snakefile: workflow/Snakefile @@ -37,17 +41,17 @@ jobs: - Linting - Formatting steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Test workflow - uses: snakemake/snakemake-github-action@v1.24.0 + uses: snakemake/snakemake-github-action@v2 with: directory: .test snakefile: .test/Snakefile args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache" - name: Test report - uses: snakemake/snakemake-github-action@v1.24.0 + uses: snakemake/snakemake-github-action@v2 with: directory: .test snakefile: .test/Snakefile diff --git a/.test/config/config.yaml b/.test/config/config.yaml index b469c8e..c1ebc86 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -9,13 +9,13 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 108 + release: 113 # Genome build build: GRCh38 # Optionally, instead of downloading the whole reference from Ensembl via the # parameters above, specify a specific chromosome below and uncomment the line. # This is usually only relevant for testing. - chromosome: 7 + #chromosome: 7 # These filters mostly correspond to the output columns of Circle-Map: # https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files diff --git a/config/config.yaml b/config/config.yaml index b36fded..f09b5c5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -9,7 +9,7 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 108 + release: 113 # Genome build build: GRCh38 # Optionally, instead of downloading the whole reference from Ensembl via the diff --git a/workflow/envs/annotatr.yaml b/workflow/envs/annotatr.yaml new file mode 100644 index 0000000..937fba1 --- /dev/null +++ b/workflow/envs/annotatr.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bioconductor-annotatr =1.28 + - r-tidyverse = 2.0 \ No newline at end of file diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml new file mode 100644 index 0000000..b410e09 --- /dev/null +++ b/workflow/envs/biomart.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bioconductor-biomart =2.58 + - r-tidyverse = 2.0 \ No newline at end of file diff --git a/workflow/envs/rtracklayer.yaml b/workflow/envs/rtracklayer.yaml new file mode 100644 index 0000000..f440c42 --- /dev/null +++ b/workflow/envs/rtracklayer.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bioconductor-rtracklayer =1.62 + - r-tidyverse = 2.0 \ No newline at end of file diff --git a/workflow/resources/circles.datavzrd.yaml b/workflow/resources/circles.datavzrd.yaml index fb371fb..9be6e0a 100644 --- a/workflow/resources/circles.datavzrd.yaml +++ b/workflow/resources/circles.datavzrd.yaml @@ -1,15 +1,17 @@ +__use_yte__: true + name: ?f"Extrachromosomal circular DNAs (eccDNAs) found by Circle-Map in sample '{wildcards.sample}'." -default-view: circle_table +default-view: circles_overview datasets: - circles: - path: ?input.circles + circles_overview: + path: ?input.circles_overview separator: "\t" views: - circle_table: - dataset: circles + circles_overview: + dataset: circles_overview page-size: 18 desc: | ?""" diff --git a/workflow/rules/circle_map.smk b/workflow/rules/circle_map.smk index 057c23b..03c2709 100644 --- a/workflow/rules/circle_map.smk +++ b/workflow/rules/circle_map.smk @@ -76,3 +76,19 @@ rule clean_circle_map_realign_output: max_circle_length=config["circle_filtering"]["max_circle_length"], script: "../scripts/clean_circle_map_realign_output.py" + + +rule annotate_cleaned_circles: + input: + all_annotations="resources/ensembl_all_annotations.harmonized.gff3.gz", + tsv="results/circle-map/{sample}.circles.cleaned.tsv", + output: + tsvs=directory("results/circle-map/{sample}.circles.cleaned.annotated/"), + log: + "logs/circle-map/{sample}.circles.cleaned.annotated.logs", + conda: + "../envs/annotatr.yaml" + params: + build=config["ref"]["build"], + script: + "../scripts/annotate_cleaned_circles.R" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ebf3165..4299360 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -83,6 +83,12 @@ def get_adapters(wildcards): ].get("adapters", ""), +def get_bioc_species_name(): + first_letter = config["ref"]["species"][0] + subspecies = config["ref"]["species"].split("_")[1] + return first_letter + subspecies + + def get_bwa_extra(wildcards): """ Denote sample name and platform in read group. diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 76deed3..65ad089 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -1,19 +1,8 @@ -rule render_datavzrd_config: - input: - template=workflow.source_path("../resources/circles.datavzrd.yaml"), - circles="results/circle-map/{sample}.circles.cleaned.tsv", - output: - "resources/datavzrd/circle-map/{sample}.circles.yaml", - log: - "logs/datavzrd/circle-map/{sample}.circles.rendering.log", - template_engine: - "yte" - - rule datavzrd: input: - config="resources/datavzrd/circle-map/{sample}.circles.yaml", - circles="results/circle-map/{sample}.circles.cleaned.tsv", + config=workflow.source_path("../resources/circles.datavzrd.yaml"), + circles_overview="results/circle-map/{sample}.circles.cleaned.tsv", + circles_tables="results/circle-map/{sample}.circles.cleaned.annotated/", output: report( directory("results/datavzrd/circles/{sample}"), @@ -22,7 +11,8 @@ rule datavzrd: labels={"tool": "Circle-Map", "sample": "{sample}"}, caption="../report/circle_map.rst", ), + config="resources/datavzrd/circle-map/{sample}.circles.datavzrd.rendered.yaml", log: "logs/datavzrd/circles/{sample}.log", wrapper: - "v3.8.0/utils/datavzrd" + "v7.0.0/utils/datavzrd" diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 0958415..14e5454 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -10,6 +10,7 @@ rule get_genome: release=config["ref"]["release"], chromosome=config["ref"].get("chromosome"), cache: "omit-software" + localrule: True wrapper: "v1.21.2/bio/reference/ensembl-sequence" @@ -69,6 +70,7 @@ rule get_known_variants: type="all", chromosome=config["ref"].get("chromosome"), cache: "omit-software" + localrule: True wrapper: "v1.21.2/bio/reference/ensembl-variation" @@ -99,3 +101,105 @@ rule tabix_known_variants: "-p vcf", wrapper: "v1.21.2/bio/tabix/index" + + +rule get_annotation: + output: + "resources/ensembl_genomic_annotations.gff3.gz", + params: + species=config["ref"]["species"], + release=config["ref"]["release"], + build=config["ref"]["build"], + log: + "logs/get-annotation.log", + cache: "omit-software" + localrule: True + wrapper: + "v3.13.6/bio/reference/ensembl-annotation" + + +rule get_regulatory_features_gff3_gz: + output: + "resources/ensembl_regulatory_annotations.gff3.gz", # presence of .gz determines if downloaded is kept compressed + params: + species=config["ref"]["species"], + release=config["ref"]["release"], + build=config["ref"]["build"], + log: + "logs/get_regulatory_features.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + localrule: True + wrapper: + "v3.13.6/bio/reference/ensembl-regulation" + + +rule get_repeat_features_tsv_gz: + output: + table="resources/ensembl_repeat_annotations.tsv.gz", # .gz extension is optional, but recommended + params: + species=config["ref"]["species"], + release=config["ref"]["release"], + build=config["ref"]["build"], + main_tables={ + "repeat_feature": { + "database": "core", + }, + }, + # choose the main table to retrieve, specifying { table : database } + join_tables={ + "seq_region": { + "database": "core", + "join_column": "seq_region_id", + }, + "repeat_consensus": { + "database": "core", + "join_column": "repeat_consensus_id", + }, + }, + # optional: add tables to join in for further annotations, specifying { table : { "database": database, "join_column": join-column } } + log: + "logs/create_repeat_annotations.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "v4.0.0/bio/reference/ensembl-mysql-table" + + +rule create_transcripts_to_genes_mapping: + output: + table="resources/ensembl_transcripts_to_genes_mapping.tsv.gz", # .gz extension is optional, but recommended + params: + biomart="genes", + species=config["ref"]["species"], + release=config["ref"]["release"], + build=config["ref"]["build"], + attributes=[ + "ensembl_transcript_id", + "ensembl_gene_id", + "external_gene_name", + "genecards", + ], + #filters={ "chromosome_name": ["22", "X"] }, # optional: restrict output by using filters + log: + "logs/create_transcripts_to_genes_mapping.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "v4.0.0/bio/reference/ensembl-biomart-table" + + +rule create_annotation_gff: + input: + genomic_annotations="resources/ensembl_genomic_annotations.gff3.gz", + mapping="resources/ensembl_transcripts_to_genes_mapping.tsv.gz", + regulatory_annotations="resources/ensembl_regulatory_annotations.gff3.gz", + output: + all_annotations="resources/ensembl_all_annotations.harmonized.gff3.gz", + log: + "logs/all_annotations.harmonized.gff3.log", + conda: + "../envs/rtracklayer.yaml" + params: + build=config["ref"]["build"], + resources: + mem_mb=lambda wc, input: 100 * input.size_mb, + script: + "../scripts/create_annotation_gff3.R" diff --git a/workflow/scripts/annotate_cleaned_circles.R b/workflow/scripts/annotate_cleaned_circles.R new file mode 100644 index 0000000..225efb3 --- /dev/null +++ b/workflow/scripts/annotate_cleaned_circles.R @@ -0,0 +1,117 @@ +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + +library("tidyverse") +rlang::global_entrace() + +library("annotatr") +library("GenomicRanges") + +circles <- read_tsv( + snakemake@input[["tsv"]] +) + +genome_build <- snakemake@params[["build"]] + +circles_gr <- GRanges( + seqnames = pull(circles, region), + ecDNA_status = "ecDNA" +) + +genome(circles_gr) <- genome_build + +overlapping_annotations <- rtracklayer::import( + snakemake@input[["all_annotations"]], + which = circles_gr, #import only intervals overlapping circles + genome = genome_build +) + +dir.create( + snakemake@output[["tsvs"]], + recursive = TRUE +) + +annotated_circles <- annotate_regions( + regions = circles_gr, + annotations = overlapping_annotations, + ignore.strand = TRUE, + quiet = FALSE +) |> + as_tibble() |> + dplyr::select( + -c( + strand, + width, + ecDNA_status, + annot.width, + annot.source, + annot.score, + annot.phase + ) + ) |> + arrange( + annot.seqnames, + annot.start, + annot.end + ) |> + mutate( + circle_region = str_c( + seqnames, + ":", + start, + "-", + end + ), + region = str_c( + annot.seqnames, + ":", + annot.start, + "-", + annot.end + ) + ) |> + dplyr::select( + -c( + seqnames, + start, + end, + annot.seqnames, + annot.start, + annot.end + ) + ) |> + dplyr::rename( + exon_rank = annot.rank + ) |> + dplyr::rename_with( + ~ str_replace(.x, fixed("annot."), "") + ) |> + dplyr::select( + region, + strand, + type, + ensembl_id, + name, + ensembl_transcript_id, + ensembl_gene_id, + hgnc_symbol, + genecards_id, + exon_rank, + circle_region, + ) |> + group_by( + circle_region + ) |> + group_walk( + ~ write_tsv( + .x, + file = file.path( + snakemake@output[["tsvs"]], + str_c( + str_replace_all(.y$circle_region, "[:-]", "_"), + ".tsv" + ) + ) + ) + ) \ No newline at end of file diff --git a/workflow/scripts/clean_circle_map_realign_output.py b/workflow/scripts/clean_circle_map_realign_output.py index d76f1a1..9feb9c6 100644 --- a/workflow/scripts/clean_circle_map_realign_output.py +++ b/workflow/scripts/clean_circle_map_realign_output.py @@ -33,7 +33,7 @@ ] # turn int cols into int -circles.loc[:, int_cols] = circles.loc[:, int_cols].round(0).applymap(lambda v: int(v) if not pd.isna(v) else pd.NA) +circles[int_cols] = circles[int_cols].round(0).map(lambda v: int(v) if not pd.isna(v) else pd.NA) # filter out low-quality circles, according to: # https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files diff --git a/workflow/scripts/create_annotation_gff3.R b/workflow/scripts/create_annotation_gff3.R new file mode 100644 index 0000000..0ae6f3d --- /dev/null +++ b/workflow/scripts/create_annotation_gff3.R @@ -0,0 +1,196 @@ +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + +library("tidyverse") +rlang::global_entrace() + +library("GenomicRanges") +library("rtracklayer") + +transcripts_to_genes_mappings <- read_tsv( + snakemake@input[["mapping"]] +) + +genome_build <- snakemake@params[["build"]] + +genomic_annotations <- rtracklayer::import( + snakemake@input[["genomic_annotations"]], + genome = genome_build, + feature.type = c( + "five_prime_UTR", + "three_prime_UTR", + "exon", + "gene", + "lnc_RNA", + "miRNA", + "snRNA", + "snoRNA", + "scRNA", + "rRNA", + "tRNA", + "V_gene_segment", + "D_gene_segment", + "J_gene_segment", + "C_gene_segment" + ), + colnames = c( + "type", + "Name", + "ID", + "Parent", + "rank", + "phase" + ) +) |> + as_tibble() |> + separate_wider_delim( + ID, + delim = ":", + names = c("id_type", "id") + ) |> + dplyr::select(-id_type) |> + unnest( + Parent, + keep_empty = TRUE + ) |> + separate_wider_delim( + Parent, + delim = ":", + names = c("parent_type", "parent_id") + ) |> + dplyr::rename(name = Name) + +regulatory_annotations <- rtracklayer::import( + snakemake@input[["regulatory_annotations"]], + genome = genome_build, + # make sure to expand parent_type generation below, if you add + # further types right here + feature.type = c( + "promoter", + "enhancer", + "TF_binding_site", + "CTCF_binding_site" + ), + colnames = c( + "type", + "ID", + "gene_id", + "gene_name", + "phase" + ) +) |> + as_tibble() |> + unnest_wider( + c(gene_id, gene_name), + names_sep = "-" + ) |> + pivot_longer( + starts_with("gene_"), + names_to = c("id_type", "gene_nr"), + values_to = "val", + names_sep = "-" + ) |> + pivot_wider( + names_from = id_type, + values_from = val + ) |> + mutate( + gene_name = na_if(gene_name, "") + ) |> + filter( + !( gene_nr > 1 & is.na(gene_id) & is.na(gene_name) ) + ) |> + dplyr::select( + !gene_nr + ) |> + dplyr::rename( + name = gene_name, + id = ID, + parent_id = gene_id, + ) |> + mutate( + rank = NA + ) + + +transcript_id_to_gene_id_mapping <- transcripts_to_genes_mappings |> + dplyr::select( + c( + ensembl_transcript_id, + ensembl_gene_id + ) + ) |> + distinct() + +gene_id_annotations <- transcripts_to_genes_mappings |> + dplyr::select( + -ensembl_transcript_id + ) |> + distinct() |> + dplyr::rename( + hgnc_symbol = external_gene_name, + genecards_id = genecards + ) + +all_annotations <- bind_rows( + genomic_annotations, + regulatory_annotations + ) |> + mutate( + ensembl_transcript_id = case_when( + str_detect(parent_id, "^ENST") ~ parent_id, + str_detect(id, "^ENST") ~ id + ), + gene_id = case_when( + str_detect(parent_id, "^ENSG") ~ parent_id, + str_detect(id, "^ENSG") ~ id, + ) + ) |> + dplyr::select( + -c( + parent_type, + parent_id, + phase + ) + ) |> + left_join( + transcript_id_to_gene_id_mapping, + by = join_by( + ensembl_transcript_id + ) + ) |> + mutate( + ensembl_gene_id = if_else( + is.na(ensembl_gene_id), + gene_id, + ensembl_gene_id + ) + ) |> + dplyr::select( + -gene_id + ) |> + # bring in the gene symbols + left_join( + gene_id_annotations, + by = join_by( + ensembl_gene_id + ) + ) |> + # make sure, everything has an `id` + mutate( + id = case_when( + is.na(id) & str_detect(name, "^ENS") ~ name, + is.na(id) & str_detect(type, "_UTR$") ~ ensembl_transcript_id, + .default = id + ) + ) |> + dplyr::rename( + ensembl_id = id + ) |> + GRanges() + +seqlevels(all_annotations) <- sort(seqlevels(all_annotations)) +all_annotations <- sort(all_annotations, ignore.strand = TRUE) + +rtracklayer::export(all_annotations, snakemake@output[["all_annotations"]]) \ No newline at end of file