From 6b201470aec61f0c624c011c134efea592664dbe Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Wed, 14 May 2025 15:55:08 -0400 Subject: [PATCH 01/12] qc report --- .../Glimpse2QCReport.wdl | 221 ++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 GlimpseImputationPipeline/Glimpse2QCReport.wdl diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl new file mode 100644 index 00000000..4fe8f570 --- /dev/null +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -0,0 +1,221 @@ +version 1.0 + +workflow Glimpse2QCReport { + +} + +task Glimpse2QCReport_t { + input { + String cohort_name + File metrics + File ancestries + File predicted_sex + File info_score_qc + File info_mean_quantile + File info_score_sampling + Int mem_gb=4 + } + + command <<< + set -eo pipefail + + cat << EOF > ~{cohort_name}_qc_report.Rmd + --- + title: GLIMPSE Imputation QC Report + subtitle: ~{cohort_name} + author: "Broad Clinical Labs" + date: "`r Sys.Date()`" + output: pdf_document + --- + + \`\`\`{r setup, include=FALSE} + knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning = FALSE) + library(dplyr) + library(readr) + library(ggplot2) + library(knitr) + library(scales) + \`\`\` + + \`\`\`{r load data, include=FALSE} + qc_metrics = read_tsv("~{metrics}") + ancestries = read_csv("~{ancestries}") + predicted_sex = read_tsv("~{predicted_sex}") + + qc_metrics = qc_metrics %>% inner_join(ancestries, by=c("sample_id"="s")) %>% inner_join(predicted_sex) + + ancestry_counts <- ancestries %>% group_by(ancestry) %>% count() %>% arrange(-n) + + coverage_metrics<-read_tsv(c("batch_1.coverage_metrics.txt","batch_0.coverage_metrics.txt")) + coverage_metrics_per_sample <- coverage_metrics %>% group_by(Sample) %>% summarise(mean_cov=mean(`Cov.`), + mean_frac_sites = mean(1-`No data pct`/100)) + + info_score_count <- read_tsv("~{info_score_qc}") %>% arrange(-total_sites) + + info_mean_quantile <- read_tsv("~{info_mean_quantile}") + + info_sample <- read_tsv("~{info_score_sampling}") + + \`\`\` + # Site QC + ## Site Coverage QC + During GLIMPSE imputation, the genome is split into 837 chunks, with the majority of the algorithm being applied to each chunk separately. All the chunks are then ligated back together at the end of the process. For each chunk, the mean coverage of sites being genotyped is calculated for each sample, along with the fractions of sites covered by at least one read. + + In the histograms below, we show the distributions of mean coverage and fractions of sites covered over all chunk/sample combinations. + + + \`\`\`{r mean_cov, message=FALSE, warning=FALSE} + ggplot(coverage_metrics, aes(x=`Cov.`)) + + geom_histogram() + theme_bw() + xlim(0,20) + + xlab("Mean Coverage (per sample x chunk)") + \`\`\` + + \`\`\`{r frac_covered, message=FALSE, warning=FALSE} + ggplot(coverage_metrics, aes(x=1-`No data pct`/100)) + + geom_histogram() + theme_bw() + xlim(0,1) + + xlab("Fraction of Sites Covered by >=1 Read (per sample x chunk)") + \`\`\` + + Below we show the same plots, but with the values for each sample averaged across all chunks. + + \`\`\`{r mean_cov_sample, message=FALSE, warning=FALSE} + ggplot(coverage_metrics_per_sample, aes(x=mean_cov)) + + geom_histogram() + theme_bw() + xlim(0,20) + + xlab("Mean Coverage (per sample)") + \`\`\` + + \`\`\`{r frac_covered_sample, message=FALSE, warning=FALSE} + ggplot(coverage_metrics_per_sample, aes(x=mean_frac_sites)) + + geom_histogram() + theme_bw() + xlim(0,1) + + xlab("Fraction of Sites Covered by >=1 Read (per sample)") + \`\`\` + + + ## Info Score QC + For each genotyped site, GLIMPSE calculates an INFO score, which is it`s estimate of how well it has imputed the site across all the samples in the cohort. It is standard practice during downstream analyses using imputed datasets to filter sites according the an INFO score threshold, often with the INFO score required to be greater than either 0.6 or 0.8. + + The total number and fraction of sites passing passing these thresholds is show in the table below. The rows in the table indicate various reference panel allele frequency thresholds, since more common sites are generally imputed much more accurately. + + \`\`\`{r info_score_table} + kable(info_score_count, format.args = list(big.mark = ','), digits = 2, col.names = c("","Total Sites", "INFO > 0.6", "INFO > 0.8", + "Fraction with INFO > 0.6", "Fraction with INFO > 0.8")) + \`\`\` + + Below we show the relationship between reference panel allele frequency and INFO score. The line and shaded regions represent the mean and 5%-95% quantiles for various reference panel allele frequency bins. The points are a random sampling of 10,000 sites. + + \`\`\`{r info_score_plot} + ggplot(info_sample, aes(x=RAF,y=INFO)) + + geom_point(alpha=0.2) + + geom_line(data=info_mean_quantile, aes(x=raf, y=mean), color='red') + + geom_ribbon(data=info_mean_quantile, aes(x=raf, y=mean, ymin=fifth_pctile, ymax=ninety_fifth_pctile), fill='red', alpha=0.3) + + scale_x_log10() + theme_bw() + xlab("Reference Panel Allele Frequency") + \`\`\` + + # Sample QC + + ## Genetic Ancestry Estimation + As part of sample QC, genetic ancestry was estimated for all samples, in order to confirm that any stratification of QC metrics are expected based on genetic ancestry. Genetic ancestry was estimated by PCA using resources provided by [gnomad](https://gnomad.broadinstitute.org/downloads#v3-ancestry-classification). Note that these ancestry estimates are used solely for QC, and are not intended to be used for any other purpose. + + \`\`\`{r pca} + ggplot(ancestries, aes(x=PC1, y=PC2)) + + geom_point(aes(color=ancestry)) + theme_bw() + \`\`\` + + \`\`\`{r ancestry_table} + kable(ancestry_counts) + \`\`\` + + ## QC Metrics + Sample QC Metrics distributions are shown below. Note that some metrics are stratified by reported sex in addition to by ancestry due to expected differences in the non-PAR region of Chromosome X. These metrics are calculated using [hail sample_qc](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.sample_qc). Note that for genotype counts, each site is counted at most once per sample. However, metrics like `Number of SNP Alt Alleles` count the number of alternate alleles observed in a sample, in which case a heterozygous genotype would count as one, and a homozygous alternate genotype as two. + + ### Number of Homozygous Reference Genotypes + \`\`\`{r n_hom_ref} + ggplot(qc_metrics, aes(x=n_hom_ref, fill=ancestry)) + + geom_histogram() + theme_bw() + + xlab("Number of Homozygous Reference Genotypes") + + scale_x_continuous(labels=comma) + \`\`\` + + ### Number of Heterozygous Genotypes + \`\`\`{r n_het} + ggplot(qc_metrics %>% filter(predicted_sex %in% c("F","M")), aes(x=n_het, fill=ancestry)) + + geom_histogram() + theme_bw() + facet_grid(~predicted_sex) + + xlab("Number of Heterozygous Genotypes") + + scale_x_continuous(labels=comma) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + \`\`\` + + + ### Number of Homozygous Variant Genotypes + \`\`\`{r n_hom_var} + ggplot(qc_metrics %>% filter(predicted_sex %in% c("F","M")), aes(x=n_hom_var, fill=ancestry)) + + geom_histogram() + theme_bw() + facet_grid(~predicted_sex) + + xlab("Number of Homozygous Variant Genotypes") + + scale_x_continuous(labels=comma) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + \`\`\` + + ### Ratio of Heterozygous to Homozygous Variant Genotypes + \`\`\`{r r_het_hom_var} + ggplot(qc_metrics %>% filter(predicted_sex %in% c("F","M")), aes(x=r_het_hom_var, fill=ancestry)) + + geom_histogram() + theme_bw() + facet_grid(~predicted_sex) + + xlab("Ratio of Heterozygous to Homozygous Variant Genotypes") + + scale_x_continuous(labels=comma) + \`\`\` + + ### Number of Non-Reference Genotypes + \`\`\`{r n_non_ref} + ggplot(qc_metrics, aes(x=n_non_ref, fill=ancestry)) + + geom_histogram() + theme_bw() + + xlab("Number of Non-Reference Genotypes") + + scale_x_continuous(labels=comma) + \`\`\` + + ### Number of SNP Alt Alleles + \`\`\`{r n_snps} + ggplot(qc_metrics, aes(x=n_snp, fill=ancestry)) + + geom_histogram() + theme_bw() + + xlab("Number of SNP Alt Alleles") + + scale_x_continuous(labels=comma) + \`\`\` + + ### Number of Insertion Alt Alleles + \`\`\`{r n_ins} + ggplot(qc_metrics, aes(x=n_insertion, fill=ancestry)) + + geom_histogram() + theme_bw() + + xlab("Number of Insertion Alt Alleles") + + scale_x_continuous(labels=comma) + \`\`\` + + ### Number of Deletion Alt Alleles + \`\`\`{r n_del} + ggplot(qc_metrics, aes(x=n_deletion, fill=ancestry)) + + geom_histogram() + theme_bw() + + xlab("Number of Deletion Alt Alleles") + + scale_x_continuous(labels=comma) + \`\`\` + + ### ti/tv Ratio + \`\`\`{r r_ti_tv} + ggplot(qc_metrics, aes(x=r_ti_tv, fill=ancestry)) + + geom_histogram() + theme_bw() + + xlab("ti/tv Ratio") + + scale_x_continuous(labels=comma) + \`\`\` + + EOF + + Rscript -e "library(rmarkdown); rmarkdown::render('~{cohort_name}_qc_report.Rmd', 'pdf_document')" + + <<< + + runtime { + docker: "us.gcr.io/broad-dsde-methods/r-gcnv-viz@sha256:c92a9a26cba4ab10b579776cd4dd70d5fca485d4a7216d0ab9b477662e3d9228" + disks: "local-disk 100 HDD" + memory: mem_gb + " GB" + } + + output { + File qc_report = "~{cohort_name}_qc_report.pdf" + } +} \ No newline at end of file From e82e6fcd32b140e81777d4f94007b48972c0b799 Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 15 May 2025 09:11:26 -0400 Subject: [PATCH 02/12] dockstore --- .dockstore.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.dockstore.yml b/.dockstore.yml index b374ae98..993d67b8 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -150,3 +150,6 @@ workflows: - name: CNVControlEventsQC subclass: WDL primaryDescriptorPath: /gCNV/CNVControlEventsQC.wdl + - name: Glimpse2QCReport + subclass: WDL + primaryDescriptorPath: /GlimpseImputationPipeline/Glimpse2QCReport.wdl From 5ae0f34c777dc9fa5858e15572517843bfa3e6e9 Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 15 May 2025 09:25:33 -0400 Subject: [PATCH 03/12] . --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 4fe8f570..2ae00cfa 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -207,7 +207,7 @@ task Glimpse2QCReport_t { Rscript -e "library(rmarkdown); rmarkdown::render('~{cohort_name}_qc_report.Rmd', 'pdf_document')" - <<< + >>> runtime { docker: "us.gcr.io/broad-dsde-methods/r-gcnv-viz@sha256:c92a9a26cba4ab10b579776cd4dd70d5fca485d4a7216d0ab9b477662e3d9228" From 506f6bb36ad1c36c7fe6e8cef8ba48ae0274c8fa Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 15 May 2025 09:30:08 -0400 Subject: [PATCH 04/12] . --- .../Glimpse2QCReport.wdl | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 2ae00cfa..108187d5 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -1,7 +1,30 @@ version 1.0 workflow Glimpse2QCReport { + input { + String cohort_name + File metrics + File ancestries + File predicted_sex + File info_score_qc + File info_mean_quantile + File info_score_sampling + } + + call Glimpse2QCReport_t { + input: + cohort_name = cohort_name, + metrics = metrics, + ancestries = ancestries, + predicted_sex = predicted_sex, + info_score_qc = info_score_qc, + info_mean_quantile = info_mean_quantile, + info_score_sampling = info_score_sampling + } + output { + File qc_report = Glimpse2QCReport_t.qc_report + } } task Glimpse2QCReport_t { From a412e5884091f822fd407026ec543ad7473fc7d9 Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 15 May 2025 13:38:38 -0400 Subject: [PATCH 05/12] . --- .../Glimpse2QCReport.wdl | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 108187d5..fe294887 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -52,7 +52,6 @@ task Glimpse2QCReport_t { --- \`\`\`{r setup, include=FALSE} - knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning = FALSE) library(dplyr) library(readr) library(ggplot2) @@ -60,7 +59,7 @@ task Glimpse2QCReport_t { library(scales) \`\`\` - \`\`\`{r load data, include=FALSE} + \`\`\`{r load data, include=FALSE,echo=FALSE, message=FALSE, warning=FALSE} qc_metrics = read_tsv("~{metrics}") ancestries = read_csv("~{ancestries}") predicted_sex = read_tsv("~{predicted_sex}") @@ -87,13 +86,13 @@ task Glimpse2QCReport_t { In the histograms below, we show the distributions of mean coverage and fractions of sites covered over all chunk/sample combinations. - \`\`\`{r mean_cov, message=FALSE, warning=FALSE} + \`\`\`{r mean_cov, echo=FALSE, message=FALSE, warning=FALSE} ggplot(coverage_metrics, aes(x=`Cov.`)) + geom_histogram() + theme_bw() + xlim(0,20) + xlab("Mean Coverage (per sample x chunk)") \`\`\` - \`\`\`{r frac_covered, message=FALSE, warning=FALSE} + \`\`\`{r frac_covered, echo=FALSE, message=FALSE, warning=FALSE} ggplot(coverage_metrics, aes(x=1-`No data pct`/100)) + geom_histogram() + theme_bw() + xlim(0,1) + xlab("Fraction of Sites Covered by >=1 Read (per sample x chunk)") @@ -101,13 +100,13 @@ task Glimpse2QCReport_t { Below we show the same plots, but with the values for each sample averaged across all chunks. - \`\`\`{r mean_cov_sample, message=FALSE, warning=FALSE} + \`\`\`{r mean_cov_sample, echo=FALSE, message=FALSE, warning=FALSE} ggplot(coverage_metrics_per_sample, aes(x=mean_cov)) + geom_histogram() + theme_bw() + xlim(0,20) + xlab("Mean Coverage (per sample)") \`\`\` - \`\`\`{r frac_covered_sample, message=FALSE, warning=FALSE} + \`\`\`{r frac_covered_sample, echo=FALSE, message=FALSE, warning=FALSE} ggplot(coverage_metrics_per_sample, aes(x=mean_frac_sites)) + geom_histogram() + theme_bw() + xlim(0,1) + xlab("Fraction of Sites Covered by >=1 Read (per sample)") @@ -115,18 +114,18 @@ task Glimpse2QCReport_t { ## Info Score QC - For each genotyped site, GLIMPSE calculates an INFO score, which is it`s estimate of how well it has imputed the site across all the samples in the cohort. It is standard practice during downstream analyses using imputed datasets to filter sites according the an INFO score threshold, often with the INFO score required to be greater than either 0.6 or 0.8. + For each genotyped site, GLIMPSE calculates an INFO score, which is it\`s estimate of how well it has imputed the site across all the samples in the cohort. It is standard practice during downstream analyses using imputed datasets to filter sites according the an INFO score threshold, often with the INFO score required to be greater than either 0.6 or 0.8. The total number and fraction of sites passing passing these thresholds is show in the table below. The rows in the table indicate various reference panel allele frequency thresholds, since more common sites are generally imputed much more accurately. - \`\`\`{r info_score_table} + \`\`\`{r info_score_table, echo=FALSE, message=FALSE, warning=FALSE} kable(info_score_count, format.args = list(big.mark = ','), digits = 2, col.names = c("","Total Sites", "INFO > 0.6", "INFO > 0.8", "Fraction with INFO > 0.6", "Fraction with INFO > 0.8")) \`\`\` Below we show the relationship between reference panel allele frequency and INFO score. The line and shaded regions represent the mean and 5%-95% quantiles for various reference panel allele frequency bins. The points are a random sampling of 10,000 sites. - \`\`\`{r info_score_plot} + \`\`\`{r info_score_plot, echo=FALSE, message=FALSE, warning=FALSE} ggplot(info_sample, aes(x=RAF,y=INFO)) + geom_point(alpha=0.2) + geom_line(data=info_mean_quantile, aes(x=raf, y=mean), color='red') + @@ -139,20 +138,20 @@ task Glimpse2QCReport_t { ## Genetic Ancestry Estimation As part of sample QC, genetic ancestry was estimated for all samples, in order to confirm that any stratification of QC metrics are expected based on genetic ancestry. Genetic ancestry was estimated by PCA using resources provided by [gnomad](https://gnomad.broadinstitute.org/downloads#v3-ancestry-classification). Note that these ancestry estimates are used solely for QC, and are not intended to be used for any other purpose. - \`\`\`{r pca} + \`\`\`{r pca, echo=FALSE, message=FALSE, warning=FALSE} ggplot(ancestries, aes(x=PC1, y=PC2)) + geom_point(aes(color=ancestry)) + theme_bw() \`\`\` - \`\`\`{r ancestry_table} + \`\`\`{r ancestry_table, echo=FALSE, message=FALSE, warning=FALSE} kable(ancestry_counts) \`\`\` ## QC Metrics - Sample QC Metrics distributions are shown below. Note that some metrics are stratified by reported sex in addition to by ancestry due to expected differences in the non-PAR region of Chromosome X. These metrics are calculated using [hail sample_qc](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.sample_qc). Note that for genotype counts, each site is counted at most once per sample. However, metrics like `Number of SNP Alt Alleles` count the number of alternate alleles observed in a sample, in which case a heterozygous genotype would count as one, and a homozygous alternate genotype as two. + Sample QC Metrics distributions are shown below. Note that some metrics are stratified by reported sex in addition to by ancestry due to expected differences in the non-PAR region of Chromosome X. These metrics are calculated using [hail sample_qc](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.sample_qc). Note that for genotype counts, each site is counted at most once per sample. However, metrics like \`Number of SNP Alt Alleles\` count the number of alternate alleles observed in a sample, in which case a heterozygous genotype would count as one, and a homozygous alternate genotype as two. ### Number of Homozygous Reference Genotypes - \`\`\`{r n_hom_ref} + \`\`\`{r n_hom_ref, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics, aes(x=n_hom_ref, fill=ancestry)) + geom_histogram() + theme_bw() + xlab("Number of Homozygous Reference Genotypes") + @@ -160,7 +159,7 @@ task Glimpse2QCReport_t { \`\`\` ### Number of Heterozygous Genotypes - \`\`\`{r n_het} + \`\`\`{r n_het, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics %>% filter(predicted_sex %in% c("F","M")), aes(x=n_het, fill=ancestry)) + geom_histogram() + theme_bw() + facet_grid(~predicted_sex) + xlab("Number of Heterozygous Genotypes") + @@ -170,7 +169,7 @@ task Glimpse2QCReport_t { ### Number of Homozygous Variant Genotypes - \`\`\`{r n_hom_var} + \`\`\`{r n_hom_var, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics %>% filter(predicted_sex %in% c("F","M")), aes(x=n_hom_var, fill=ancestry)) + geom_histogram() + theme_bw() + facet_grid(~predicted_sex) + xlab("Number of Homozygous Variant Genotypes") + @@ -179,7 +178,7 @@ task Glimpse2QCReport_t { \`\`\` ### Ratio of Heterozygous to Homozygous Variant Genotypes - \`\`\`{r r_het_hom_var} + \`\`\`{r r_het_hom_var, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics %>% filter(predicted_sex %in% c("F","M")), aes(x=r_het_hom_var, fill=ancestry)) + geom_histogram() + theme_bw() + facet_grid(~predicted_sex) + xlab("Ratio of Heterozygous to Homozygous Variant Genotypes") + @@ -187,7 +186,7 @@ task Glimpse2QCReport_t { \`\`\` ### Number of Non-Reference Genotypes - \`\`\`{r n_non_ref} + \`\`\`{r n_non_ref, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics, aes(x=n_non_ref, fill=ancestry)) + geom_histogram() + theme_bw() + xlab("Number of Non-Reference Genotypes") + @@ -195,7 +194,7 @@ task Glimpse2QCReport_t { \`\`\` ### Number of SNP Alt Alleles - \`\`\`{r n_snps} + \`\`\`{r n_snps, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics, aes(x=n_snp, fill=ancestry)) + geom_histogram() + theme_bw() + xlab("Number of SNP Alt Alleles") + @@ -203,7 +202,7 @@ task Glimpse2QCReport_t { \`\`\` ### Number of Insertion Alt Alleles - \`\`\`{r n_ins} + \`\`\`{r n_ins, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics, aes(x=n_insertion, fill=ancestry)) + geom_histogram() + theme_bw() + xlab("Number of Insertion Alt Alleles") + @@ -211,7 +210,7 @@ task Glimpse2QCReport_t { \`\`\` ### Number of Deletion Alt Alleles - \`\`\`{r n_del} + \`\`\`{r n_del, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics, aes(x=n_deletion, fill=ancestry)) + geom_histogram() + theme_bw() + xlab("Number of Deletion Alt Alleles") + @@ -219,7 +218,7 @@ task Glimpse2QCReport_t { \`\`\` ### ti/tv Ratio - \`\`\`{r r_ti_tv} + \`\`\`{r r_ti_tv, echo=FALSE, message=FALSE, warning=FALSE} ggplot(qc_metrics, aes(x=r_ti_tv, fill=ancestry)) + geom_histogram() + theme_bw() + xlab("ti/tv Ratio") + From fabcf0221d1e431ed3a4b9ad12debe106d9f4d95 Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 15 May 2025 13:53:55 -0400 Subject: [PATCH 06/12] . --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index fe294887..47d0affc 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -47,7 +47,7 @@ task Glimpse2QCReport_t { title: GLIMPSE Imputation QC Report subtitle: ~{cohort_name} author: "Broad Clinical Labs" - date: "`r Sys.Date()`" + date: "\`r Sys.Date()\`" output: pdf_document --- @@ -69,8 +69,8 @@ task Glimpse2QCReport_t { ancestry_counts <- ancestries %>% group_by(ancestry) %>% count() %>% arrange(-n) coverage_metrics<-read_tsv(c("batch_1.coverage_metrics.txt","batch_0.coverage_metrics.txt")) - coverage_metrics_per_sample <- coverage_metrics %>% group_by(Sample) %>% summarise(mean_cov=mean(`Cov.`), - mean_frac_sites = mean(1-`No data pct`/100)) + coverage_metrics_per_sample <- coverage_metrics %>% group_by(Sample) %>% summarise(mean_cov=mean(\`Cov.\`), + mean_frac_sites = mean(1-\`No data pct\`/100)) info_score_count <- read_tsv("~{info_score_qc}") %>% arrange(-total_sites) @@ -87,13 +87,13 @@ task Glimpse2QCReport_t { \`\`\`{r mean_cov, echo=FALSE, message=FALSE, warning=FALSE} - ggplot(coverage_metrics, aes(x=`Cov.`)) + + ggplot(coverage_metrics, aes(x=\`Cov.\`)) + geom_histogram() + theme_bw() + xlim(0,20) + xlab("Mean Coverage (per sample x chunk)") \`\`\` \`\`\`{r frac_covered, echo=FALSE, message=FALSE, warning=FALSE} - ggplot(coverage_metrics, aes(x=1-`No data pct`/100)) + + ggplot(coverage_metrics, aes(x=1-\`No data pct\`/100)) + geom_histogram() + theme_bw() + xlim(0,1) + xlab("Fraction of Sites Covered by >=1 Read (per sample x chunk)") \`\`\` From a44207868784299b070fc413ebf1d6b4d2ac7579 Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 15 May 2025 15:58:12 -0400 Subject: [PATCH 07/12] . --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 47d0affc..2bb1491d 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -31,6 +31,7 @@ task Glimpse2QCReport_t { input { String cohort_name File metrics + File coverage_metrics File ancestries File predicted_sex File info_score_qc From e2f8d8ee33daa2657d61a728cb976f965525f64e Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 22 May 2025 15:37:47 -0400 Subject: [PATCH 08/12] coverage metrics --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 2bb1491d..485d379f 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -4,17 +4,18 @@ workflow Glimpse2QCReport { input { String cohort_name File metrics + Array[File] coverage_metrics File ancestries File predicted_sex File info_score_qc File info_mean_quantile File info_score_sampling } - call Glimpse2QCReport_t { input: cohort_name = cohort_name, metrics = metrics, + coverage_metrics = coverage_metrics, ancestries = ancestries, predicted_sex = predicted_sex, info_score_qc = info_score_qc, @@ -22,6 +23,7 @@ workflow Glimpse2QCReport { info_score_sampling = info_score_sampling } + output { File qc_report = Glimpse2QCReport_t.qc_report } @@ -31,7 +33,7 @@ task Glimpse2QCReport_t { input { String cohort_name File metrics - File coverage_metrics + Array[File] coverage_metrics File ancestries File predicted_sex File info_score_qc @@ -69,7 +71,7 @@ task Glimpse2QCReport_t { ancestry_counts <- ancestries %>% group_by(ancestry) %>% count() %>% arrange(-n) - coverage_metrics<-read_tsv(c("batch_1.coverage_metrics.txt","batch_0.coverage_metrics.txt")) + coverage_metrics<-read_tsv(c("~{sep='","' coverage_metrics}")) coverage_metrics_per_sample <- coverage_metrics %>% group_by(Sample) %>% summarise(mean_cov=mean(\`Cov.\`), mean_frac_sites = mean(1-\`No data pct\`/100)) From c72f16ee1de15b583a9278ff94ebd2e74066543d Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 22 May 2025 16:26:42 -0400 Subject: [PATCH 09/12] rocker/verse --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 485d379f..ddabeed8 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -235,7 +235,7 @@ task Glimpse2QCReport_t { >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/r-gcnv-viz@sha256:c92a9a26cba4ab10b579776cd4dd70d5fca485d4a7216d0ab9b477662e3d9228" + docker: "rocker/verse:4.4" disks: "local-disk 100 HDD" memory: mem_gb + " GB" } From 1cfaf150caaf9eb9faafcbb78cb77356aa80348a Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Thu, 22 May 2025 16:43:05 -0400 Subject: [PATCH 10/12] tlmgr --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index ddabeed8..3e9bb02b 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -44,7 +44,7 @@ task Glimpse2QCReport_t { command <<< set -eo pipefail - + tlmgr install booktabs multirow wrapfig float colortbl pdflscape tabu threeparttable threeparttablex ulem makecell xcolor bookmark cat << EOF > ~{cohort_name}_qc_report.Rmd --- title: GLIMPSE Imputation QC Report From 84d1845262b7a248568e0d22277a4ced97d71c52 Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Fri, 23 May 2025 09:17:23 -0400 Subject: [PATCH 11/12] docker --- Utilities/Dockers/R-Markdown-PDF/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Utilities/Dockers/R-Markdown-PDF/Dockerfile diff --git a/Utilities/Dockers/R-Markdown-PDF/Dockerfile b/Utilities/Dockers/R-Markdown-PDF/Dockerfile new file mode 100644 index 00000000..becfb0ef --- /dev/null +++ b/Utilities/Dockers/R-Markdown-PDF/Dockerfile @@ -0,0 +1,4 @@ +FROM rocker/verse:4.4 + +# Install required LaTeX packages for PDF rendering +RUN tlmgr install booktabs multirow wrapfig float colortbl pdflscape tabu threeparttable threeparttablex ulem makecell xcolor bookmark From 2ab8b5055dc422e4d87151c985616b1a639e844b Mon Sep 17 00:00:00 2001 From: Chris Kachulis Date: Fri, 23 May 2025 09:35:29 -0400 Subject: [PATCH 12/12] docker --- GlimpseImputationPipeline/Glimpse2QCReport.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GlimpseImputationPipeline/Glimpse2QCReport.wdl b/GlimpseImputationPipeline/Glimpse2QCReport.wdl index 3e9bb02b..75c67e19 100644 --- a/GlimpseImputationPipeline/Glimpse2QCReport.wdl +++ b/GlimpseImputationPipeline/Glimpse2QCReport.wdl @@ -44,7 +44,7 @@ task Glimpse2QCReport_t { command <<< set -eo pipefail - tlmgr install booktabs multirow wrapfig float colortbl pdflscape tabu threeparttable threeparttablex ulem makecell xcolor bookmark + cat << EOF > ~{cohort_name}_qc_report.Rmd --- title: GLIMPSE Imputation QC Report @@ -235,7 +235,7 @@ task Glimpse2QCReport_t { >>> runtime { - docker: "rocker/verse:4.4" + docker: "us.gcr.io/broad-dsde-methods/r-markdown-pdf:0.1" disks: "local-disk 100 HDD" memory: mem_gb + " GB" }