Skip to content

Commit 802c563

Browse files
committed
updates
1 parent d25992d commit 802c563

File tree

2 files changed

+64
-17
lines changed

2 files changed

+64
-17
lines changed

allele_specific_analysis/step1a_gatk_rna_genotype.sh

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
# -v g/reference/gatk:$HOME/gatk_bundle \
3535
# -v $SCRATCH1:$SCRATCH1 \
3636
# -e SCRATCH1="/g/scratch" \
37-
# -it p4rkerw/gatk:4.1.8.1
37+
# --rm -it p4rkerw/gatk:4.1.8.1
3838

3939
# make sure that miniconda is in the path
4040
export PATH=$PATH:/gatk:/opt/miniconda/envs/gatk/bin:/gatk:/opt/miniconda/envs/gatk/bin:/opt/miniconda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
@@ -62,7 +62,7 @@ gatk MarkDuplicates \
6262
--CREATE_INDEX true
6363

6464
# limit to specified intervals
65-
# this will speed up analysis be eliminating alt contigs
65+
# this will speed up analysis by eliminating alt contigs
6666
CONTIGS=(chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY)
6767
printf '%s\n' "${CONTIGS[@]}" > /tmp/interval.list
6868

@@ -93,18 +93,37 @@ gatk ApplyBQSR \
9393

9494
# single sample gvcf variant calling
9595
# annotate the sites with dbsnp
96-
gatk HaplotypeCaller \
97-
-R ref/genome.fa \
98-
-I $outdir/bqsr.bam \
99-
-O $outdir/output.g.vcf.gz \
100-
-ERC GVCF \
101-
-D gatk_bundle/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf
96+
PROCESSES=4 # Number of threads you want
97+
for INTERVAL in ${CONTIGS[*]}; do
98+
echo $INTERVAL
99+
gatk HaplotypeCaller \
100+
-R ref/genome.fa \
101+
-I $outdir/bqsr.bam \
102+
-O $outdir/output.g.$INTERVAL.vcf.gz \
103+
-ERC GVCF \
104+
-D gatk_bundle/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf \
105+
-L $INTERVAL \
106+
--verbosity ERROR &
107+
while (( $(jobs |wc -l) >= (( ${PROCESSES} + 1 )) )); do
108+
sleep 0.1
109+
done
110+
done
111+
112+
# merge vcfs from parallel haplotypecaller
113+
(ls $outdir/output.g.chr*.vcf.gz) > /tmp/vcf.list
114+
gatk GatherVcfs \
115+
-I /tmp/vcf.list \
116+
-O $outdir/output.g.vcf.gz
117+
118+
# index merged vcf
119+
gatk IndexFeatureFile \
120+
-I $outdir/output.g.vcf.gz
102121

103122
# single sample genotyping
104123
gatk GenotypeGVCFs \
105124
-R ref/genome.fa \
106125
-V $outdir/output.g.vcf.gz \
107-
-O $outdir/output.vcf.gz
126+
-O $outdir/output.vcf.gz
108127

109128
# perform hard filtering using the qual-by-depth QD score and window for snp clustering
110129
gatk VariantFiltration \
@@ -116,6 +135,6 @@ gatk VariantFiltration \
116135
--filter "FS > 30.0" \
117136
--filter-name "QD" \
118137
--filter "QD < 2.0" \
119-
-O $outdir/filter.output.vcf.gz
138+
-O $outdir/$SAMPLE_NAME.filter.output.vcf.gz
120139

121140

analysis/corr_chromVar_TF_exp.R

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,16 @@ df.final <- dplyr::distinct(df.final, celltype, gene_motif, .keep_all = TRUE)
134134
df.sig <- dplyr::filter(df.final, pval < 0.05) %>%
135135
arrange(-num_celltypes, -corr)
136136

137+
df.final$celltype <- as.factor(df.final$celltype)
138+
levels(df.final$celltype) <- c("PT","PTVCAM1","PEC","TAL","DCT1","DCT2","CNT","PC","ICA","ICB",
139+
"MES","FIB","ENDO","PODO","LEUK")
140+
141+
142+
df.sig$celltype <- as.factor(df.sig$celltype)
143+
levels(df.sig$celltype) <- c("PT","PTVCAM1","PEC","TAL","DCT1","DCT2","CNT","PC","ICA","ICB",
144+
"MES","FIB","ENDO","PODO","LEUK")
145+
146+
137147
# calculate pearson r2 for all tf-gene combos
138148
pearson <- cor.test(df.sig$chromvar, df.sig$rna, method="pearson", conf.level=0.95)
139149
max_chrom <- max(abs(df.sig$max_chrom)) + 1
@@ -164,7 +174,7 @@ pearson <- cor.test(df.pos$chromvar, df.pos$rna, method="pearson", conf.level=0.
164174
max_chrom <- max(abs(df.pos$max_chrom)) + 1
165175
max_exp <- max(abs(df.pos$max_exp)) + 1
166176

167-
p1 <- ggplot(df.pos, aes(x=chromvar, y=rna)) +
177+
p2 <- ggplot(df.pos, aes(x=chromvar, y=rna)) +
168178
geom_smooth(method="lm", color = "darkgray") +
169179
geom_point(aes(color=celltype)) +
170180
xlab("chromVAR activity (avg_logFC)") +
@@ -176,7 +186,7 @@ p1 <- ggplot(df.pos, aes(x=chromvar, y=rna)) +
176186
theme_minimal() +
177187
geom_hline(yintercept=0) +
178188
geom_vline(xintercept=0)
179-
p1
189+
p2
180190

181191

182192

@@ -186,7 +196,7 @@ pearson <- cor.test(df.neg$chromvar, df.neg$rna, method="pearson", conf.level=0.
186196
max_chrom <- max(abs(df.neg$max_chrom)) + 1
187197
max_exp <- max(abs(df.neg$max_exp)) + 1
188198

189-
p2 <- ggplot(df.neg, aes(x=chromvar, y=rna)) +
199+
p3 <- ggplot(df.neg, aes(x=chromvar, y=rna)) +
190200
geom_smooth(method="lm", color = "darkgray") +
191201
geom_point(aes(color=celltype)) +
192202
xlab("chromVAR activity (avg_logFC)") +
@@ -198,13 +208,31 @@ p2 <- ggplot(df.neg, aes(x=chromvar, y=rna)) +
198208
theme_minimal() +
199209
geom_hline(yintercept=0) +
200210
geom_vline(xintercept=0)
201-
p2
211+
p3
202212

213+
gene_motif <- df.final$gene_motif[grepl("NR3C1", df.final$gene_motif)]
214+
toplot <- dplyr::filter(df.final, motif == "MA0113.3")
215+
max_chrom <- unique(toplot$max_chrom)
216+
max_exp <- unique(toplot$max_exp)
217+
p4 <- ggplot(toplot, aes(x=chromvar, y=rna)) +
218+
geom_smooth(method="lm", color = "darkgray") +
219+
geom_point(aes(color=celltype)) +
220+
geom_text_repel(aes(label=celltype)) +
221+
xlab("chromVAR activity (avg_logFC)") +
222+
ylab("Gene expression (avg_logFC)") +
223+
ggtitle(unique(toplot$gene), subtitle=paste0("Pearson r^2=",unique(toplot$cor)," pval=",unique(toplot$pval))) +
224+
xlim(c(-max_chrom,max_chrom)) +
225+
ylim(c(-max_exp,max_exp)) +
226+
theme_minimal() +
227+
geom_hline(yintercept=0) +
228+
geom_vline(xintercept=0)
229+
p4
203230

204-
toplot <- dplyr::filter(df.final, gene_motif == "ZEB1_MA0103.3")
231+
gene_motif <- df.final$gene_motif[grepl("NR3C2", df.final$gene_motif)]
232+
toplot <- dplyr::filter(df.final, motif == "MA0727.1")
205233
max_chrom <- unique(toplot$max_chrom)
206234
max_exp <- unique(toplot$max_exp)
207-
p1 <- ggplot(toplot, aes(x=chromvar, y=rna)) +
235+
p5 <- ggplot(toplot, aes(x=chromvar, y=rna)) +
208236
geom_smooth(method="lm", color = "darkgray") +
209237
geom_point(aes(color=celltype)) +
210238
geom_text_repel(aes(label=celltype)) +
@@ -216,5 +244,5 @@ p1 <- ggplot(toplot, aes(x=chromvar, y=rna)) +
216244
theme_minimal() +
217245
geom_hline(yintercept=0) +
218246
geom_vline(xintercept=0)
219-
p1
247+
p5
220248

0 commit comments

Comments
 (0)