-
Notifications
You must be signed in to change notification settings - Fork 1
Home
Franz AKE PLASS lab, Barcelona
The analysis can be achieved using the isoform counts matrix generated by SCALPEL, or by loading the Seurat objects generated by the tool. To perform the single-cell analysis, we use the popular library Seurat:
#libraries loading
library(Seurat)
library(dplyr)
library(data.table)
library(tibble)
library(stringr)
library(dplyr)
source("./SCALPEL/src/scalpel_library.R")
Following SCALPEL execution, the iDGE counts files or iDGE based Seurat object can be found in ./results/final_results/
#reading SCALPEL seurat objects
myObj = readRDS("./results/final_results/iDGE_seurat.RDS")
If you provided a [--cluster file], the iDGE based Seurat Object generated by SCALPEL will include the corresponding cell annotation You can eventually reannotate the cells in the current analysis with a new annotation:
#Current annotation
myObj[[]] %>% head()
#Reannotate the cells (Optional)
new_annotation = fread("./Lukassen_annotation.txt")
head(myObj[[]])
orig.ident nCount_RNA nFeature_RNA Barcodes Cell_Cluster
AAACCTGAGAGGGCTT-2 SRR612905X 18001.89 5484 AAACCTGAGAGGGCTT-2 RS
AAACCTGAGCTTATCG-1 SRR612905X 20004.00 5994 AAACCTGAGCTTATCG-1 RS
AAACCTGCATACGCCG-2 SRR612905X 53003.99 8733 AAACCTGCATACGCCG-2 ES
AAACCTGGTTGAGTTC-1 SRR612905X 16225.00 5384 AAACCTGGTTGAGTTC-1 RS
AAACCTGTCAACGAAA-1 SRR612905X 19945.00 5597 AAACCTGTCAACGAAA-1 ES
AAACCTGTCGCGGATC-2 SRR612905X 7640.50 3523 AAACCTGTCGCGGATC-2 ES
new_annotation
Barcodes Cell_Cluster2
1: AAACCTGAGAGGGCTT-2 RS5
2: AAACCTGAGCTTATCG-1 RS1/2
3: AAACCTGCATACGCCG-2 ES
4: AAACCTGGTTGAGTTC-1 RS1/2
5: AAACCTGTCAACGAAA-1 ES
---
2038: TTTGTCACAAGTTAAG-1 RS5
2039: TTTGTCACATCGGTTA-2 ES
2040: TTTGTCAGTAATAGCA-1 RS3
2041: TTTGTCAGTCTCGTTC-1 RS1/2
2042: TTTGTCATCTGTCCGT-2 SC3/4
#Add the annotation to the Seurat object
myObj$Cell_Cluster2 = left_join(dplyr::select(myObj[[]], Barcodes, Cell_Cluster), new_annotation, by = "Barcodes")$Cell_Cluster2
head(myObj[[]])
orig.ident nCount_RNA nFeature_RNA Barcodes Cell_Cluster Cell_Cluster2
AAACCTGAGAGGGCTT-2 SRR612905X 18001.89 5484 AAACCTGAGAGGGCTT-2 RS RS5
AAACCTGAGCTTATCG-1 SRR612905X 20004.00 5994 AAACCTGAGCTTATCG-1 RS RS1/2
AAACCTGCATACGCCG-2 SRR612905X 53003.99 8733 AAACCTGCATACGCCG-2 ES ES
AAACCTGGTTGAGTTC-1 SRR612905X 16225.00 5384 AAACCTGGTTGAGTTC-1 RS RS1/2
AAACCTGTCAACGAAA-1 SRR612905X 19945.00 5597 AAACCTGTCAACGAAA-1 ES ES
AAACCTGTCGCGGATC-2 SRR612905X 7640.50 3523 AAACCTGTCGCGGATC-2 ES CS1
#Dimensionality reduction analysis (1)
#-------------------------------------
myObj = NormalizeData(myObj)
myObj = FindVariableFeatures(myObj, nfeatures = 2000)
VariableFeaturePlot(myObj)
myObj = ScaleData(myObj)
myObj = RunPCA(myObj)
ElbowPlot(myObj, ndims = 40)
npcs.comp = 11
myObj = RunUMAP(myObj, dims = 1:npcs.comp)
# colors params
#--------------
cols.umap = c("red","dodgerblue2","orange1")
cols.umap.raw = scales::hue_pal()(12)
cols.umap.scalpel = c(cols.umap.raw[c(2,7,4,5,11,3,6,9,6,10)],"red",cols.umap.raw[c(12)])
cols.umap.scalpel[7] = "#00f3b0"
P1=DimPlot(myObj, group.by = "Cell_Cluster", cols = cols.umap, pt.size = 1.2, label = T, label.size = 5) +
xlab("UMAP1") +
ylab("UMAP2") +
ggtitle("iDGE UMAP - Annot1")
P2=DimPlot(myObj, group.by = "Cell_Cluster2", label=T,pt.size = 1.2, label.size = 5,cols = cols.umap.scalpel) +
ggtitle("iDGE UMAP - Annot2")
P1+P2
f you provided a [--cluster file], SCALPEL performs a differential isoform usage analysis on the clusters. the generated table can be found in the generated results folder (./results/final_results/DIU_table.txt). You can perform a differential isoforms usage analysis using any cell annotation defined:
#Differential isoform usage
DTU = Find_isoforms(myObj, condition = "Cell_Cluster", threshold_tr_abundance = 0.1, pval_adjusted = 0.05)
#ex: Arl2bp
dplyr::filter(DTU, gene_name=="Arl2bp")
ES | RS | SC | gene | p_value | p_value.adjusted | gene_tr | transcript |
---|---|---|---|---|---|---|---|
922.839 | 7978.85 | 3641.979 | Arl2bp | 0 | 0 | Arl2bp*Arl2bp-201 | Arl2bp-201 |
8622.610 | 2175.07 | 623.466 | Arl2bp | 0 | 0 | Arl2bp*Arl2bp-201 | Arl2bp-201 |
#Feature expression in cells
#ex: Arl2bp
FeaturePlot(scalpel.seurat, features = c("Arl2bp***Arl2bp-201","Arl2bp***Arl2bp-204"),label = T, pt.size = 1.2, label.size = 5)
The read coverage on the isoforms can be visualized accordingly to the defined clusters for each gene with a differential isoform usage:
#Coverage track
#1 - first let's split the filtered BAM file by the defined clusters
filtered.BAM = "~/SRR612905X_results/SRR612905X/SRR612905X.filtered.bam"
filtering
bc.tags = dplyr::select(scalpel.seurat@meta.data, Barcodes, Cell_Cluster2)
lapply(c("ES","RS","SC"), function(x){
#current cluster
print(x)
#filter tab and get barcodes
filter(bc.tags, Cell_Cluster2==x) %>% dplyr::select(Barcodes) %>% fwrite(file = paste0(as.character(x),".txt"),sep = "\t")
#samtools filtering
filt.exp = paste0("/usr/local/Caskroom/mambaforge/base/envs/test/bin/samtools view -b -D CB:",
as.character(x), ".txt ", filtered.BAM, " > ", as.character(x), ".bam")
index.exp = paste0("/usr/local/Caskroom/mambaforge/base/envs/test/bin/samtools index ", as.character(x), ".bam")
system(filt.exp)
system(index.exp)
})
#The reads coverage plots requires to merge the sample BAM files and split by each clusters
#We can rely on samtools in this purpose:
samtools.bin = "/usr/local/Caskroom/mambaforge/base/bin/samtools"
SRR6129050.BAM = "datas/SRR612905X_exp2/SRR6129050/SRR6129050.filtered.bam"
SRR6129051.BAM = "datas/SRR612905X_exp2/SRR6129051/SRR6129051.filtered.bam"
BC.tag = "CB" (depending of the sequencing protocol)
#a. merging of samples BAM files
merge.expression = paste0(samtools.bin, " merge -o SRR612905X.bam ", SRR6129050.BAM, " ", SRR6129051.BAM)
print(merge.expression)
# system(merge.expression)
#b. Splitting of the merged BAM file according to each clusters cell barcodes
lapply(unique(my.obj1$Cell_Clusters), function(x){
print(x)
#a.write cluster current barcodes
dplyr::filter(my.obj1@meta.data, Cell_Clusters==x) %>% dplyr::select(Barcodes) %>%
fwrite(paste0(x,".barcodes"), col.names = F, row.names = F)
#b. subset merged bam file
bam_subset.expression = paste0(
samtools.bin, " view -b -D ", BC.tag, ":", paste0(x,".barcodes"), " SRR612905X.bam > ", x, ".bam")
#c. indexing spitted bam file
index.expression = paste0(samtools.bin, " index ", x, ".bam")
#execution
print(bam_subset.expression)
system(bam_subset.expression)
print(index.expression)
system(index.expression)
})
#c. Coverage Track visualization
GTF.path = "datas/gencode.vM21.annotation.gtf"
filtered.bamfiles = c(
"ES" = "ES.bam",
"RS" = "RS.bam",
"SC" = "SC.bam"
)
#1. Create genomeRange object
gtf.gr = rtracklayer::import(GTF.path)
#2.plot
CoverPlot(genome_gr = gtf.gr, gene_in = gene.in, genome_sp = "mm10",
transcripts_in = diff.tab$transcript, filter_trs = T,
bamfiles = filtered.bamfiles, samtools.bin = samtools.bin)
Using to the isoform quantification, we can weight the quantified isoform 3'UTR length by their expression and analyze the average 3'UTR size dynamic in the cell types:
#A. At single-cell resolution
#1. Get Isoform expression in cells
#**********************************
cell.exp = my.obj1[["RNA"]]$counts %>% data.frame()
gene_trs = str_split_fixed(rownames(cell.exp), pattern = "\\*\\*\\*", 2)
cell.exp = cell.exp %>%
mutate(gene_transcript=rownames(cell.exp), gene_name=gene_trs[,1], transcript_name=gene_trs[,2]) %>%
melt() %>%
dplyr::filter(value>0)
#annotate with cell types
cell.exp = cell.exp %>%
mutate(variable = str_replace(variable,"\\.","-") %>% str_extract(pattern = "(A|T|G|C)*\\-[0-9]")) %>%
left_join(my.obj1@meta.data[,c("Barcodes","Cell_Clusters")], by = join_by(variable==Barcodes),
relationship = "many-to-many") %>%
data.table()
#2. Get 3'UTR size
#*****************
utrTab = data.frame(gtf.gr) %>%
filter(transcript_name %in% cell.exp$transcript_name) %>%
dplyr::filter(type %in% c("UTR","CDS")) %>%
mutate(utr_type=NA) %>%
group_by(gene_name, transcript_name, type) %>%
#get coding region (CDS) coordinates
mutate(cds_start = ifelse(type=="CDS", min(start),NA), cds_end = ifelse(type=="CDS",max(end),NA)) %>%
group_by(gene_name,transcript_name) %>%
mutate(cds_start = cds_start[1], cds_end = cds_end[1]) %>%
filter(type=="UTR") %>%
#tag 3'UTR/5'UTR regions
mutate(utr_type = ifelse(strand=="+" & end < cds_start, "5UTR","3UTR"),
utr_type = ifelse(strand=="-" & end < cds_start, "3UTR","5UTR")) %>%
#subset 3'UTR regions
dplyr::filter(utr_type=="3UTR") %>%
mutate(UTRlength = sum(width)) %>%
ungroup() %>%
distinct(transcript_name, UTRlength)
#3. Annotate Isoform with 3'UTR length
#*************************************
cell.exp2 = left_join(cell.exp, utrTab) %>%
na.omit()
#4. Weighting
#************
#*by gene
cell.exp3 = cell.exp2 %>%
group_by(Cell_Clusters, variable, gene_name) %>%
mutate(value.norm = value / sum(value), weighted.UTRlength.iso = value.norm * UTRlength,
weighted.UTRlength.gene = mean(weighted.UTRlength.iso)) %>%
data.table()
#by Cell
cell.exp4 = distinct(cell.exp3, Cell_Clusters, variable, gene_name, weighted.UTRlength.gene) %>%
group_by(Cell_Clusters, variable) %>%
summarise(weighted.UTRlength.cell = mean(weighted.UTRlength.gene)) %>%
data.table()
#Plot
ggplot(cell.exp4, aes(Cell_Clusters, weighted.UTRlength.cell, fill=Cell_Clusters)) +
geom_boxplot(width=0.3) +
theme_few(base_size = 13) +
scale_fill_canva() +
ggtitle("average weigthed gene UTR length in cells for each cluster")