Home

SCALPEL Downstream analysis on scRNAseq SPERMATOGENESIS dataset

Franz AKE PLASS lab, Barcelona

The analysis can be achieved using the isoform counts matrix generated by SCALPEL, or by loading the Seurat objects generated by the tool. To perform the single-cell analysis, we use the popular library Seurat:

import libraries

#libraries loading
library(Seurat)
library(dplyr)
library(data.table)
library(tibble)
library(stringr)
library(dplyr)
source("./SCALPEL/src/scalpel_library.R")

A. Samples reading

Following SCALPEL execution, the iDGE counts files or iDGE based Seurat object can be found in ./results/final_results/

#reading SCALPEL seurat objects
myObj = readRDS("./results/final_results/iDGE_seurat.RDS")

If you provided a [--cluster file], the iDGE based Seurat Object generated by SCALPEL will include the corresponding cell annotation You can eventually reannotate the cells in the current analysis with a new annotation:

#Current annotation
myObj[[]] %>% head()

#Reannotate the cells (Optional)
new_annotation = fread("./Lukassen_annotation.txt")

head(myObj[[]])
orig.ident                    nCount_RNA nFeature_RNA           Barcodes Cell_Cluster
AAACCTGAGAGGGCTT-2 SRR612905X   18001.89         5484 AAACCTGAGAGGGCTT-2           RS
AAACCTGAGCTTATCG-1 SRR612905X   20004.00         5994 AAACCTGAGCTTATCG-1           RS
AAACCTGCATACGCCG-2 SRR612905X   53003.99         8733 AAACCTGCATACGCCG-2           ES
AAACCTGGTTGAGTTC-1 SRR612905X   16225.00         5384 AAACCTGGTTGAGTTC-1           RS
AAACCTGTCAACGAAA-1 SRR612905X   19945.00         5597 AAACCTGTCAACGAAA-1           ES
AAACCTGTCGCGGATC-2 SRR612905X    7640.50         3523 AAACCTGTCGCGGATC-2           ES

new_annotation
      Barcodes              Cell_Cluster2
   1: AAACCTGAGAGGGCTT-2           RS5
   2: AAACCTGAGCTTATCG-1         RS1/2
   3: AAACCTGCATACGCCG-2            ES
   4: AAACCTGGTTGAGTTC-1         RS1/2
   5: AAACCTGTCAACGAAA-1            ES
  ---                                 
2038: TTTGTCACAAGTTAAG-1           RS5
2039: TTTGTCACATCGGTTA-2            ES
2040: TTTGTCAGTAATAGCA-1           RS3
2041: TTTGTCAGTCTCGTTC-1         RS1/2
2042: TTTGTCATCTGTCCGT-2         SC3/4

#Add the annotation to the Seurat object
myObj$Cell_Cluster2 = left_join(dplyr::select(myObj[[]], Barcodes, Cell_Cluster), new_annotation, by = "Barcodes")$Cell_Cluster2

head(myObj[[]])
orig.ident                    nCount_RNA nFeature_RNA           Barcodes Cell_Cluster Cell_Cluster2
AAACCTGAGAGGGCTT-2 SRR612905X   18001.89         5484 AAACCTGAGAGGGCTT-2           RS           RS5
AAACCTGAGCTTATCG-1 SRR612905X   20004.00         5994 AAACCTGAGCTTATCG-1           RS         RS1/2
AAACCTGCATACGCCG-2 SRR612905X   53003.99         8733 AAACCTGCATACGCCG-2           ES            ES
AAACCTGGTTGAGTTC-1 SRR612905X   16225.00         5384 AAACCTGGTTGAGTTC-1           RS         RS1/2
AAACCTGTCAACGAAA-1 SRR612905X   19945.00         5597 AAACCTGTCAACGAAA-1           ES            ES
AAACCTGTCGCGGATC-2 SRR612905X    7640.50         3523 AAACCTGTCGCGGATC-2           ES           CS1

B. Dimensionality reduction

#Dimensionality reduction analysis (1)
#-------------------------------------
myObj = NormalizeData(myObj)
myObj = FindVariableFeatures(myObj, nfeatures = 2000)
VariableFeaturePlot(myObj)
myObj = ScaleData(myObj)
myObj = RunPCA(myObj)
ElbowPlot(myObj, ndims = 40)
npcs.comp = 11
myObj = RunUMAP(myObj, dims = 1:npcs.comp)

# colors params
#--------------
cols.umap = c("red","dodgerblue2","orange1")
cols.umap.raw = scales::hue_pal()(12)
cols.umap.scalpel = c(cols.umap.raw[c(2,7,4,5,11,3,6,9,6,10)],"red",cols.umap.raw[c(12)])
cols.umap.scalpel[7] = "#00f3b0"

P1=DimPlot(myObj, group.by = "Cell_Cluster", cols = cols.umap, pt.size = 1.2, label = T, label.size = 5) +
  xlab("UMAP1") +
  ylab("UMAP2") +
  ggtitle("iDGE UMAP - Annot1")

P2=DimPlot(myObj, group.by = "Cell_Cluster2", label=T,pt.size = 1.2, label.size = 5,cols = cols.umap.scalpel) + 
  ggtitle("iDGE UMAP - Annot2") 

P1+P2

Isoform quantification based UMAP plot

C. Differential isoform analysis

f you provided a [--cluster file], SCALPEL performs a differential isoform usage analysis on the clusters. the generated table can be found in the generated results folder (./results/final_results/DIU_table.txt). You can perform a differential isoforms usage analysis using any cell annotation defined:

#Differential isoform usage
DTU = Find_isoforms(myObj, condition = "Cell_Cluster", threshold_tr_abundance = 0.1, pval_adjusted = 0.05)

#ex: Arl2bp
dplyr::filter(DTU, gene_name=="Arl2bp")

ES	RS	SC	gene	p_value	p_value.adjusted	gene_tr	transcript
922.839	7978.85	3641.979	Arl2bp	0	0	Arl2bp*Arl2bp-201	Arl2bp-201
8622.610	2175.07	623.466	Arl2bp	0	0	Arl2bp*Arl2bp-201	Arl2bp-201

D. Visualizations (Isoform differential usage)

Isoform expression in cells (FeaturePlot)

#Feature expression in cells
#ex: Arl2bp
FeaturePlot(scalpel.seurat, features = c("Arl2bp***Arl2bp-201","Arl2bp***Arl2bp-204"),label = T, pt.size = 1.2, label.size = 5)

Feature Plot

3. Isoform reads coverage track

The read coverage on the isoforms can be visualized accordingly to the defined clusters for each gene with a differential isoform usage:

#Coverage track
#1 - first let's split the filtered BAM file by the defined clusters
filtered.BAM = "~/SRR612905X_results/SRR612905X/SRR612905X.filtered.bam"

filtering
bc.tags = dplyr::select(scalpel.seurat@meta.data, Barcodes, Cell_Cluster2)
lapply(c("ES","RS","SC"), function(x){
  #current cluster
  print(x)
  #filter tab and get barcodes
  filter(bc.tags, Cell_Cluster2==x) %>% dplyr::select(Barcodes) %>% fwrite(file = paste0(as.character(x),".txt"),sep = "\t")
  #samtools filtering
  filt.exp = paste0("/usr/local/Caskroom/mambaforge/base/envs/test/bin/samtools view -b -D CB:",
                    as.character(x), ".txt ", filtered.BAM, " > ", as.character(x), ".bam")
  index.exp = paste0("/usr/local/Caskroom/mambaforge/base/envs/test/bin/samtools index ", as.character(x), ".bam")
  system(filt.exp)
  system(index.exp)
})

#The reads coverage plots requires to merge the sample BAM files and split by each clusters
#We can rely on samtools in this purpose:

samtools.bin = "/usr/local/Caskroom/mambaforge/base/bin/samtools"
SRR6129050.BAM = "datas/SRR612905X_exp2/SRR6129050/SRR6129050.filtered.bam"
SRR6129051.BAM = "datas/SRR612905X_exp2/SRR6129051/SRR6129051.filtered.bam"
BC.tag = "CB" (depending of the sequencing protocol)

#a. merging of samples BAM files
merge.expression = paste0(samtools.bin, " merge -o SRR612905X.bam ", SRR6129050.BAM, " ", SRR6129051.BAM)
print(merge.expression)
# system(merge.expression)

#b. Splitting of the merged BAM file according to each clusters cell barcodes
lapply(unique(my.obj1$Cell_Clusters), function(x){
  print(x)
  
  #a.write cluster current barcodes
  dplyr::filter(my.obj1@meta.data, Cell_Clusters==x) %>% dplyr::select(Barcodes) %>%
    fwrite(paste0(x,".barcodes"), col.names = F, row.names = F)
  
  #b. subset merged bam file
  bam_subset.expression = paste0(
    samtools.bin, " view -b -D ", BC.tag, ":", paste0(x,".barcodes"), " SRR612905X.bam > ", x, ".bam")
  
  #c. indexing spitted bam file
  index.expression = paste0(samtools.bin, " index ", x, ".bam")
  
  #execution
  print(bam_subset.expression)
  system(bam_subset.expression)
  
  print(index.expression)
  system(index.expression)
})

#c. Coverage Track visualization
GTF.path = "datas/gencode.vM21.annotation.gtf"
filtered.bamfiles = c(
  "ES" = "ES.bam",
  "RS" = "RS.bam",
  "SC" = "SC.bam"
)

#1. Create genomeRange object
gtf.gr = rtracklayer::import(GTF.path)

#2.plot
CoverPlot(genome_gr = gtf.gr, gene_in = gene.in, genome_sp = "mm10",
          transcripts_in = diff.tab$transcript, filter_trs = T,
          bamfiles = filtered.bamfiles, samtools.bin = samtools.bin)

enter image description here

F. 3'UTR size dynamic in cell types

Using to the isoform quantification, we can weight the quantified isoform 3'UTR length by their expression and analyze the average 3'UTR size dynamic in the cell types:

#A. At single-cell resolution

#1. Get Isoform expression in cells
#**********************************
cell.exp = my.obj1[["RNA"]]$counts %>% data.frame()
gene_trs = str_split_fixed(rownames(cell.exp), pattern = "\\*\\*\\*", 2)
cell.exp = cell.exp %>%
  mutate(gene_transcript=rownames(cell.exp), gene_name=gene_trs[,1], transcript_name=gene_trs[,2]) %>%
  melt() %>%
  dplyr::filter(value>0)
#annotate with cell types
cell.exp = cell.exp %>%
  mutate(variable = str_replace(variable,"\\.","-") %>% str_extract(pattern = "(A|T|G|C)*\\-[0-9]")) %>% 
  left_join(my.obj1@meta.data[,c("Barcodes","Cell_Clusters")], by = join_by(variable==Barcodes),
            relationship = "many-to-many") %>%
  data.table()

#2. Get 3'UTR size
#*****************
utrTab = data.frame(gtf.gr) %>%
  filter(transcript_name %in% cell.exp$transcript_name) %>%
  dplyr::filter(type %in% c("UTR","CDS")) %>%
  mutate(utr_type=NA) %>%
  group_by(gene_name, transcript_name, type) %>%
  #get coding region (CDS) coordinates
  mutate(cds_start = ifelse(type=="CDS", min(start),NA), cds_end = ifelse(type=="CDS",max(end),NA)) %>%
  group_by(gene_name,transcript_name) %>%
  mutate(cds_start = cds_start[1], cds_end = cds_end[1]) %>%
  filter(type=="UTR") %>%
  #tag 3'UTR/5'UTR regions
  mutate(utr_type = ifelse(strand=="+" & end < cds_start, "5UTR","3UTR"),
         utr_type = ifelse(strand=="-" & end < cds_start, "3UTR","5UTR")) %>%
  #subset 3'UTR regions
  dplyr::filter(utr_type=="3UTR") %>%
  mutate(UTRlength = sum(width)) %>%
  ungroup() %>%
  distinct(transcript_name, UTRlength)

#3. Annotate Isoform with 3'UTR length
#*************************************
cell.exp2 = left_join(cell.exp, utrTab) %>%
  na.omit()

#4. Weighting
#************
#*by gene
cell.exp3 = cell.exp2 %>%
  group_by(Cell_Clusters, variable, gene_name) %>%
  mutate(value.norm = value / sum(value), weighted.UTRlength.iso = value.norm * UTRlength,
         weighted.UTRlength.gene = mean(weighted.UTRlength.iso)) %>%
  data.table()

#by Cell
cell.exp4 = distinct(cell.exp3, Cell_Clusters, variable, gene_name, weighted.UTRlength.gene) %>%
  group_by(Cell_Clusters, variable) %>%
  summarise(weighted.UTRlength.cell = mean(weighted.UTRlength.gene)) %>%
  data.table()
  
#Plot  
ggplot(cell.exp4, aes(Cell_Clusters, weighted.UTRlength.cell, fill=Cell_Clusters)) +
  geom_boxplot(width=0.3) +
  theme_few(base_size = 13) +
  scale_fill_canva() +
  ggtitle("average weigthed gene UTR length in cells for each cluster")

enter image description here

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Home

SCALPEL Downstream analysis on scRNAseq SPERMATOGENESIS dataset

import libraries

A. Samples reading

B. Dimensionality reduction

C. Differential isoform analysis

D. Visualizations (Isoform differential usage)

Isoform expression in cells (FeaturePlot)

3. Isoform reads coverage track

F. 3'UTR size dynamic in cell types

Uh oh!

Uh oh!

Clone this wiki locally