Single cell RNA-seq (ES mouse) using Seurat

In this post I will analyze mouse ES-cell single cell RNA-seq data using Seurat.

Data was downlaoded from https://github.com/debsin/dropClust

library(RCurl)
library(Seurat)
library(cellrangerRkit)
library(data.table)
library(tidyverse)

This post also follows examples from https://davetang.org/muse/2017/08/01/getting-started-seurat/ First we remove all genes that are expressed in less than 3 cells.

esMus <- Read10X(c('~/github/jespermaag.github.io/es_mouse/'))

dim(esMus)
## [1] 24175  2717
keep <- apply(esMus, 1, function(x) sum(x>0))
table(keep>=3)
## 
## FALSE  TRUE 
##   153 24022
keep <- keep >= 3

esMus <- esMus[keep,]

summary(colSums(esMus))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1647    6263   17373   20033   31086   85686
genes <- apply(esMus, 2, function(x) sum(x>0))
genes %>% 
    melt() %>%
    ggplot(aes(x=value)) +
        geom_histogram(colour='black', fill="steelblue") +
        theme_classic()+
        xlab('Number genes (1>= counts)') +
        ggtitle('Number of genes detected per cell')+
	    theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),  
        axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))

plot of chunk explorePlots

colSums(esMus) %>%
    melt() %>%
    ggplot(aes(x=value)) +
        geom_histogram(colour='black', fill="steelblue") +
        theme_classic()+
        xlab('Counts per cell)') +
        ggtitle('Total counts per cell')+
        theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),  
        axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))

plot of chunk explorePlots

As seen in the number of gene plots, there is a binomial distribution of number of genes per cell. I will keep this in mind for further analysis. The binomial distribtion is missing from the total counts per cell.

To use Seurat, I first have to create a Seurat object

esMusSeur <- CreateSeuratObject(raw.data = esMus,
                           min.cells = 3,
                           min.genes = 100,
                           project = "ES_mouse")

esMusSeur
## An object of class seurat in project ES_mouse 
##  24022 genes across 2717 samples.
slotNames(esMusSeur)
##  [1] "raw.data"     "data"         "scale.data"   "var.genes"   
##  [5] "is.expr"      "ident"        "meta.data"    "project.name"
##  [9] "dr"           "assay"        "hvg.info"     "imputed"     
## [13] "cell.names"   "cluster.tree" "snn"          "calc.params" 
## [17] "kmeans"       "spatial"      "misc"         "version"
esMusSeur <- NormalizeData(object = esMusSeur,
                      normalization.method = "LogNormalize",
                      scale.factor = 1e4)



 

colSums(esMusSeur@data) %>%
    melt() %>%
    ggplot(aes(x=value)) +
        geom_histogram(colour='black', fill="steelblue") +
        theme_classic()+
        xlab('Normalised counts per cell') +
        ggtitle('Total counts per cell')+
        theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),  
        axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))

plot of chunk createSeurat As seen in the previous figure, after normalised the binomial distribution appears for the total counts.

Below I want to find the variable genes between the cells and plot the PCA. Since this is my first single cell analysis, I will use the parameters from https://davetang.org/muse/2017/08/01/getting-started-seurat/

esMusSeur <- FindVariableGenes(object = esMusSeur,
                          mean.function = ExpMean,
                          dispersion.function = LogVMR,
                          x.low.cutoff = 0.0125,
                          x.high.cutoff = 3,
                          y.cutoff = 0.5)

plot of chunk variability

length(esMusSeur@var.genes)
## [1] 5707
head(esMusSeur@hvg.info)
##        gene.mean gene.dispersion gene.dispersion.scaled
## Gm7102 0.5832131        5.417292              16.151102
## Dcdc2c 0.4870505        5.006380              18.885776
## Rn28s1 2.9695050        4.793843               2.929660
## Rn4.5s 2.7443443        4.607135               3.618150
## Rn45s  3.6240987        4.024695               2.001075
## Ccdc36 3.3616208        3.766651               1.659102
esMusSeur@hvg.info %>%
    ggplot(aes( x= gene.mean, y = gene.dispersion.scaled)) +
        geom_point(pch=21, fill='steelblue') +
        theme_classic()+
        xlab('Mean gene expression') +
        ylab('scaled dispersion') +
        ggtitle('Mean expression vs dispersion')+
        theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),  
        axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))+
        geom_smooth()

plot of chunk variability

## Regressing out: nUMI
## Scaling data matrix
esMusSeur <- RunPCA(object = esMusSeur,
               pc.genes = esMusSeur@var.genes,
               do.print = TRUE,
               pcs.print = 1:5,
               genes.print = 5)
## [1] "PC1"
## [1] "Krt8"   "Krt18"  "S100a6" "Gsn"    "Lgals1"
## [1] ""
## [1] "Npm1"   "Dppa5a" "Ptma"   "Nlrp1a" "Rpl21" 
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Lrrc58" "Tpm1"   "Tagln"  "Eno1b"  "Eno1"  
## [1] ""
## [1] "Tdh"     "Mir690"  "Esrrb"   "Zfp42"   "Slc29a1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Gm15772" "Mt2"     "Esrrb"   "Tdh"     "Hmces"  
## [1] ""
## [1] "Acsf2"   "Ppm1k"   "Gm10941" "Agl"     "Mtfmt"  
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Klf5"   "Malat1" "Klf6"   "Slc2a3" "Sparc" 
## [1] ""
## [1] "Il12a"    "Gm20740"  "Olfr954"  "Fam229b"  "Tmem126b"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Gapdh"  "Gpx4"   "Gm8709" "Dusp9"  "Gsn"   
## [1] ""
## [1] "Erdr1"     "Hist1h2ao" "Apoe"      "Hist1h2ap" "Flnb"     
## [1] ""
## [1] ""
PrintPCAParams(esMusSeur)
## Parameters used in latest PCA calculation run on: 2018-08-11 14:46:38
## =============================================================================
## PCs computed    Genes used in calculation    PCs Scaled by Variance Explained
##     20                  5707                               TRUE
## -----------------------------------------------------------------------------
## rev.pca 
##  FALSE
## -----------------------------------------------------------------------------
## Full gene list can be accessed using 
##  GetCalcParam(object = object, calculation = "RunPCA", parameter = "pc.genes")
VizPCA(object = esMusSeur, pcs.use = 1:2)

plot of chunk PCA

PCAPlot(object = esMusSeur, dim.1 = 1, dim.2 = 2)

plot of chunk PCA

esMusSeur <- ProjectPCA(object = esMusSeur, do.print = FALSE)

By plotting the standard deviation for each principal compartment, I can see that the deviation stop at around the 12th compartment.

K-nearest neighbour finds the number of groups in the data set, which is used to colour the data in the tSNE-plot

PCElbowPlot(esMusSeur)

plot of chunk subsetPCA

esMusSeur <- FindClusters(object = esMusSeur,
                     reduction.type = "pca",
                     dims.use = 1:10,
                     resolution = 0.6,
                     print.output = 0,
                     save.SNN = TRUE)
 
esMusSeur <- FindClusters(object = esMusSeur,
                     reduction.type = "pca",
                     dims.use = 1:12,
                     resolution = 0.6,
                     print.output = 0,
                     save.SNN = TRUE)


esMusSeur <- RunTSNE(object = esMusSeur,
                dims.use = 1:12,
                do.fast = TRUE)

TSNEPlot(object = esMusSeur, do.label = TRUE)

plot of chunk subsetPCA

I then want to identify the top genes per cluster.

The top genes can then be plotted as violin plots, feature plots, which projects the expression on the tSNE-plot, or as heatmaps for the top 5 highest expressed genes per cluster.

topMarkers <- esMusSeur.markers %>% 
    group_by(cluster) %>% 
    top_n(1, avg_logFC)
    
    
VlnPlot(object = esMusSeur, features.plot = topMarkers$gene, point.size.use = 0)

plot of chunk topGenes

FeaturePlot(object = esMusSeur,
            features.plot = topMarkers$gene,
            cols.use = c("grey", "blue"),
            reduction.use = "tsne")

plot of chunk topGenes

topMarkers5 <- esMusSeur.markers %>% 
    group_by(cluster) %>% 
    top_n(5, avg_logFC)
    
DoHeatmap(object = esMusSeur,
          genes.use = topMarkers5$gene,
          remove.key = TRUE,
          slim.col.label = TRUE)

plot of chunk topGenes

There you go. My first single cell analysis using Seurat, and following the examples from Dave Tang