Single cell RNA-seq (ES mouse) using Seurat
In this post I will analyze mouse ES-cell single cell RNA-seq data using Seurat.
Data was downlaoded from https://github.com/debsin/dropClust
library(RCurl)
library(Seurat)
library(cellrangerRkit)
library(data.table)
library(tidyverse)
This post also follows examples from https://davetang.org/muse/2017/08/01/getting-started-seurat/ First we remove all genes that are expressed in less than 3 cells.
esMus <- Read10X(c('~/github/jespermaag.github.io/es_mouse/'))
dim(esMus)
## [1] 24175 2717
keep <- apply(esMus, 1, function(x) sum(x>0))
table(keep>=3)
##
## FALSE TRUE
## 153 24022
keep <- keep >= 3
esMus <- esMus[keep,]
summary(colSums(esMus))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1647 6263 17373 20033 31086 85686
genes <- apply(esMus, 2, function(x) sum(x>0))
genes %>%
melt() %>%
ggplot(aes(x=value)) +
geom_histogram(colour='black', fill="steelblue") +
theme_classic()+
xlab('Number genes (1>= counts)') +
ggtitle('Number of genes detected per cell')+
theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),
axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))
colSums(esMus) %>%
melt() %>%
ggplot(aes(x=value)) +
geom_histogram(colour='black', fill="steelblue") +
theme_classic()+
xlab('Counts per cell)') +
ggtitle('Total counts per cell')+
theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),
axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))
As seen in the number of gene plots, there is a binomial distribution of number of genes per cell. I will keep this in mind for further analysis. The binomial distribtion is missing from the total counts per cell.
To use Seurat, I first have to create a Seurat object
esMusSeur <- CreateSeuratObject(raw.data = esMus,
min.cells = 3,
min.genes = 100,
project = "ES_mouse")
esMusSeur
## An object of class seurat in project ES_mouse
## 24022 genes across 2717 samples.
slotNames(esMusSeur)
## [1] "raw.data" "data" "scale.data" "var.genes"
## [5] "is.expr" "ident" "meta.data" "project.name"
## [9] "dr" "assay" "hvg.info" "imputed"
## [13] "cell.names" "cluster.tree" "snn" "calc.params"
## [17] "kmeans" "spatial" "misc" "version"
esMusSeur <- NormalizeData(object = esMusSeur,
normalization.method = "LogNormalize",
scale.factor = 1e4)
colSums(esMusSeur@data) %>%
melt() %>%
ggplot(aes(x=value)) +
geom_histogram(colour='black', fill="steelblue") +
theme_classic()+
xlab('Normalised counts per cell') +
ggtitle('Total counts per cell')+
theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),
axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))
As seen in the previous figure, after normalised the binomial distribution appears for the total counts.
Below I want to find the variable genes between the cells and plot the PCA. Since this is my first single cell analysis, I will use the parameters from https://davetang.org/muse/2017/08/01/getting-started-seurat/
esMusSeur <- FindVariableGenes(object = esMusSeur,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 0.0125,
x.high.cutoff = 3,
y.cutoff = 0.5)
length(esMusSeur@var.genes)
## [1] 5707
head(esMusSeur@hvg.info)
## gene.mean gene.dispersion gene.dispersion.scaled
## Gm7102 0.5832131 5.417292 16.151102
## Dcdc2c 0.4870505 5.006380 18.885776
## Rn28s1 2.9695050 4.793843 2.929660
## Rn4.5s 2.7443443 4.607135 3.618150
## Rn45s 3.6240987 4.024695 2.001075
## Ccdc36 3.3616208 3.766651 1.659102
esMusSeur@hvg.info %>%
ggplot(aes( x= gene.mean, y = gene.dispersion.scaled)) +
geom_point(pch=21, fill='steelblue') +
theme_classic()+
xlab('Mean gene expression') +
ylab('scaled dispersion') +
ggtitle('Mean expression vs dispersion')+
theme( axis.title.x = element_text(size=14), axis.text.x = element_text(size=12,colour='black'),
axis.text.y = element_text(size=12,colour='black'), axis.title.y = element_text(size=14))+
geom_smooth()
## Regressing out: nUMI
## Scaling data matrix
esMusSeur <- RunPCA(object = esMusSeur,
pc.genes = esMusSeur@var.genes,
do.print = TRUE,
pcs.print = 1:5,
genes.print = 5)
## [1] "PC1"
## [1] "Krt8" "Krt18" "S100a6" "Gsn" "Lgals1"
## [1] ""
## [1] "Npm1" "Dppa5a" "Ptma" "Nlrp1a" "Rpl21"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Lrrc58" "Tpm1" "Tagln" "Eno1b" "Eno1"
## [1] ""
## [1] "Tdh" "Mir690" "Esrrb" "Zfp42" "Slc29a1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Gm15772" "Mt2" "Esrrb" "Tdh" "Hmces"
## [1] ""
## [1] "Acsf2" "Ppm1k" "Gm10941" "Agl" "Mtfmt"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Klf5" "Malat1" "Klf6" "Slc2a3" "Sparc"
## [1] ""
## [1] "Il12a" "Gm20740" "Olfr954" "Fam229b" "Tmem126b"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Gapdh" "Gpx4" "Gm8709" "Dusp9" "Gsn"
## [1] ""
## [1] "Erdr1" "Hist1h2ao" "Apoe" "Hist1h2ap" "Flnb"
## [1] ""
## [1] ""
PrintPCAParams(esMusSeur)
## Parameters used in latest PCA calculation run on: 2018-08-11 14:46:38
## =============================================================================
## PCs computed Genes used in calculation PCs Scaled by Variance Explained
## 20 5707 TRUE
## -----------------------------------------------------------------------------
## rev.pca
## FALSE
## -----------------------------------------------------------------------------
## Full gene list can be accessed using
## GetCalcParam(object = object, calculation = "RunPCA", parameter = "pc.genes")
VizPCA(object = esMusSeur, pcs.use = 1:2)
PCAPlot(object = esMusSeur, dim.1 = 1, dim.2 = 2)
esMusSeur <- ProjectPCA(object = esMusSeur, do.print = FALSE)
By plotting the standard deviation for each principal compartment, I can see that the deviation stop at around the 12th compartment.
K-nearest neighbour finds the number of groups in the data set, which is used to colour the data in the tSNE-plot
PCElbowPlot(esMusSeur)
esMusSeur <- FindClusters(object = esMusSeur,
reduction.type = "pca",
dims.use = 1:10,
resolution = 0.6,
print.output = 0,
save.SNN = TRUE)
esMusSeur <- FindClusters(object = esMusSeur,
reduction.type = "pca",
dims.use = 1:12,
resolution = 0.6,
print.output = 0,
save.SNN = TRUE)
esMusSeur <- RunTSNE(object = esMusSeur,
dims.use = 1:12,
do.fast = TRUE)
TSNEPlot(object = esMusSeur, do.label = TRUE)
I then want to identify the top genes per cluster.
The top genes can then be plotted as violin plots, feature plots, which projects the expression on the tSNE-plot, or as heatmaps for the top 5 highest expressed genes per cluster.
topMarkers <- esMusSeur.markers %>%
group_by(cluster) %>%
top_n(1, avg_logFC)
VlnPlot(object = esMusSeur, features.plot = topMarkers$gene, point.size.use = 0)
FeaturePlot(object = esMusSeur,
features.plot = topMarkers$gene,
cols.use = c("grey", "blue"),
reduction.use = "tsne")
topMarkers5 <- esMusSeur.markers %>%
group_by(cluster) %>%
top_n(5, avg_logFC)
DoHeatmap(object = esMusSeur,
genes.use = topMarkers5$gene,
remove.key = TRUE,
slim.col.label = TRUE)