In this post I will analyze mouse ES-cell single cell RNA-seq data using Seurat.
Data was downlaoded from https://github.com/debsin/dropClust
library ( RCurl)
library ( Seurat)
library ( cellrangerRkit)
library ( data.table)
library ( tidyverse)
This post also follows examples from https://davetang.org/muse/2017/08/01/getting-started-seurat/
First we remove all genes that are expressed in less than 3 cells.
esMus <- Read10X( c ( '~/github/jespermaag.github.io/es_mouse/' ))
dim ( esMus)
## [1] 24175 2717
keep <- apply ( esMus, 1 , function ( x) sum ( x> 0 ))
table ( keep>= 3 )
##
## FALSE TRUE
## 153 24022
keep <- keep >= 3
esMus <- esMus[ keep,]
summary ( colSums ( esMus))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1647 6263 17373 20033 31086 85686
genes <- apply ( esMus, 2 , function ( x) sum ( x> 0 ))
genes %>%
melt() %>%
ggplot( aes( x= value)) +
geom_histogram( colour= 'black' , fill= "steelblue" ) +
theme_classic() +
xlab( 'Number genes (1>= counts)' ) +
ggtitle( 'Number of genes detected per cell' ) +
theme( axis.title.x = element_text( size= 14 ), axis.text.x = element_text( size= 12 , colour= 'black' ),
axis.text.y = element_text( size= 12 , colour= 'black' ), axis.title.y = element_text( size= 14 ))
colSums ( esMus) %>%
melt() %>%
ggplot( aes( x= value)) +
geom_histogram( colour= 'black' , fill= "steelblue" ) +
theme_classic() +
xlab( 'Counts per cell)' ) +
ggtitle( 'Total counts per cell' ) +
theme( axis.title.x = element_text( size= 14 ), axis.text.x = element_text( size= 12 , colour= 'black' ),
axis.text.y = element_text( size= 12 , colour= 'black' ), axis.title.y = element_text( size= 14 ))
As seen in the number of gene plots, there is a binomial distribution of number of genes per cell.
I will keep this in mind for further analysis.
The binomial distribtion is missing from the total counts per cell.
To use Seurat, I first have to create a Seurat object
esMusSeur <- CreateSeuratObject( raw.data = esMus,
min.cells = 3 ,
min.genes = 100 ,
project = "ES_mouse" )
esMusSeur
## An object of class seurat in project ES_mouse
## 24022 genes across 2717 samples.
slotNames( esMusSeur)
## [1] "raw.data" "data" "scale.data" "var.genes"
## [5] "is.expr" "ident" "meta.data" "project.name"
## [9] "dr" "assay" "hvg.info" "imputed"
## [13] "cell.names" "cluster.tree" "snn" "calc.params"
## [17] "kmeans" "spatial" "misc" "version"
esMusSeur <- NormalizeData( object = esMusSeur,
normalization.method = "LogNormalize" ,
scale.factor = 1e4 )
colSums ( esMusSeur@ data) %>%
melt() %>%
ggplot( aes( x= value)) +
geom_histogram( colour= 'black' , fill= "steelblue" ) +
theme_classic() +
xlab( 'Normalised counts per cell' ) +
ggtitle( 'Total counts per cell' ) +
theme( axis.title.x = element_text( size= 14 ), axis.text.x = element_text( size= 12 , colour= 'black' ),
axis.text.y = element_text( size= 12 , colour= 'black' ), axis.title.y = element_text( size= 14 ))
As seen in the previous figure, after normalised the binomial distribution appears for the total counts.
Below I want to find the variable genes between the cells and plot the PCA.
Since this is my first single cell analysis, I will use the parameters from https://davetang.org/muse/2017/08/01/getting-started-seurat/
esMusSeur <- FindVariableGenes( object = esMusSeur,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 0.0125 ,
x.high.cutoff = 3 ,
y.cutoff = 0.5 )
length ( esMusSeur@ var.genes)
## [1] 5707
head ( esMusSeur@ hvg.info)
## gene.mean gene.dispersion gene.dispersion.scaled
## Gm7102 0.5832131 5.417292 16.151102
## Dcdc2c 0.4870505 5.006380 18.885776
## Rn28s1 2.9695050 4.793843 2.929660
## Rn4.5s 2.7443443 4.607135 3.618150
## Rn45s 3.6240987 4.024695 2.001075
## Ccdc36 3.3616208 3.766651 1.659102
esMusSeur@ hvg.info %>%
ggplot( aes( x= gene.mean, y = gene.dispersion.scaled)) +
geom_point( pch= 21 , fill= 'steelblue' ) +
theme_classic() +
xlab( 'Mean gene expression' ) +
ylab( 'scaled dispersion' ) +
ggtitle( 'Mean expression vs dispersion' ) +
theme( axis.title.x = element_text( size= 14 ), axis.text.x = element_text( size= 12 , colour= 'black' ),
axis.text.y = element_text( size= 12 , colour= 'black' ), axis.title.y = element_text( size= 14 )) +
geom_smooth()
## Regressing out: nUMI
## Scaling data matrix
esMusSeur <- RunPCA( object = esMusSeur,
pc.genes = esMusSeur@ var.genes,
do.print = TRUE ,
pcs.print = 1 : 5 ,
genes.print = 5 )
## [1] "PC1"
## [1] "Krt8" "Krt18" "S100a6" "Gsn" "Lgals1"
## [1] ""
## [1] "Npm1" "Dppa5a" "Ptma" "Nlrp1a" "Rpl21"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Lrrc58" "Tpm1" "Tagln" "Eno1b" "Eno1"
## [1] ""
## [1] "Tdh" "Mir690" "Esrrb" "Zfp42" "Slc29a1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Gm15772" "Mt2" "Esrrb" "Tdh" "Hmces"
## [1] ""
## [1] "Acsf2" "Ppm1k" "Gm10941" "Agl" "Mtfmt"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Klf5" "Malat1" "Klf6" "Slc2a3" "Sparc"
## [1] ""
## [1] "Il12a" "Gm20740" "Olfr954" "Fam229b" "Tmem126b"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Gapdh" "Gpx4" "Gm8709" "Dusp9" "Gsn"
## [1] ""
## [1] "Erdr1" "Hist1h2ao" "Apoe" "Hist1h2ap" "Flnb"
## [1] ""
## [1] ""
PrintPCAParams( esMusSeur)
## Parameters used in latest PCA calculation run on: 2018-08-11 14:46:38
## =============================================================================
## PCs computed Genes used in calculation PCs Scaled by Variance Explained
## 20 5707 TRUE
## -----------------------------------------------------------------------------
## rev.pca
## FALSE
## -----------------------------------------------------------------------------
## Full gene list can be accessed using
## GetCalcParam(object = object, calculation = "RunPCA", parameter = "pc.genes")
VizPCA( object = esMusSeur, pcs.use = 1 : 2 )
PCAPlot( object = esMusSeur, dim.1 = 1 , dim.2 = 2 )
esMusSeur <- ProjectPCA( object = esMusSeur, do.print = FALSE )
By plotting the standard deviation for each principal compartment, I can see that the deviation stop at around the 12th compartment.
K-nearest neighbour finds the number of groups in the data set, which is used to colour the data in the tSNE-plot
PCElbowPlot( esMusSeur)
esMusSeur <- FindClusters( object = esMusSeur,
reduction.type = "pca" ,
dims.use = 1 : 10 ,
resolution = 0.6 ,
print.output = 0 ,
save.SNN = TRUE )
esMusSeur <- FindClusters( object = esMusSeur,
reduction.type = "pca" ,
dims.use = 1 : 12 ,
resolution = 0.6 ,
print.output = 0 ,
save.SNN = TRUE )
esMusSeur <- RunTSNE( object = esMusSeur,
dims.use = 1 : 12 ,
do.fast = TRUE )
TSNEPlot( object = esMusSeur, do.label = TRUE )
I then want to identify the top genes per cluster.
The top genes can then be plotted as violin plots, feature plots, which projects the expression on the tSNE-plot, or as heatmaps for the top 5 highest expressed genes per cluster.
topMarkers <- esMusSeur.markers %>%
group_by( cluster) %>%
top_n( 1 , avg_logFC)
VlnPlot( object = esMusSeur, features.plot = topMarkers$ gene, point.size.use = 0 )
FeaturePlot( object = esMusSeur,
features.plot = topMarkers$ gene,
cols.use = c ( "grey" , "blue" ),
reduction.use = "tsne" )
topMarkers5 <- esMusSeur.markers %>%
group_by( cluster) %>%
top_n( 5 , avg_logFC)
DoHeatmap( object = esMusSeur,
genes.use = topMarkers5$ gene,
remove.key = TRUE ,
slim.col.label = TRUE )
There you go. My first single cell analysis using Seurat, and following the examples from Dave Tang