1.数据下载
首先下载PBMC数据(10X genomics数据),主要包括三个文件:barcodes.tsv、genes.tsv、matrix.mtx。它们的含义如下:
barcode.tsv:存储用于标记单个细胞的唯一条形码(barcode),试剂分析中需要质控过滤低质量的细胞(比如空液滴或者双细胞)。
genes.tsv:存储基因(或特征)的 标识符 和 名称。每行对应一个基因或其他特征(如抗体标签、CRISPR引导RNA)。
matrix.mtx:以 稀疏矩阵格式 存储基因表达数据(UMI计数矩阵)
2. 代码测试
library(dplyr)
library(Seurat)
library(patchwork)
library(ggplot2)
##############################data quality control######################################
pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/") #加载pbmc数据集
pbmc <- CreateSeuratObject(
counts = pbmc.data,
project = "pbmc3k",
min.cells = 3, #仅保留在至少 3 个细胞中表达的基因(避免大量零值基因增加噪声)
min.features = 200 # 只保留检测到至少 200 个基因的细胞
)
pbmc
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")#过滤高线粒体基因比例(低质量、死细胞)的细胞。
#通过小提琴图(Violin Plot)展示每个细胞的 基因数、UMI总数 和 线粒体基因比例 的分布,用于质控筛选。
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
#观察 UMI总数 与 线粒体基因比例 的关系
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
#观察 UMI总数 与 基因数 的关联,二者呈 正相关(UMI越多,检测到的基因数越多)
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
#保留检测到至少 200个基因 的细胞(过滤空液滴); 排除检测到 超过2500个基因 的细胞(过滤双细胞或异常大细胞)
#保留线粒体基因比例 低于5% 的细胞(过滤濒死或破损细胞)
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
##############################data Normalization########################################
#消除测序深度差异对基因表达量的影响。
#对每个细胞的UMI计数进行转换:将每个基因的计数除以总UMI数,再乘以 scale.factor(默认10000)再进行对数转换
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#识别表达量变异度最高的基因(通常与生物学差异相关,而非技术噪声)
#使用 方差稳定化变换 (VST) 方法计算基因变异性;选择前2000个高变基因用于后续分析(如PCA降维)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
p <- plot1 + plot2
ggsave("combined_plot.pdf", p, width = 12, height = 6)
##################################scaling data##########################################
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca") + NoLegend()
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
#生成肘部图,辅助选择用于下游分析的主成分(PCs)
ElbowPlot(pbmc)
#基于前10个主成分构建K近邻图(KNN),量化细胞间相似性。
pbmc <- FindNeighbors(pbmc, dims = 1:10)
#使用Louvain算法进行无监督聚类
pbmc <- FindClusters(pbmc, resolution = 0.5)#resolution:控制分群粒度(值越大簇越多,如0.5生成约10个簇)。
head(Idents(pbmc), 5)
pbmc <- RunUMAP(pbmc, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")
saveRDS(pbmc, file = "./pbmc_tutorial.rds")#将Seurat对象保存为RDS文件,便于后续加载和复用
#######################################差异表达分析######################################
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2) #寻找簇2与其他所有簇的差异表达基因(DEGs)。
head(cluster2.markers, n = 5)
# find all markers distinguishing cluster 5 from clusters 0 and 3#比较簇5与簇0、3的差异基因。
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))
head(cluster5.markers, n = 5)
# find markers for every cluster compared to all remaining cells, report only the positive ones
#全簇标记基因,输出每个簇的Top标记基因列表,用于注释细胞类型。
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
pbmc.markers %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1)#为每个簇筛选仅正相关的标记基因
cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
#小提琴图显示基因在簇间的表达分布。
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
# you can plot raw counts as well
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
#UMAP图上叠加基因表达
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
pbmc.markers %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1) %>%
slice_head(n = 10) %>%
ungroup() -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
library(ggplot2)
plot <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") +
theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + guides(colour = guide_legend(override.aes = list(size = 10)))
ggsave(filename = "./pbmc3k_umap.jpg", height = 7, width = 12, plot = plot, quality = 50)
3. UMAP图中簇的分类
3.1 差异表达分析:寻找簇特异性标记基因
通过Seruat的FindMarkers()或FindAllMarkers()函数,识别每个簇与其他簇相比显著高表达的基因。
3.2 标记基因与已知细胞类型关联
将差异基因与文献或者数据库中的标记基因比对,例如:
-
免疫细胞:
CD3D
(T细胞)、CD19
(B细胞)、CD14
(单核细胞)、NKG7
(NK细胞)。 -
肿瘤微环境:
EPCAM
(上皮细胞)、PECAM1
(内皮细胞)、COL1A1
(成纤维细胞)。 -
神经细胞:
SYT1
(神经元)、GFAP
(星形胶质细胞)。
常用数据库:
1. CellMarker:涵盖人类和小鼠的细胞标记基因。
2. PanglaoDB:单细胞转录组标记基因数据库。
3. SingleR:自动化注释工具(R包)
3.3 可视化验证标记基因
1. 基因表达UMAP图:FeaturePlot()
2. 小提琴图:VlnPlot()
3. 热图:DoHeatmap()
3.4 自动化注释工具:SingleR
library(SingleR)
# 加载参考数据集(如HumanPrimaryCellAtlasData)
ref <- HumanPrimaryCellAtlasData()
# 提取表达矩阵(log标准化后的数据)
pbmc_data <- GetAssayData(pbmc, slot = "data")
# 运行SingleR注释
annotations <- SingleR(test = pbmc_data, ref = ref, labels = ref$label.main)
# 将注释结果添加到Seurat对象
pbmc$SingleR_labels <- annotations$labels
# 可视化注释结果
DimPlot(pbmc, group.by = "SingleR_labels", label = TRUE)