Seruat v5测试第一部分- 聚类_seurat v5 操作文档-CSDN博客

本文链接：https://blog.csdn.net/interestingaha/article/details/147094393

1.数据下载

首先下载PBMC数据（10X genomics数据），主要包括三个文件：barcodes.tsv、genes.tsv、matrix.mtx。它们的含义如下：

barcode.tsv：存储用于标记单个细胞的唯一条形码（barcode），试剂分析中需要质控过滤低质量的细胞（比如空液滴或者双细胞）。

genes.tsv：存储基因（或特征）的 标识符 和 名称。每行对应一个基因或其他特征（如抗体标签、CRISPR引导RNA）。

matrix.mtx：以 稀疏矩阵格式 存储基因表达数据（UMI计数矩阵）

2. 代码测试

library(dplyr)
library(Seurat)
library(patchwork)
library(ggplot2)

##############################data quality control######################################

pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/") #加载pbmc数据集
pbmc <- CreateSeuratObject(
    counts = pbmc.data, 
    project = "pbmc3k", 
    min.cells = 3, #仅保留在至少 3 个细胞中表达的基因（避免大量零值基因增加噪声）
    min.features = 200 # 只保留检测到至少 200 个基因的细胞
    )
pbmc

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")#过滤高线粒体基因比例（低质量、死细胞）的细胞。
#通过小提琴图（Violin Plot）展示每个细胞的 基因数、UMI总数 和 线粒体基因比例 的分布，用于质控筛选。
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
#观察 UMI总数 与 线粒体基因比例 的关系
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
#观察 UMI总数 与 基因数 的关联，二者呈 正相关（UMI越多，检测到的基因数越多）
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

#保留检测到至少 200个基因 的细胞（过滤空液滴）; 排除检测到 超过2500个基因 的细胞（过滤双细胞或异常大细胞）
#保留线粒体基因比例 低于5% 的细胞（过滤濒死或破损细胞）
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)


##############################data Normalization########################################

#消除测序深度差异对基因表达量的影响。
#对每个细胞的UMI计数进行转换:将每个基因的计数除以总UMI数，再乘以 scale.factor（默认10000）再进行对数转换
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

#识别表达量变异度最高的基因（通常与生物学差异相关，而非技术噪声）
#使用 方差稳定化变换 (VST) 方法计算基因变异性；选择前2000个高变基因用于后续分析（如PCA降维）
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
p <- plot1 + plot2
ggsave("combined_plot.pdf", p, width = 12, height = 6)

##################################scaling data##########################################
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca") + NoLegend()
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
#生成肘部图，辅助选择用于下游分析的主成分（PCs）
ElbowPlot(pbmc)
#基于前10个主成分构建K近邻图（KNN），量化细胞间相似性。
pbmc <- FindNeighbors(pbmc, dims = 1:10)
#使用Louvain算法进行无监督聚类
pbmc <- FindClusters(pbmc, resolution = 0.5)#resolution：控制分群粒度（值越大簇越多，如0.5生成约10个簇）。
head(Idents(pbmc), 5)
pbmc <- RunUMAP(pbmc, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")
saveRDS(pbmc, file = "./pbmc_tutorial.rds")#将Seurat对象保存为RDS文件，便于后续加载和复用

#######################################差异表达分析######################################
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2) #寻找簇2与其他所有簇的差异表达基因（DEGs）。
head(cluster2.markers, n = 5)
# find all markers distinguishing cluster 5 from clusters 0 and 3#比较簇5与簇0、3的差异基因。
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))
head(cluster5.markers, n = 5)
# find markers for every cluster compared to all remaining cells, report only the positive ones

#全簇标记基因，输出每个簇的Top标记基因列表，用于注释细胞类型。
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
pbmc.markers %>%
    group_by(cluster) %>%
    dplyr::filter(avg_log2FC > 1)#为每个簇筛选仅正相关的标记基因

cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)

#小提琴图显示基因在簇间的表达分布。
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
# you can plot raw counts as well
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

#UMAP图上叠加基因表达
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))

pbmc.markers %>%
    group_by(cluster) %>%
    dplyr::filter(avg_log2FC > 1) %>%
    slice_head(n = 10) %>%
    ungroup() -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
    "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

library(ggplot2)
plot <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") +
    theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + guides(colour = guide_legend(override.aes = list(size = 10)))
ggsave(filename = "./pbmc3k_umap.jpg", height = 7, width = 12, plot = plot, quality = 50)

3. UMAP图中簇的分类

3.1 差异表达分析：寻找簇特异性标记基因

通过Seruat的FindMarkers()或FindAllMarkers()函数，识别每个簇与其他簇相比显著高表达的基因。

3.2 标记基因与已知细胞类型关联

将差异基因与文献或者数据库中的标记基因比对，例如：

免疫细胞：CD3D（T细胞）、CD19（B细胞）、CD14（单核细胞）、NKG7（NK细胞）。
肿瘤微环境：EPCAM（上皮细胞）、PECAM1（内皮细胞）、COL1A1（成纤维细胞）。
神经细胞：SYT1（神经元）、GFAP（星形胶质细胞）。

常用数据库：

1. CellMarker：涵盖人类和小鼠的细胞标记基因。

2. PanglaoDB：单细胞转录组标记基因数据库。

3. SingleR：自动化注释工具（R包）

3.3 可视化验证标记基因

1. 基因表达UMAP图：FeaturePlot（）

2. 小提琴图：VlnPlot（）

3. 热图：DoHeatmap（）

3.4 自动化注释工具：SingleR

library(SingleR)
# 加载参考数据集（如HumanPrimaryCellAtlasData）
ref <- HumanPrimaryCellAtlasData()
# 提取表达矩阵（log标准化后的数据）
pbmc_data <- GetAssayData(pbmc, slot = "data")
# 运行SingleR注释
annotations <- SingleR(test = pbmc_data, ref = ref, labels = ref$label.main)
# 将注释结果添加到Seurat对象
pbmc$SingleR_labels <- annotations$labels
# 可视化注释结果
DimPlot(pbmc, group.by = "SingleR_labels", label = TRUE)