感谢关注,一起来学习生信干货。
前言
单细胞分析做多了,会发现翻来覆去就那些分析需求:降维-聚类-注释-差异表达DEG-富集分析GO-KEGG-GSEA-GSVA-细胞通讯分析-转录因子分析-拟时序分析-可变剪切分析-突变分析。。。然后就是无穷无尽的修图、改图。
所以很多代码修过来修过去,就变成了“精华”代码,所谓的精华代码,就是可以“一键出图”的代码。
这些代码,是老赵从别人那里借鉴来的 + 自己精修过的结合体。
从这个意义上来讲,个性化分析难吗?
不难,因为包是现成的,代码也都是现成的,似乎谁跑都能出图。
但另一方面,为啥没听说过有哪个大公司是专门做个性化生信分析的?为啥不是所有的测序公司都有自己的生信分析团队?即使有,客户的满意率又有多高?
老赵和朋友做自己的生信分析公司已经有半年了,直接/间接接触了20多位客户,以scRNAseq和GEO数据挖掘需求居多,对个性化生信分析是深有体会:
个性化生信分析最大的难点不在于跑代码和修改代码,而在于“费人”!它需要一个综合能力(生物学背景+代码能力+沟通能力)很强的人,来对接客户的分析需求,沟通需求,然后代码出图,再根据客户的反馈,修改代码,修图;如果客户有疑问,需要跟客户解释;如果客户对于分析方向犹豫,需要帮客户做决断;如果客户对文章构思有疑惑,需要帮客户设计思路等等等等。这样的一个人,从综合能力来讲,大概率是个博士水平了。所以老赵目前的感悟是:
1. 对个人而言,个性化生信分析可以做,但很难当主业去做大(因为忙不过来,分身乏术),只能当副业;
2.对测序公司的生信团队而言,虽然客户源广,跑常规分析的pipeline可以,但是做个性化分析大概率会缺“能和客户高效沟通的analyzer”。毕竟有这个能力的,不会甘心只做个analyzer。
好了,废话不多说,直接分享老赵的代码吧
1. 读取数据
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
setwd("/Users/Desktop/wechat/两组细胞-DEG-GO-KEGG-GSEA")
library(Seurat)
library(qs)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(SeuratData)
## 读取数据
data("pbmc3k.final")
pbmc3k<- UpdateSeuratObject(pbmc3k.final)
table(pbmc3k$seurat_annotations)
Idents(pbmc3k)<- pbmc3k$seurat_annotations
DimPlot(pbmc3k)
2. DEG一键分析及出图
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
#########DEGs
###1. 指定两种细胞,比如CD14+ Mono vs NK
# logFC是正的放前面
group1 <- 'CD14+ Mono'
group2 <- 'NK'
## 这里一定得是F,否则只有ident.1的正的数据
tmp.marker <- FindMarkers(pbmc3k,
logfc.threshold = 0.25, min.pct = 0.1,
only.pos = F, test.use = "wilcox",
ident.1= group1, ident.2= group2)
#谁放前面,谁的logFC就是正的
tmp.marker$gene=rownames(tmp.marker)
tmp.marker$condition=ifelse(tmp.marker$avg_log2FC > 0, group1, group2)
tmp.marker=tmp.marker%>%filter(p_val_adj < 0.05)
tmp.marker=as.data.frame(tmp.marker)
tmp.marker=tmp.marker%>%arrange(desc(avg_log2FC))
rownames(tmp.marker) <- tmp.marker$gene
marker_condition= tmp.marker
# 定义 VolcanoPlot 函数(此处省略函数定义,保持不变)
marker_condition$log2FoldChange = marker_condition$avg_log2FC
marker_condition$padj = marker_condition$p_val_adj
## 去除男性特异性基因
marker_condition <- marker_condition[!marker_condition$gene %in% marker_condition$gene[grep('^XIST',marker_condition$gene)],]
VolcanoPlot <- function(dif, log2FC = 1, padj = 0.05,
label.symbols = NULL, label.max = 30,
cols = c("#497aa2", "#ae3137"), title = ""){
# (1) define up and down
dif$threshold <- "ns" # 初始化所有基因的threshold为"ns"(不显著)
up_indices <- which(dif$log2FoldChange > log2FC & dif$padj < padj)
dif$threshold[up_indices] <- "up"
down_indices <- which(dif$log2FoldChange < -log2FC & dif$padj < padj)
dif$threshold[down_indices] <- "down"
dif$threshold <- factor(dif$threshold, levels = c('down', 'ns', 'up'))
tb2 = table(dif$threshold)
# (2) plot
color_manual <- c('down' = cols[1], 'ns' = "grey", 'up' = cols[2])
labels_manual <- c('down' = paste0("down (", tb2[1], ")"), 'ns' = 'ns', 'up' = paste0("up (", tb2[3], ")"))
g1 = ggplot(data = dif, aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
geom_point(alpha = 0.8, size = 0.8) +
geom_vline(xintercept = c(-log2FC, log2FC), linetype = 2, color = "grey") +
geom_hline(yintercept = -log10(padj), linetype = 2, color = "grey") +
labs(title = ifelse("" == title, "", paste("DEG:", title))) +
xlab(bquote(Log[2] * FoldChange)) +
ylab(bquote(-Log[10] * italic(P.adj))) +
theme_classic(base_size = 14) +
theme(legend.box = "horizontal",
legend.position = "top",
legend.spacing.x = unit(0, 'pt'),
legend.text = element_text(margin = margin(r = 20)),
legend.margin = margin(b = -10, unit = "pt"),
plot.title = element_text(hjust = 0.5, size = 10)) +
scale_color_manual('', labels = labels_manual, values = color_manual) +
guides(color = guide_legend(override.aes = list(size = 3, alpha = 1)))
g1
# (3) label genes
if (label.max > 0) {
if (is.null(label.symbols)) {
dif.sig = dif[which(dif$threshold != "ns"), ]
len = nrow(dif.sig)
if (len < label.max) {
label.symbols = rownames(dif.sig)
} else {
dif.sig = dif.sig[order(dif.sig$log2FoldChange), ]
dif.sig = rbind(dif.sig[1:(label.max / 2), ], dif.sig[(len - label.max / 2):len, ])
label.symbols = rownames(dif.sig)
}}
dd_text = dif[label.symbols, ]
# add text
g1 + geom_text_repel(data = dd_text,
aes(x = log2FoldChange, y = -log10(padj), label = row.names(dd_text)),
color = "black", alpha = 1)
} else {
return(g1)
}
}
dir.create('DEGs')
VolcanoPlot(marker_condition, log2FC = 0.585,padj = 0.05, label.max = 10)
ggsave(paste0('DEGs/','P_0.05_FC_1.5_volcano_plot_1.pdf'),width = 8, height = 6 )
VolcanoPlot(marker_condition, log2FC = 0.585,padj = 0.05, label.max = 0)
ggsave(paste0('DEGs/','P_0.05_FC_1.5_volcano_plot_no_label_1.pdf'),width = 8, height = 6 )
VolcanoPlot(marker_condition,log2FC = 0.585, padj=0.01, label.max = 50, cols=c("blue", "red"))
ggsave(paste0('DEGs/','P_0.01_logFC_1_volcano_plot_2.pdf'),width = 8, height = 6 )
VolcanoPlot(marker_condition,log2FC = 0.585, padj=0.01, label.max = 0, cols=c("blue", "red"))
ggsave(paste0('DEGs/','P_0.01_logFC_1_volcano_plot_no_label_2.pdf'),width = 8, height = 6 )
write.csv(marker_condition[marker_condition$p_val_adj < 0.05,],
paste0('DEGs/','P_0.05_DEGs_genes.csv'))
write.csv(marker_condition[marker_condition$p_val_adj < 0.01,],
paste0('DEGs/','P_0.01_DEGs_genes.csv'))
上述代码一共在DEG的文件夹里出了四张图和两个csv文件:
3. GO一键富集分析及出图
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
###GO-analysis
# OrgDb的可选类型:'org.Dr.eg.db','org.Mm.eg.db','org.Hs.eg.db'
library(org.Hs.eg.db)
library(enrichplot)
library(clusterProfiler)
library(ggupset)
dir.create('GO')
# 初始化或清空错误信息文件
#这行代码的作用是打开(或创建)一个名为 "error_messages.txt" 的文件,以追加模式进行写入。
#这意味着任何写入这个文件的数据都会被添加到文件的末尾,而不会覆盖原有内容。
#如果 "error_messages.txt" 文件不存在,这行代码将会创建这个文件。
error_file <- file("GO/error_messages.txt", "a")
writeLines(c("Error log for GO enrichment analysis:",
"------------------------------------------------",
'starting time at:', format(Sys.time(), "%Y-%m-%d %H:%M:%S"),
"------------------------------------------------"),
error_file, useBytes = FALSE)
for (i in unique(marker_condition$condition)) {
# 尝试转换基因ID
gene <- tryCatch({
bitr(geneID = marker_condition$gene[marker_condition$condition == i],
fromType = 'SYMBOL',
toType = 'ENTREZID',
OrgDb = 'org.Hs.eg.db')},
error = function(e) {
message_text <- paste("Error in gene ID conversion for :", i, ": ", e$message)
writeLines(message_text, error_file, useBytes = FALSE)
return(NULL)})
# 如果基因ID转换失败,跳过当前循环的剩余部分
if (is.null(gene)) {
next
}
# 进行GO富集分析
ego <- tryCatch({
enrichGO(gene = gene$ENTREZID,
OrgDb = 'org.Hs.eg.db',
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
}, error = function(e) {
message_text <- paste("Error in GO enrichment analysis for :", i, ": ", e$message)
writeLines(message_text, error_file, useBytes = FALSE)
return(NULL)
})
# 如果没有富集出任何通路,跳过画图代码
if (is.null(ego) || nrow(as.data.frame(ego)) == 0) {
message_text <- paste("No enriched GO terms found for :", i)
writeLines(message_text, error_file, useBytes = FALSE)
next}
# 保存富集分析结果
file_ego <- paste0("GO/",i, "_Enriched_GO_terms.csv")
write.csv(as.data.frame(ego), file_ego)
# 绘制图形并设置标题
file_name <- paste0("GO/Barplot_GO_terms_enriched_in_", i, ".pdf")
p <- barplot(ego, title = paste0("GO terms enriched in ", i))
ggsave(file_name, plot = p, width = 6, height = 5)
file_name <- paste0("GO/Dotplot_GO_terms_enriched_in_", i, ".pdf")
p<- dotplot(ego, x = ~-log(p.adjust), title = paste0("GO terms enriched in cluster ", i))
ggsave(file_name, plot = p, width = 8, height = 7)
file_name <- paste0("GO/Treeplot_GO_terms_enriched_in_", i, ".pdf")
edox2 <- pairwise_termsim(ego)
p<- treeplot(edox2, hclust_method = "average")
ggsave(file_name, plot = p, width = 14, height = 8)
file_name <- paste0("GO/Upsetplot_GO_terms_enriched_in_", i, ".pdf")
p<- upsetplot(ego)
ggsave(file_name, plot = p, width = 14, height = 8)
}
close(error_file)
最后在GO的文件夹里出了8张图和2个GO.csv富集文件:
4. KEGG一键富集分析及出图
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
#KEGG analysis
dir.create('KEGG')
error_file <- file("KEGG/error_messages.txt", "a")
writeLines(c("Error log for GO enrichment analysis:",
"------------------------------------------------",
'starting time at:', format(Sys.time(), "%Y-%m-%d %H:%M:%S"),
"------------------------------------------------"),
error_file, useBytes = FALSE)
for (i in unique(marker_condition$condition)) {
# 尝试转换基因ID
gene <- tryCatch({
bitr(geneID = marker_condition$gene[marker_condition$condition == i],
fromType = 'SYMBOL',
toType = 'ENTREZID',
OrgDb = 'org.Hs.eg.db')
}, error = function(e) {
message_text <- paste("Error in gene ID conversion for condition:", i, ": ", e$message)
writeLines(message_text, error_file, useBytes = FALSE)
return(NULL)
})
# 如果基因ID转换失败,跳过当前循环的剩余部分
if (is.null(gene)) {
next
}
# 进行KEGG富集分析
kegg <- tryCatch({
enrichKEGG(gene = gene$ENTREZID,
organism = "hsa",
keyType = "kegg",
pvalueCutoff=0.05,
pAdjustMethod="BH",
qvalueCutoff=1)
},
error = function(e) {
message_text <- paste("Error in GO enrichment analysis for condition:", i, ": ", e$message)
writeLines(message_text, error_file, useBytes = FALSE)
return(NULL)
}
)
# 如果没有富集出任何通路,跳过画图代码
if (is.null(kegg) || nrow(kegg) == 0) {
message_text <- paste("No enriched KEGG terms found for:", i)
writeLines(message_text, error_file, useBytes = FALSE)
next
}
kegg_readable <- setReadable(kegg, 'org.Hs.eg.db', 'ENTREZID')
kegg<- data.frame(kegg_readable)
#保存KEGG结果
file_kegg <- paste0("KEGG/KEGG_enriched_in_", i,".csv")
write.csv(kegg,file_kegg)
#file_name <- paste0("KEGG/Cnetplot_KEGG_enriched_in_", i,".pdf")
#cnetplot(kegg_readable,circular = T, colorEdge = T,color_gene = "#B3B",
#edge.width = 10)
#ggsave(file_name,width = 20, height = 20)
file_name <- paste0("KEGG/Heatplot_KEGG_terms_enriched_in_", i,".pdf")
p<- heatplot(kegg_readable, showCategory=5)+
scale_fill_gradient(low = "blue", high = "red")
ggsave(file_name,width = 25, height = 12)
file_name <- paste0("KEGG/Dotplot_KEGG_terms_enriched_in_", i,".pdf")
dotplot(kegg_readable,x = ~-log(p.adjust), showCategory=10)
ggsave(file_name, width = 8, height = 7)
file_name <- paste0("KEGG/Barplot_KEGG_terms_enriched_in_", i,".pdf")
barplot(kegg_readable, showCategory=10)
ggsave(file_name, width = 8, height = 7)
file_name <- paste0("KEGG/Treeplot_KEGG_terms_enriched_in_", i, ".pdf")
edox2 <- pairwise_termsim(kegg_readable)
p<- treeplot(edox2, hclust_method = "average")
ggsave(file_name, plot = p, width = 14, height = 8)
file_name <- paste0("KEGG/Upsetplot_KEGG_terms_enriched_in_", i, ".pdf")
p<- upsetplot(kegg_readable)
ggsave(file_name, plot = p, width = 14, height = 8)
}
close(error_file)
KEGG这套代码一共出了10张图,2个KEGG.csv文件:
5. GSEA一键富集分析及出图
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
###########GSEA
#GSEA analysis
dir.create('GSEA/GO/GSEA_plot',recursive = T)
dir.create('GSEA/KEGG/GSEA_plot',recursive = T)
marker_condition$new <- 'all'
for (i in unique(marker_condition$new)) {
genes <- marker_condition[marker_condition$new == i,]
# 使用 bitr 获取 ENTREZID
# 如果没有找到ENTREZID,则跳过当前循环
gene_ids <- bitr(genes$gene, fromType = 'SYMBOL', toType = 'ENTREZID', OrgDb = org.Hs.eg.db)
colnames(gene_ids)[1] <- 'gene'
# 合并数据并排序
merge_genes <- merge(genes, gene_ids, by = "gene", all.y = TRUE)
merge_genes <- merge_genes[order(merge_genes$avg_log2FC, decreasing = TRUE), ]
input_LFC <- merge_genes$avg_log2FC
names(input_LFC) <- merge_genes$ENTREZID
# 运行 gseKEGG
res <- gseKEGG(geneList = input_LFC,
organism = "hsa",
pvalueCutoff = 0.1,
pAdjustMethod = "BH")
# 保存结果和绘图
filename <- paste0("GSEA/KEGG/GSEA_analysis.csv")
write.csv(res@result, filename)
#### Treeplot
file_name <- paste0("GSEA/KEGG/Treeplot_GSEA_KEGG_terms_enriched_in_", i, ".pdf")
edox2 <- pairwise_termsim(res)
p<- treeplot(edox2, hclust_method = "average")
p
ggsave(file_name, plot = p, width = 14, height = 8)
#### Upsetplot;fold change distributions of different categories
file_name <- paste0("GSEA/KEGG/Upsetplot_GSEA_KEGG_terms_enriched_in_", i, ".pdf")
p<- upsetplot(res)+ylab('FoldChange')
p
ggsave(file_name, plot = p, width = 14, height = 8)
#### Ridgeplot
file_name <- paste0("GSEA/KEGG/Ridgeplot_GSEA_KEGG_terms_enriched_in_", i, ".pdf")
p<- ridgeplot(res)+xlab('Expression of core enriched genes')
p
ggsave(file_name, plot = p, width = 10, height = 8)
for (ci in seq_along(res$Description)) {
p <- gseaplot2(res, title = res$Description[ci], geneSetID = ci, pvalue_table = TRUE)
filename <- paste0("GSEA/KEGG/GSEA_plot/GSEA_plot_", ci, '_','.pdf')
ggsave(filename, plot = p, width = 12, height = 8)
}
#多个gnenset合并展示
#gseaplot2(res, geneSetID = 1:5,pvalue_table = T)
library(clusterProfiler)
library(org.Hs.eg.db)
library(devtools)
#devtools::install_github("davidsjoberg/ggsankey")
#devtools::install_github("junjunlab/GseaVis")
library(GseaVis)
# enrichment
ego3 <- gseGO(geneList = input_LFC,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
# default plot
pdf(file = paste0("GSEA/GO/dotplot1_GSEA_GO_terms_enriched_in_", i, ".pdf"),width = 12, height = 12)
print(dotplotGsea(data = ego3,topn = 20))
dev.off()
# use NES
pdf(file = paste0("GSEA/GO/dotplot2_GSEA_GO_terms_enriched_in_", i, ".pdf"),width = 12, height = 12)
print(dotplotGsea(data = ego3,topn = 10,
order.by = 'NES'))
dev.off()
# add segment line
pdf(file = paste0("GSEA/GO/dotplot3_GSEA_GO_terms_enriched_in_", i, ".pdf"),width = 12, height = 12)
print(dotplotGsea(data = ego3,topn = 10,
order.by = 'NES',
add.seg = T))
dev.off()
# change line style
pdf(file = paste0("GSEA/GO/dotplot4_GSEA_GO_terms_enriched_in_", i, ".pdf"),width = 12, height = 12)
print(dotplotGsea(data = ego3,topn = 10,
add.seg = T,
line.col = 'orange',
line.type = 'dashed'))
dev.off()
# free scale
pdf(file = paste0("GSEA/GO/dotplot5_GSEA_GO_terms_enriched_in_", i, ".pdf"),width = 12, height = 12)
print(dotplotGsea(data = ego3,topn = 10,
order.by = 'NES',
add.seg = T,
line.col = 'orange',
line.type = 'dashed',
scales = 'free'))
dev.off()
#### Treeplot
file_name <- paste0("GSEA/GO/Treeplot_GSEA_GO_terms_enriched_in_", i, ".pdf")
edox2 <- pairwise_termsim(ego3)
p<- treeplot(edox2, hclust_method = "average")
p
ggsave(file_name, plot = p, width = 14, height = 8)
#### Upsetplot;fold change distributions of different categories
file_name <- paste0("GSEA/GO/Upsetplot_GSEA_GO_terms_enriched_in_", i, ".pdf")
p<- upsetplot(ego3)+ylab('FoldChange')
p
ggsave(file_name, plot = p, width = 14, height = 8)
file_name <- paste0("GSEA/GO/Ridgeplot_GSEA_GO_terms_enriched_in_", i, ".pdf")
p<- ridgeplot(ego3)+xlab('Expression of core enriched genes')
p
ggsave(file_name, plot = p, width = 15, height = 15)
# all plot
for (term in 1:length(ego3@result[["ID"]])) {
pdf(file = paste0("GSEA/GO/GSEA_plot/GSEA_plot", term, ".pdf"),width = 12, height = 8)
print(gseaNb(object = ego3,
geneSetID = ego3@result[["ID"]][term],
newGsea = F,
addPval = T,
pvalY = 0.6,
markTopgene = T,
pvalX = 0.9,
pHjust = 1,
subPlot = 3))
dev.off()
}}
GSEA的结果分成了两个subfolder:
GSEA/GO 和 GSEA/KEGG
点击数:0