← 返回分析流程中心创建时间 2026/6/3 分析难度 入门 推荐场景 转录组分析 预计耗时 0.5-1 天
Pipeline Detail
RNA-Seq转录组基础与表达变化
RNA-seq Salmon + tximport + DESeq2
基于 Salmon 转录本准比对定量、tximport 基因层汇总和 DESeq2 差异表达分析的快速 RNA-seq 流程。
Metadata
流程元数据
先看应用场景、输入输出和工具依赖,再进入正文命令细节。
Difficulty
入门
Scenario
转录组分析
Estimated Time
0.5-1 天
Tools
DESeq2Salmontximport
Inputs
GTFcount matrixTPM
Outputs
heatmapvolcano plot
Workflow DAG
流程图
用步骤节点快速理解这个分析从原始数据到结果报告的流转关系。
STEP 1
→下载参考转录组
STEP 2
→Salmon 构建索引
STEP 3
→Salmon paired-end 定量
STEP 4
→GTF 生成 tx2gene
STEP 5
→tximport 基因层导入
STEP 6
→DESeq2 差异分析
STEP 7
火山图与热图
Protocol
流程文档
正文保留 Markdown 排版、代码语言标识和表格样式,适合边学边复现。
RNA-seq Salmon + tximport + DESeq2
适用场景
适用于 bulk RNA-seq 项目中需要快速、轻量地完成转录本准比对定量和基因层差异表达分析的场景。Salmon 负责 transcript-level quantification,tximport 将转录本结果按 tx2gene 汇总到 gene-level,DESeq2 负责统计建模与差异分析。
本流程样本包括 adc1-1、adc1-2、adc1-3、adc2-1、adc2-2、adc2-3、WT-1、WT-2、WT-3,比较组包括 adc1_vs_WT、adc2_vs_WT、adc2_vs_adc1。
1. 下载参考转录组并构建 Salmon 索引
curl ftp://ftp.ensemblgenomes.org/pub/plants/release-28/fasta/arabidopsis_thaliana/cdna/Arabidopsis_thaliana.TAIR10.28.cdna.all.fa.gz -o athal.fa.gz
REF_DIR="ref_genomic"
INDEX_DIR="ref_genomic_salmon"
THREADS=8
salmon index -t ${REF_DIR}/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa -i ${INDEX_DIR} --gencode -p ${THREADS}
bsub -J salmon_index -n 8 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "bash salmon_index.sh"
2. Salmon paired-end 定量
INDEX_DIR="ref_genomic_salmon"
RAW_DATA_DIR="02_clean_data"
QUANT_DIR="05_salmon"
THREADS=12
LOG_DIR="logs"
SAMPLES=("adc1-1" "adc1-2" "adc1-3" "adc2-1" "adc2-2" "adc2-3" "WT-1" "WT-2" "WT-3")
for sample in "${SAMPLES[@]}"
do
SAMPLE_OUTDIR="${QUANT_DIR}/${sample}"
mkdir -p ${SAMPLE_OUTDIR}
salmon quant -i ${INDEX_DIR} -l A -1 ${RAW_DATA_DIR}/${sample}_R1_paired.fq.gz -2 ${RAW_DATA_DIR}/${sample}_R2_paired.fq.gz -p ${THREADS} --validateMappings --gcBias --seqBias -o ${SAMPLE_OUTDIR} > ${LOG_DIR}/salmon_${sample}.log 2>&1
done
bsub -J salmon -n 12 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "bash salmon_quant_loop.sh"
3. 创建样本信息表
sample_names <- c(
"adc1-1", "adc1-2", "adc1-3",
"adc2-1", "adc2-2", "adc2-3",
"WT-1", "WT-2", "WT-3"
)
sample_info <- data.frame(
sample_id = sample_names,
condition = factor(c(rep("adc1", 3), rep("adc2", 3), rep("WT", 3))),
stringsAsFactors = FALSE
)
sample_info$quant_file <- file.path("05_salmon_quant", sample_info$sample_id, "quant.sf")
all(file.exists(sample_info$quant_file))
4. 从 GTF 生成 tx2gene 并导入 Salmon 结果
library(tximport)
library(DESeq2)
library(GenomicFeatures)
library(org.At.tair.db)
txdb <- makeTxDbFromGFF("Arabidopsis_thaliana.TAIR10.57.gtf", format = "gtf")
tx2gene <- select(
txdb,
keys = keys(txdb, "TXNAME"),
keytype = "TXNAME",
columns = "GENEID"
)
txi <- tximport(
sample_info$quant_file,
type = "salmon",
tx2gene = tx2gene,
countsFromAbundance = "lengthScaledTPM"
)
colnames(txi$counts) <- sample_names
colnames(txi$abundance) <- sample_names
colnames(txi$length) <- sample_names
tpm_data <- txi$abundance
write.csv(tpm_data, "05_salmon_quant/tables/tpm_data_all_genes.csv")
5. DESeq2 差异分析
dds <- DESeqDataSetFromTximport(
txi,
colData = sample_info,
design = ~ condition
)
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep, ]
dds <- DESeq(dds)
comparisons <- list(
adc1_vs_WT = c("condition", "adc1", "WT"),
adc2_vs_WT = c("condition", "adc2", "WT"),
adc2_vs_adc1 = c("condition", "adc2", "adc1")
)
add_gene_annotation <- function(gene_df) {
gene_ids <- gene_df$gene
gene_df$symbol <- mapIds(org.At.tair.db, keys = gene_ids, column = "SYMBOL", keytype = "TAIR", multiVals = "first")[gene_ids]
gene_df$description <- mapIds(org.At.tair.db, keys = gene_ids, column = "GENENAME", keytype = "TAIR", multiVals = "first")[gene_ids]
gene_df
}
normalized_counts <- counts(dds, normalized = TRUE)
tpm_filtered <- tpm_data[rownames(dds), ]
for (contrast_name in names(comparisons)) {
contrast_dir <- file.path("05_salmon_quant/tables", contrast_name)
dir.create(contrast_dir, showWarnings = FALSE, recursive = TRUE)
res <- results(dds, contrast = comparisons[[contrast_name]], alpha = 0.05, lfcThreshold = 1)
resLFC <- lfcShrink(dds, contrast = comparisons[[contrast_name]], type = "ashr", res = res)
res_df <- as.data.frame(resLFC)
res_df$gene <- rownames(res_df)
res_df <- add_gene_annotation(res_df)
norm_counts_df <- as.data.frame(normalized_counts)
norm_counts_df$gene <- rownames(norm_counts_df)
tpm_df <- as.data.frame(tpm_filtered)
tpm_df$gene <- rownames(tpm_df)
res_combined <- merge(res_df, norm_counts_df, by = "gene", all.x = TRUE)
res_combined <- merge(res_combined, tpm_df, by = "gene", all.x = TRUE)
sig_genes <- res_combined |>
dplyr::filter(padj < 0.05 & abs(log2FoldChange) > 1) |>
dplyr::arrange(padj, dplyr::desc(abs(log2FoldChange)))
write.csv(res_combined, file.path(contrast_dir, paste0(contrast_name, "_full_results.csv")), row.names = FALSE, na = "")
write.csv(sig_genes, file.path(contrast_dir, paste0(contrast_name, "_significant_genes.csv")), row.names = FALSE, na = "")
}
6. 火山图和热图
library(ggplot2)
library(ggrepel)
library(pheatmap)
library(dplyr)
create_ggplot_volcano <- function(res_df, contrast_name, top_n = 20) {
volcano_data <- res_df |>
mutate(
significance = case_when(
padj < 0.05 & log2FoldChange > 1 ~ "Up",
padj < 0.05 & log2FoldChange < -1 ~ "Down",
TRUE ~ "Not significant"
),
log10_padj = -log10(ifelse(is.na(padj), 1, padj))
)
top_genes <- volcano_data |>
filter(significance %in% c("Up", "Down")) |>
arrange(padj) |>
head(top_n)
ggplot(volcano_data, aes(x = log2FoldChange, y = log10_padj, color = significance)) +
geom_point(alpha = 0.6, size = 1.5) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "red") +
geom_text_repel(data = top_genes, aes(label = ifelse(!is.na(symbol) & symbol != "", symbol, gene))) +
theme_minimal() +
labs(title = paste("Volcano Plot -", contrast_name))
}
主要输出
- Salmon
quant.sf - gene-level count matrix
- TPM matrix
- DESeq2 full result tables
- significant gene tables
- volcano plots
- normalized counts and TPM heatmaps