← 返回分析流程中心创建时间 2026/6/3 分析难度 入门 推荐场景 转录组分析 预计耗时 0.5-1 天
Pipeline Detail
RNA-Seq转录组基础与表达变化
RNA-seq STAR + featureCounts + DESeq2 + TPM
基于 STAR 基因组比对、featureCounts 基因计数、DESeq2 差异表达和 TPM/标准化计数整理的 RNA-seq 流程。
Metadata
流程元数据
先看应用场景、输入输出和工具依赖,再进入正文命令细节。
Difficulty
入门
Scenario
转录组分析
Estimated Time
0.5-1 天
Tools
DESeq2STARfeatureCounts
Inputs
FASTQBAMGTFcount matrixTPM
Outputs
heatmapvolcano plot
Workflow DAG
流程图
用步骤节点快速理解这个分析从原始数据到结果报告的流转关系。
STEP 1
→FastQC/MultiQC 质控
STEP 2
→Trimmomatic 裁剪
STEP 3
→STAR 构建基因组索引
STEP 4
→STAR 比对
STEP 5
→比对率汇总
STEP 6
→featureCounts 定量
STEP 7
→DESeq2 差异分析
STEP 8
PCA/火山图/热图
Protocol
流程文档
正文保留 Markdown 排版、代码语言标识和表格样式,适合边学边复现。
RNA-seq STAR + featureCounts + DESeq2 + TPM
适用场景
适用于需要基因组坐标级比对结果、后续可能查看 BAM、比对率、基因计数矩阵和标准 DESeq2 差异表达分析的 bulk RNA-seq 项目。该流程对剪接 reads、基因组定位、featureCounts 计数和样本层级 QC 更友好。
样本包括 adc1-1、adc1-2、adc1-3、adc2-1、adc2-2、adc2-3、WT-1、WT-2、WT-3。
1. FastQC 质控与 Trimmomatic 裁剪
bsub -J fastqc -n 1 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "fastqc *.fq.gz"
bsub -J fastqc_wt -n 1 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "fastqc WT-*"
multiqc .
for sample in WT-1 WT-2 WT-3 adc1-1 adc1-2 adc1-3 adc2-1 adc2-2 adc2-3
do
java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.32.jar PE -threads 10 -phred33 01_raw_data/${sample}_R1.fq.gz 01_raw_data/${sample}_R2.fq.gz 02_clean_data/${sample}_R1_paired.fq.gz 02_clean_data/${sample}_R1_unpaired.fq.gz 02_clean_data/${sample}_R2_paired.fq.gz 02_clean_data/${sample}_R2_unpaired.fq.gz HEADCROP:15
done
2. STAR 构建基因组索引
bsub -J gunzipgtf -n 1 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "gunzip Arabidopsis_thaliana.TAIR10.57.gtf.gz"
STAR --runThreadN 8 --runMode genomeGenerate --genomeDir ./ref_genomic --genomeFastaFiles ./ref_genomic/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz --sjdbGTFfile ./ref_genomic/Arabidopsis_thaliana.TAIR10.57.gtf.gz --sjdbOverhang 134 --genomeSAindexNbases 12 --genomeChrBinNbits 16 > logs/02_star_index.log 2>&1
3. STAR 比对与 GeneCounts
for sample in adc1-1 adc1-2 adc1-3 adc2-1 adc2-2 adc2-3 WT-1 WT-2 WT-3
do
STAR --runThreadN 12 --genomeDir ref_genomic --readFilesIn 02_clean_data/${sample}_R1_paired.fq.gz 02_clean_data/${sample}_R2_paired.fq.gz --readFilesCommand zcat --outFileNamePrefix 03_aligment_star/${sample}_ --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax 10 --outFilterMismatchNoverLmax 0.03 --alignIntronMax 5000 --alignMatesGapMax 5000 --quantMode GeneCounts
done
bsub -J star -n 12 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "bash Star_alignment.sh"
4. 比对结果汇总
echo "样本,总reads,reads平均长度,唯一比对率,多比对率" > alignment_summary.csv
for sample in adc1-1 adc1-2 adc1-3 adc2-1 adc2-2 adc2-3 WT-1 WT-2 WT-3
do
total=$(grep "Number of input reads" ${sample}_Log.final.out | awk '{print $6}')
avg_length=$(grep "Average input read length" ${sample}_Log.final.out | awk '{print $6}')
unique=$(grep "Uniquely mapped reads %" ${sample}_Log.final.out | awk '{print $6}' | sed 's/%//')
multi=$(grep "% of reads mapped to multiple loci" ${sample}_Log.final.out | awk '{print $9}' | sed 's/%//')
echo "${sample},${total},${avg_length},${unique},${multi}" >> alignment_summary.csv
done
5. featureCounts 基因定量
GTF_FILE="./ref_genomic/Arabidopsis_thaliana.TAIR10.57.gtf"
THREADS=12
featureCounts -T ${THREADS} -p -B -C -t exon -g gene_id -a ${GTF_FILE} -o 04_featurecounts/featurecounts_gene_counts.txt 03_aligment_star/*.bam
bsub -J featureCounts -n 12 -R "span[hosts=1]" -o %J.out -e %J.err -q normal "bash featureCounts.sh"
cut -f1,7- 04_featurecounts/featurecounts_gene_counts.txt > 04_featurecounts/count_matrix.txt
6. DESeq2 差异表达分析
library(DESeq2)
library(ggplot2)
library(pheatmap)
library(dplyr)
library(RColorBrewer)
library(ggrepel)
library(org.At.tair.db)
count_data <- read.table("count_matrix.txt", header = FALSE, row.names = 1, sep = " ")
sample_names <- c("adc1-1", "adc1-2", "adc1-3", "adc2-1", "adc2-2", "adc2-3", "WT-1", "WT-2", "WT-3")
colnames(count_data) <- sample_names
sample_info <- data.frame(
sample = sample_names,
condition = factor(c(rep("adc1", 3), rep("adc2", 3), rep("WT", 3))),
replicate = factor(rep(1:3, 3)),
row.names = sample_names
)
dds <- DESeqDataSetFromMatrix(
countData = as.matrix(count_data),
colData = sample_info,
design = ~ condition
)
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep, ]
dds <- DESeq(dds)
normalized_counts <- counts(dds, normalized = TRUE)
7. 比较组、注释与结果表
contrasts <- list(
adc1_vs_WT = c("condition", "adc1", "WT"),
adc2_vs_WT = c("condition", "adc2", "WT"),
adc2_vs_adc1 = c("condition", "adc2", "adc1")
)
add_gene_annotation <- function(gene_df) {
gene_ids <- gene_df$gene
gene_df$symbol <- mapIds(org.At.tair.db, keys = gene_ids, column = "SYMBOL", keytype = "TAIR", multiVals = "first")[gene_ids]
gene_df$description <- mapIds(org.At.tair.db, keys = gene_ids, column = "GENENAME", keytype = "TAIR", multiVals = "first")[gene_ids]
gene_df
}
dir.create("deseq2_results/tables", showWarnings = FALSE, recursive = TRUE)
results_list <- list()
for (contrast_name in names(contrasts)) {
contrast_dir <- file.path("deseq2_results/tables", contrast_name)
dir.create(contrast_dir, showWarnings = FALSE)
res <- results(dds, contrast = contrasts[[contrast_name]], alpha = 0.05, lfcThreshold = 1)
resLFC <- lfcShrink(dds, contrast = contrasts[[contrast_name]], type = "ashr", res = res)
res_df <- as.data.frame(resLFC)
res_df$gene <- rownames(res_df)
res_df <- add_gene_annotation(res_df)
norm_counts_df <- as.data.frame(normalized_counts)
norm_counts_df$gene <- rownames(norm_counts_df)
res_final <- merge(res_df, norm_counts_df, by = "gene", all.x = TRUE)
sig_genes <- res_final |>
filter(padj < 0.05 & abs(log2FoldChange) > 1) |>
arrange(padj, desc(abs(log2FoldChange)))
write.csv(res_final, file.path(contrast_dir, paste0(contrast_name, "_full_results.csv")), row.names = FALSE, na = "")
write.csv(sig_genes, file.path(contrast_dir, paste0(contrast_name, "_significant_genes.csv")), row.names = FALSE, na = "")
results_list[[contrast_name]] <- list(full_results = res_final, significant = sig_genes)
}
8. PCA、火山图和样本距离热图
dir.create("deseq2_results/plots", showWarnings = FALSE, recursive = TRUE)
vsd <- vst(dds, blind = FALSE)
pcaData <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
pca_plot <- ggplot(pcaData, aes(PC1, PC2, color = condition, label = name)) +
geom_point(size = 4) +
geom_text_repel(size = 3, max.overlaps = 20) +
scale_color_brewer(palette = "Set1") +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
ggtitle("Principal Component Analysis") +
theme_minimal()
ggsave("deseq2_results/plots/PCA_plot.png", pca_plot, width = 10, height = 8, dpi = 300)
sampleDists <- dist(t(assay(vsd)))
pheatmap(
as.matrix(sampleDists),
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
annotation_col = data.frame(Condition = sample_info$condition, row.names = rownames(sample_info)),
main = "Sample Distance Matrix"
)
可选:GO 富集分析
library(clusterProfiler)
library(org.At.tair.db)
sig_genes <- results_list$adc1_vs_WT$significant$gene
ego <- enrichGO(
gene = sig_genes,
OrgDb = org.At.tair.db,
keyType = "TAIR",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05
)
主要输出
- STAR sorted BAM
- STAR
Log.final.out比对率汇总 - featureCounts gene count matrix
- DESeq2 full result tables
- significant gene tables
- normalized counts
- PCA plot
- volcano plots
- sample distance heatmap