一文打通从 SRA 原始数据到差异表达及功能富集的全流程,具体概括如下:SRA下载(prefetch) → 格式转换(fasterq-dump) → 质控(fastqc multiqc) → 去接头/低质量(trim_galore) → 基因组比对(hisat2) → BAM处理(samtools) → 定量计数(featureCounts) → 差异分析(limma) → 可视化(ggplot2) → 功能富集(clusterProfiler)
# 下载 Miniconda 安装脚本wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh# 执行安装(按提示操作,建议安装到 ~/miniconda3)bash Miniconda3-latest-Linux-x86_64.sh# 初始化 shell(安装后重启终端或执行)source ~/.bashrc# 验证安装conda --version# 创建新环境(指定 Python 3.10,兼容性较好)conda create -n rna python=3.10 -y# 激活环境conda activate rna# 配置国内镜像(加速下载)conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/conda config --set show_channel_urls yes# 安装 SRA 工具包(下载 SRA 数据)conda install -c bioconda sra-tools -y# 安装 FastQC(质控)conda install -c bioconda fastqc -y# 安装 MultiQC(质控报告汇总)conda install -c bioconda multiqc -y# 安装 Trim Galore(去接头和低质量,会自动安装 cutadapt 和 fastqc)conda install -c bioconda trim-galore -y# 安装 Hisat2(比对)conda install -c bioconda hisat2 -y# 安装 Samtools(BAM 处理)conda install -c bioconda samtools -y# 安装 Subread(包含 featureCounts)conda install -c bioconda subread -y# 安装 pigz(多线程压缩,可选但推荐)conda install -c conda-forge pigz -y# 安装 parallel(并行处理,可选)conda install -c conda-forge parallel -y# 检查所有工具版本fastqc --versionmultiqc --versiontrim_galore --versionhisat2 --versionsamtools --versionfeatureCounts -vfasterq-dump --version# 激活 conda 环境(需提前安装好所需软件)conda activate rna# 设置项目名并创建目录结构projectName='GSE50177'mkdir -p ${projectName}/{sra,fastq,reports,clean,aligned,reference}cd ${projectName}
cd sra# 从 NCBI SRA Run Selector 下载 SRR_Acc_List.txt# 链接:https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA217298# 批量下载(单线程,稳定但慢)cat SRR_Acc_List.txt | while read id; do prefetch ${id}done# 整理文件:将分散在子文件夹的 .sra 文件移出cat SRR_Acc_List.txt | while read id; do mv -f ${id}/${id}.sra ./ rm -rf ${id}done
cd ../fastq# 双端数据拆分(--split-3 会自动识别单端/双端)for sra_file in ../sra/*.sra; do fasterq-dump --split-3 ${sra_file} -O ./done# 压缩节省空间(可选)pigz -p 4 *.fastq
# FastQC 单样本质控ls *.fastq.gz | xargs fastqc -t 4 -O ../reports/# MultiQC 汇总报告cd ../reportsmultiqc ./ -o ./# 查看 multiqc_report.html,重点关注:# - Per base sequence quality(Q30 > 80%)# - Adapter Content(是否有接头残留)
# 双端数据处理cd ../clean# 生成配置文件(两列:R1路径 R2路径)ls ../fastq/*_R1_*.fastq.gz > r1.txtls ../fastq/*_R2_*.fastq.gz > r2.txtpaste r1.txt r2.txt > config.txt# 批量运行 trim_galorecat config.txt | while read r1 r2; do trim_galore \ -q 25 \ # Phred quality cutoff --phred33 \ # 质量值编码格式 --length 36 \ # 丢弃短于36bp的reads --stringency 3 \ # 接头匹配严格度 --paired \ # 双端模式 --cores 4 \ # 线程数(trim_galore v0.6.0+) -o ./ \ ${r1} ${r2}done
# 单端数据处理ls ../fastq/*.fastq.gz | while read fq; do trim_galore -q 25 --phred33 --length 36 -o ./ ${fq}done
cd ../reference# 人类基因组 GRCh38 + GENCODE v46wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/GRCh38.p14.genome.fa.gzgunzip GRCh38.p14.genome.fa.gzwget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.annotation.gtf.gzgunzip gencode.v46.annotation.gtf.gz# 构建 Hisat2 索引(约30分钟,4线程)hisat2-build -p 4 GRCh38.p14.genome.fa GRCh38_index
cd ../clean# 双端比对ls *_val_1.fq.gz | sed 's/_val_1.fq.gz//' | while read prefix; do hisat2 -p 10 \ -x ../reference/GRCh38_index \ -1 ${prefix}_val_1.fq.gz \ -2 ${prefix}_val_2.fq.gz \ -S ../aligned/${prefix}.sam \ 2> ../aligned/${prefix}.align.logdone
cd ../aligned# 排序并转BAM(按坐标排序,去除SAM节省空间)ls *.sam | while read sam; do base=$(basename ${sam} .sam) samtools sort -O bam -@ 5 -o ${base}.bam ${sam} rm ${sam} # 删除SAM释放空间done# 建立索引(IGV等可视化工具需要)ls *.bam | xargs -P 4 -i samtools index {}# 比对统计ls *.bam | while read bam; do samtools flagstat -@ 2 ${bam} > $(basename ${bam} .bam).flagstatdone# 检查比对率:Unique mapped reads > 70% 为合格grep "mapped (" *.flagstat | head
# 使用 featureCounts 计算 reads 数gtf="../reference/gencode.v46.annotation.gtf"nohup featureCounts \ -T 5 \ # 线程数 -p \ # 双端模式(单端去掉) -t exon \ # 特征类型 -g gene_id \ # 分组依据 -a ${gtf} \ -o all.id.txt \ *.bam \ 1>counts.log 2>&1 &# 输出文件说明:# all.id.txt # 主结果(含注释信息和计数矩阵)# all.id.txt.summary # 汇总统计
# 设置国内镜像加速下载options(repos = c(CRAN = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))options(Bioc_mirror = "https://mirrors.ustc.edu.cn/bioc/")# 安装 BiocManager(Bioconductor 官方安装器)if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")# 定义所需包列表biocPackages <- c( "GEOquery", "limma", # 数据获取与差异分析 "ggplot2", "pheatmap", # 可视化 "clusterProfiler", "org.Hs.eg.db", "enrichplot", # 富集分析 "FactoMineR", "factoextra" # PCA分析)# 批量安装(跳过已安装的包)BiocManager::install(biocPackages, ask = FALSE, update = FALSE)
rm(list = ls())options(stringsAsFactors = FALSE)# 读取 featureCounts 输出(跳过前1行注释)gset <- read.table("all.id.txt", header = TRUE, skip = 1)# 提取 counts 矩阵(第7列后为样本数据,根据实际列数调整)counts_cols <- grep(".bam$", colnames(gset))gene.datExpr <- log2(gset[, counts_cols] + 1)# ENSEMBL ID 去版本号(如 ENSG00000139629.15 → ENSG00000139629)library(stringr)gene.datExpr$ENSEMBL <- str_split_fixed(gset$Geneid, "\\.", 2)[, 1]# 过滤低表达基因(中位数 > 0)gene.datExpr$median <- apply(gene.datExpr[, 1:(ncol(gene.datExpr)-1)], 1, median)gene.datExpr <- gene.datExpr[gene.datExpr$median > 0, ]# 去重复:保留每个 ENSEMBL ID 中表达量最高的记录gene.datExpr <- gene.datExpr[order(gene.datExpr$ENSEMBL, gene.datExpr$median, decreasing = TRUE), ]gene.datExpr <- gene.datExpr[!duplicated(gene.datExpr$ENSEMBL), ]# 设置行名并清理rownames(gene.datExpr) <- gene.datExpr$ENSEMBLgene.datExpr <- gene.datExpr[, 1:(ncol(gene.datExpr)-2)] # 去除辅助列# 定义分组(根据实际实验设计修改)group_list <- c("control", "control", "siSUZ", "siSUZ")save(gene.datExpr, group_list, file = "ready.data.Rdata")
load("ready.data.Rdata")dir.create("export", showWarnings = FALSE)# 层次聚类library(stringr)colnames(gene.datExpr) <- paste(group_list, str_split_fixed(colnames(gene.datExpr), "\\.", 3)[, 1], sep = "_")hc <- hclust(dist(t(gene.datExpr)))pdf("export/hclust.pdf", width = 8, height = 6)plot(as.dendrogram(hc), horiz = TRUE)dev.off()# PCA 分析library(FactoMineR)library(factoextra)dat.pca <- PCA(t(gene.datExpr), graph = FALSE)pdf("export/sample_PCA.pdf", width = 8, height = 6)fviz_pca_ind(dat.pca, geom.ind = "point", col.ind = group_list, addEllipses = TRUE, legend.title = "Groups")dev.off()
library(limma)# 构建设计矩阵(无截距模型)design <- model.matrix(~ 0 + factor(group_list))colnames(design) <- levels(factor(group_list))rownames(design) <- colnames(gene.datExpr)# 构建对比矩阵contrast.matrix <- makeContrasts(siSUZ - control, levels = design)# 线性模型拟合fit <- lmFit(gene.datExpr, design)fit2 <- contrasts.fit(fit, contrast.matrix)fit2 <- eBayes(fit2)# 提取结果nrDEG <- topTable(fit2, coef = 1, number = Inf, sort.by = "P")head(nrDEG)# 保存结果save(nrDEG, file = "data/nrDEG.Rdata")write.table(nrDEG, file = "data/nrDEG.txt", sep = "\t", quote = FALSE)
# 火山图library(ggplot2)# 设定阈值(教学建议固定 |logFC| > 1,而非动态计算)logFC_cutoff <- 1pval_cutoff <- 0.05nrDEG$change <- ifelse( nrDEG$P.Value < pval_cutoff & abs(nrDEG$logFC) > logFC_cutoff, ifelse(nrDEG$logFC > logFC_cutoff, "UP", "DOWN"), "NOT")# 统计差异基因数table(nrDEG$change)this_title <- paste0( "Cutoff: |log2FC| > ", logFC_cutoff, ", P < ", pval_cutoff, "\nUP: ", sum(nrDEG$change == "UP"), " | DOWN: ", sum(nrDEG$change == "DOWN"))ggplot(nrDEG, aes(x = logFC, y = -log10(P.Value), color = change)) + geom_point(alpha = 0.4, size = 1.5) + theme_bw() + scale_color_manual(values = c("DOWN" = "#2E86AB", "NOT" = "grey50", "UP" = "#F24236")) + labs(x = "log2 Fold Change", y = "-log10 P-value", title = this_title) + theme(plot.title = element_text(hjust = 0.5))ggsave("export/volcano.pdf", width = 8, height = 7)
# 热图library(pheatmap)# 筛选差异基因deg_genes <- rownames(nrDEG[nrDEG$change %in% c("UP", "DOWN"), ])choose_matrix <- gene.datExpr[deg_genes, ]# Z-score 标准化并截断极值mat_scaled <- t(scale(t(choose_matrix)))mat_scaled[mat_scaled > 2] <- 2mat_scaled[mat_scaled < -2] <- -2# 样本注释annotation_col <- data.frame(Group = group_list)rownames(annotation_col) <- colnames(gene.datExpr)# 绘制热图pheatmap(mat_scaled, annotation_col = annotation_col, cluster_cols = TRUE, show_rownames = FALSE, show_colnames = FALSE, color = colorRampPalette(c("#2166AC", "white", "#B2182B"))(100), filename = "export/heatmap_DEGs.pdf", width = 8, height = 10)
library(clusterProfiler)library(org.Hs.eg.db)# ID 转换:ENSEMBL → SYMBOL → ENTREZIDdf <- bitr(rownames(nrDEG), fromType = "ENSEMBL", toType = c("SYMBOL", "ENTREZID"), OrgDb = org.Hs.eg.db)nrDEG$ENSEMBL <- rownames(nrDEG)nrDEG <- merge(nrDEG, df, by = "ENSEMBL", all.x = TRUE)# 提取上下调基因(过滤NA)gene_up <- na.omit(nrDEG[nrDEG$change == "UP", "ENTREZID"])gene_down <- na.omit(nrDEG[nrDEG$change == "DOWN", "ENTREZID"])# GO 富集分析(上调基因示例)if (length(gene_up) >= 10) { ego_up <- enrichGO( gene = gene_up, OrgDb = org.Hs.eg.db, ont = "ALL", # BP/MF/CC 三合一 pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.2, readable = TRUE ) # 条形图 pdf("export/GO_UP_barplot.pdf", width = 10, height = 8) print(barplot(ego_up, split = "ONTOLOGY") + facet_grid(ONTOLOGY ~ ., scales = "free")) dev.off()}# KEGG 富集(需联网)if (length(gene_up) >= 10) { kk_up <- enrichKEGG( gene = gene_up, organism = "hsa", # hsa = human pvalueCutoff = 0.05 ) dotplot(kk_up) ggsave("export/KEGG_UP_dotplot.pdf")}