一文打通从 SRA 原始数据到Peak注释与Motif分析的全流程,具体概括如下:BWA 比对 → Samtools 过滤 → Sambamba 去重 → Genrich Call Peak → ChIPseeker 注释 → HOMER Motif
# 创建环境(指定 Python 3.10,避免依赖冲突)conda create -n atac python=3.10 -y# 激活环境(后续所有操作都在此环境下)conda activate atac# 添加国内镜像加速conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/conda config --set show_channel_urls yes
# 下载工具conda install -c bioconda sra-tools -y# 质控工具conda install -c bioconda fastqc multiqc -y# 数据清洗(ATAC-seq 推荐 fastp,比 trim_galore 更快)conda install -c bioconda fastp -y# 比对与处理conda install -c bioconda bwa samtools sambamba -y# Peak calling 与可视化conda install -c bioconda genrich deeptools BAMscale -y# 基因组操作conda install -c bioconda bedtools -y# Motif 分析conda install -c bioconda homer -y# Peak 交集统计conda install -c bioconda intervene -y
# 设置项目根目录变量(避免硬编码,提升可移植性)PROJECT_DIR=~/atac_aramkdir -p ${PROJECT_DIR}/{sra,raw,fastqc,clean,genome,bwa_sam,bwa_bam,call_peak,motif,deeptools}cd ${PROJECT_DIR}
# 准备SRA编号列表cd ${PROJECT_DIR}# 创建样本列表cat > sra_list.txt << 'EOF'SRR5874657SRR5874658SRR5874659EOF
cd ${PROJECT_DIR}/sra# 批量下载prefetch --option-file ../sra_list.txt# 整理:将分散在子文件夹的 .sra 移到一起cat ../sra_list.txt | while read id; do if [ -f ${id}/${id}.sra ]; then mv ${id}/${id}.sra ./ rm -rf ${id} fidone
cd ${PROJECT_DIR}/raw# 使用 fasterq-dump(新版 sra-tools,速度提升 5-10 倍)for sra in ../sra/*.sra; do fasterq-dump --split-3 ${sra} -O ./done# 压缩节省空间(pigz 多线程压缩)pigz -p 4 *.fastqls -lh
# FastQC原始数据质控cd ${PROJECT_DIR}/rawmkdir -p ../fastqc/raw# ✅ 修正后的循环:正确闭合,使用相对路径cat ../sra_list.txt | while read id; do fastqc -t 2 -o ../fastqc/raw ${id}_1.fastq.gz ${id}_2.fastq.gzdone# 汇总报告cd ../fastqc/rawmultiqc ./
# Fastp清洗cd ${PROJECT_DIR}/rawmkdir -p ../clean# 批量质控:去除低质量碱基和接头# -f 16: 5'端切除16bp(Tn5接头区域常见低质量)# -t 2: 3'端切除2bp# -L: 禁用长度过滤(ATAC-seq 片段本身较短)cat ../sra_list.txt | while read id; do fastp \ -i ${id}_1.fastq.gz -I ${id}_2.fastq.gz \ -o ../clean/${id}_1.clean.fq.gz -O ../clean/${id}_2.clean.fq.gz \ -f 16 -t 2 -L \ -j ../clean/${id}_fastp.json \ -h ../clean/${id}_fastp.html \ --thread 4done
# 清洗后质控验证cd ${PROJECT_DIR}/cleanmkdir -p ../fastqc/cleancat ../sra_list.txt | while read id; do fastqc -t 2 -o ../fastqc/clean ${id}_1.clean.fq.gz ${id}_2.clean.fq.gzdonecd ../fastqc/cleanmultiqc ./
cd ${PROJECT_DIR}/genome# 下载拟南芥基因组wget http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-51/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz# 解压gunzip Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz# 下载注释文件(后续 ChIPseeker 需要)wget http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-51/plants/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.51.gff3.gzgunzip Arabidopsis_thaliana.TAIR10.51.gff3.gzcd ../clean# 双端比对ls *_val_1.fq.gz | sed 's/_val_1.fq.gz//' | while read prefix; do hisat2 -p 10 \ -x ../reference/GRCh38_index \ -1 ${prefix}_val_1.fq.gz \ -2 ${prefix}_val_2.fq.gz \ -S ../aligned/${prefix}.sam \ 2> ../aligned/${prefix}.align.logdone
cd ${PROJECT_DIR}/genome# 构建索引(约10分钟)bwa index -a bwtsw Arabidopsis_thaliana.TAIR10.dna.toplevel.fa# 验证索引文件ls -lh *.amb *.ann *.bwt *.pac *.sa
cd ${PROJECT_DIR}/clean# 比对并直接输出 SAMcat ../sra_list.txt | while read id; do bwa mem \ -t 4 \ -v 2 \ ../genome/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa \ ${id}_1.clean.fq.gz ${id}_2.clean.fq.gz \ > ../bwa_sam/${id}.sam 2> ../bwa_sam/${id}.bwa.logdone
cd ${PROJECT_DIR}/bwa_sammkdir -p ../bwa_bam/step1_filter# -F 4: 去除未比对 reads# -F 8: 去除 mate 未比对的 reads# -q 10: 比对质量阈值# -b: 输出 BAM 格式cat ../sra_list.txt | while read id; do samtools view -@ 4 -b -F 4 -F 8 -q 10 \ -o ../bwa_bam/step1_filter/${id}.bam \ ${id}.sam # 删除 SAM 释放空间 rm ${id}.samdone
cd ${PROJECT_DIR}/bwa_bammkdir -p step2_sort step3_rmdupcat ../sra_list.txt | while read id; do # Step 1: 按坐标排序(去重前置条件!) samtools sort -@ 4 -O bam \ -o step2_sort/${id}.sort.bam \ step1_filter/${id}.bam # Step 2: 去除 PCR 重复(-r 表示删除重复) sambamba markdup -r -t 4 \ step2_sort/${id}.sort.bam \ step3_rmdup/${id}.rmdup.bam # 建立索引 samtools index step3_rmdup/${id}.rmdup.bamdone
cd ${PROJECT_DIR}/bwa_bam/step3_rmdup# 生成 normalized bigwig(RPM 标准化)cat ../../sra_list.txt | while read id; do BAMscale scale \ --bam ${id}.rmdup.bam \ --operation strand \ -t 4 \ -o ../../deeptools/done
# 划分基因组窗口cd ${PROJECT_DIR}/genome# 统计染色体长度samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa# 提取染色体和长度(拟南芥 1-5 号染色体)awk '{print $1 "\t" $2}' Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis.genome# 按 2000bp 划分窗口(ATAC-seq 常用 100-500bp,教学演示用 2000bp)bedtools makewindows -g Arabidopsis.genome -w 2000 > Arabidopsis.windows
# 统计各区reads数cd ${PROJECT_DIR}/bwa_bam/step3_rmdupmultiBamSummary BED-file \ --BED ../../genome/Arabidopsis.windows \ --bamfiles *.rmdup.bam \ --numberOfProcessors 4 \ -out ../../deeptools/readCounts.npz \ --outRawCounts ../../deeptools/readCounts.tab
# 设置工作目录setwd("~/atac_ara/deeptools")# 读取 counts 矩阵atac_counts <- read.table("readCounts.tab", header = TRUE, sep = "\t", stringsAsFactors = FALSE)# 去除前三列(chr, start, end),保留样本列counts_matrix <- atac_counts[, -c(1:3)]colnames(counts_matrix) <- c("SRR5874657", "SRR5874658", "SRR5874659")# 计算相关性cor_matrix <- cor(counts_matrix)print(cor_matrix)# 可视化install.packages("corrplot") # 如未安装library(corrplot)pdf("sample_correlation.pdf", width = 6, height = 6)corrplot(cor_matrix, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", title = "Sample Correlation (ATAC-seq)", mar = c(0,0,2,0))dev.off()
# 去除细胞器DNAcd ${PROJECT_DIR}/bwa_bam/step3_rmdupmkdir -p ../../call_peak# 只保留 1-5 号染色体(拟南芥核基因组)cat ../../sra_list.txt | while read id; do samtools view -@ 4 -b ${id}.rmdup.bam 1 2 3 4 5 \ > ../../call_peak/${id}.nuc.bamdone
# 按Read Name排序cd ${PROJECT_DIR}/call_peakcat ../sra_list.txt | while read id; do samtools sort -n -@ 4 -O bam \ -o ${id}.nuc.sortn.bam \ -T ${id}.tmp \ ${id}.nuc.bamdone
# 单样本Call Peakcd ${PROJECT_DIR}/call_peakcat ../sra_list.txt | while read id; do Genrich \ -t ${id}.nuc.sortn.bam \ -o ${id}.narrowPeak \ -j \ # ATAC-seq 模式(Tn5 偏移校正) -y \ # 保留重复 reads(已在 sambamba 去除) -v \ # 详细输出 -q 0.01 # FDR 阈值done
# 联合Call Peakcd ${PROJECT_DIR}/call_peak# 合并所有样本输入,提高 Peak 检出灵敏度BAMS=$(cat ../sra_list.txt | sed 's/$/.nuc.sortn.bam/' | paste -sd "," -)Genrich \ -t ${BAMS} \ -o all.narrowPeak \ -j -v -q 0.01# 统计 Peak 数目wc -l *.narrowPeak
# 设置国内镜像options(repos = c(CRAN = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))options(Bioc_mirror = "https://mirrors.ustc.edu.cn/bioc/")# 安装 BiocManagerif(!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")# 安装 ATAC-seq 分析包BiocManager::install(c("ChIPseeker", "GenomicFeatures", "clusterProfiler"))
library(ChIPseeker)library(GenomicFeatures)# 从 GFF 构建 TxDb 对象ara_TxDb <- makeTxDbFromGFF("~/atac_ara/genome/Arabidopsis_thaliana.TAIR10.51.gff3")# 设置工作目录setwd("~/atac_ara/call_peak")# 读取所有 narrowPeak 文件files <- list.files(pattern = "*.narrowPeak")files_list <- as.list(files)names(files_list) <- gsub(".narrowPeak", "", files)# 批量注释 PeakpeakAnnoList <- lapply(files_list, annotatePeak, TxDb = ara_TxDb, tssRegion = c(-1000, 1000), verbose = FALSE)# 查看注释结果peakAnnoList$all
# 1. Peak 在 TSS 周围的分布plotDistToTSS(peakAnnoList, ylab = "Opening sites (%) (5'->3')", title = "Distribution of ATAC peaks relative to TSS")# 2. Peak 基因组分布饼图plotAnnoPie(peakAnnoList$all)# 3. 多样本 TSS 热图(1kb 窗口)peakHeatmap(files_list, weightCol = "V5", TxDb = ara_TxDb, upstream = 1000, downstream = 1000)# 4. 平均分布谱图promoter <- getPromoters(TxDb = ara_TxDb, upstream = 1000, downstream = 1000)tagMatrixList <- lapply(files_list, getTagMatrix, windows = promoter)plotAvgProf(tagMatrixList, xlim = c(-1000, 1000), conf = 0.95, facet = "row")
# Peak交集统计cd ${PROJECT_DIR}/call_peak# intervene 绘制 Venn 图(需在当前 conda 环境安装)intervene venn -i *.narrowPeak --save-overlaps# 结果在 ./Intervene_results/ 中
# HOMER Motif富集cd ${PROJECT_DIR}/call_peak# 将 narrowPeak 转为 HOMER 兼容的 bed 格式# 格式:ID chr start end strandawk '{print "Peak_"NR"\t"$1"\t"$2"\t"$3"\t+"}' all.narrowPeak > all.homer.bed# 运行 HOMER(-p 4 线程)findMotifsGenome.pl \ all.homer.bed \ ~/atac_ara/genome/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa \ ../motif/all_motif_enrichment \ -p 4 \ -len 8,10,12 \ -size 200