当前位置：首页>Linux>ATAC-Seq Linux上游分析 + R语言下游分析完整流程

ATAC-Seq Linux上游分析 + R语言下游分析完整流程

2026-06-30 22:11:36

一文打通从 SRA 原始数据到Peak注释与Motif分析的全流程，具体概括如下：

BWA 比对 → Samtools 过滤 → Sambamba 去重 → Genrich Call Peak → ChIPseeker 注释 → HOMER Motif

# 创建环境（指定 Python 3.10，避免依赖冲突）conda create -n atac python=3.10 -y# 激活环境（后续所有操作都在此环境下）conda activate atac# 添加国内镜像加速conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/conda config --set show_channel_urls yes

# 下载工具conda install -c bioconda sra-tools -y# 质控工具conda install -c bioconda fastqc multiqc -y# 数据清洗（ATAC-seq 推荐 fastp，比 trim_galore 更快）conda install -c bioconda fastp -y# 比对与处理conda install -c bioconda bwa samtools sambamba -y# Peak  calling 与可视化conda install -c bioconda genrich deeptools BAMscale -y# 基因组操作conda install -c bioconda bedtools -y# Motif 分析conda install -c bioconda homer -y# Peak 交集统计conda install -c bioconda intervene -y

# 设置项目根目录变量（避免硬编码，提升可移植性）PROJECT_DIR=~/atac_aramkdir -p ${PROJECT_DIR}/{sra,raw,fastqc,clean,genome,bwa_sam,bwa_bam,call_peak,motif,deeptools}cd ${PROJECT_DIR}

2.2 SRA数据下载

# 准备SRA编号列表cd ${PROJECT_DIR}# 创建样本列表cat > sra_list.txt << 'EOF'SRR5874657SRR5874658SRR5874659EOF

cd ${PROJECT_DIR}/sra# 批量下载prefetch --option-file ../sra_list.txt# 整理：将分散在子文件夹的 .sra 移到一起cat ../sra_list.txt | while read id; do    if [ -f ${id}/${id}.sra ]; then        mv ${id}/${id}.sra ./        rm -rf ${id}    fidone

2.3 SRA转FASTQ

cd ${PROJECT_DIR}/raw# 使用 fasterq-dump（新版 sra-tools，速度提升 5-10 倍）for sra in ../sra/*.sra; do    fasterq-dump --split-3 ${sra} -O ./done# 压缩节省空间（pigz 多线程压缩）pigz -p 4 *.fastqls -lh

2.4 质控与数据清洗

# FastQC原始数据质控cd ${PROJECT_DIR}/rawmkdir -p ../fastqc/raw# ✅ 修正后的循环：正确闭合，使用相对路径cat ../sra_list.txt | while read id; do    fastqc -t 2 -o ../fastqc/raw ${id}_1.fastq.gz ${id}_2.fastq.gzdone# 汇总报告cd ../fastqc/rawmultiqc ./

# Fastp清洗cd ${PROJECT_DIR}/rawmkdir -p ../clean# 批量质控：去除低质量碱基和接头# -f 16: 5'端切除16bp（Tn5接头区域常见低质量）# -t 2:  3'端切除2bp# -L:    禁用长度过滤（ATAC-seq 片段本身较短）cat ../sra_list.txt | while read id; do    fastp \        -i ${id}_1.fastq.gz -I ${id}_2.fastq.gz \        -o ../clean/${id}_1.clean.fq.gz -O ../clean/${id}_2.clean.fq.gz \        -f 16 -t 2 -L \        -j ../clean/${id}_fastp.json \        -h ../clean/${id}_fastp.html \        --thread 4done

# 清洗后质控验证cd ${PROJECT_DIR}/cleanmkdir -p ../fastqc/cleancat ../sra_list.txt | while read id; do    fastqc -t 2 -o ../fastqc/clean ${id}_1.clean.fq.gz ${id}_2.clean.fq.gzdonecd ../fastqc/cleanmultiqc ./

2.6 基因组比对

cd ${PROJECT_DIR}/genome# 下载拟南芥基因组wget http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-51/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz# 解压gunzip Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz# 下载注释文件（后续 ChIPseeker 需要）wget http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-51/plants/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.51.gff3.gzgunzip Arabidopsis_thaliana.TAIR10.51.gff3.gzcd ../clean# 双端比对ls *_val_1.fq.gz | sed 's/_val_1.fq.gz//' | while read prefix; do    hisat2 -p 10 \        -x ../reference/GRCh38_index \        -1 ${prefix}_val_1.fq.gz \        -2 ${prefix}_val_2.fq.gz \        -S ../aligned/${prefix}.sam \        2> ../aligned/${prefix}.align.logdone

cd ${PROJECT_DIR}/genome# 构建索引（约10分钟）bwa index -a bwtsw Arabidopsis_thaliana.TAIR10.dna.toplevel.fa# 验证索引文件ls -lh *.amb *.ann *.bwt *.pac *.sa

cd ${PROJECT_DIR}/clean# 比对并直接输出 SAMcat ../sra_list.txt | while read id; do    bwa mem \        -t 4 \        -v 2 \        ../genome/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa \        ${id}_1.clean.fq.gz ${id}_2.clean.fq.gz \        > ../bwa_sam/${id}.sam 2> ../bwa_sam/${id}.bwa.logdone

2.7 SAM转BAM并建立索引

cd ${PROJECT_DIR}/bwa_sammkdir -p ../bwa_bam/step1_filter# -F 4:  去除未比对 reads# -F 8:  去除 mate 未比对的 reads# -q 10: 比对质量阈值# -b:    输出 BAM 格式cat ../sra_list.txt | while read id; do    samtools view -@ 4 -b -F 4 -F 8 -q 10 \        -o ../bwa_bam/step1_filter/${id}.bam \        ${id}.sam    # 删除 SAM 释放空间    rm ${id}.samdone

2.8 排序与去PCR重复

cd ${PROJECT_DIR}/bwa_bammkdir -p step2_sort step3_rmdupcat ../sra_list.txt | while read id; do    # Step 1: 按坐标排序（去重前置条件！）    samtools sort -@ 4 -O bam \        -o step2_sort/${id}.sort.bam \        step1_filter/${id}.bam    # Step 2: 去除 PCR 重复（-r 表示删除重复）    sambamba markdup -r -t 4 \        step2_sort/${id}.sort.bam \        step3_rmdup/${id}.rmdup.bam    # 建立索引    samtools index step3_rmdup/${id}.rmdup.bamdone

2.9 生成BigWig文件（可视化）

cd ${PROJECT_DIR}/bwa_bam/step3_rmdup# 生成 normalized bigwig（RPM 标准化）cat ../../sra_list.txt | while read id; do    BAMscale scale \        --bam ${id}.rmdup.bam \        --operation strand \        -t 4 \        -o ../../deeptools/done

2.10 样本相关性分析

# 划分基因组窗口cd ${PROJECT_DIR}/genome# 统计染色体长度samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa# 提取染色体和长度（拟南芥 1-5 号染色体）awk '{print $1 "\t" $2}' Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis.genome# 按 2000bp 划分窗口（ATAC-seq 常用 100-500bp，教学演示用 2000bp）bedtools makewindows -g Arabidopsis.genome -w 2000 > Arabidopsis.windows

# 统计各区reads数cd ${PROJECT_DIR}/bwa_bam/step3_rmdupmultiBamSummary BED-file \    --BED ../../genome/Arabidopsis.windows \    --bamfiles *.rmdup.bam \    --numberOfProcessors 4 \    -out ../../deeptools/readCounts.npz \    --outRawCounts ../../deeptools/readCounts.tab

三、R语言下游分析

3.1 R语言相关性可视化

# 设置工作目录setwd("~/atac_ara/deeptools")# 读取 counts 矩阵atac_counts <- read.table("readCounts.tab", header = TRUE, sep = "\t", stringsAsFactors = FALSE)# 去除前三列（chr, start, end），保留样本列counts_matrix <- atac_counts[, -c(1:3)]colnames(counts_matrix) <- c("SRR5874657", "SRR5874658", "SRR5874659")# 计算相关性cor_matrix <- cor(counts_matrix)print(cor_matrix)# 可视化install.packages("corrplot")  # 如未安装library(corrplot)pdf("sample_correlation.pdf", width = 6, height = 6)corrplot(cor_matrix, method = "color", type = "upper",          addCoef.col = "black", tl.col = "black",         title = "Sample Correlation (ATAC-seq)", mar = c(0,0,2,0))dev.off()

3.2 Call Peak

# 去除细胞器DNAcd ${PROJECT_DIR}/bwa_bam/step3_rmdupmkdir -p ../../call_peak# 只保留 1-5 号染色体（拟南芥核基因组）cat ../../sra_list.txt | while read id; do    samtools view -@ 4 -b ${id}.rmdup.bam 1 2 3 4 5 \        > ../../call_peak/${id}.nuc.bamdone

# 按Read Name排序cd ${PROJECT_DIR}/call_peakcat ../sra_list.txt | while read id; do    samtools sort -n -@ 4 -O bam \        -o ${id}.nuc.sortn.bam \        -T ${id}.tmp \        ${id}.nuc.bamdone

# 单样本Call Peakcd ${PROJECT_DIR}/call_peakcat ../sra_list.txt | while read id; do    Genrich \        -t ${id}.nuc.sortn.bam \        -o ${id}.narrowPeak \        -j \              # ATAC-seq 模式（Tn5 偏移校正）        -y \              # 保留重复 reads（已在 sambamba 去除）        -v \              # 详细输出        -q 0.01           # FDR 阈值done

# 联合Call Peakcd ${PROJECT_DIR}/call_peak# 合并所有样本输入，提高 Peak 检出灵敏度BAMS=$(cat ../sra_list.txt | sed 's/$/.nuc.sortn.bam/' | paste -sd "," -)Genrich \    -t ${BAMS} \    -o all.narrowPeak \    -j -v -q 0.01# 统计 Peak 数目wc -l *.narrowPeak

3.3 Peak注释与可视化

# 设置国内镜像options(repos = c(CRAN = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))options(Bioc_mirror = "https://mirrors.ustc.edu.cn/bioc/")# 安装 BiocManagerif(!require("BiocManager", quietly = TRUE))    install.packages("BiocManager")# 安装 ATAC-seq 分析包BiocManager::install(c("ChIPseeker", "GenomicFeatures", "clusterProfiler"))

library(ChIPseeker)library(GenomicFeatures)# 从 GFF 构建 TxDb 对象ara_TxDb <- makeTxDbFromGFF("~/atac_ara/genome/Arabidopsis_thaliana.TAIR10.51.gff3")# 设置工作目录setwd("~/atac_ara/call_peak")# 读取所有 narrowPeak 文件files <- list.files(pattern = "*.narrowPeak")files_list <- as.list(files)names(files_list) <- gsub(".narrowPeak", "", files)# 批量注释 PeakpeakAnnoList <- lapply(files_list, annotatePeak,                        TxDb = ara_TxDb,                       tssRegion = c(-1000, 1000),                       verbose = FALSE)# 查看注释结果peakAnnoList$all

# 1. Peak 在 TSS 周围的分布plotDistToTSS(peakAnnoList,              ylab = "Opening sites (%) (5'->3')",              title = "Distribution of ATAC peaks relative to TSS")# 2. Peak 基因组分布饼图plotAnnoPie(peakAnnoList$all)# 3. 多样本 TSS 热图（1kb 窗口）peakHeatmap(files_list,             weightCol = "V5",             TxDb = ara_TxDb,            upstream = 1000,             downstream = 1000)# 4. 平均分布谱图promoter <- getPromoters(TxDb = ara_TxDb, upstream = 1000, downstream = 1000)tagMatrixList <- lapply(files_list, getTagMatrix, windows = promoter)plotAvgProf(tagMatrixList,             xlim = c(-1000, 1000),            conf = 0.95,            facet = "row")

3.4 Peak交集与Motif分析

# Peak交集统计cd ${PROJECT_DIR}/call_peak# intervene 绘制 Venn 图（需在当前 conda 环境安装）intervene venn -i *.narrowPeak --save-overlaps# 结果在 ./Intervene_results/ 中

# HOMER Motif富集cd ${PROJECT_DIR}/call_peak# 将 narrowPeak 转为 HOMER 兼容的 bed 格式# 格式：ID chr start end strandawk '{print "Peak_"NR"\t"$1"\t"$2"\t"$3"\t+"}' all.narrowPeak > all.homer.bed# 运行 HOMER（-p 4 线程）findMotifsGenome.pl \    all.homer.bed \    ~/atac_ara/genome/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa \    ../motif/all_motif_enrichment \    -p 4 \    -len 8,10,12 \    -size 200

本文来自网友投稿或网络内容，如有侵犯您的权益请联系我们删除，联系邮箱：wyl860211@qq.com 。

ATAC-Seq Linux上游分析 + R语言下游分析完整流程

最新文章

热门文章

随机文章

ATAC-Seq Linux上游分析 + R语言下游分析完整流程

驱动之路#23:Linux中断(面试必考题)

易于利用的“Pack2TheRoot”Linux漏洞获得Root访问权限

最新文章

热门文章

随机文章