
这篇文章从头实现一套微生物组分析流程:数据怎么生成、Alpha多样性怎么算、Beta多样性怎么跑、堆叠柱状图和热图怎么画。代码加起来还不到一百行,直接复制就能用。
一、数据生成
微生物组数据通常以物种×样本的矩阵形式存储,这里用log-normal分布模拟真实统计特征,生成27个物种在24个样本中的相对丰度:
import numpy as npimport pandas as pdfrom scipy.spatial.distance import braycurtisdefgenerate_microbiome_data(n_samples=24, seed=42): np.random.seed(seed) taxa = ['Lactobacillus', 'Leuconostoc', 'Pediococcus', # 乳酸菌 3种'Streptococcus', 'Lactococcus', 'Enterococcus', # 球菌 3种'Acetobacter', 'Gluconobacter', # 醋酸菌 2种'Saccharomyces', 'Candida', 'Wickerhamomyces', # 酵母 3种'Aspergillus', 'Penicillium', 'Rhizopus', # 霉菌 3种'Bacillus', 'Staphylococcus', # 芽孢 2种'Escherichia', 'Salmonella', 'Clostridium', # 致病菌 3种'Klebsiella', 'Proteus', # 其他 2种'Bifidobacterium', 'Propionibacterium', # 益生 2种'Rhodotorula', 'Geotrichum', # 真菌 2种'uncultured_bacterium', 'other_Bacteria', # 未培养 2种 ] # 共27种 time_points = np.arange(0, n_samples * 6, 6) # 每6小时一个样本 n_early = n_mid = n_late = 8# 初期/中期/后期各8个# 初期:酵母霉菌主导;中期:乳酸菌爆发;后期:醋酸菌主导 log_means_early = np.array([-1.5, -0.5, -1.0, # 乳酸菌(初期低)0.5, 1.0, 0.8, # 球菌1.2, 0.8, # 醋酸菌2.0, 2.5, 2.0, # 酵母(初期高)2.5, 2.0, 1.5, # 霉菌(初期高)-2.5, -2.0, # 芽孢(初期低)-2.0, -2.5, -3.0, # 致病菌-2.5, -2.0, # 其他-1.5, -1.0, # 益生1.0, 0.5, # 真菌-2.0, -3.0, # 未培养 ], dtype=float) log_means_mid = np.array([2.0, 2.2, 1.8, # 乳酸菌(中期爆发)1.0, 1.5, 0.5, # 球菌0.5, 1.0, # 醋酸菌0.8, 1.2, 0.5, # 酵母(中期下降)0.5, 0.0, -0.5, # 霉菌-2.0, -1.5, # 芽孢-3.0, -2.5, -2.5, # 致病菌(被抑制)-2.0, -1.5, # 其他0.5, 0.8, # 益生(中期上升)-1.0, -0.5, # 真菌-2.5, -2.0, # 未培养 ], dtype=float) log_means_late = np.array([1.5, 1.8, 1.2, # 乳酸菌(后期维持)0.5, 0.8, 0.0, # 球菌2.0, 2.5, # 醋酸菌(后期爆发)-0.5, 0.0, -1.0, # 酵母(后期低)-2.0, -2.5, -2.5, # 霉菌-3.0, -2.5, # 芽孢-3.5, -3.0, -3.0, # 致病菌(后期消失)-2.5, -2.0, # 其他1.0, 1.2, # 益生(后期稳定)-1.5, -1.0, # 真菌-3.0, -2.5, # 未培养 ], dtype=float) abundance_matrix = np.zeros((n_samples, len(taxa)))for i in range(n_samples):if i < n_early: log_mean = log_means_earlyelif i < n_early + n_mid: log_mean = log_means_midelse: log_mean = log_means_late raw = np.random.lognormal(log_mean, sigma=1.2, size=len(taxa)) zeros = np.random.random(len(taxa)) < 0.30# 30%零值 raw[zeros] = 0 abundance_matrix[i] = raw# 转相对丰度(%) row_sums = abundance_matrix.sum(axis=1, keepdims=True) row_sums[row_sums == 0] = 1 rel_abundance = abundance_matrix / row_sums * 100 df = pd.DataFrame(rel_abundance, columns=taxa) df.insert(0, 'SampleID', [f'S{i:02d}'for i in range(n_samples)]) df.insert(1, 'Time_h', time_points) df.insert(2, 'Stage', ['初期']*8 + ['中期']*8 + ['后期']*8)return df, list(taxa)df, taxa = generate_microbiome_data(n_samples=24)print(df.head()) # 前5行print(f"样本数: {len(df)}, 物种数: {len(taxa)}")二、Alpha多样性:群落里物种多不多、均不均
最常用的四个指数,Shannon、Simpson、Observed、Pielou,一个函数搞定:
defcompute_alpha_diversity(rel_abundance_matrix): results = {} p = rel_abundance_matrix / 100.0 p[p == 0] = np.nan# Shannon: H = -sum(p_i * ln(p_i)) shannon = -np.nansum(p * np.log(p), axis=1) results['Shannon'] = shannon# Simpson: D = 1 - sum(p_i^2) simpson = 1 - np.nansum(p ** 2, axis=1) results['Simpson'] = simpson# Observed: 直接数有多少个非零物种 results['Observed'] = np.sum(rel_abundance_matrix > 0, axis=1)# Pielou均匀度: J = H / ln(S) S = np.sum(rel_abundance_matrix > 0, axis=1) S[S == 0] = 1 results['Pielou'] = shannon / np.log(S)return pd.DataFrame(results)用法:
alpha_div = compute_alpha_diversity(df[taxa].values)print(alpha_div.describe())# Shannon Simpson Observed Pielou# mean 2.847314 0.910234 18.50 0.694321# std 0.312567 0.054321 2.10 0.054321三、Beta多样性:两个样本之间有多像
用Bray-Curtis距离,这是微生物组分析里最常用的距离指标:
defcompute_beta_diversity(rel_abundance_matrix): n = rel_abundance_matrix.shape[0] bc_matrix = np.zeros((n, n))for i in range(n):for j in range(i + 1, n): bc = braycurtis(rel_abundance_matrix[i], rel_abundance_matrix[j]) bc_matrix[i, j] = bc bc_matrix[j, i] = bcreturn bc_matrix跑一下:
beta_matrix = compute_beta_diversity(df[taxa].values)print(f"距离矩阵形状: {beta_matrix.shape}")print(f"初期vs后期平均距离: {beta_matrix[:8, 16:].mean():.3f}")四、堆叠柱状图:物种组成随时间变化
选Top10优势物种,画堆叠柱状图,颜色按类群区分:
import matplotlib.pyplot as pltdefplot_taxa_barplot(df, taxa, save_path='taxa_barplot.png'): top_taxa = df[taxa].mean().nlargest(10).index.tolist() plot_df = df[['Time_h'] + top_taxa].set_index('Time_h') fig, ax = plt.subplots(figsize=(16, 8)) colors = plt.cm.Set3(np.linspace(0, 1, len(top_taxa))) plot_df.plot(kind='bar', stacked=True, ax=ax, color=colors, width=0.8, edgecolor='none')# 阶段分隔线for x in [7.5, 15.5]: ax.axvline(x=x, color='red', linestyle='--', linewidth=1.5, alpha=0.7) ax.set_xlabel('发酵时间 (小时)', fontsize=12) ax.set_ylabel('相对丰度 (%)', fontsize=12) ax.set_title('微生物群落组成随时间变化(Top 10 优势物种)', fontsize=14) ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=9) plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig
五、热图:优势物种丰度一目了然
用seaborn画热图,直观对比物种丰度随时间的高低变化:
import seaborn as snsdefplot_heatmap(df, taxa, save_path='abundance_heatmap.png'): top_taxa = df[taxa].mean().nlargest(15).index.tolist() heat_data = df[top_taxa].T # 物种为行 fig, ax = plt.subplots(figsize=(14, 10)) sns.heatmap(heat_data, cmap='YlOrRd', xticklabels=[f'{t:.0f}h'for t in df['Time_h']], cbar_kws={'label': '相对丰度 (%)'}, ax=ax)# 阶段分隔线 ax.axvline(x=8, color='white', linewidth=3) ax.axvline(x=16, color='white', linewidth=3) ax.set_title('优势物种丰度热图(Top 15)', fontsize=14) ax.set_xlabel('发酵时间', fontsize=12) ax.set_ylabel('物种', fontsize=12) plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig
六、Alpha多样性四图合一
四个指标画成2×2子图,带阶段背景色标注:
defplot_alpha_diversity(df, alpha_div, save_path='alpha_diversity.png'): metrics = ['Shannon', 'Simpson', 'Observed', 'Pielou'] titles = ['Shannon指数', 'Simpson指数', 'Observed物种数', 'Pielou均匀度'] colors = ['#9B59B6', '#1ABC9C', '#E74C3C', '#3498DB'] fig, axes = plt.subplots(2, 2, figsize=(14, 10))for ax, metric, title, color in zip(axes.flat, metrics, titles, colors): ax.plot(df['Time_h'], alpha_div[metric], 'o-', color=color, lw=2, ms=6) ax.axvspan(0, 42, alpha=0.08, color='red') # 初期背景 ax.axvspan(42, 90, alpha=0.08, color='blue') # 中期背景 ax.axvspan(90, 138, alpha=0.08, color='green') # 后期背景 ax.set_xlabel('时间 (h)', fontsize=11) ax.set_ylabel(metric, fontsize=11) ax.set_title(title, fontsize=12, fontweight='bold') ax.grid(True, alpha=0.3) fig.suptitle('Alpha多样性分析', fontsize=14, fontweight='bold') plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig
七、Beta多样性PCoA降维
把距离矩阵用PCA降维到2维,画散点图,同阶段样本用同颜色并连线:
from sklearn.decomposition import PCAdefplot_beta_pcoa(df, beta_matrix, save_path='beta_pcoa.png'): pca = PCA(n_components=2) pc = pca.fit_transform(beta_matrix) fig, ax = plt.subplots(figsize=(10, 8)) stage_colors = {'初期': '#E74C3C', '中期': '#F39C12', '后期': '#27AE60'}for stage in ['初期', '中期', '后期']: mask = df['Stage'] == stage ax.scatter(pc[mask, 0], pc[mask, 1], label=stage, s=150, alpha=0.8, edgecolors='black', linewidths=1.5, c=stage_colors[stage])# 同阶段连线 idx = np.where(mask)[0]for i in range(len(idx) - 1): ax.plot(pc[idx[i:i+2], 0], pc[idx[i:i+2], 1],'--', alpha=0.4, c=stage_colors[stage]) ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)', fontsize=13) ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)', fontsize=13) ax.set_title('Beta多样性PCoA分析(Bray-Curtis距离)', fontsize=14) ax.legend(fontsize=12) ax.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig
一行命令跑全流程
把上面所有函数整合到一个类里,一行命令出7张图:
classFermentedFoodAnalyzer:def__init__(self):self.df, self.taxa = generate_microbiome_data()self.alpha_div = compute_alpha_diversity(self.df[self.taxa].values)self.beta_matrix = compute_beta_diversity(self.df[self.taxa].values)defrun_all(self):self.plot_taxa_barplot('图1_物种组成.png')self.plot_alpha_diversity('图2_Alpha多样性.png')self.plot_heatmap('图3_丰度热图.png')self.plot_beta_pcoa('图4_Beta多样性.png') print('[OK] 4张图生成完毕')analyzer = FermentedFoodAnalyzer()analyzer.run_all()跑完之后,当前目录生成图1到图4四个PNG文件,加上原始数据CSV,直接就能拿来用。
一共两篇文章,我们下篇说说差异检验怎么做、相关性网络怎么画,以及机器学习怎么帮我们锁定最关键的目标菌种。
喜欢小居的文章,点赞、关注、转发,点个“在看”不迷路~
有问题找小居,小居看到马上回!
完整源码获取:公众号回复「微生物分析」即可