知道乳酸菌重要,但具体哪个阶段哪几种菌的影响力最大?
差异检验找出来,网络分析看清楚,用随机森林直接进行排名!

这里我们全部用Python实现,代码直接可跑。
一、差异丰度检验:哪些菌在三阶段之间真的变了
拿到微生物组数据,第一个问题是:每个菌在初期、中期、后期之间的差异,统计上显著吗?
用Mann-Whitney U检验,不需要数据服从正态分布,特别适合微生物组这种偏态数据:
from scipy import statsdefdifferential_abundance_test(df, taxa, stage_col='Stage'): early = df[df[stage_col] == '初期'][taxa].values mid = df[df[stage_col] == '中期'][taxa].values late = df[df[stage_col] == '后期'][taxa].values results = []for i, taxon in enumerate(taxa):# 初期 vs 后期 stat_e, p_e = stats.mannwhitneyu(early[:, i], late[:, i], alternative='two-sided')# 初期 vs 中期 stat_m, p_m = stats.mannwhitneyu(early[:, i], mid[:, i], alternative='two-sided') results.append({'Taxon': taxon,'Early_mean': early[:, i].mean(),'Mid_mean': mid[:, i].mean(),'Late_mean': late[:, i].mean(),'p_EarlyLate': p_e,'p_EarlyMid': p_m,'Significant': 'Yes'if (p_e < 0.05or p_m < 0.05) else'No' })return pd.DataFrame(results)跑一下,筛选显著差异的物种:
diff_results = differential_abundance_test(df, taxa)sig = diff_results[diff_results['Significant'] == 'Yes'].sort_values('Early_mean', ascending=False)print(f"显著差异物种数: {len(sig)} / {len(taxa)}")# 显著差异物种数: 21 / 27top_sig = sig.nlargest(5, 'Early_mean')print(top_sig[['Taxon', 'Early_mean', 'Mid_mean', 'Late_mean', 'p_EarlyLate']])# Taxon Early_mean Mid_mean Late_mean p_EarlyLate# Saccharomyces 22.34 8.21 5.12 0.0003# Lactobacillus 10.12 28.45 18.23 0.0007# Aspergillus 18.90 5.34 2.10 0.0012# Acetobacter 8.21 12.45 25.67 0.0021从结果能直接看出:Saccharomyces(酵母菌)初期远高于中后期,Lactobacillus(乳酸菌)中期爆发,Acetobacter(醋酸菌)后期才起来——规律完全符合生物学常识。
二、差异丰度可视化
把显著差异的物种画成簇状柱状图,三色并列,差距一目了然:
def plot_differential_abundance(diff_results, save_path='diff_abundance.png'):sig = diff_results[diff_results['Significant'] == 'Yes'].nlargest(12, 'Early_mean') fig, ax = plt.subplots(figsize=(13, 7))x = np.arange(len(sig))w = 0.25 ax.bar(x - w, sig['Early_mean'], w, label='初期', color='#E74C3C', alpha=0.85) ax.bar(x, sig['Mid_mean'], w, label='中期', color='#F39C12', alpha=0.85) ax.bar(x + w, sig['Late_mean'], w, label='后期', color='#27AE60', alpha=0.85) ax.set_xticks(x) ax.set_xticklabels(sig['Taxon'], rotation=45, ha='right', fontsize=9) ax.set_ylabel('平均相对丰度 (%)', fontsize=12) ax.set_title('差异丰度物种分析(Mann-Whitney U检验, p<0.05)', fontsize=14) ax.legend(fontsize=10) ax.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig
三、相关性网络:菌和菌之间谁和谁走得近
如果两个菌在大多数样本里同增同减,说明它们之间可能存在生态互作。
用Spearman相关性来量化,配对筛选阈值|r|>0.5且p<0.05,最后用NetworkX画网络图:
import networkx as nxdefbuild_correlation_network(rel_abundance_matrix, taxa, threshold=0.5, save_path='network.png'):# 计算Spearman相关矩阵 corr_matrix, p_matrix = stats.spearmanr(rel_abundance_matrix) G = nx.Graph()for i, t1 in enumerate(taxa): G.add_node(t1)for j, t2 in enumerate(taxa):if i < j and abs(corr_matrix[i, j]) > threshold and p_matrix[i, j] < 0.05: G.add_edge(t1, t2, weight=corr_matrix[i, j], pvalue=p_matrix[i, j])# 节点颜色:乳酸菌=红,醋酸菌=绿,酵母=橙,霉菌=紫,其他=蓝 color_map = {'Lactobacillus': '#E74C3C', 'Leuconostoc': '#C0392B', 'Pediococcus': '#E67E22','Acetobacter': '#27AE60', 'Gluconobacter': '#2ECC71','Saccharomyces': '#F39C12', 'Candida': '#E74C3C','Aspergillus': '#8E44AD', 'Penicillium': '#9B59B6','default': '#3498DB' } node_colors = [color_map.get(n, color_map['default']) for n in G.nodes()] fig, ax = plt.subplots(figsize=(14, 12)) pos = nx.spring_layout(G, k=2, iterations=50, seed=42) degrees = dict(G.degree()) node_sizes = [degrees[n] * 200 + 300for n in G.nodes()]if G.edges(): edge_weights = [d['weight'] for _, _, d in G.edges(data=True)] nx.draw_networkx_edges(G, pos, ax=ax, alpha=0.4, width=[abs(w)*3for w in edge_weights], edge_color=[w for w in edge_weights], edge_cmap=plt.cm.RdBu_r) nx.draw_networkx_nodes(G, pos, ax=ax, node_size=node_sizes, node_color=node_colors, alpha=0.8, edgecolors='black', linewidths=1) nx.draw_networkx_labels(G, pos, ax=ax, font_size=8, font_weight='bold') ax.set_title('微生物共现网络(Spearman |r|>0.5, p<0.05)', fontsize=14) ax.axis('off') plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show() print(f"网络节点: {G.number_of_nodes()}, 边数: {G.number_of_edges()}") print(f"网络密度: {nx.density(G):.4f}")return G调用:
net = build_correlation_network(df[taxa].values, taxa, threshold=0.5)# 网络节点: 24, 边数: 31# 网络密度: 0.0892
从网络里能直接读出几件事:乳酸菌三个种
(Lactobacillus、Leuconostoc、Pediococcus)之间互相正相关,形成一个紧密的聚类;醋酸菌(Acetobacter、Gluconobacter)之间也高度正相关;酵母和霉菌在初期正相关,到中后期转为负相关,可能反映了对底物的竞争关系。
四、随机森林:直接锁定最重要的物种
这是最关键的一步——用机器学习来回答:给定一个样本的物种组成,哪个菌对判断发酵阶段的贡献最大?
用随机森林回归模型,以物种丰度为特征(X),发酵阶段为标签(y):
from sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import r2_score, mean_squared_errordefbuild_ml_model(df, taxa):# 标签:初期=0,中期=1,后期=2 stage_map = {'初期': 0, '中期': 1, '后期': 2} y = df['Stage'].map(stage_map).values X = df[taxa].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42) model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) r2 = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f"测试集 R2: {r2:.4f}, RMSE: {rmse:.4f}")# 特征重要性 fi = pd.DataFrame({'Taxon': taxa, 'Importance': model.feature_importances_}) fi = fi.sort_values('Importance', ascending=False) print("\n特征重要性 Top5:") print(fi.head())return model, fi跑一下:
model, fi = build_ml_model(df, taxa)# 测试集 R2: 0.9472, RMSE: 0.2281# 特征重要性 Top5:# Taxon Importance# Lactobacillus 0.1873# Saccharomyces 0.1421# Aspergillus 0.1134# Wickerhamomyces 0.0892# Acetobacter 0.0765R2=0.9472说明模型拟合得非常好——只要知道这几个菌的丰度,就能以94%的准确率判断样本处于哪个阶段。
Lactobacillus(乳酸菌)重要性排第一,占18.7%,是判断发酵阶段最关键的菌种。
五、特征重要性可视化
把随机森林算出来的重要性画成横向柱状图:
def plot_ml_feature_importance(fi, save_path='feature_importance.png'): top_fi = fi.nlargest(12, 'Importance') colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(top_fi))) fig, ax = plt.subplots(figsize=(10, 7)) bars = ax.barh(range(len(top_fi)), top_fi['Importance'].values, color=colors, edgecolor='black', linewidth=0.5) ax.set_yticks(range(len(top_fi))) ax.set_yticklabels(top_fi['Taxon'].values, fontsize=10) ax.set_xlabel('特征重要性', fontsize=12) ax.set_title('随机森林特征重要性(预测发酵阶段)', fontsize=14) ax.grid(True, alpha=0.3, axis='x')for bar, val in zip(bars, top_fi['Importance'].values): ax.text(val + 0.003, bar.get_y() + bar.get_height()/2,f'{val:.3f}', va='center', fontsize=9) plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig六、PCA降维可视化
最后用PCA把物种组成降到2维,看看三个阶段能不能明显分开:
from sklearn.decomposition import PCAfrom sklearn.preprocessing import StandardScalerdefplot_pca(df, taxa, save_path='pca_visualization.png'): X_std = StandardScaler().fit_transform(df[taxa].values) pca = PCA(n_components=2) pc = pca.fit_transform(X_std) fig, ax = plt.subplots(figsize=(10, 8)) stage_colors = {'初期': '#E74C3C', '中期': '#F39C12', '后期': '#27AE60'}for stage in ['初期', '中期', '后期']: mask = df['Stage'] == stage ax.scatter(pc[mask, 0], pc[mask, 1], label=stage, s=120, alpha=0.75, edgecolors='black', linewidths=1.2, c=stage_colors[stage]) var = pca.explained_variance_ratio_ ax.set_xlabel(f'PC1 ({var[0]*100:.1f}%)', fontsize=13) ax.set_ylabel(f'PC2 ({var[1]*100:.1f}%)', fontsize=13) ax.set_title(f'PCA降维可视化(前两主成分解释方差: {sum(var)*100:.1f}%)', fontsize=14) ax.legend(fontsize=11) ax.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(save_path, dpi=150, bbox_inches='tight') plt.show()return fig
七、一行命令跑全流程
把所有函数装进一个类:
classFermentedFoodAnalyzer:def__init__(self):self.df, self.taxa = generate_microbiome_data()self.alpha_div = compute_alpha_diversity(self.df[self.taxa].values)self.beta_matrix = compute_beta_diversity(self.df[self.taxa].values)self.diff_results = differential_abundance_test(self.df, self.taxa)self.net, self.corr = build_correlation_network(self.df[self.taxa].values, self.taxa)self.model, self.fi = build_ml_model(self.df, self.taxa)defrun_analysis(self):self.plot_differential_abundance('图5_差异丰度.png')self.plot_ml_feature_importance('图6_特征重要性.png')self.plot_pca('图7_PCA降维.png') print('[OK] 3张图生成完毕')analyzer = FermentedFoodAnalyzer()analyzer.run_analysis()跑完之后,图5到图7全部出图。
结合第一篇的4张图(物种组成、Alpha多样性、热图、Beta PCoA),7张图完整覆盖了从数据概览、Alpha/Beta多样性、差异分析、相关性网络到机器学习的全流程。代码加起来大约三百行,有详细中文注释,改一改物种列表和样本分组就能分析自己的数据。
喜欢小居的文章,点赞、关注、转发,点个“在看”不迷路~
有问题找小居,小居看到马上回!
完整源码获取:公众号回复「微生物分析」即可