"本文提供可直接复制粘贴的代码模板,覆盖数据预处理、模型训练、评估调优全流程。"
核心亮点:提供可直接复制的代码模板,覆盖 sklearn 与 statsmodels 双框架,从线性回归到正则化、超参数调优、模型保存全流程,是业务落地的速查手册。
一、环境准备与数据加载
1.1 安装依赖
pip install numpy pandas matplotlib seaborn scikit-learn statsmodels
1.2 导入库
# 数据处理import numpy as npimport pandas as pd# 可视化import matplotlib.pyplot as pltimport seaborn as sns# 机器学习from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCVfrom sklearn.preprocessing import StandardScaler, LabelEncoderfrom sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score)# 统计建模import statsmodels.api as sm# 设置显示选项pd.set_option('display.max_columns', None)pd.set_option('display.width', None)plt.rcParams['font.size'] = 10
1.3 加载示例数据
使用经典的鸢尾花数据集(二分类版本):
from sklearn.datasets import load_iris# 加载数据iris = load_iris()X = iris.datay = iris.target# 只取前两类(二分类问题)mask = y < 2X = X[mask]y = y[mask]# 创建DataFramedf = pd.DataFrame(X, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])df['target'] = yprint("数据集形状:", df.shape)print("\n前5行数据:")print(df.head())print("\n类别分布:")print(df['target'].value_counts())
二、探索性数据分析(EDA)
2.1 数据概览
# 基础统计信息print("=" * 50)print("数据基本信息")print("=" * 50)print(df.info())print("\n" + "=" * 50)print("描述性统计")print("=" * 50)print(df.describe())# 检查缺失值print("\n" + "=" * 50)print("缺失值检查")print("=" * 50)print(df.isnull().sum())
2.2 特征分布可视化
# 特征分布直方图fig, axes = plt.subplots(2, 2, figsize=(12, 10))axes = axes.ravel()for idx, col in enumerate(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']): axes[idx].hist(df[df['target']==0][col], alpha=0.5, label='Class 0', bins=20) axes[idx].hist(df[df['target']==1][col], alpha=0.5, label='Class 1', bins=20) axes[idx].set_xlabel(col) axes[idx].set_ylabel('Frequency') axes[idx].legend() axes[idx].set_title(f'Distribution of {col}')plt.tight_layout()plt.show()
2.3 特征相关性分析
# 相关性热力图plt.figure(figsize=(10, 8))correlation_matrix = df.corr()sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True, fmt='.2f')plt.title('Feature Correlation Matrix')plt.show()# 与目标变量的相关性print("特征与目标变量的相关性:")print(correlation_matrix['target'].sort_values(ascending=False))
2.4 散点图矩阵
# 散点图矩阵features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']sns.pairplot(df, hue='target', vars=features, diag_kind='kde', height=2.5)plt.suptitle('Pairplot of Features', y=1.02)plt.show()
三、数据预处理
3.1 划分训练集和测试集
# 分离特征和目标X = df.drop('target', axis=1)y = df['target']# 划分数据集(80%训练,20%测试)X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)print(f"训练集大小: {X_train.shape}")print(f"测试集大小: {X_test.shape}")print(f"训练集类别分布:\n{pd.Series(y_train).value_counts()}")print(f"测试集类别分布:\n{pd.Series(y_test).value_counts()}")
3.2 特征缩放
重要:逻辑回归对特征尺度敏感,必须进行标准化。
# 创建标准化器scaler = StandardScaler()# 拟合并转换训练集X_train_scaled = scaler.fit_transform(X_train)X_test_scaled = scaler.transform(X_test)# 转换回DataFrame便于查看X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)print("标准化后的训练集统计:")print(X_train_scaled_df.describe().round(3))
3.3 处理类别不平衡(如需要)
# 检查类别比例class_ratio = pd.Series(y_train).value_counts(normalize=True)print("类别比例:")print(class_ratio)# 如果不平衡,可以使用class_weightif class_ratio.min() < 0.4: # 如果最小类别占比小于40% print("\n检测到类别不平衡,建议使用class_weight='balanced'") use_balanced = Trueelse: print("\n类别分布较平衡") use_balanced = False
四、模型训练(sklearn版)
4.1 基础模型训练
# 创建逻辑回归模型model = LogisticRegression( random_state=42, max_iter=1000, # 增加迭代次数确保收敛 solver='lbfgs' # 默认求解器,适合小数据集)# 训练模型model.fit(X_train_scaled, y_train)print("模型训练完成!")print(f"迭代次数: {model.n_iter_}")print(f"是否收敛: {model.n_iter_ < model.max_iter}")
4.2 查看模型参数
# 模型系数(权重)coefficients = pd.DataFrame({ 'Feature': X.columns, 'Coefficient': model.coef_[0], 'Abs_Coefficient': np.abs(model.coef_[0])}).sort_values('Abs_Coefficient', ascending=False)print("模型系数(特征重要性):")print(coefficients)print(f"\n截距(Intercept): {model.intercept_[0]:.4f}")# 可视化系数plt.figure(figsize=(10, 6))colors = ['green' if c > 0 else 'red' for c in coefficients['Coefficient']]plt.barh(coefficients['Feature'], coefficients['Coefficient'], color=colors, alpha=0.7)plt.xlabel('Coefficient Value')plt.title('Logistic Regression Coefficients\n(Green: Positive, Red: Negative)')plt.axvline(x=0, color='black', linestyle='--', alpha=0.3)plt.tight_layout()plt.show()
4.3 预测与概率
# 预测类别y_pred = model.predict(X_test_scaled)# 预测概率y_pred_proba = model.predict_proba(X_test_scaled)print("预测类别示例(前10个):")print(y_pred[:10])print("\n预测概率示例(前10个):")print("类别0概率 | 类别1概率")for i in range(10): print(f"{y_pred_proba[i][0]:.4f} | {y_pred_proba[i][1]:.4f}")# 只取正类的概率y_pred_proba_positive = y_pred_proba[:, 1]
五、模型评估
5.1 基础指标
# 计算各项指标accuracy = accuracy_score(y_test, y_pred)precision = precision_score(y_test, y_pred)recall = recall_score(y_test, y_pred)f1 = f1_score(y_test, y_pred)auc = roc_auc_score(y_test, y_pred_proba_positive)print("=" * 50)print("模型评估指标")print("=" * 50)print(f"准确率 (Accuracy): {accuracy:.4f}")print(f"精确率 (Precision): {precision:.4f}")print(f"召回率 (Recall): {recall:.4f}")print(f"F1分数: {f1:.4f}")print(f"AUC-ROC: {auc:.4f}")
5.2 混淆矩阵
# 计算混淆矩阵cm = confusion_matrix(y_test, y_pred)# 可视化plt.figure(figsize=(8, 6))sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])plt.ylabel('Actual')plt.xlabel('Predicted')plt.title('Confusion Matrix')plt.show()# 详细解读print("混淆矩阵解读:")print(f"真正例(TP): {cm[1,1]} - 实际是1,预测也是1")print(f"真负例(TN): {cm[0,0]} - 实际是0,预测也是0")print(f"假正例(FP): {cm[0,1]} - 实际是0,预测为1(误报)")print(f"假负例(FN): {cm[1,0]} - 实际是1,预测为0(漏报)")
5.3 分类报告
print("详细分类报告:")print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1']))
5.4 ROC曲线
# 计算ROC曲线fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_positive)# 绘制ROC曲线plt.figure(figsize=(10, 8))plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {auc:.3f})')plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')plt.xlim([0.0, 1.0])plt.ylim([0.0, 1.05])plt.xlabel('False Positive Rate (1 - Specificity)')plt.ylabel('True Positive Rate (Sensitivity)')plt.title('Receiver Operating Characteristic (ROC) Curve')plt.legend(loc="lower right")plt.grid(True, alpha=0.3)plt.show()
5.5 精确率-召回率曲线
from sklearn.metrics import precision_recall_curve, average_precision_score# 计算PR曲线precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba_positive)avg_precision = average_precision_score(y_test, y_pred_proba_positive)# 绘制PR曲线plt.figure(figsize=(10, 8))plt.plot(recall_curve, precision_curve, color='blue', lw=2, label=f'PR Curve (AP = {avg_precision:.3f})')plt.xlabel('Recall')plt.ylabel('Precision')plt.title('Precision-Recall Curve')plt.legend()plt.grid(True, alpha=0.3)plt.show()
六、交叉验证
6.1 K折交叉验证
# 5折交叉验证cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')print("5折交叉验证结果:")print(f"各折得分: {cv_scores}")print(f"平均得分: {cv_scores.mean():.4f}")print(f"标准差: {cv_scores.std():.4f}")# 可视化plt.figure(figsize=(8, 5))plt.bar(range(1, 6), cv_scores, alpha=0.7, color='skyblue', edgecolor='navy')plt.axhline(y=cv_scores.mean(), color='red', linestyle='--', label=f'Mean: {cv_scores.mean():.4f}')plt.xlabel('Fold')plt.ylabel('Accuracy')plt.title('5-Fold Cross-Validation Scores')plt.ylim([0.8, 1.0])plt.legend()plt.show()
七、超参数调优
7.1 网格搜索(Grid Search)
# 定义参数网格param_grid = { 'C': [0.001, 0.01, 0.1, 1, 10, 100], # 正则化强度的倒数 'penalty': ['l1', 'l2'], 'solver': ['liblinear'] # liblinear支持l1和l2}# 创建网格搜索grid_search = GridSearchCV( LogisticRegression(random_state=42, max_iter=1000), param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)# 执行搜索print("开始网格搜索...")grid_search.fit(X_train_scaled, y_train)print(f"\n最佳参数: {grid_search.best_params_}")print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
7.2 评估最佳模型
# 使用最佳模型best_model = grid_search.best_estimator_# 在测试集上评估y_pred_best = best_model.predict(X_test_scaled)y_pred_proba_best = best_model.predict_proba(X_test_scaled)[:, 1]print("=" * 50)print("最佳模型在测试集上的表现")print("=" * 50)print(f"准确率: {accuracy_score(y_test, y_pred_best):.4f}")print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba_best):.4f}")print(f"F1分数: {f1_score(y_test, y_pred_best):.4f}")
八、statsmodels版本(统计细节)
8.1 使用statsmodels获取详细统计信息
# 添加常数项(截距)X_train_const = sm.add_constant(X_train_scaled)X_test_const = sm.add_constant(X_test_scaled)# 拟合模型logit_model = sm.Logit(y_train, X_train_const)result = logit_model.fit()# 查看详细结果print(result.summary())
8.2 解释统计结果
# 获取OR值(优势比)及其置信区间params = result.paramsconf = result.conf_int()conf['OR'] = np.exp(params)conf.columns = ['2.5%', '97.5%', 'OR']print("\n优势比(Odds Ratios)及其95%置信区间:")print(np.exp(conf))print("\n解读:")print("OR > 1: 该特征增加时,发生概率增加")print("OR < 1: 该特征增加时,发生概率减少")print("OR = 1: 该特征对结果无影响")
九、完整代码模板
"""逻辑回归完整流程模板复制此代码,替换你的数据即可运行"""import numpy as npimport pandas as pdfrom sklearn.model_selection import train_test_split, cross_val_score, GridSearchCVfrom sklearn.preprocessing import StandardScalerfrom sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, roc_auc_score)# ========== 1. 数据准备 ==========# 替换为你的数据加载方式# df = pd.read_csv('your_data.csv')# X = df.drop('target_column', axis=1)# y = df['target_column']# ========== 2. 数据划分 ==========X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)# ========== 3. 特征缩放 ==========scaler = StandardScaler()X_train_scaled = scaler.fit_transform(X_train)X_test_scaled = scaler.transform(X_test)# ========== 4. 模型训练 ==========model = LogisticRegression( C=1.0, # 正则化强度(调参重点) penalty='l2', # 正则化类型 class_weight=None, # 不平衡数据可设为'balanced' random_state=42, max_iter=1000)model.fit(X_train_scaled, y_train)# ========== 5. 预测与评估 ==========y_pred = model.predict(X_test_scaled)y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba):.4f}")print("\n分类报告:")print(classification_report(y_test, y_pred))# ========== 6. 保存模型 ==========import joblibjoblib.dump(model, 'logistic_regression_model.pkl')joblib.dump(scaler, 'scaler.pkl')print("\n模型已保存!")
十、常见问题与解决方案
| | |
|---|
| ConvergenceWarning | |
| | 使用class_weight='balanced';调整阈值 |
| | |
| | |
| | |
下篇预告
下一篇我们将进入真实业务场景:
敬请期待:《逻辑回归实战案例:从数据到上线的完整项目》
附:推荐阅读
- • sklearn官方文档:https://scikit-learn.org/stable/modules/linear\_model.html#logistic-regression
- • 《Python机器学习》Sebastian Raschka