在数据驱动的时代,Python凭借其强大的数据处理生态系统(Pandas、NumPy、Scikit-learn等)成为数据分析领域的首选语言。
本文将深入剖析10个真实业务场景中的数据分析案例,涵盖数据清洗、特征工程、时间序列分析、机器学习应用等核心技术栈。每个案例都基于实际项目经验提炼,注重代码的工程化和可复用性。
案例1:缺失值智能填补策略
业务背景
处理用户行为日志时,常因网络波动、埋点失效导致数据缺失。简单删除会损失大量样本,直接填充均值则忽略了数据分布特性。
技术方案
采用多重插补(MICE)结合KNN插补的混合策略,根据数据类型和缺失模式选择最优方法。
import pandas as pdimport numpy as npfrom sklearn.impute import KNNImputerfrom sklearn.experimental import enable_iterative_imputerfrom sklearn.impute import IterativeImputer# 模拟包含缺失值的用户行为数据np.random.seed(42)data = pd.DataFrame({'user_id': range(1000),'age': np.random.randint(18, 65, 1000),'session_duration': np.random.exponential(300, 1000),'page_views': np.random.poisson(10, 1000),'purchase_amount': np.random.gamma(2, 50, 1000)})# 随机制造15%的缺失值mask = np.random.rand(*data.shape) < 0.15data = data.mask(mask)# 策略1:数值型特征用MICE(基于随机森林的迭代插补)mice_imputer = IterativeImputer( estimator=None, # 默认使用BayesianRidge max_iter=10, random_state=42)# 策略2:KNN插补(适合局部相似性强的数据)knn_imputer = KNNImputer(n_neighbors=5, weights='distance')# 根据特征相关性选择插补方法corr_matrix = data.corr().abs()high_corr_features = corr_matrix[corr_matrix > 0.6].dropna(how='all').index.tolist()# 对高相关特征使用KNNdata_knn = pd.DataFrame( knn_imputer.fit_transform(data[high_corr_features]), columns=high_corr_features)# 对其他特征使用MICEother_features = [col for col in data.columns if col notin high_corr_features]data_mice = pd.DataFrame( mice_imputer.fit_transform(data[other_features]), columns=other_features)# 合并结果data_imputed = pd.concat([data_knn, data_mice], axis=1)# 验证插补质量(对比原始数据分布)print("原始数据统计:\n", data.describe())print("\n插补后数据统计:\n", data_imputed.describe())
关键要点
- 分布保持性:MICE通过迭代回归保持多变量联合分布
- 验证机制:使用Kolmogorov-Smirnov检验对比插补前后分布差异
案例2:时间序列异常检测与平滑处理
业务背景
电商平台监控每小时GMV(成交总额),需识别促销、故障等引起的异常波动,同时平滑噪声以提取趋势。
技术方案
结合STL分解(Seasonal-Trend decomposition using Loess)和孤立森林(Isolation Forest)实现异常检测。
import pandas as pdimport numpy as npfrom statsmodels.tsa.seasonal import STLfrom sklearn.ensemble import IsolationForestimport matplotlib.pyplot as plt# 生成模拟的GMV时间序列(包含趋势、周期、异常)dates = pd.date_range('2024-01-01', periods=720, freq='H')trend = np.linspace(100000, 150000, 720)seasonal = 20000 * np.sin(np.arange(720) * 2 * np.pi / 24) # 日周期noise = np.random.normal(0, 5000, 720)# 人工注入异常值(模拟促销和系统故障)anomalies = np.zeros(720)anomalies[[100, 200, 300, 500]] = [50000, -30000, 80000, -40000]gmv = trend + seasonal + noise + anomaliesdf = pd.DataFrame({'gmv': gmv}, index=dates)# Step1: STL分解提取趋势、季节性、残差stl = STL(df['gmv'], seasonal=25, trend=51) # seasonal需为奇数result = stl.fit()# Step2: 对残差应用孤立森林检测异常residuals = result.resid.values.reshape(-1, 1)iso_forest = IsolationForest(contamination=0.01, random_state=42)anomaly_labels = iso_forest.fit_predict(residuals)# 标记异常点df['is_anomaly'] = anomaly_labels == -1df['trend'] = result.trenddf['seasonal'] = result.seasonal# 平滑处理:用趋势+季节性替换异常值df['gmv_smoothed'] = df['gmv'].copy()df.loc[df['is_anomaly'], 'gmv_smoothed'] = ( df.loc[df['is_anomaly'], 'trend'] + df.loc[df['is_anomaly'], 'seasonal'])# 可视化结果fig, axes = plt.subplots(3, 1, figsize=(15, 10))axes[0].plot(df.index, df['gmv'], label='原始GMV', alpha=0.7)axes[0].scatter(df[df['is_anomaly']].index, df[df['is_anomaly']]['gmv'], color='red', label='异常点', s=50)axes[0].legend()axes[0].set_title('GMV时间序列异常检测')axes[1].plot(df.index, df['trend'], label='趋势', color='green')axes[1].plot(df.index, df['seasonal'], label='季节性', color='orange')axes[1].legend()axes[1].set_title('STL分解结果')axes[2].plot(df.index, df['gmv_smoothed'], label='平滑后GMV', color='purple')axes[2].legend()axes[2].set_title('平滑处理后')plt.tight_layout()# plt.savefig('time_series_anomaly_detection.png', dpi=300)print(f"检测到{df['is_anomaly'].sum()}个异常点")
关键要点
- 业务解释性:分解后的趋势可用于长期预测,季节性指导运营节奏
案例3:用户RFM模型构建与聚类分析
业务背景
电商平台需要对百万级用户进行精细化运营,根据消费行为分层,实现个性化营销。
技术方案
构建RFM模型(Recency-Frequency-Monetary),结合K-Means聚类和轮廓系数优化分群。
import pandas as pdimport numpy as npfrom sklearn.preprocessing import StandardScalerfrom sklearn.cluster import KMeansfrom sklearn.metrics import silhouette_scoreimport matplotlib.pyplot as pltimport seaborn as sns# 模拟交易数据np.random.seed(42)n_users = 10000transaction_data = pd.DataFrame({'user_id': np.repeat(range(n_users), np.random.randint(1, 20, n_users)),'order_date': pd.to_datetime('2024-01-01') + pd.to_timedelta( np.random.randint(0, 365, np.repeat(range(n_users), np.random.randint(1, 20, n_users)).shape[0]), unit='D' ),'order_amount': np.random.gamma(2, 100, np.repeat(range(n_users), np.random.randint(1, 20, n_users)).shape[0])})# 计算RFM指标reference_date = pd.to_datetime('2024-12-31')rfm = transaction_data.groupby('user_id').agg({'order_date': lambda x: (reference_date - x.max()).days, # Recency'user_id': 'count', # Frequency'order_amount': 'sum'# Monetary}).rename(columns={'order_date': 'recency','user_id': 'frequency','order_amount': 'monetary'})# 数据标准化(消除量纲影响)scaler = StandardScaler()rfm_scaled = scaler.fit_transform(rfm)# 确定最优K值(肘部法则+轮廓系数)inertia = []silhouette_scores = []K_range = range(2, 11)for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(rfm_scaled) inertia.append(kmeans.inertia_) silhouette_scores.append(silhouette_score(rfm_scaled, kmeans.labels_))# 可视化选择K=5(示例)optimal_k = 5kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)rfm['cluster'] = kmeans_final.fit_predict(rfm_scaled)# 聚类结果分析cluster_analysis = rfm.groupby('cluster').agg({'recency': 'mean','frequency': 'mean','monetary': 'mean'}).round(2)cluster_analysis['user_count'] = rfm['cluster'].value_counts().sort_index()cluster_analysis['percentage'] = (cluster_analysis['user_count'] / len(rfm) * 100).round(2)print("RFM聚类结果:\n", cluster_analysis)# 为每个聚类命名(基于业务理解)cluster_names = {0: '流失高价值用户',1: '新用户',2: '忠诚用户',3: '沉睡用户',4: '潜力用户'}rfm['segment'] = rfm['cluster'].map(cluster_names)# 3D可视化from mpl_toolkits.mplot3d import Axes3Dfig = plt.figure(figsize=(12, 8))ax = fig.add_subplot(111, projection='3d')for cluster in range(optimal_k): cluster_data = rfm[rfm['cluster'] == cluster] ax.scatter( cluster_data['recency'], cluster_data['frequency'], cluster_data['monetary'], label=cluster_names[cluster], alpha=0.6, s=50 )ax.set_xlabel('Recency (天)')ax.set_ylabel('Frequency (次)')ax.set_zlabel('Monetary (元)')ax.legend()plt.title('RFM用户分群3D可视化')# plt.savefig('rfm_clustering.png', dpi=300)
关键要点
- 标准化必要性:RFM三个维度量纲不同,直接聚类会被monetary主导
- K值选择:结合肘部法则(惯性下降速率)和轮廓系数(簇内紧密度)
案例4:A/B测试统计显著性检验
业务背景
产品改版后,需严格验证新版本是否真正提升了用户转化率,避免"假阳性"决策。
技术方案
使用双样本t检验和Bootstrap重采样进行假设检验,计算置信区间和统计功效。
import pandas as pdimport numpy as npfrom scipy import statsfrom scipy.stats import ttest_indimport matplotlib.pyplot as pltimport seaborn as sns# 模拟A/B测试数据np.random.seed(42)n_users = 5000# A组(对照组):转化率12%group_a = np.random.binomial(1, 0.12, n_users)# B组(实验组):转化率14%(相对提升16.7%)group_b = np.random.binomial(1, 0.14, n_users)df_ab = pd.DataFrame({'group': ['A'] * n_users + ['B'] * n_users,'converted': np.concatenate([group_a, group_b])})# 基础统计conversion_rate = df_ab.groupby('group')['converted'].agg(['sum', 'count', 'mean'])conversion_rate['conversion_rate'] = (conversion_rate['mean'] * 100).round(2)print("转化率统计:\n", conversion_rate)# Step1: 双样本t检验t_stat, p_value = ttest_ind(group_b, group_a)print(f"\nt检验结果:t统计量={t_stat:.4f}, p值={p_value:.4f}")# Step2: 置信区间计算(使用正态近似)p_a = group_a.mean()p_b = group_b.mean()se = np.sqrt(p_a * (1 - p_a) / n_users + p_b * (1 - p_b) / n_users)ci_lower = (p_b - p_a) - 1.96 * seci_upper = (p_b - p_a) + 1.96 * seprint(f"转化率提升的95%置信区间:[{ci_lower*100:.2f}%, {ci_upper*100:.2f}%]")# Step3: Bootstrap重采样验证(10000次)n_bootstrap = 10000bootstrap_diffs = []for _ in range(n_bootstrap): sample_a = np.random.choice(group_a, size=n_users, replace=True) sample_b = np.random.choice(group_b, size=n_users, replace=True) bootstrap_diffs.append(sample_b.mean() - sample_a.mean())bootstrap_diffs = np.array(bootstrap_diffs)bootstrap_ci = np.percentile(bootstrap_diffs, [2.5, 97.5])print(f"Bootstrap 95%置信区间:[{bootstrap_ci[0]*100:.2f}%, {bootstrap_ci[1]*100:.2f}%]")# Step4: 功效分析(检验能检测到真实差异的概率)from statsmodels.stats.power import zt_ind_solve_powereffect_size = (p_b - p_a) / np.sqrt((p_a * (1 - p_a) + p_b * (1 - p_b)) / 2)power = zt_ind_solve_power( effect_size=effect_size, nobs1=n_users, alpha=0.05, ratio=1.0, alternative='two-sided')print(f"\n统计功效(Power):{power:.2%}")# 可视化Bootstrap分布plt.figure(figsize=(10, 6))plt.hist(bootstrap_diffs * 100, bins=50, alpha=0.7, edgecolor='black')plt.axvline(0, color='red', linestyle='--', label='零假设(无差异)')plt.axvline(bootstrap_ci[0] * 100, color='green', linestyle='--', label='95% CI')plt.axvline(bootstrap_ci[1] * 100, color='green', linestyle='--')plt.xlabel('转化率差异 (%)')plt.ylabel('频数')plt.title('Bootstrap重采样分布')plt.legend()# plt.savefig('ab_test_bootstrap.png', dpi=300)# 决策建议if p_value < 0.05and ci_lower > 0: print("\n✅ 结论:B版本显著优于A版本,建议全量上线")else: print("\n❌ 结论:差异不显著或置信区间包含0,建议继续观察")
关键要点
- 多重验证:t检验提供p值,Bootstrap提供稳健的置信区间
- 功效分析:样本量不足可能导致"假阴性"(II类错误)
- 业务阈值:即使统计显著,也需考虑最小可检测效应(MDE)
案例5:多重共线性诊断与特征工程
业务背景
构建房价预测模型时,面积、房间数、楼层等特征高度相关,导致回归系数不稳定。
技术方案
使用VIF(方差膨胀因子)诊断共线性,通过PCA降维和正则化回归解决。
import pandas as pdimport numpy as npfrom sklearn.linear_model import LinearRegression, Ridge, Lassofrom sklearn.preprocessing import StandardScalerfrom sklearn.decomposition import PCAfrom statsmodels.stats.outliers_influence import variance_inflation_factorimport matplotlib.pyplot as plt# 模拟房价数据(故意制造共线性)np.random.seed(42)n_samples = 500area = np.random.uniform(50, 200, n_samples) # 面积rooms = 1 + area / 50 + np.random.normal(0, 0.5, n_samples) # 房间数与面积强相关floor = np.random.randint(1, 30, n_samples)age = np.random.uniform(0, 30, n_samples)distance = np.random.uniform(1, 20, n_samples)# 目标变量:房价(受多因素影响)price = (5000 * area + 20000 * rooms + 500 * floor - 1000 * age - 2000 * distance + np.random.normal(0, 50000, n_samples))df_house = pd.DataFrame({'area': area,'rooms': rooms,'floor': floor,'age': age,'distance': distance,'price': price})# Step1: 计算VIF(方差膨胀因子)X = df_house.drop('price', axis=1)vif_data = pd.DataFrame()vif_data['feature'] = X.columnsvif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]print("VIF诊断结果:\n", vif_data.sort_values('VIF', ascending=False))# VIF>10表示严重共线性# Step2: 相关性热力图import seaborn as snsplt.figure(figsize=(8, 6))sns.heatmap(X.corr(), annot=True, cmap='coolwarm', center=0)plt.title('特征相关性矩阵')# plt.savefig('correlation_heatmap.png', dpi=300)# 解决方案1: PCA降维scaler = StandardScaler()X_scaled = scaler.fit_transform(X)pca = PCA(n_components=0.95) # 保留95%的方差X_pca = pca.fit_transform(X_scaled)print(f"\nPCA降维:{X.shape[1]}维 → {X_pca.shape[1]}维")print(f"解释方差比:{pca.explained_variance_ratio_}")# 解决方案2: 岭回归(L2正则化)ridge = Ridge(alpha=100)ridge.fit(X_scaled, df_house['price'])# 解决方案3: Lasso回归(L1正则化,可实现特征选择)lasso = Lasso(alpha=1000)lasso.fit(X_scaled, df_house['price'])# 对比系数coef_comparison = pd.DataFrame({'feature': X.columns,'OLS': LinearRegression().fit(X_scaled, df_house['price']).coef_,'Ridge': ridge.coef_,'Lasso': lasso.coef_}).round(2)print("\n回归系数对比:\n", coef_comparison)# 可视化系数差异coef_comparison.set_index('feature').plot(kind='bar', figsize=(12, 6))plt.ylabel('系数值')plt.title('不同回归方法的系数对比')plt.xticks(rotation=45)plt.legend()# plt.savefig('coefficient_comparison.png', dpi=300)
关键要点
- VIF阈值:VIF>10需警惕,VIF>100严重共线性
- 正则化选择:Ridge保留所有特征,Lasso可将不重要特征系数压缩至0
案例6:时间窗口特征构造与滞后分析
业务背景
预测股票价格或销量时,历史数据的时序依赖性是关键,需构造滑动窗口特征。
技术方案
基于Pandas的rolling和shift函数构造多尺度时间特征,结合ACF/PACF分析最优滞后阶数。
import pandas as pdimport numpy as npfrom statsmodels.graphics.tsaplots import plot_acf, plot_pacffrom statsmodels.tsa.stattools import adfullerimport matplotlib.pyplot as plt# 模拟销量时间序列np.random.seed(42)dates = pd.date_range('2023-01-01', periods=365, freq='D')trend = np.linspace(1000, 1500, 365)seasonal = 200 * np.sin(np.arange(365) * 2 * np.pi / 7) # 周周期noise = np.random.normal(0, 50, 365)sales = trend + seasonal + noisedf_sales = pd.DataFrame({'sales': sales}, index=dates)# Step1: 构造滚动窗口特征df_sales['ma_7'] = df_sales['sales'].rolling(window=7).mean() # 7日均值df_sales['ma_30'] = df_sales['sales'].rolling(window=30).mean() # 30日均值df_sales['std_7'] = df_sales['sales'].rolling(window=7).std() # 7日标准差df_sales['max_7'] = df_sales['sales'].rolling(window=7).max() # 7日最大值df_sales['min_7'] = df_sales['sales'].rolling(window=7).min() # 7日最小值# Step2: 构造滞后特征for lag in [1, 7, 14, 30]: df_sales[f'lag_{lag}'] = df_sales['sales'].shift(lag)# Step3: 构造差分特征(平稳性转换)df_sales['diff_1'] = df_sales['sales'].diff(1) # 一阶差分df_sales['diff_7'] = df_sales['sales'].diff(7) # 周差分# Step4: 构造变化率特征df_sales['pct_change_1'] = df_sales['sales'].pct_change(1) * 100df_sales['pct_change_7'] = df_sales['sales'].pct_change(7) * 100# 删除NaN行(窗口和滞后导致)df_features = df_sales.dropna()print("构造的时间序列特征:\n", df_features.head(10))# Step5: ACF/PACF分析确定最优滞后阶数fig, axes = plt.subplots(2, 1, figsize=(12, 8))plot_acf(df_sales['sales'].dropna(), lags=40, ax=axes[0])plot_pacf(df_sales['sales'].dropna(), lags=40, ax=axes[1])axes[0].set_title('自相关函数 (ACF)')axes[1].set_title('偏自相关函数 (PACF)')plt.tight_layout()# plt.savefig('acf_pacf_analysis.png', dpi=300)# Step6: ADF平稳性检验adf_result = adfuller(df_sales['sales'].dropna())print(f"\nADF检验统计量: {adf_result[0]:.4f}")print(f"p值: {adf_result[1]:.4f}")if adf_result[1] < 0.05: print("结论:序列平稳")else: print("结论:序列非平稳,需进行差分")# 可视化原始序列与移动平均plt.figure(figsize=(14, 6))plt.plot(df_sales.index, df_sales['sales'], label='原始销量', alpha=0.7)plt.plot(df_sales.index, df_sales['ma_7'], label='7日均线', linewidth=2)plt.plot(df_sales.index, df_sales['ma_30'], label='30日均线', linewidth=2)plt.legend()plt.title('销量时间序列与移动平均')plt.xlabel('日期')plt.ylabel('销量')# plt.savefig('time_series_features.png', dpi=300)
关键要点
- 滞后特征:反映时序依赖性,阶数选择参考ACF/PACF
- 差分转换:将非平稳序列转化为平稳序列,满足模型假设
案例7:类别不平衡处理与SMOTE过采样
业务背景
信用卡欺诈检测中,欺诈样本仅占0.1%,直接建模会导致模型偏向预测"正常"。
技术方案
使用SMOTE(Synthetic Minority Over-sampling Technique)生成合成样本,结合类权重调整和评估指标优化。
import pandas as pdimport numpy as npfrom sklearn.datasets import make_classificationfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import classification_report, confusion_matrix, roc_auc_scorefrom imblearn.over_sampling import SMOTEfrom imblearn.under_sampling import RandomUnderSamplerfrom imblearn.pipeline import Pipeline as ImbPipelineimport matplotlib.pyplot as pltimport seaborn as sns# 模拟极度不平衡数据集(欺诈率0.1%)X, y = make_classification( n_samples=10000, n_features=20, n_informative=15, n_redundant=5, weights=[0.999, 0.001], # 99.9% vs 0.1% flip_y=0, random_state=42)print(f"原始样本分布:正常={np.sum(y==0)}, 欺诈={np.sum(y==1)}")# 划分训练集和测试集X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, stratify=y, random_state=42)# 方案1: 不处理不平衡(基线)clf_baseline = RandomForestClassifier(random_state=42)clf_baseline.fit(X_train, y_train)y_pred_baseline = clf_baseline.predict(X_test)print("\n【基线模型】未处理不平衡:")print(classification_report(y_test, y_pred_baseline, target_names=['正常', '欺诈']))# 方案2: SMOTE过采样smote = SMOTE(sampling_strategy=0.5, random_state=42) # 将少数类提升至多数类的50%X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)clf_smote = RandomForestClassifier(random_state=42)clf_smote.fit(X_train_smote, y_train_smote)y_pred_smote = clf_smote.predict(X_test)print("\n【SMOTE过采样】:")print(f"过采样后样本分布:正常={np.sum(y_train_smote==0)}, 欺诈={np.sum(y_train_smote==1)}")print(classification_report(y_test, y_pred_smote, target_names=['正常', '欺诈']))# 方案3: 类权重调整clf_weighted = RandomForestClassifier( class_weight='balanced', # 自动计算权重 random_state=42)clf_weighted.fit(X_train, y_train)y_pred_weighted = clf_weighted.predict(X_test)print("\n【类权重调整】:")print(classification_report(y_test, y_pred_weighted, target_names=['正常', '欺诈']))# 方案4: SMOTE + 随机欠采样组合pipeline = ImbPipeline([ ('over', SMOTE(sampling_strategy=0.5, random_state=42)), ('under', RandomUnderSampler(sampling_strategy=0.8, random_state=42)), ('clf', RandomForestClassifier(random_state=42))])pipeline.fit(X_train, y_train)y_pred_pipeline = pipeline.predict(X_test)print("\n【SMOTE + 欠采样组合】:")print(classification_report(y_test, y_pred_pipeline, target_names=['正常', '欺诈']))# 对比ROC-AUCmodels = {'基线': clf_baseline,'SMOTE': clf_smote,'类权重': clf_weighted,'组合策略': pipeline}auc_scores = {}for name, model in models.items(): y_proba = model.predict_proba(X_test)[:, 1] auc_scores[name] = roc_auc_score(y_test, y_proba)print("\n各模型AUC对比:")for name, auc in auc_scores.items(): print(f"{name}: {auc:.4f}")# 可视化混淆矩阵(以SMOTE为例)cm = confusion_matrix(y_test, y_pred_smote)plt.figure(figsize=(8, 6))sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['正常', '欺诈'], yticklabels=['正常', '欺诈'])plt.ylabel('真实标签')plt.xlabel('预测标签')plt.title('SMOTE模型混淆矩阵')# plt.savefig('smote_confusion_matrix.png', dpi=300)
关键要点
- SMOTE原理:在少数类样本间插值生成新样本,而非简单复制
- 评估指标:关注Recall(召回率)和F1-Score,而非Accuracy
案例8:分层抽样与样本代表性检验
业务背景
从百万用户中抽取1万样本进行问卷调查,需确保样本在年龄、地域、消费层级上的分布与总体一致。
技术方案
使用分层随机抽样保证子群体比例,通过卡方检验验证抽样代表性。
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom scipy.stats import chisquare, chi2_contingencyimport matplotlib.pyplot as plt# 模拟用户总体数据(100万用户)np.random.seed(42)n_population = 1000000population = pd.DataFrame({'user_id': range(n_population),'age_group': np.random.choice(['18-25', '26-35', '36-45', '46+'], n_population, p=[0.2, 0.35, 0.3, 0.15]),'region': np.random.choice(['一线', '新一线', '二线', '三线及以下'], n_population, p=[0.15, 0.25, 0.35, 0.25]),'consumption_level': np.random.choice(['高', '中', '低'], n_population, p=[0.2, 0.5, 0.3])})print("总体分布:")print(population[['age_group', 'region', 'consumption_level']].describe())# Step1: 简单随机抽样(对比基线)sample_simple = population.sample(n=10000, random_state=42)# Step2: 分层随机抽样# 按年龄分组+地域进行分层population['strata'] = population['age_group'] + '_' + population['region']# 计算每层的抽样比例sample_size = 10000strata_proportions = population['strata'].value_counts(normalize=True)# 分层抽样sample_stratified = population.groupby('strata', group_keys=False).apply(lambda x: x.sample(n=max(1, int(len(x) / len(population) * sample_size)), random_state=42)).reset_index(drop=True)print(f"\n分层抽样样本量:{len(sample_stratified)}")# Step3: 代表性检验 - 卡方检验defchi_square_test(population_dist, sample_dist, feature_name):""" 对单个特征进行卡方检验 """# 期望频数(基于总体分布) expected = population_dist.values * len(sample_dist)# 观测频数(样本实际分布) observed = sample_dist.values# 卡方检验 chi2, p_value = chisquare(f_obs=observed, f_exp=expected) print(f"\n{feature_name} 卡方检验:") print(f"卡方统计量: {chi2:.4f}") print(f"p值: {p_value:.4f}")if p_value > 0.05: print(f"结论:样本与总体分布无显著差异(代表性良好)")else: print(f"结论:样本与总体分布存在显著差异")return chi2, p_value# 对年龄分组进行检验pop_age_dist = population['age_group'].value_counts(normalize=True).sort_index()sample_age_dist_simple = sample_simple['age_group'].value_counts().sort_index()sample_age_dist_stratified = sample_stratified['age_group'].value_counts().sort_index()print("\n========== 简单随机抽样 ==========")chi_square_test(pop_age_dist, sample_age_dist_simple, "年龄分组")print("\n========== 分层随机抽样 ==========")chi_square_test(pop_age_dist, sample_age_dist_stratified, "年龄分组")# Step4: 可视化对比fig, axes = plt.subplots(1, 3, figsize=(18, 5))# 总体分布pop_age_dist.plot(kind='bar', ax=axes[0], color='skyblue', alpha=0.8)axes[0].set_title('总体年龄分布')axes[0].set_ylabel('比例')# 简单随机抽样(sample_age_dist_simple / len(sample_simple)).plot(kind='bar', ax=axes[1], color='orange', alpha=0.8)axes[1].set_title('简单随机抽样分布')axes[1].set_ylim(axes[0].get_ylim())# 分层抽样(sample_age_dist_stratified / len(sample_stratified)).plot(kind='bar', ax=axes[2], color='green', alpha=0.8)axes[2].set_title('分层随机抽样分布')axes[2].set_ylim(axes[0].get_ylim())plt.tight_layout()# plt.savefig('sampling_comparison.png', dpi=300)# Step5: 多维度交叉验证contingency_table_pop = pd.crosstab(population['age_group'], population['region'])contingency_table_sample = pd.crosstab(sample_stratified['age_group'], sample_stratified['region'])chi2, p_value, dof, expected = chi2_contingency(contingency_table_sample)print(f"\n年龄×地域交叉表卡方检验:")print(f"卡方统计量: {chi2:.4f}, p值: {p_value:.4f}")
关键要点
- 分层变量选择:选择对研究目标影响大的变量(如调研用户满意度,按消费层级分层)
- 样本量分配:比例分配(按层大小)vs 最优分配(按层方差)
- 卡方检验假设:期望频数≥5,否则需合并类别或用Fisher精确检验
案例9:相关性分析与因果推断初探
发现"购买会员的用户留存率高30%",需判断是会员权益提升留存,还是本身活跃用户更愿意购买会员(选择偏差)。
技术方案
使用倾向得分匹配(PSM)消除混淆因素,结合双重差分(DID)验证因果效应。
import pandas as pdimport numpy as npfrom sklearn.linear_model import LogisticRegressionfrom sklearn.neighbors import NearestNeighborsimport matplotlib.pyplot as plt# 模拟用户数据(包含混淆变量)np.random.seed(42)n_users = 5000# 混淆变量:活跃度、使用时长、历史消费activity_score = np.random.uniform(0, 100, n_users)usage_days = np.random.poisson(20, n_users)historical_purchase = np.random.uniform(0, 500, n_users)# 购买会员的倾向(活跃用户更可能购买)propensity = 1 / (1 + np.exp(-(0.05 * activity_score + 0.1 * usage_days + 0.002 * historical_purchase - 5)))is_member = np.random.binomial(1, propensity)# 留存率(受会员和活跃度影响)retention_prob = 1 / (1 + np.exp(-(0.03 * activity_score + 0.05 * usage_days + 15 * is_member - 3# 会员真实效应)))is_retained = np.random.binomial(1, retention_prob)df_users = pd.DataFrame({'activity_score': activity_score,'usage_days': usage_days,'historical_purchase': historical_purchase,'is_member': is_member,'is_retained': is_retained})print("原始数据统计:")print(df_users.groupby('is_member')['is_retained'].agg(['mean', 'count']))# Step1: 倾向得分计算X = df_users[['activity_score', 'usage_days', 'historical_purchase']]y = df_users['is_member']ps_model = LogisticRegression()ps_model.fit(X, y)df_users['propensity_score'] = ps_model.predict_proba(X)[:, 1]# Step2: 1:1最近邻匹配treatment = df_users[df_users['is_member'] == 1]control = df_users[df_users['is_member'] == 0]# 使用KNN找到最相似的对照用户knn = NearestNeighbors(n_neighbors=1)knn.fit(control[['propensity_score']])distances, indices = knn.kneighbors(treatment[['propensity_score']])# 构建匹配后的数据集matched_control = control.iloc[indices.flatten()].reset_index(drop=True)matched_treatment = treatment.reset_index(drop=True)matched_data = pd.concat([ matched_treatment.assign(group='treatment'), matched_control.assign(group='control')], ignore_index=True)print("\n匹配后样本量:", len(matched_data))# Step3: 匹配质量检验(协变量平衡性)print("\n匹配前协变量差异:")for col in ['activity_score', 'usage_days', 'historical_purchase']: diff_before = df_users[df_users['is_member']==1][col].mean() - df_users[df_users['is_member']==0][col].mean() print(f"{col}: {diff_before:.2f}")print("\n匹配后协变量差异:")for col in ['activity_score', 'usage_days', 'historical_purchase']: diff_after = matched_treatment[col].mean() - matched_control[col].mean() print(f"{col}: {diff_after:.2f}")# Step4: 计算平均处理效应(ATT)att = matched_treatment['is_retained'].mean() - matched_control['is_retained'].mean()print(f"\n平均处理效应(ATT): {att*100:.2f}%")print("解释:在消除混淆因素后,会员提升留存的净效应")# Step5: 可视化倾向得分分布fig, axes = plt.subplots(1, 2, figsize=(14, 5))# 匹配前axes[0].hist(df_users[df_users['is_member']==1]['propensity_score'], bins=30, alpha=0.5, label='会员', color='blue')axes[0].hist(df_users[df_users['is_member']==0]['propensity_score'], bins=30, alpha=0.5, label='非会员', color='red')axes[0].set_title('匹配前倾向得分分布')axes[0].legend()# 匹配后axes[1].hist(matched_treatment['propensity_score'], bins=30, alpha=0.5, label='会员', color='blue')axes[1].hist(matched_control['propensity_score'], bins=30, alpha=0.5, label='匹配的对照组', color='red')axes[1].set_title('匹配后倾向得分分布')axes[1].legend()plt.tight_layout()# plt.savefig('psm_propensity_distribution.png', dpi=300)# Step6: 敏感性分析(检验隐藏偏差的影响)from scipy.stats import binomn_pairs = len(matched_treatment)n_concordant = np.sum((matched_treatment['is_retained'] == 1) & (matched_control['is_retained'] == 0))# Rosenbaum边界检验(简化版)Gamma = 2.0# 假设未观测混淆使几率比扩大2倍p_upper = binom.cdf(n_concordant, n_pairs, Gamma / (1 + Gamma))print(f"\n敏感性分析(Γ={Gamma}): p值上界 = {p_upper:.4f}")if p_upper < 0.05: print("结论:即使存在适度隐藏偏差,因果效应仍显著")
关键要点
- 倾向得分本质:将多维混淆变量压缩为一维"被处理概率"
- 因果推断局限:PSM只能控制观测到的混淆,无法处理未观测混淆
案例10:文本数据结构化与主题建模
业务背景
分析10万条用户评论,提取核心主题和情感倾向,辅助产品迭代决策。
技术方案
使用TF-IDF提取关键词,LDA(潜在狄利克雷分配)进行主题建模,结合情感分析量化用户态度。
import pandas as pdimport numpy as npfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizerfrom sklearn.decomposition import LatentDirichletAllocationimport jiebaimport jieba.analyseimport matplotlib.pyplot as pltimport seaborn as sns# 模拟用户评论数据np.random.seed(42)comments = ["这款手机拍照效果非常好,夜景模式很强大","电池续航一般,充电速度还行","屏幕显示效果出色,色彩鲜艳","系统流畅度不错,但有时会卡顿","外观设计很漂亮,手感也很好","价格有点贵,性价比一般","拍照功能强大,支持多种模式","电池不耐用,一天要充两次电","屏幕分辨率高,看视频很舒服","系统优化还需改进,偶尔闪退"] * 1000# 模拟10000条评论df_comments = pd.DataFrame({'comment': comments})# Step1: 中文分词与停用词过滤stopwords = set(['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好'])deftokenize_chinese(text):"""中文分词并去除停用词""" words = jieba.cut(text)return' '.join([w for w in words if w notin stopwords and len(w) > 1])df_comments['tokens'] = df_comments['comment'].apply(tokenize_chinese)# Step2: TF-IDF特征提取tfidf = TfidfVectorizer(max_features=100)tfidf_matrix = tfidf.fit_transform(df_comments['tokens'])# 提取高频关键词feature_names = tfidf.get_feature_names_out()tfidf_scores = tfidf_matrix.sum(axis=0).A1top_keywords = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)[:20]print("TF-IDF Top 20关键词:")for word, score in top_keywords: print(f"{word}: {score:.2f}")# Step3: LDA主题建模n_topics = 5count_vectorizer = CountVectorizer(max_features=100)count_matrix = count_vectorizer.fit_transform(df_comments['tokens'])lda = LatentDirichletAllocation( n_components=n_topics, random_state=42, max_iter=20, learning_method='batch')lda.fit(count_matrix)# 展示每个主题的top词defdisplay_topics(model, feature_names, no_top_words=10): topics = {}for topic_idx, topic in enumerate(model.components_): top_words_idx = topic.argsort()[-no_top_words:][::-1] top_words = [feature_names[i] for i in top_words_idx] topics[f"主题{topic_idx+1}"] = top_wordsreturn topicsfeature_names_count = count_vectorizer.get_feature_names_out()topics = display_topics(lda, feature_names_count, 10)print("\n\nLDA主题建模结果:")for topic_name, words in topics.items(): print(f"{topic_name}: {', '.join(words)}")# Step4: 文档-主题分布doc_topic_dist = lda.transform(count_matrix)df_comments['dominant_topic'] = doc_topic_dist.argmax(axis=1)print("\n各主题文档数量:")print(df_comments['dominant_topic'].value_counts().sort_index())# Step5: 简易情感分析(基于关键词)positive_words = ['好', '强大', '出色', '鲜艳', '流畅', '漂亮', '舒服', '不错']negative_words = ['一般', '卡顿', '贵', '不耐用', '闪退', '改进']defsentiment_score(text):"""简单的情感打分:正面词+1,负面词-1""" score = 0for word in positive_words: score += text.count(word)for word in negative_words: score -= text.count(word)return scoredf_comments['sentiment'] = df_comments['comment'].apply(sentiment_score)df_comments['sentiment_label'] = df_comments['sentiment'].apply(lambda x: '正面'if x > 0else ('负面'if x < 0else'中性'))print("\n情感分布:")print(df_comments['sentiment_label'].value_counts())# Step6: 主题-情感交叉分析topic_sentiment = pd.crosstab( df_comments['dominant_topic'], df_comments['sentiment_label'], normalize='index') * 100print("\n主题-情感交叉分析(百分比):")print(topic_sentiment.round(2))# 可视化主题-情感热力图plt.figure(figsize=(10, 6))sns.heatmap(topic_sentiment, annot=True, fmt='.1f', cmap='RdYlGn', center=50)plt.title('各主题的情感分布')plt.xlabel('情感倾向')plt.ylabel('主题编号')# plt.savefig('topic_sentiment_heatmap.png', dpi=300)# Step7: 关键词云图(需wordcloud库)# from wordcloud import WordCloud# text_corpus = ' '.join(df_comments['tokens'])# wordcloud = WordCloud(# font_path='simhei.ttf', # 中文字体路径# width=800, # height=400,# background_color='white'# ).generate(text_corpus)# plt.figure(figsize=(12, 6))# plt.imshow(wordcloud, interpolation='bilinear')# plt.axis('off')# plt.savefig('wordcloud.png', dpi=300)
关键要点
- LDA参数调优:主题数k通过困惑度(Perplexity)和一致性(Coherence)确定
- 情感分析升级:可使用预训练模型(BERT、RoBERTa)提升准确率
总结与进阶路线
本文深入剖析了Python数据分析的10个高频场景,覆盖了从数据预处理到机器学习应用的完整链路。每个案例都融合了统计学原理、工程实践和业务思维,体现了数据分析的三大核心能力:
技术能力矩阵
进阶学习建议
- 可视化工具:Plotly交互式图表、Tableau BI看板
- 模型部署:Flask API、Docker容器化、模型监控
数据分析是一门兼具科学严谨性和艺术创造性的学科。真正的专家不仅精通工具和算法,更能从数据中洞察业务本质,用数据驱动决策。希望本文能为大家的数据分析之路提供实战指引,持续精进,终成大器。