# pip install numpy pandas matplotlib seaborn scipy scikit-learn openpyxl xlsxwriter python-docx fastdtw tslearn joblib python-dateutil# pip install scikit-learn-extra# 先更新pip# python -m pip install --upgrade pip# 安装基本依赖(已安装的会跳过)# pip install numpy pandas matplotlib seaborn scipy scikit-learn openpyxl xlsxwriter python-docx# 安装fastdtw和tslearn(替代原始的dtw包)# pip install fastdtw tslearn# 安装并行计算所需的包# pip install joblib# 安装其他可能需要的包# pip install python-dateutil# 注意:如果sklearn_extra无法安装,代码会自动使用替代方法"""21. K均值纵向聚类分析(基于DTW距离)- Python版============================================"""import osimport sysimport warningswarnings.filterwarnings('ignore')import numpy as npimport pandas as pdfrom datetime import datetimeimport matplotlib.pyplot as pltimport matplotlib as mplimport seaborn as snsfrom scipy import statsfrom scipy.spatial.distance import pdist, squareformfrom scipy.cluster.hierarchy import dendrogram, linkagefrom scipy.spatial.distance import cdistfrom sklearn.preprocessing import StandardScalerfrom sklearn.cluster import KMeansfrom sklearn.metrics import silhouette_score, silhouette_samplesfrom sklearn.manifold import MDSfrom sklearn.decomposition import PCA# 尝试导入KMedoids,如果失败则使用替代方法try: from sklearn_extra.cluster import KMedoids KMEDOIDS_AVAILABLE = Trueprint("sklearn_extra.KMedoids 可用")except ImportError: KMEDOIDS_AVAILABLE = Falseprint("sklearn_extra.KMedoids 不可用,将使用替代方法")import fastdtwfrom scipy.spatial.distance import euclideanimport multiprocessing as mpfrom concurrent.futures import ProcessPoolExecutorfrom joblib import Parallel, delayedimport plotly.graph_objects as goimport plotly.express as pxfrom plotly.subplots import make_subplotsfrom openpyxl import Workbookimport xlsxwriterimport docxfrom docx.shared import Inches, Pt, RGBColorfrom docx.enum.text import WD_ALIGN_PARAGRAPHfrom docx.enum.table import WD_TABLE_ALIGNMENT# 设置中文字体plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']plt.rcParams['axes.unicode_minus'] = False# 设置绘图风格sns.set_style("whitegrid")plt.style.use('seaborn-v0_8-whitegrid')# ============================================# 1. 初始化设置# ============================================def setup_environment():"""设置工作环境和结果文件夹"""# 获取桌面路径if os.name == 'nt': # Windows desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')else: # Mac/Linux desktop_path = os.path.join(os.path.expanduser('~'), 'Desktop')# 设置工作目录 os.chdir(desktop_path)print(f"工作目录设置为: {desktop_path}")# 创建结果文件夹 results_dir = "21-DTW-Kmeans结果"if not os.path.exists(results_dir): os.makedirs(results_dir)print(f"创建结果文件夹: {results_dir}")else:print(f"结果文件夹已存在: {results_dir}")# 创建子文件夹用于保存不同类型的输出 subfolders = ['data', 'plots', 'tables', 'reports']for folder in subfolders: folder_path = os.path.join(results_dir, folder)if not os.path.exists(folder_path): os.makedirs(folder_path)return results_dir# ============================================# 2. 数据加载与预处理# ============================================def load_and_preprocess_data(results_dir):"""加载和预处理数据"""print("步骤1: 加载和预处理数据...")# 读取数据 data_file = "longitudinal_data.xlsx" try: simple_data = pd.read_excel(data_file, sheet_name='Simple_Dataset')print(f"成功读取数据,维度: {simple_data.shape}")print(f"数据列: {list(simple_data.columns)}") except FileNotFoundError:print(f"错误: 找不到文件 {data_file}")print("创建示例数据以供测试...")# 创建示例数据 np.random.seed(123) n_subjects = 100 n_timepoints = 10 ids = [f"ID_{i:03d}"for i in range(1, n_subjects + 1)]times = list(range(1, n_timepoints + 1)) data = []for i, subject_id in enumerate(ids):# 创建不同的时间序列模式if i < 30:# 模式1: 上升趋势 base = np.linspace(0, 5, n_timepoints)elif i < 60:# 模式2: 下降趋势 base = np.linspace(5, 0, n_timepoints)elif i < 80:# 模式3: 凸形 base = np.array([0, 1, 3, 5, 7, 8, 7, 5, 3, 1])else:# 模式4: 凹形 base = np.array([8, 6, 4, 2, 0, 1, 2, 3, 4, 5])# 添加随机噪声 noise = np.random.normal(0, 0.5, n_timepoints) outcomes = base + noisefor j, time in enumerate(times): data.append({'ID': subject_id,'Time': time,'Outcome': outcomes[j],'Age': np.random.randint(20, 60),'Sex': np.random.choice(['M', 'F']),'Treatment': np.random.choice(['A', 'B', 'C']),'Baseline_Score': np.random.uniform(10, 90),'School_ID': f"SCH_{np.random.randint(1, 10):03d}",'Measurement_Date': datetime(2023, 1, 1) + pd.Timedelta(days=np.random.randint(0, 365)) }) simple_data = pd.DataFrame(data)# 保存示例数据 simple_data.to_excel(data_file, index=False, sheet_name='Simple_Dataset')print(f"已创建示例数据并保存到 {data_file}")# 转换数据类型 simple_data = simple_data.copy() simple_data['ID'] = simple_data['ID'].astype(str) simple_data['Time'] = pd.to_numeric(simple_data['Time']) simple_data['Outcome'] = pd.to_numeric(simple_data['Outcome']) simple_data['Age'] = pd.to_numeric(simple_data['Age']) simple_data['Sex'] = simple_data['Sex'].astype('category') simple_data['Treatment'] = simple_data['Treatment'].astype('category') simple_data['Baseline_Score'] = pd.to_numeric(simple_data['Baseline_Score']) simple_data['School_ID'] = simple_data['School_ID'].astype(str)if'Measurement_Date'in simple_data.columns: simple_data['Measurement_Date'] = pd.to_datetime(simple_data['Measurement_Date'])# 处理缺失值 - 使用线性插值print(f"缺失值处理前: {simple_data['Outcome'].isnull().sum()} 个缺失值")# 对每个ID的时间序列进行插值 simple_data['Outcome_interp'] = simple_data.groupby('ID')['Outcome'].transform( lambda x: x.interpolate(method='linear', limit_direction='both') )# 如果还有缺失值,使用组均值填充if simple_data['Outcome_interp'].isnull().sum() > 0: group_means = simple_data.groupby('ID')['Outcome_interp'].transform('mean') simple_data['Outcome_interp'] = simple_data['Outcome_interp'].fillna(group_means)print(f"缺失值处理后: {simple_data['Outcome_interp'].isnull().sum()} 个缺失值")# 保存预处理后的数据 preprocessed_file = os.path.join(results_dir, 'data', '01_预处理数据.csv') simple_data.to_csv(preprocessed_file, index=False, encoding='utf-8-sig')print(f"预处理数据已保存到: {preprocessed_file}")return simple_data# ============================================# 3. 创建时间序列矩阵# ============================================def create_time_series_matrix(simple_data, results_dir):"""创建时间序列矩阵"""print("步骤2: 创建时间序列矩阵...")# 创建宽格式的时间序列矩阵 ts_matrix = simple_data.pivot( index='ID', columns='Time', values='Outcome_interp' )# 添加列名前缀 ts_matrix.columns = [f'Time_{int(col)}'for col in ts_matrix.columns]print(f"时间序列矩阵维度: {ts_matrix.shape}")print(f"矩阵中的缺失值: {ts_matrix.isnull().sum().sum()}")# 处理缺失值 - 使用列均值填充if ts_matrix.isnull().sum().sum() > 0: ts_matrix = ts_matrix.fillna(ts_matrix.mean())# 标准化时间序列(按行) ts_matrix_array = ts_matrix.values# 对每一行进行标准化 ts_matrix_scaled = np.zeros_like(ts_matrix_array)for i in range(ts_matrix_array.shape[0]): row = ts_matrix_array[i, :]if np.std(row) > 0: ts_matrix_scaled[i, :] = (row - np.mean(row)) / np.std(row)# 转换为DataFrame ts_matrix_scaled_df = pd.DataFrame( ts_matrix_scaled, index=ts_matrix.index, columns=ts_matrix.columns )# 保存时间序列矩阵 ts_matrix_file = os.path.join(results_dir, 'data', '02_时间序列矩阵.csv') ts_matrix_scaled_df.to_csv(ts_matrix_file, encoding='utf-8-sig')print(f"时间序列矩阵已保存到: {ts_matrix_file}")return ts_matrix_scaled_df, ts_matrix_scaled# ============================================# 4. 动态时间规整(DTW)距离计算# ============================================def dtw_distance(ts1, ts2):"""计算两个时间序列的DTW距离""" try:# 使用fastdtw计算距离 distance, _ = fastdtw.fastdtw(ts1, ts2, dist=euclidean)return distance except Exception as e:# 如果DTW失败,使用欧氏距离print(f"DTW计算失败,使用欧氏距离: {e}")return np.sqrt(np.sum((ts1 - ts2) ** 2))

print("正在计算DTW距离矩阵...")# 使用joblib并行计算 try: from joblib import Parallel, delayed import multiprocessing# 计算上三角矩阵 def compute_row(i): row = np.zeros(n)for j in range(i + 1, n): row[j] = dtw_distance(ts_sample[i], ts_sample[j])return row# 使用并行计算 n_jobs = min(multiprocessing.cpu_count(), 8) # 最多使用8个核心 results = Parallel(n_jobs=n_jobs)(delayed(compute_row)(i) for i in range(n))# 构建完整的距离矩阵for i in range(n): dist_matrix[i, :] = results[i] dist_matrix[:, i] = results[i] except Exception as e:print(f"并行计算失败,使用顺序计算: {e}")# 顺序计算for i in range(n):for j in range(i + 1, n): dist = dtw_distance(ts_sample[i], ts_sample[j]) dist_matrix[i, j] = dist dist_matrix[j, i] = distif (i + 1) % 10 == 0:print(f"已完成 {i + 1}/{n} 个样本...")# 确保对角线为0 np.fill_diagonal(dist_matrix, 0)print(f"距离矩阵维度: {dist_matrix.shape}")print(f"距离矩阵中的NaN值: {np.isnan(dist_matrix).sum()}")print(f"距离矩阵中的Inf值: {np.isinf(dist_matrix).sum()}")# 处理异常值if np.isnan(dist_matrix).sum() > 0: finite_vals = dist_matrix[np.isfinite(dist_matrix)]if len(finite_vals) > 0: max_val = np.max(finite_vals) dist_matrix[np.isnan(dist_matrix)] = max_val * 1.1else: dist_matrix[np.isnan(dist_matrix)] = 1000if np.isinf(dist_matrix).sum() > 0: finite_vals = dist_matrix[np.isfinite(dist_matrix)]if len(finite_vals) > 0: max_val = np.max(finite_vals) dist_matrix[np.isinf(dist_matrix)] = max_val * 1.1else: dist_matrix[np.isinf(dist_matrix)] = 1000# 保存距离矩阵 dist_matrix_file = os.path.join(results_dir, 'data', '03_DTW距离矩阵.csv') pd.DataFrame(dist_matrix).to_csv(dist_matrix_file, index=False, encoding='utf-8-sig')print(f"DTW距离矩阵已保存到: {dist_matrix_file}")return dist_matrix, sample_indices, ts_sample# ============================================# 5. 确定最佳聚类数# ============================================def determine_optimal_clusters(dist_matrix, results_dir):"""确定最佳聚类数"""print("步骤4: 确定最佳聚类数...") n_samples = dist_matrix.shape[0] k_range = list(range(2, min(9, n_samples))) # 尝试2-8个聚类 wss = [] # 簇内平方和 sil_scores = [] # 轮廓系数for k in k_range: try:if KMEDOIDS_AVAILABLE:# 使用KMedoids算法(PAM) kmedoids = KMedoids(n_clusters=k, metric='precomputed', random_state=123, init='k-medoids++') clusters = kmedoids.fit_predict(dist_matrix)# 计算簇内距离和(替代WSS) wss_value = 0for i in range(k): cluster_points = np.where(clusters == i)[0]if len(cluster_points) > 0:# 计算簇内所有点到质心的平均距离if len(cluster_points) > 1:# 获取质心索引 medoid_idx = kmedoids.medoid_indices_[i]# 计算所有点到质心的距离for point in cluster_points:if point != medoid_idx: wss_value += dist_matrix[point, medoid_idx] wss.append(wss_value)# 计算轮廓系数if len(np.unique(clusters)) > 1: sil_score = silhouette_score(dist_matrix, clusters, metric='precomputed') sil_scores.append(sil_score)else: sil_scores.append(0)else:# 使用MDS降维后KMeansprint(f"KMedoids不可用,使用MDS+KMeans进行K={k}的聚类...")# 使用MDS将距离矩阵转换为特征向量 mds = MDS(n_components=min(10, n_samples - 1), dissimilarity='precomputed', random_state=123, normalized_stress='auto') mds_features = mds.fit_transform(dist_matrix)# 使用KMeans聚类 kmeans = KMeans(n_clusters=k, random_state=123, n_init=20) clusters = kmeans.fit_predict(mds_features)# 计算轮廓系数(基于MDS特征)if len(np.unique(clusters)) > 1: sil_score = silhouette_score(mds_features, clusters, metric='euclidean') sil_scores.append(sil_score)else: sil_scores.append(0)# 计算簇内距离和(基于MDS特征) wss_value = kmeans.inertia_ if hasattr(kmeans, 'inertia_') else 0 wss.append(wss_value)print(f"K = {k} 完成 - 轮廓系数: {sil_scores[-1]:.3f}") except Exception as e:print(f"K = {k} 失败: {e}") wss.append(np.nan) sil_scores.append(np.nan)# 找到最佳K值 valid_indices = ~np.isnan(sil_scores)if any(valid_indices): optimal_k_sil = k_range[np.nanargmax(sil_scores)]else: optimal_k_sil = 2# 肘部法则(找到拐点)if any(~np.isnan(wss)): wss_valid = [w for w in wss if not np.isnan(w)]if len(wss_valid) > 1:# 计算相对变化 wss_diff = np.diff(wss_valid) wss_rel_diff = np.abs(wss_diff / wss_valid[:-1])# 找到第一个相对变化小于10%的点 elbow_points = np.where(wss_rel_diff < 0.1)[0]if len(elbow_points) > 0: optimal_k_elbow = k_range[elbow_points[0] + 1]else: optimal_k_elbow = k_range[-1]else: optimal_k_elbow = 3else: optimal_k_elbow = 3print(f"基于轮廓系数的最佳K值: {optimal_k_sil}")print(f"基于肘部法则的最佳K值: {optimal_k_elbow}")# 选择最佳K值 optimal_k = optimal_k_sil if optimal_k_sil >= 2 else (optimal_k_elbow if optimal_k_elbow >= 2 else 3)# 创建肘部法则图和轮廓系数图 create_cluster_selection_plots(k_range, wss, sil_scores, optimal_k, results_dir)return optimal_k, k_range, wss, sil_scoresdef create_cluster_selection_plots(k_range, wss, sil_scores, optimal_k, results_dir):"""创建聚类选择可视化图"""# 肘部法则图 fig1, ax1 = plt.subplots(figsize=(10, 6)) ax1.plot(k_range, wss, 'bo-', linewidth=2, markersize=8) ax1.axvline(x=optimal_k, color='r', linestyle='--', linewidth=2) ax1.set_xlabel('聚类数 (k)', fontsize=12) ax1.set_ylabel('簇内距离和', fontsize=12) ax1.set_title('肘部法则图 (Elbow Method)', fontsize=14, fontweight='bold') ax1.grid(True, alpha=0.3) ax1.text(optimal_k + 0.1, max([w for w in wss if not np.isnan(w)]) * 0.9, f'建议聚类数: {optimal_k}', fontsize=11, color='red', fontweight='bold') elbow_plot_file = os.path.join(results_dir, 'plots', '01_肘部法则图.png') plt.tight_layout() plt.savefig(elbow_plot_file, dpi=300, bbox_inches='tight') plt.close()# 轮廓系数图 fig2, ax2 = plt.subplots(figsize=(10, 6)) ax2.plot(k_range, sil_scores, 'go-', linewidth=2, markersize=8) ax2.axvline(x=optimal_k, color='r', linestyle='--', linewidth=2) ax2.set_xlabel('聚类数 (k)', fontsize=12) ax2.set_ylabel('平均轮廓系数', fontsize=12) ax2.set_title('轮廓系数图 (Silhouette Method)', fontsize=14, fontweight='bold') ax2.grid(True, alpha=0.3) ax2.text(optimal_k + 0.1, max([s for s in sil_scores if not np.isnan(s)]) * 0.9, f'最佳聚类数: {optimal_k}', fontsize=11, color='red', fontweight='bold') silhouette_plot_file = os.path.join(results_dir, 'plots', '02_轮廓系数图.png') plt.tight_layout() plt.savefig(silhouette_plot_file, dpi=300, bbox_inches='tight') plt.close()print("聚类选择图已保存")# ============================================# 6. K均值聚类(基于DTW)# ============================================def perform_clustering(dist_matrix, optimal_k, ts_sample, sample_indices, ts_matrix_scaled_df, simple_data, results_dir):"""执行K均值聚类"""print("步骤5: 执行K均值聚类...") n_samples = dist_matrix.shape[0] try:if KMEDOIDS_AVAILABLE:# 使用KMedoids算法(PAM) kmedoids = KMedoids(n_clusters=optimal_k, metric='precomputed', random_state=123, init='k-medoids++') clusters = kmedoids.fit_predict(dist_matrix) medoid_indices = kmedoids.medoid_indices_# 计算轮廓系数 silhouette_vals = silhouette_samples(dist_matrix, clusters, metric='precomputed')print(f"KMedoids聚类完成,使用K={optimal_k}")else:print("KMedoids不可用,使用MDS+KMeans聚类...")# 使用MDS将距离矩阵转换为特征向量,然后使用KMeans mds = MDS(n_components=min(10, n_samples - 1), dissimilarity='precomputed', random_state=123, normalized_stress='auto') mds_features = mds.fit_transform(dist_matrix)# 使用KMeans聚类 kmeans = KMeans(n_clusters=optimal_k, random_state=123, n_init=20) clusters = kmeans.fit_predict(mds_features)# 计算轮廓系数(基于MDS特征) silhouette_vals = silhouette_samples(mds_features, clusters, metric='euclidean') medoid_indices = Noneprint(f"MDS+KMeans聚类完成,使用K={optimal_k}") except Exception as e:print(f"聚类失败: {e}")print("使用简单随机分配...")# 如果都失败,分配随机聚类 clusters = np.random.randint(0, optimal_k, n_samples) silhouette_vals = np.full(n_samples, np.nan) medoid_indices = None# 创建聚类分配结果if len(sample_indices) == ts_matrix_scaled_df.shape[0]:# 使用了全部样本 sample_ids = ts_matrix_scaled_df.index.tolist()else:# 使用了子样本 sample_ids = [f"Sample_{i}"for i in range(n_samples)] cluster_assignments = pd.DataFrame({'ID': sample_ids,'Cluster': clusters + 1, # 从1开始编号'Silhouette_Width': silhouette_vals })# 如果有原始ID,使用原始IDif len(sample_indices) == ts_matrix_scaled_df.shape[0]: cluster_assignments['ID'] = ts_matrix_scaled_df.index.tolist()# 保存聚类分配结果 assignments_file = os.path.join(results_dir, 'data', '04_聚类分配结果.csv') cluster_assignments.to_csv(assignments_file, index=False, encoding='utf-8-sig')print(f"聚类分配结果已保存到: {assignments_file}")# 合并原始数据if len(sample_indices) == ts_matrix_scaled_df.shape[0]:# 使用了全部样本 cluster_data = simple_data.copy()# 添加聚类信息 cluster_mapping = dict(zip(cluster_assignments['ID'], cluster_assignments['Cluster'])) cluster_data['Cluster'] = cluster_data['ID'].map(cluster_mapping)else:# 只保存样本数据 cluster_data = pd.DataFrame({'Sample_ID': sample_ids,'Cluster': clusters + 1 })# 保存完整聚类数据 cluster_data_file = os.path.join(results_dir, 'data', '05_完整聚类数据.csv') cluster_data.to_csv(cluster_data_file, index=False, encoding='utf-8-sig')return cluster_assignments, clusters, silhouette_vals# ============================================# 7. 聚类质量评估# ============================================def evaluate_clustering_quality(cluster_assignments, results_dir):"""评估聚类质量"""print("步骤6: 评估聚类质量...")# 计算聚类统计量 cluster_stats = cluster_assignments.groupby('Cluster').agg({'ID': 'count','Silhouette_Width': ['mean', 'std'] }).round(3)# 重命名列 cluster_stats.columns = ['n', 'Avg_Silhouette', 'SD_Silhouette'] cluster_stats = cluster_stats.reset_index()# 计算比例 total_samples = len(cluster_assignments) cluster_stats['Proportion'] = (cluster_stats['n'] / total_samples).round(3)# 总体统计 overall_stats = pd.DataFrame({'Metric': ['总样本数', '聚类数', '平均轮廓系数', '最小轮廓系数'],'Value': [ total_samples, len(cluster_stats), cluster_assignments['Silhouette_Width'].mean() if not cluster_assignments['Silhouette_Width'].isnull().all() else 0, cluster_assignments['Silhouette_Width'].min() if not cluster_assignments['Silhouette_Width'].isnull().all() else 0 ] })# 保存统计结果 cluster_stats_file = os.path.join(results_dir, 'tables', '01_聚类统计.csv') cluster_stats.to_csv(cluster_stats_file, index=False, encoding='utf-8-sig') overall_stats_file = os.path.join(results_dir, 'tables', '02_总体统计.csv') overall_stats.to_csv(overall_stats_file, index=False, encoding='utf-8-sig')print("聚类质量评估完成")if not cluster_assignments['Silhouette_Width'].isnull().all():print(f"平均轮廓系数: {overall_stats.loc[overall_stats['Metric'] == '平均轮廓系数', 'Value'].values[0]:.3f}")return cluster_stats, overall_stats# ============================================# 8. 可视化分析# ============================================def create_visualizations(dist_matrix, ts_sample, clusters, cluster_assignments, simple_data, optimal_k, results_dir):"""创建可视化图表"""print("步骤7: 创建可视化图表...") n_clusters = optimal_k# 8.1 轮廓图 try: create_silhouette_plot(dist_matrix, clusters, cluster_assignments, n_clusters, results_dir) except Exception as e:print(f"轮廓图创建失败: {e}")# 8.2 中心序列图 try: create_centroid_plots(ts_sample, clusters, n_clusters, results_dir) except Exception as e:print(f"中心序列图创建失败: {e}")# 8.3 分面序列图 try: create_facet_plots(ts_sample, clusters, n_clusters, results_dir) except Exception as e:print(f"分面序列图创建失败: {e}")# 8.4 MDS散点图 try: create_mds_plot(dist_matrix, clusters, n_clusters, results_dir) except Exception as e:print(f"MDS散点图创建失败: {e}")# 8.5 人口学特征热图 try: create_demographic_heatmap(simple_data, cluster_assignments, results_dir) except Exception as e:print(f"人口学热图创建失败: {e}")# 8.6 特征雷达图 try: create_radar_plot(ts_sample, clusters, n_clusters, results_dir) except Exception as e:print(f"雷达图创建失败: {e}")# 8.7 综合面板图 try: create_combined_panel(results_dir) except Exception as e:print(f"综合面板图创建失败: {e}")print("所有可视化图表已创建")def create_silhouette_plot(dist_matrix, clusters, cluster_assignments, n_clusters, results_dir):"""创建轮廓图""" try:# 获取轮廓系数 silhouette_vals = silhouette_samples(dist_matrix, clusters, metric='precomputed') fig, ax1 = plt.subplots(figsize=(10, 8))# 设置y轴起点 y_lower = 10# 对每个聚类绘制轮廓for i in range(n_clusters):# 获取第i个聚类的轮廓系数 cluster_silhouette_vals = silhouette_vals[clusters == i]if len(cluster_silhouette_vals) > 0: cluster_silhouette_vals.sort() size_cluster_i = cluster_silhouette_vals.shape[0] y_upper = y_lower + size_cluster_i color = plt.cm.tab10(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)# 标记聚类编号 ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i + 1)) y_lower = y_upper + 10 ax1.set_xlabel('轮廓系数值', fontsize=12) ax1.set_ylabel('聚类', fontsize=12) ax1.set_title(f'聚类轮廓图 (K = {n_clusters})', fontsize=14, fontweight='bold')# 平均轮廓系数线 avg_silhouette = np.mean(silhouette_vals) ax1.axvline(x=avg_silhouette, color='red', linestyle='--') ax1.text(avg_silhouette + 0.01, y_lower - 50, f'平均轮廓系数: {avg_silhouette:.3f}', color='red') ax1.set_yticks([]) ax1.grid(True, alpha=0.3) silhouette_plot_file = os.path.join(results_dir, 'plots', '03_轮廓图.png') plt.tight_layout() plt.savefig(silhouette_plot_file, dpi=300, bbox_inches='tight') plt.close() except Exception as e:print(f"创建轮廓图时出错: {e}")# 创建简单的替代图 fig, ax = plt.subplots(figsize=(10, 6)) ax.text(0.5, 0.5, '轮廓图无法生成\n轮廓系数数据不可用', ha='center', va='center', fontsize=14) ax.axis('off') silhouette_plot_file = os.path.join(results_dir, 'plots', '03_轮廓图.png') plt.savefig(silhouette_plot_file, dpi=300, bbox_inches='tight') plt.close()def create_centroid_plots(ts_sample, clusters, n_clusters, results_dir):"""创建中心序列图""" fig, ax = plt.subplots(figsize=(12, 8))# 定义颜色 colors = plt.cm.tab10(np.linspace(0, 1, n_clusters))# 计算每个聚类的中心序列(均值) time_points = np.arange(ts_sample.shape[1])for i in range(n_clusters): cluster_indices = np.where(clusters == i)[0]if len(cluster_indices) > 0: cluster_data = ts_sample[cluster_indices, :] centroid = np.mean(cluster_data, axis=0)# 绘制中心序列 ax.plot(time_points, centroid, color=colors[i], linewidth=3, label=f'簇 {i + 1} (n={len(cluster_indices)})')# 添加阴影表示标准差if len(cluster_indices) > 1: std = np.std(cluster_data, axis=0) ax.fill_between(time_points, centroid - std, centroid + std, color=colors[i], alpha=0.2) ax.set_xlabel('时间点', fontsize=12) ax.set_ylabel('标准化值', fontsize=12) ax.set_title('各簇中心序列(均值±标准差)', fontsize=14, fontweight='bold') ax.legend(loc='best') ax.grid(True, alpha=0.3) centroid_plot_file = os.path.join(results_dir, 'plots', '04_中心序列图.png') plt.tight_layout() plt.savefig(centroid_plot_file, dpi=300, bbox_inches='tight') plt.close()def create_facet_plots(ts_sample, clusters, n_clusters, results_dir):"""创建分面序列图"""# 确定网格布局 n_cols = 2 n_rows = (n_clusters + 1) // 2 fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4 * n_rows))# 如果只有一行,确保axes是二维数组if n_rows == 1: axes = axes.reshape(1, -1) time_points = np.arange(ts_sample.shape[1])for i in range(n_clusters): row = i // n_cols col = i % n_cols ax = axes[row, col] cluster_indices = np.where(clusters == i)[0]if len(cluster_indices) > 0: cluster_data = ts_sample[cluster_indices, :] centroid = np.mean(cluster_data, axis=0)# 绘制所有个体序列for j in range(min(20, len(cluster_indices))): # 最多显示20条线 ax.plot(time_points, cluster_data[j, :], color='gray', alpha=0.3, linewidth=0.5)# 绘制中心序列 ax.plot(time_points, centroid, color='red', linewidth=2.5) ax.set_title(f'簇 {i + 1} (n={len(cluster_indices)})', fontsize=12, fontweight='bold') ax.set_xlabel('时间点') ax.set_ylabel('标准化值') ax.grid(True, alpha=0.3)# 隐藏多余的子图for i in range(n_clusters, n_rows * n_cols): row = i // n_cols col = i % n_cols axes[row, col].axis('off') fig.suptitle('各簇内序列与中心对比', fontsize=16, fontweight='bold') plt.tight_layout() facet_plot_file = os.path.join(results_dir, 'plots', '05_分面序列图.png') plt.savefig(facet_plot_file, dpi=300, bbox_inches='tight') plt.close()def create_mds_plot(dist_matrix, clusters, n_clusters, results_dir):"""创建MDS散点图""" try:# 执行MDS mds = MDS(n_components=2, dissimilarity='precomputed', random_state=123, normalized_stress='auto') mds_coords = mds.fit_transform(dist_matrix) fig, ax = plt.subplots(figsize=(10, 8))# 定义颜色 colors = plt.cm.tab10(np.linspace(0, 1, n_clusters))# 绘制每个聚类的点for i in range(n_clusters): cluster_indices = np.where(clusters == i)[0]if len(cluster_indices) > 0: ax.scatter(mds_coords[cluster_indices, 0], mds_coords[cluster_indices, 1], color=colors[i], s=50, alpha=0.7, label=f'簇 {i + 1}')# 计算并标记聚类中心 cluster_center = np.mean(mds_coords[cluster_indices, :], axis=0) ax.scatter(cluster_center[0], cluster_center[1], color=colors[i], s=200, marker='*', edgecolor='black') ax.set_xlabel('MDS维度 1', fontsize=12) ax.set_ylabel('MDS维度 2', fontsize=12) ax.set_title('基于DTW距离的多维尺度分析(MDS)', fontsize=14, fontweight='bold') ax.legend(loc='best') ax.grid(True, alpha=0.3) mds_plot_file = os.path.join(results_dir, 'plots', '06_MDS散点图.png') plt.tight_layout() plt.savefig(mds_plot_file, dpi=300, bbox_inches='tight') plt.close() except Exception as e:print(f"MDS分析失败: {e}")def create_demographic_heatmap(simple_data, cluster_assignments, results_dir):"""创建人口学特征热图""" try:# 合并人口学信息if'ID'in simple_data.columns and 'ID'in cluster_assignments.columns:# 获取每个ID的第一条记录(基线信息) demographic_data = simple_data.groupby('ID').first().reset_index()# 确保聚类分配中有相同的ID merged_data = pd.merge(demographic_data, cluster_assignments, on='ID', how='inner')# 计算各簇的人口学特征 demo_summary = merged_data.groupby('Cluster').agg({'Age': 'mean','Baseline_Score': 'mean' }).round(2)# 如果有性别信息,计算女性比例if'Sex'in merged_data.columns:# 转换为数值:女性=1,男性=0 merged_data['Sex_numeric'] = (merged_data['Sex'] == 'F').astype(int) sex_summary = merged_data.groupby('Cluster')['Sex_numeric'].mean() * 100 demo_summary['Female_Percent'] = sex_summary.round(1)# 创建热图 fig, ax = plt.subplots(figsize=(10, 6))# 标准化数据用于热图 heatmap_data = demo_summary.T # 转置以使聚类作为列 sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='coolwarm', center=0, ax=ax, cbar_kws={'label': '值'}) ax.set_title('各聚类人口学特征热图', fontsize=14, fontweight='bold') ax.set_xlabel('聚类') ax.set_ylabel('特征') heatmap_file = os.path.join(results_dir, 'plots', '07_人口学热图.png') plt.tight_layout() plt.savefig(heatmap_file, dpi=300, bbox_inches='tight') plt.close() except Exception as e:print(f"人口学热图创建失败: {e}")def create_radar_plot(ts_sample, clusters, n_clusters, results_dir):"""创建特征雷达图""" try:# 计算每个序列的特征 features = []for i in range(ts_sample.shape[0]): ts = ts_sample[i, :] features.append({'Mean': np.mean(ts),'Std': np.std(ts) if np.std(ts) > 0 else 0.001,'Slope': np.polyfit(range(len(ts)), ts, 1)[0] if len(ts) > 1 else 0 }) features_df = pd.DataFrame(features) features_df['Cluster'] = clusters# 计算各聚类的特征平均值 cluster_features = features_df.groupby('Cluster').mean().reset_index()# 标准化特征for col in ['Mean', 'Std', 'Slope']:if cluster_features[col].std() > 0: cluster_features[col] = (cluster_features[col] - cluster_features[col].mean()) / cluster_features[ col].std()# 创建雷达图 fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111, projection='polar')# 特征标签 categories = ['平均水平', '波动程度', '变化趋势'] N = len(categories)# 角度 angles = [n / float(N) * 2 * np.pi for n in range(N)] angles += angles[:1] # 闭合图形# 定义颜色 colors = plt.cm.tab10(np.linspace(0, 1, n_clusters))for i in range(n_clusters): values = cluster_features.iloc[i][['Mean', 'Std', 'Slope']].tolist() values += values[:1] # 闭合图形 ax.plot(angles, values, 'o-', linewidth=2, color=colors[i], label=f'簇 {i + 1}') ax.fill(angles, values, alpha=0.25, color=colors[i]) ax.set_xticks(angles[:-1]) ax.set_xticklabels(categories) ax.set_title('各聚类时间序列特征雷达图', fontsize=14, fontweight='bold') ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0)) radar_file = os.path.join(results_dir, 'plots', '08_特征雷达图.png') plt.tight_layout() plt.savefig(radar_file, dpi=300, bbox_inches='tight') plt.close() except Exception as e:print(f"雷达图创建失败: {e}")def create_combined_panel(results_dir):"""创建综合面板图""" try:# 加载四个主要图表 fig, axes = plt.subplots(2, 2, figsize=(16, 12))# 图表1: 肘部法则图 try: img1 = plt.imread(os.path.join(results_dir, 'plots', '01_肘部法则图.png')) axes[0, 0].imshow(img1) axes[0, 0].axis('off') axes[0, 0].set_title('A. 肘部法则图', fontsize=12, fontweight='bold') except: axes[0, 0].text(0.5, 0.5, '肘部法则图\n无法加载', ha='center', va='center', fontsize=12) axes[0, 0].axis('off')# 图表2: 轮廓系数图 try: img2 = plt.imread(os.path.join(results_dir, 'plots', '02_轮廓系数图.png')) axes[0, 1].imshow(img2) axes[0, 1].axis('off') axes[0, 1].set_title('B. 轮廓系数图', fontsize=12, fontweight='bold') except: axes[0, 1].text(0.5, 0.5, '轮廓系数图\n无法加载', ha='center', va='center', fontsize=12) axes[0, 1].axis('off')# 图表3: 中心序列图 try: img3 = plt.imread(os.path.join(results_dir, 'plots', '04_中心序列图.png')) axes[1, 0].imshow(img3) axes[1, 0].axis('off') axes[1, 0].set_title('C. 中心序列图', fontsize=12, fontweight='bold') except: axes[1, 0].text(0.5, 0.5, '中心序列图\n无法加载', ha='center', va='center', fontsize=12) axes[1, 0].axis('off')# 图表4: MDS散点图 try: img4 = plt.imread(os.path.join(results_dir, 'plots', '06_MDS散点图.png')) axes[1, 1].imshow(img4) axes[1, 1].axis('off') axes[1, 1].set_title('D. MDS散点图', fontsize=12, fontweight='bold') except: axes[1, 1].text(0.5, 0.5, 'MDS散点图\n无法加载', ha='center', va='center', fontsize=12) axes[1, 1].axis('off') fig.suptitle('K均值纵向聚类分析综合可视化', fontsize=16, fontweight='bold') plt.tight_layout() combined_file = os.path.join(results_dir, 'plots', '09_综合可视化面板.png') plt.savefig(combined_file, dpi=300, bbox_inches='tight') plt.close()print(f"综合面板图已保存: {combined_file}") except Exception as e:print(f"综合面板图创建失败: {e}")# ============================================# 9. 创建三线表# ============================================def create_tables(cluster_stats, simple_data, cluster_assignments, ts_sample, clusters, results_dir):"""创建三线表"""print("步骤8: 创建三线表...")# 9.1 聚类统计三线表 cluster_stats_table = cluster_stats.copy() cluster_stats_table.columns = ['聚类', '样本数', '平均轮廓宽度', '轮廓宽度标准差', '比例']# 格式化数字 cluster_stats_table['平均轮廓宽度'] = cluster_stats_table['平均轮廓宽度'].apply( lambda x: f'{x:.3f}'if pd.notnull(x) else'N/A') cluster_stats_table['轮廓宽度标准差'] = cluster_stats_table['轮廓宽度标准差'].apply( lambda x: f'{x:.3f}'if pd.notnull(x) else'N/A') cluster_stats_table['比例'] = cluster_stats_table['比例'].apply(lambda x: f'{x:.3f}')# 保存三线表 stats_table_file = os.path.join(results_dir, 'tables', '03_聚类统计三线表.csv') cluster_stats_table.to_csv(stats_table_file, index=False, encoding='utf-8-sig')# 9.2 聚类人口学特征三线表 try:if'ID'in simple_data.columns and 'ID'in cluster_assignments.columns:# 获取基线数据 baseline_data = simple_data.groupby('ID').first().reset_index()# 合并聚类信息 merged_data = pd.merge(baseline_data, cluster_assignments, on='ID', how='inner')# 创建人口学特征表 demo_table = merged_data.groupby('Cluster').agg({'Age': lambda x: f'{np.mean(x):.1f} ± {np.std(x):.1f}'if len(x) > 0 else'N/A','Baseline_Score': lambda x: f'{np.mean(x):.1f} ± {np.std(x):.1f}'if len(x) > 0 else'N/A' }).reset_index()# 如果有性别信息if'Sex'in merged_data.columns: def female_percent(x):if len(x) > 0:return f'{100 * (x == "F").mean():.1f}%'return'N/A' sex_summary = merged_data.groupby('Cluster')['Sex'].apply(female_percent).reset_index() sex_summary.columns = ['Cluster', '女性比例'] demo_table = pd.merge(demo_table, sex_summary, on='Cluster') demo_table.columns = ['聚类', '平均年龄', '平均基线得分', '女性比例']# 保存三线表 demo_table_file = os.path.join(results_dir, 'tables', '04_人口学特征三线表.csv') demo_table.to_csv(demo_table_file, index=False, encoding='utf-8-sig') except Exception as e:print(f"人口学特征表创建失败: {e}")# 9.3 时间序列特征三线表 try:# 计算每个序列的特征 features = []for i in range(ts_sample.shape[0]): ts = ts_sample[i, :] features.append({'Cluster': clusters[i] + 1,'Mean': np.mean(ts),'Std': np.std(ts) if np.std(ts) > 0 else 0.001,'Slope': np.polyfit(range(len(ts)), ts, 1)[0] if len(ts) > 1 else 0 }) features_df = pd.DataFrame(features)# 创建特征表 ts_feature_table = features_df.groupby('Cluster').agg({'Mean': lambda x: f'{np.mean(x):.3f} ± {np.std(x):.3f}'if len(x) > 0 else'N/A','Std': lambda x: f'{np.mean(x):.3f} ± {np.std(x):.3f}'if len(x) > 0 else'N/A','Slope': lambda x: f'{np.mean(x):.3f} ± {np.std(x):.3f}'if len(x) > 0 else'N/A' }).reset_index() ts_feature_table.columns = ['聚类', '平均均值', '平均标准差', '平均斜率']# 保存三线表 ts_table_file = os.path.join(results_dir, 'tables', '05_时间序列特征三线表.csv') ts_feature_table.to_csv(ts_table_file, index=False, encoding='utf-8-sig') except Exception as e:print(f"时间序列特征表创建失败: {e}")print("三线表已创建")# ============================================# 10. 创建Word分析报告# ============================================def create_word_report(results_dir, n_samples, optimal_k, cluster_stats, overall_stats, ts_matrix_scaled, dist_matrix):"""创建Word分析报告"""print("步骤9: 生成Word分析报告...") try:# 创建Word文档 doc = docx.Document()# 添加标题 title = doc.add_heading('K均值纵向聚类分析报告', 0) title.alignment = WD_ALIGN_PARAGRAPH.CENTER# 添加生成时间 doc.add_paragraph(f'生成时间: {datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}') doc.add_paragraph()# 一、分析摘要 doc.add_heading('一、分析摘要', level=1) doc.add_paragraph(f'本次分析对{n_samples}个样本的时间序列数据进行了K均值纵向聚类分析。') doc.add_paragraph(f'使用动态时间规整(DTW)作为距离度量,通过轮廓系数确定最佳聚类数为{optimal_k}。') avg_silhouette = overall_stats.loc[overall_stats['Metric'] == '平均轮廓系数', 'Value'].values[0] doc.add_paragraph(f'平均轮廓宽度为{avg_silhouette:.3f},表明聚类结构较为合理。') doc.add_paragraph()# 二、分析方法 doc.add_heading('二、分析方法', level=1) doc.add_paragraph('1. 数据预处理:对时间序列进行线性插值处理缺失值,并进行标准化') doc.add_paragraph('2. 距离计算:使用动态时间规整(DTW)算法计算序列间距离') doc.add_paragraph('3. 聚类数确定:结合肘部法则和轮廓系数确定最佳聚类数')if KMEDOIDS_AVAILABLE: doc.add_paragraph('4. 聚类算法:使用KMedoids算法(PAM)进行聚类')else: doc.add_paragraph('4. 聚类算法:使用MDS降维后KMeans进行聚类(KMedoids不可用)') doc.add_paragraph('5. 可视化:采用多维尺度分析、序列图、热图等多种可视化方法') doc.add_paragraph()# 三、主要结果 doc.add_heading('三、主要结果', level=1)# 表1: 聚类分布统计 doc.add_heading('表1. 聚类分布统计', level=2)# 创建表格 table1 = doc.add_table(rows=len(cluster_stats) + 1, cols=5) table1.style = 'Light Grid Accent 1'# 添加表头 headers = ['聚类', '样本数', '比例', '平均轮廓宽度', '轮廓宽度标准差']for i, header in enumerate(headers): table1.cell(0, i).text = header table1.cell(0, i).paragraphs[0].runs[0].font.bold = True# 添加数据for i, row in cluster_stats.iterrows(): table1.cell(i + 1, 0).text = str(int(row['Cluster'])) table1.cell(i + 1, 1).text = str(int(row['n'])) table1.cell(i + 1, 2).text = f"{row['Proportion']:.3f}" table1.cell(i + 1, 3).text = f"{row['Avg_Silhouette']:.3f}"if pd.notnull(row['Avg_Silhouette']) else'N/A' table1.cell(i + 1, 4).text = f"{row['SD_Silhouette']:.3f}"if pd.notnull(row['SD_Silhouette']) else'N/A' doc.add_paragraph()# 四、可视化分析 doc.add_heading('四、可视化分析', level=1) doc.add_paragraph('1. 聚类数确定:肘部法则图和轮廓系数图显示最佳聚类数') doc.add_paragraph('2. 聚类质量:轮廓图显示各聚类内部一致性') doc.add_paragraph('3. 序列模式:中心序列图揭示了不同聚类的时间序列特征差异') doc.add_paragraph('4. 关系可视化:MDS散点图直观展示了序列间的相似性关系') doc.add_paragraph()# 添加图表 image_files = ['01_肘部法则图.png','02_轮廓系数图.png','03_轮廓图.png','04_中心序列图.png','06_MDS散点图.png','07_人口学热图.png' ]for img_file in image_files: img_path = os.path.join(results_dir, 'plots', img_file)if os.path.exists(img_path):# 添加图表标题 chart_title = img_file.replace('.png', '').replace('_', ' ') doc.add_heading(chart_title, level=2)# 添加图片 doc.add_picture(img_path, width=Inches(6)) doc.add_paragraph()# 五、讨论与结论 doc.add_heading('五、讨论与结论', level=1) doc.add_paragraph('1. 公共卫生意义:', style='List Bullet') doc.add_paragraph(' • 基于DTW的K均值聚类能够有效识别具有相似时间序列模式的个体亚组') doc.add_paragraph(' • 在公共卫生领域,可用于识别不同健康行为模式(如运动模式、用药依从性模式)') doc.add_paragraph(' • 为个性化干预和精准公共卫生提供数据支持') doc.add_paragraph() doc.add_paragraph('2. 方法学优势:', style='List Bullet') doc.add_paragraph(' • DTW距离能够处理不等长、不同步的时间序列') doc.add_paragraph(' • 对局部形态的拉伸/压缩具有鲁棒性') doc.add_paragraph(' • 无需预设参数化轨迹形式,灵活性高') doc.add_paragraph() doc.add_paragraph('3. 局限性与展望:', style='List Bullet') doc.add_paragraph(' • 计算复杂度较高,大规模数据需要优化算法') doc.add_paragraph(' • 未来可结合其他特征(如变点检测、频域特征)进行多维度聚类') doc.add_paragraph(' • 建议进行稳定性分析和外部验证') doc.add_paragraph()# 六、技术细节 doc.add_heading('六、技术细节', level=1) doc.add_paragraph(f'Python版本: {sys.version}') doc.add_paragraph('主要Python包: numpy, pandas, scikit-learn, fastdtw, matplotlib, seaborn') doc.add_paragraph(f'样本数量: {n_samples}') doc.add_paragraph(f'时间点数量: {ts_matrix_scaled.shape[1]}') doc.add_paragraph('距离度量: 动态时间规整(DTW)')if KMEDOIDS_AVAILABLE: doc.add_paragraph('聚类算法: KMedoids (PAM)')else: doc.add_paragraph('聚类算法: MDS降维后KMeans (KMedoids不可用)')# 保存Word文档 report_file = os.path.join(results_dir, 'reports', 'K均值纵向聚类分析报告.docx') doc.save(report_file)print(f"Word报告已生成: {report_file}") except Exception as e:print(f"Word报告生成失败: {e}")# ============================================# 11. 创建详细的分析总结文件# ============================================def create_summary_file(results_dir, n_samples, optimal_k, cluster_stats, overall_stats, ts_matrix_scaled):"""创建分析总结文件"""print("步骤10: 创建分析总结文件...") summary_text = f"""K均值纵向聚类分析总结报告========================================分析时间: {datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}样本数量: {n_samples}时间点数: {ts_matrix_scaled.shape[1]}最佳聚类数: {optimal_k}平均轮廓宽度: {overall_stats.loc[overall_stats['Metric'] == '平均轮廓系数', 'Value'].values[0]:.3f}聚类方法: {'KMedoids (PAM)' if KMEDOIDS_AVAILABLE else 'MDS降维后KMeans (KMedoids不可用)'}各聚类分布:{cluster_stats.to_string(index=False)}主要发现:1. 成功识别出{optimal_k}个具有不同时间序列模式的亚组2. 聚类质量: {"良好" if overall_stats.loc[overall_stats['Metric'] == '平均轮廓系数', 'Value'].values[0] > 0.5 else "一般"} (平均轮廓宽度={overall_stats.loc[overall_stats['Metric'] == '平均轮廓系数', 'Value'].values[0]:.3f})3. 各聚类在人口学特征上存在差异4. 可视化结果清晰展示了不同聚类的时间序列模式特征文件清单:1. 预处理数据 (data/01_预处理数据.csv)2. 时间序列矩阵 (data/02_时间序列矩阵.csv)3. DTW距离矩阵 (data/03_DTW距离矩阵.csv)4. 聚类分配结果 (data/04_聚类分配结果.csv)5. 完整聚类数据 (data/05_完整聚类数据.csv)6. 聚类统计 (tables/01_聚类统计.csv)7. 总体统计 (tables/02_总体统计.csv)8-15. 可视化图表 (plots/*.png)16-18. 三线表 (tables/*.csv)19. Word分析报告 (reports/K均值纵向聚类分析报告.docx)"""# 保存总结文件 summary_file = os.path.join(results_dir, '分析总结.txt') with open(summary_file, 'w', encoding='utf-8') as f: f.write(summary_text)print(f"分析总结文件已保存: {summary_file}")# ============================================# 12. 主函数# ============================================def main():"""主函数"""print("=" * 50)print("K均值纵向聚类分析(基于DTW距离)- Python版")print("=" * 50)# 1. 设置环境 results_dir = setup_environment()# 2. 加载和预处理数据 simple_data = load_and_preprocess_data(results_dir)# 3. 创建时间序列矩阵 ts_matrix_scaled_df, ts_matrix_scaled = create_time_series_matrix(simple_data, results_dir)# 4. 计算DTW距离矩阵 dist_matrix, sample_indices, ts_sample = calculate_dtw_distance(ts_matrix_scaled, results_dir, n_samples=50)# 5. 确定最佳聚类数 optimal_k, k_range, wss, sil_scores = determine_optimal_clusters(dist_matrix, results_dir)# 6. 执行聚类 cluster_assignments, clusters, silhouette_vals = perform_clustering( dist_matrix, optimal_k, ts_sample, sample_indices, ts_matrix_scaled_df, simple_data, results_dir )# 7. 聚类质量评估 cluster_stats, overall_stats = evaluate_clustering_quality(cluster_assignments, results_dir)# 8. 可视化分析 create_visualizations(dist_matrix, ts_sample, clusters, cluster_assignments, simple_data, optimal_k, results_dir)# 9. 创建三线表 create_tables(cluster_stats, simple_data, cluster_assignments, ts_sample, clusters, results_dir)# 10. 创建Word分析报告 create_word_report(results_dir, len(sample_indices), optimal_k, cluster_stats, overall_stats, ts_matrix_scaled, dist_matrix)# 11. 创建分析总结文件 create_summary_file(results_dir, len(sample_indices), optimal_k, cluster_stats, overall_stats, ts_matrix_scaled)# 12. 打印完成信息print("\n" + "=" * 50)print("K均值纵向聚类分析完成!")print(f"结果已保存到文件夹: {results_dir}")print("\n关键结果摘要:")print(f"最佳聚类数: {optimal_k}")print("各聚类样本数:")print(cluster_stats[['Cluster', 'n']].to_string(index=False))if not cluster_assignments['Silhouette_Width'].isnull().all():print(f"平均轮廓宽度: {overall_stats.loc[overall_stats['Metric'] == '平均轮廓系数', 'Value'].values[0]:.3f}")print(f"聚类方法: {'KMedoids (PAM)' if KMEDOIDS_AVAILABLE else 'MDS降维后KMeans (KMedoids不可用)'}")print("\n主要输出文件:")print("1. Word分析报告: reports/K均值纵向聚类分析报告.docx")print("2. 综合可视化面板: plots/09_综合可视化面板.png")print("3. 所有数据文件: 共20+个文件")print("=" * 50)# 尝试打开结果文件夹 try:if os.name == 'nt': # Windows os.startfile(results_dir)elif os.name == 'posix': # macOS/Linuxif sys.platform == 'darwin': os.system(f'open "{results_dir}"')else: os.system(f'xdg-open "{results_dir}"') except:print(f"无法自动打开文件夹,请手动访问: {os.path.abspath(results_dir)}")# ============================================# 运行主程序# ============================================if __name__ == "__main__": main()