一、NumPy简介与生态系统
"""NumPy生态体系:1. NumPy - 多维数组和矩阵运算(今日重点)2. SciPy - 科学计算算法库3. pandas - 数据分析库(基于NumPy)4. Matplotlib - 可视化库5. scikit-learn - 机器学习库NumPy核心优势:- 高效的ndarray多维数组- 广播功能- 丰富的数学函数库- 线性代数、傅里叶变换、随机数生成- C/C++/Fortran代码集成"""import numpy as npimport sysimport timeimport matplotlib.pyplot as plt# 检查NumPy版本print(f"NumPy版本: {np.__version__}")print(f"Python版本: {sys.version}")# NumPy配置信息print(f"\nNumPy配置:")print(f"BLAS库: {np.__config__.show()}")print(f"最大线程数: {np.get_num_threads()}")
二、NumPy数组基础
1. 数组创建与初始化
def demonstrate_array_creation(): """演示多种数组创建方法""" print("=" * 80) print("NumPy数组创建方法") print("=" * 80) # 1. 从Python列表创建 print("\n1. 从Python列表创建:") list_data = [1, 2, 3, 4, 5] arr_from_list = np.array(list_data) print(f"列表: {list_data}") print(f"NumPy数组: {arr_from_list}") print(f"数组形状: {arr_from_list.shape}") print(f"数组维度: {arr_from_list.ndim}") print(f"数组数据类型: {arr_from_list.dtype}") # 2. 从嵌套列表创建多维数组 print("\n2. 多维数组创建:") matrix_data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] matrix = np.array(matrix_data) print(f"矩阵:\n{matrix}") print(f"形状: {matrix.shape}") print(f"维度: {matrix.ndim}") print(f"元素总数: {matrix.size}") # 3. 特殊数组创建函数 print("\n3. 特殊数组创建函数:") # 零数组 zeros_arr = np.zeros((3, 4)) print(f"零数组 (3x4):\n{zeros_arr}") # 一数组 ones_arr = np.ones((2, 3, 2), dtype=np.float32) print(f"\n一数组 (2x3x2, float32):\n{ones_arr}") # 单位矩阵 identity = np.eye(4) print(f"\n单位矩阵 (4x4):\n{identity}") # 对角矩阵 diagonal = np.diag([1, 2, 3, 4]) print(f"\n对角矩阵:\n{diagonal}") # 空数组(值未初始化) empty_arr = np.empty((2, 3)) print(f"\n空数组 (值未初始化):\n{empty_arr}") # 4. 数值范围数组 print("\n4. 数值范围数组:") # arange: 类似Python的range arange_arr = np.arange(0, 20, 2) print(f"arange(0, 20, 2): {arange_arr}") # linspace: 等间隔数值 linspace_arr = np.linspace(0, 1, 11) print(f"linspace(0, 1, 11): {linspace_arr}") # logspace: 对数间隔数值 logspace_arr = np.logspace(0, 3, 4) print(f"logspace(0, 3, 4): {logspace_arr}") # 5. 网格坐标 print("\n5. 网格坐标:") # mgrid: 返回密集网格 x, y = np.mgrid[0:3, 0:3] print(f"mgrid x:\n{x}") print(f"mgrid y:\n{y}") # ogrid: 返回开放网格(节省内存) x_ogrid, y_ogrid = np.ogrid[0:3, 0:3] print(f"\nogrid x:\n{x_ogrid}") print(f"ogrid y:\n{y_ogrid}") # meshgrid: 创建坐标矩阵 x_coords = np.array([0, 1, 2]) y_coords = np.array([0, 1, 2]) X, Y = np.meshgrid(x_coords, y_coords) print(f"\nmeshgrid X:\n{X}") print(f"meshgrid Y:\n{Y}") # 6. 随机数组 print("\n6. 随机数组:") # 固定随机种子 np.random.seed(42) # 均匀分布 uniform_arr = np.random.rand(3, 3) print(f"均匀分布 (0-1):\n{uniform_arr}") # 标准正态分布 normal_arr = np.random.randn(3, 3) print(f"\n标准正态分布:\n{normal_arr}") # 指定范围的随机整数 randint_arr = np.random.randint(0, 100, (3, 3)) print(f"\n随机整数 (0-100):\n{randint_arr}") # 7. 复杂数据类型 print("\n7. 复杂数据类型:") # 复数数组 complex_arr = np.array([1+2j, 3+4j, 5+6j]) print(f"复数数组: {complex_arr}") print(f"实部: {complex_arr.real}") print(f"虚部: {complex_arr.imag}") print(f"共轭: {complex_arr.conjugate()}") # 结构化数组 structured_dtype = np.dtype([ ('name', 'U10'), # 最大10个字符的Unicode字符串 ('age', 'i4'), # 32位整数 ('height', 'f4'), # 32位浮点数 ('weight', 'f4') ]) people = np.array([ ('Alice', 25, 1.65, 55.5), ('Bob', 30, 1.80, 75.2), ('Charlie', 35, 1.75, 68.3) ], dtype=structured_dtype) print(f"\n结构化数组:") print(f"数据类型: {people.dtype}") print(f"数组:\n{people}") print(f"名字字段: {people['name']}") print(f"平均年龄: {people['age'].mean():.1f}") return { 'arr_from_list': arr_from_list, 'matrix': matrix, 'zeros_arr': zeros_arr, 'identity': identity, 'uniform_arr': uniform_arr, 'complex_arr': complex_arr, 'people': people }# 执行数组创建演示arrays = demonstrate_array_creation()
2. 数组属性与数据类型
def demonstrate_array_properties(): """演示数组属性和数据类型""" print("=" * 80) print("NumPy数组属性与数据类型") print("=" * 80) # 创建示例数组 arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) print(f"示例数组:\n{arr}") print(f"\n数组属性:") print(f"形状 (shape): {arr.shape}") # (4, 3) print(f"维度 (ndim): {arr.ndim}") # 2 print(f"元素总数 (size): {arr.size}") # 12 print(f"数据类型 (dtype): {arr.dtype}") # int64 print(f"元素字节大小 (itemsize): {arr.itemsize}") # 8 print(f"总字节大小 (nbytes): {arr.nbytes}") # 96 print(f"步幅 (strides): {arr.strides}") # (24, 8) - 内存布局 # 数据类型转换 print("\n数据类型转换:") arr_float = arr.astype(np.float32) print(f"转换为float32:\n{arr_float}") print(f"新数据类型: {arr_float.dtype}") # 查看数据类型信息 print("\nNumPy数据类型体系:") print(f"整数类型: {np.sctypes['int']}") print(f"浮点类型: {np.sctypes['float']}") print(f"复数类型: {np.sctypes['complex']}") print(f"其他类型: {np.sctypes['others']}") # 数据类型精度 print("\n数据类型精度:") print(f"int8 范围: [{np.iinfo(np.int8).min}, {np.iinfo(np.int8).max}]") print(f"int32 范围: [{np.iinfo(np.int32).min}, {np.iinfo(np.int32).max}]") print(f"float32 精度: {np.finfo(np.float32).eps}") print(f"float64 精度: {np.finfo(np.float64).eps}") # 数组内存布局 print("\n数组内存布局:") # C风格(行优先) arr_c = np.array([[1, 2, 3], [4, 5, 6]], order='C') print(f"C风格数组 (行优先):") print(f" 数组: {arr_c}") print(f" 是否C连续: {arr_c.flags['C_CONTIGUOUS']}") # F风格(列优先) arr_f = np.array([[1, 2, 3], [4, 5, 6]], order='F') print(f"\nF风格数组 (列优先):") print(f" 数组: {arr_f}") print(f" 是否F连续: {arr_f.flags['F_CONTIGUOUS']}") # 查看所有标志 print("\n数组标志:") for flag in arr.flags: print(f" {flag}: {arr.flags[flag]}") # 数据类型转换的性能考虑 print("\n数据类型转换性能示例:") # 创建大型数组 large_arr = np.random.randn(10000, 10000).astype(np.float32) start = time.time() converted = large_arr.astype(np.float64) end = time.time() print(f"转换 10000x10000 float32 -> float64 耗时: {end-start:.3f}秒") print(f"内存占用:") print(f" float32: {large_arr.nbytes / 1024**2:.2f} MB") print(f" float64: {converted.nbytes / 1024**2:.2f} MB") # 特殊值 print("\n特殊值:") print(f"正无穷: {np.inf}") print(f"负无穷: {-np.inf}") print(f"非数字: {np.nan}") print(f"π: {np.pi}") print(f"自然常数e: {np.e}") # 创建包含特殊值的数组 special_arr = np.array([1.0, np.inf, -np.inf, np.nan, 0.0]) print(f"\n包含特殊值的数组: {special_arr}") print(f"是否为无穷大: {np.isinf(special_arr)}") print(f"是否为NaN: {np.isnan(special_arr)}") print(f"是否为有限值: {np.isfinite(special_arr)}") return arr# 执行数组属性演示example_arr = demonstrate_array_properties()
三、数组索引与切片
def demonstrate_indexing_slicing(): """演示数组索引和切片操作""" print("=" * 80) print("NumPy数组索引与切片") print("=" * 80) # 创建示例数组 arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]) print(f"原始数组 (4x4):\n{arr}") # 1. 基本索引 print("\n1. 基本索引:") print(f"arr[0, 0] = {arr[0, 0]}") # 第一行第一列 print(f"arr[2, 3] = {arr[2, 3]}") # 第三行第四列 print(f"arr[-1, -1] = {arr[-1, -1]}") # 最后一行最后一列 # 2. 切片操作 print("\n2. 切片操作:") print(f"arr[1:3, 1:3] (行1-2, 列1-2):\n{arr[1:3, 1:3]}") print(f"\narr[:, 2] (所有行的第3列): {arr[:, 2]}") print(f"\narr[1, :] (第2行的所有列): {arr[1, :]}") print(f"\narr[::2, ::2] (每隔一行的每隔一列):\n{arr[::2, ::2]}") print(f"\narr[::-1, ::-1] (行列都反转):\n{arr[::-1, ::-1]}") # 3. 高级索引:整数数组索引 print("\n3. 整数数组索引:") rows = np.array([0, 2, 3]) cols = np.array([1, 2, 0]) print(f"行索引: {rows}") print(f"列索引: {cols}") print(f"arr[rows, cols] = {arr[rows, cols]}") # 使用ix_函数进行网格索引 row_idx = np.array([0, 2]) col_idx = np.array([1, 3]) print(f"\n使用ix_进行网格索引:") print(f"行索引: {row_idx}") print(f"列索引: {col_idx}") print(f"arr[np.ix_(row_idx, col_idx)]:\n{arr[np.ix_(row_idx, col_idx)]}") # 4. 布尔索引 print("\n4. 布尔索引:") mask = arr > 10 print(f"布尔掩码 (arr > 10):\n{mask}") print(f"arr[mask] = {arr[mask]}") # 复合条件 mask_complex = (arr > 5) & (arr < 12) print(f"\n复合条件 (5 < arr < 12):\n{arr[mask_complex]}") # 5. 花式索引 (Fancy Indexing) print("\n5. 花式索引:") # 创建新数组 arr_large = np.arange(24).reshape(4, 6) print(f"新数组 (4x6):\n{arr_large}") # 使用列表进行索引 print(f"\narr_large[[0, 2, 3], [2, 4, 5]] = {arr_large[[0, 2, 3], [2, 4, 5]]}") # 使用数组进行索引 row_indices = np.array([[0, 0], [2, 2]]) col_indices = np.array([[0, 1], [4, 5]]) print(f"\n行索引数组:\n{row_indices}") print(f"列索引数组:\n{col_indices}") print(f"arr_large[row_indices, col_indices]:\n{arr_large[row_indices, col_indices]}") # 6. 视图 vs 副本 print("\n6. 视图(View) vs 副本(Copy):") # 视图 - 共享数据 arr_view = arr[1:3, 1:3] print(f"原始数组:\n{arr}") print(f"切片视图:\n{arr_view}") # 修改视图会影响原始数组 arr_view[0, 0] = 99 print(f"\n修改视图后原始数组:\n{arr}") # 恢复原始值 arr[1, 1] = 6 # 副本 - 独立数据 arr_copy = arr[1:3, 1:3].copy() arr_copy[0, 0] = 88 print(f"\n副本修改后原始数组:\n{arr}") print(f"副本:\n{arr_copy}") # 检查是否为视图 print(f"\n检查是否为视图:") print(f"arr_view.base is arr: {arr_view.base is arr}") print(f"arr_copy.base is arr: {arr_copy.base is arr}") # 7. 索引技巧 print("\n7. 索引技巧:") # 获取非零元素索引 nonzero_indices = np.nonzero(arr > 10) print(f"大于10的元素索引: {nonzero_indices}") print(f"大于10的元素值: {arr[nonzero_indices]}") # 使用where函数 print(f"\nnp.where(arr > 10): {np.where(arr > 10)}") # 条件替换 arr_modified = np.where(arr > 10, 100, arr) print(f"将大于10的元素替换为100:\n{arr_modified}") # 8. 多维数组扁平化 print("\n8. 数组扁平化:") # flatten - 返回副本 flattened = arr.flatten() print(f"flatten() (副本): {flattened}") # ravel - 返回视图(如果可能) raveled = arr.ravel() print(f"ravel() (视图): {raveled}") # reshape - 改变形状(视图) reshaped = arr.reshape(2, 8) print(f"reshape(2, 8):\n{reshaped}") # 9. 转置和轴交换 print("\n9. 转置和轴交换:") print(f"原始数组:\n{arr}") print(f"转置:\n{arr.T}") # 3D数组轴交换 arr_3d = np.arange(24).reshape(2, 3, 4) print(f"\n3D数组 (形状: {arr_3d.shape}):") print(f"交换轴0和1:\n{arr_3d.transpose(1, 0, 2).shape}") print(f"交换所有轴:\n{arr_3d.transpose(2, 1, 0).shape}") return arr# 执行索引和切片演示indexed_arr = demonstrate_indexing_slicing()
四、数组操作与数学运算
def demonstrate_array_operations(): """演示数组操作和数学运算""" print("=" * 80) print("NumPy数组操作与数学运算") print("=" * 80) # 创建示例数组 A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) print(f"数组A:\n{A}") print(f"数组B:\n{B}") # 1. 算术运算 print("\n1. 算术运算:") print(f"A + B (加法):\n{A + B}") print(f"A - B (减法):\n{A - B}") print(f"A * B (元素乘法):\n{A * B}") print(f"A / B (除法):\n{A / B}") print(f"A ** 2 (平方):\n{A ** 2}") print(f"np.sqrt(A) (平方根):\n{np.sqrt(A)}") # 2. 矩阵乘法 print("\n2. 矩阵乘法:") print(f"A @ B (矩阵乘法):\n{A @ B}") print(f"np.dot(A, B) (点积):\n{np.dot(A, B)}") print(f"np.matmul(A, B) (矩阵乘法):\n{np.matmul(A, B)}") # 3. 广播机制 print("\n3. 广播机制:") scalar = 10 print(f"A + {scalar} (标量广播):\n{A + scalar}") vector = np.array([10, 20]) print(f"\n向量: {vector}") print(f"A + vector (行广播):\n{A + vector}") # 更复杂的广播 arr_3d = np.arange(12).reshape(3, 4) arr_1d = np.array([100, 200, 300, 400]) print(f"\n3D数组 (3x4):\n{arr_3d}") print(f"1D数组: {arr_1d}") print(f"广播相加:\n{arr_3d + arr_1d}") # 4. 聚合运算 print("\n4. 聚合运算:") data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) print(f"数据数组:\n{data}") print(f"\n总和: {data.sum()}") print(f"每列总和: {data.sum(axis=0)}") print(f"每行总和: {data.sum(axis=1)}") print(f"\n平均值: {data.mean():.2f}") print(f"每列平均值: {data.mean(axis=0)}") print(f"每行平均值: {data.mean(axis=1)}") print(f"\n标准差: {data.std():.2f}") print(f"方差: {data.var():.2f}") print(f"\n最小值: {data.min()}") print(f"最大值: {data.max()}") print(f"最小值位置: {data.argmin()}") print(f"最大值位置: {data.argmax()}") # 5. 累积运算 print("\n5. 累积运算:") print(f"累积和: {data.cumsum()}") print(f"累积积: {data.cumprod()}") # 6. 逻辑运算 print("\n6. 逻辑运算:") C = np.array([1, 2, 3, 4, 5]) D = np.array([5, 4, 3, 2, 1]) print(f"数组C: {C}") print(f"数组D: {D}") print(f"C > D: {C > D}") print(f"C == D: {C == D}") print(f"np.logical_and(C > 2, C < 5): {np.logical_and(C > 2, C < 5)}") print(f"np.logical_or(C < 2, C > 4): {np.logical_or(C < 2, C > 4)}") # 7. 统计函数 print("\n7. 统计函数:") random_data = np.random.normal(0, 1, 1000) print(f"随机数据 (正态分布):") print(f" 中位数: {np.median(random_data):.4f}") print(f" 百分位数 (25%): {np.percentile(random_data, 25):.4f}") print(f" 百分位数 (50%): {np.percentile(random_data, 50):.4f}") print(f" 百分位数 (75%): {np.percentile(random_data, 75):.4f}") print(f" 峰度: {stats.kurtosis(random_data):.4f}") print(f" 偏度: {stats.skew(random_data):.4f}") # 8. 三角函数 print("\n8. 三角函数:") angles = np.array([0, np.pi/6, np.pi/4, np.pi/3, np.pi/2]) print(f"角度 (弧度): {angles}") print(f"sin: {np.sin(angles)}") print(f"cos: {np.cos(angles)}") print(f"tan: {np.tan(angles)}") # 9. 指数和对数函数 print("\n9. 指数和对数函数:") x = np.array([1, 2, 3, 4, 5]) print(f"x: {x}") print(f"exp(x): {np.exp(x)}") print(f"log(x): {np.log(x)}") print(f"log10(x): {np.log10(x)}") print(f"log2(x): {np.log2(x)}") # 10. 比较性能 print("\n10. 向量化运算性能比较:") # 创建大型数组 large_arr = np.random.randn(10000, 10000) # 向量化运算 start = time.time() result_vectorized = np.sin(large_arr) ** 2 + np.cos(large_arr) ** 2 vectorized_time = time.time() - start # 循环运算(仅演示小规模) small_arr = np.random.randn(100, 100) start = time.time() result_loop = np.zeros_like(small_arr) for i in range(small_arr.shape[0]): for j in range(small_arr.shape[1]): result_loop[i, j] = np.sin(small_arr[i, j]) ** 2 + np.cos(small_arr[i, j]) ** 2 loop_time = time.time() - start print(f"向量化运算 (10000x10000): {vectorized_time:.3f}秒") print(f"循环运算 (100x100): {loop_time:.3f}秒") print(f"速度提升倍数: {loop_time/(vectorized_time*0.01):.0f}x (估算)") return A, B, data# 执行数组操作演示A, B, data = demonstrate_array_operations()
五、线性代数运算
def demonstrate_linear_algebra(): """演示线性代数运算""" print("=" * 80) print("NumPy线性代数运算") print("=" * 80) # 创建示例矩阵 A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) C = np.array([[2, 1, -1], [-3, -1, 2], [-2, 1, 2]]) print(f"矩阵A:\n{A}") print(f"矩阵B:\n{B}") print(f"矩阵C (3x3):\n{C}") # 1. 基本线性代数运算 print("\n1. 基本线性代数运算:") # 矩阵乘法 print(f"A @ B:\n{A @ B}") # 点积 v1 = np.array([1, 2, 3]) v2 = np.array([4, 5, 6]) print(f"\n向量v1: {v1}") print(f"向量v2: {v2}") print(f"点积: {np.dot(v1, v2)}") print(f"点积 (v1@v2): {v1 @ v2}") # 外积 print(f"\n外积:\n{np.outer(v1, v2)}") # 2. 矩阵属性 print("\n2. 矩阵属性:") print(f"矩阵A的迹: {np.trace(A)}") print(f"矩阵A的行列式: {np.linalg.det(A):.2f}") # 3. 矩阵分解 print("\n3. 矩阵分解:") # 特征值和特征向量 eigenvalues, eigenvectors = np.linalg.eig(A) print(f"矩阵A的特征值: {eigenvalues}") print(f"矩阵A的特征向量:\n{eigenvectors}") # 验证特征值分解 print(f"\n验证: A * v = λ * v") for i in range(len(eigenvalues)): v = eigenvectors[:, i] λ = eigenvalues[i] Av = A @ v λv = λ * v print(f"λ{i+1} = {λ:.4f}, 误差: {np.linalg.norm(Av - λv):.6f}") # 4. 线性方程组求解 print("\n4. 线性方程组求解:") # 求解 Ax = b b = np.array([1, 2]) print(f"系数矩阵A:\n{A}") print(f"常数向量b: {b}") # 方法1: 直接求解 x = np.linalg.solve(A, b) print(f"解x: {x}") print(f"验证 A @ x = b: {A @ x}") # 方法2: 使用逆矩阵 A_inv = np.linalg.inv(A) x_inv = A_inv @ b print(f"\n使用逆矩阵求解:") print(f"A的逆矩阵:\n{A_inv}") print(f"解x: {x_inv}") # 5. 最小二乘法 print("\n5. 最小二乘法:") # 创建过定方程组 X = np.array([[1, 1], [1, 2], [1, 3], [1, 4]]) y = np.array([1.5, 2.5, 3.5, 4.5]) print(f"设计矩阵X:\n{X}") print(f"响应变量y: {y}") # 正规方程: (X^T X)^(-1) X^T y XTX = X.T @ X XTX_inv = np.linalg.inv(XTX) w = XTX_inv @ X.T @ y print(f"\n正规方程解:") print(f"参数w: {w}") print(f"预测值: {X @ w}") # 使用np.linalg.lstsq w_lstsq, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None) print(f"\nnp.linalg.lstsq解:") print(f"参数w: {w_lstsq}") print(f"残差平方和: {residuals}") # 6. 矩阵范数 print("\n6. 矩阵范数:") print(f"Frobenius范数 (‖A‖_F): {np.linalg.norm(A, 'fro'):.4f}") print(f"1-范数 (最大列和): {np.linalg.norm(A, 1):.4f}") print(f"∞-范数 (最大行和): {np.linalg.norm(A, np.inf):.4f}") print(f"2-范数 (最大奇异值): {np.linalg.norm(A, 2):.4f}") # 7. 奇异值分解 (SVD) print("\n7. 奇异值分解 (SVD):") U, S, Vt = np.linalg.svd(A) print(f"U矩阵:\n{U}") print(f"奇异值: {S}") print(f"V^T矩阵:\n{Vt}") # 验证SVD Σ = np.zeros(A.shape) np.fill_diagonal(Σ, S) A_reconstructed = U @ Σ @ Vt print(f"\n重构矩阵:\n{A_reconstructed}") print(f"重构误差: {np.linalg.norm(A - A_reconstructed):.6f}") # 8. QR分解 print("\n8. QR分解:") Q, R = np.linalg.qr(A) print(f"Q矩阵 (正交):\n{Q}") print(f"R矩阵 (上三角):\n{R}") print(f"验证 Q^T Q = I:\n{Q.T @ Q}") # 9. Cholesky分解 print("\n9. Cholesky分解:") # 创建对称正定矩阵 P = np.array([[4, 12, -16], [12, 37, -43], [-16, -43, 98]]) print(f"对称正定矩阵P:\n{P}") try: L = np.linalg.cholesky(P) print(f"Cholesky分解 L:\n{L}") print(f"验证 L L^T = P:") print(f"L @ L.T:\n{L @ L.T}") except np.linalg.LinAlgError: print("矩阵不是正定的") # 10. 矩阵秩和条件数 print("\n10. 矩阵秩和条件数:") print(f"矩阵A的秩: {np.linalg.matrix_rank(A)}") print(f"矩阵A的条件数: {np.linalg.cond(A):.4f}") # 病态矩阵示例 ill_conditioned = np.array([[1, 1], [1, 1.0001]]) print(f"\n病态矩阵:\n{ill_conditioned}") print(f"条件数: {np.linalg.cond(ill_conditioned):.4f}") return A, B, C# 执行线性代数演示lin_alg_A, lin_alg_B, lin_alg_C = demonstrate_linear_algebra()
六、随机数生成与概率分布
def demonstrate_random_generation(): """演示随机数生成和概率分布""" print("=" * 80) print("NumPy随机数生成与概率分布") print("=" * 80) # 设置随机种子 np.random.seed(42) # 1. 随机数生成器 print("1. 随机数生成器:") # 创建随机数生成器实例 rng = np.random.default_rng(seed=42) # 2. 均匀分布 print("\n2. 均匀分布:") uniform_samples = rng.uniform(0, 1, 10) print(f"均匀分布 U(0,1) 10个样本: {uniform_samples}") # 3. 正态分布 print("\n3. 正态分布:") normal_samples = rng.normal(0, 1, 10) print(f"标准正态分布 N(0,1) 10个样本: {normal_samples}") # 4. 多种概率分布 print("\n4. 多种概率分布:") distributions = { '指数分布': rng.exponential(1.0, 1000), '伽马分布': rng.gamma(2, 2, 1000), '贝塔分布': rng.beta(2, 5, 1000), '二项分布': rng.binomial(10, 0.5, 1000), '泊松分布': rng.poisson(5, 1000), '卡方分布': rng.chisquare(3, 1000), '学生t分布': rng.standard_t(10, 1000), 'F分布': rng.f(5, 10, 1000) } # 5. 随机抽样 print("\n5. 随机抽样:") population = np.arange(100) print(f"总体: {population[:10]}...") # 简单随机抽样 sample = rng.choice(population, size=10, replace=False) print(f"无放回抽样 (10个): {sample}") # 加权随机抽样 weights = np.exp(np.linspace(0, 1, 100)) weights = weights / weights.sum() weighted_sample = rng.choice(population, size=10, replace=False, p=weights) print(f"加权抽样 (10个): {weighted_sample}") # 6. 排列和洗牌 print("\n6. 排列和洗牌:") arr = np.arange(10) print(f"原始数组: {arr}") # 洗牌(原地) shuffled = arr.copy() rng.shuffle(shuffled) print(f"洗牌后: {shuffled}") # 排列(返回新数组) permutation = rng.permutation(arr) print(f"排列: {permutation}") # 7. 随机数生成性能 print("\n7. 随机数生成性能:") sizes = [1000, 10000, 100000, 1000000] print(f"{'样本量':<10}{'生成时间(ms)':<15}{'速度(百万/秒)':<15}") print("-" * 40) for size in sizes: start = time.time() samples = rng.normal(0, 1, size) elapsed = (time.time() - start) * 1000 # 毫秒 speed = size / (elapsed * 1000) if elapsed > 0 else 0 print(f"{size:<10}{elapsed:<15.2f}{speed:<15.2f}") # 8. 可视化概率分布 print("\n8. 可视化概率分布:") fig, axes = plt.subplots(3, 3, figsize=(15, 12)) axes = axes.flatten() for idx, (name, samples) in enumerate(list(distributions.items())[:9]): ax = axes[idx] # 直方图 ax.hist(samples, bins=50, density=True, alpha=0.7, edgecolor='black') # 添加标题 ax.set_title(name) ax.set_xlabel('值') ax.set_ylabel('密度') ax.grid(True, alpha=0.3) # 添加统计信息 stats_text = f'均值: {samples.mean():.2f}\n标准差: {samples.std():.2f}' ax.text(0.05, 0.95, stats_text, transform=ax.transAxes, verticalalignment='top', fontsize=10, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) # 隐藏多余的子图 for idx in range(len(distributions), len(axes)): axes[idx].set_visible(False) plt.suptitle('概率分布可视化', fontsize=16, y=1.02) plt.tight_layout() plt.show() # 9. 多元正态分布 print("\n9. 多元正态分布:") # 定义均值和协方差矩阵 mean = [0, 0] cov = [[1, 0.8], [0.8, 1]] # 生成样本 multivariate_samples = rng.multivariate_normal(mean, cov, 1000) print(f"均值向量: {mean}") print(f"协方差矩阵:\n{np.array(cov)}") print(f"生成样本形状: {multivariate_samples.shape}") # 计算样本统计量 sample_mean = multivariate_samples.mean(axis=0) sample_cov = np.cov(multivariate_samples.T) print(f"\n样本均值: {sample_mean}") print(f"样本协方差矩阵:\n{sample_cov}") # 10. 蒙特卡洛模拟 print("\n10. 蒙特卡洛模拟 - 估计π:") n_simulations = [100, 1000, 10000, 100000, 1000000] print(f"{'模拟次数':<10}{'估计π':<15}{'误差':<15}{'时间(ms)':<15}") print("-" * 55) for n in n_simulations: start = time.time() # 生成随机点 x = rng.uniform(-1, 1, n) y = rng.uniform(-1, 1, n) # 计算在单位圆内的点数 inside_circle = (x**2 + y**2) <= 1 pi_estimate = 4 * inside_circle.sum() / n elapsed = (time.time() - start) * 1000 error = abs(pi_estimate - np.pi) print(f"{n:<10}{pi_estimate:<15.6f}{error:<15.6f}{elapsed:<15.2f}") return distributions, multivariate_samples# 执行随机数生成演示distributions, multivariate_samples = demonstrate_random_generation()
七、性能优化与向量化
def demonstrate_performance_optimization(): """演示NumPy性能优化技巧""" print("=" * 80) print("NumPy性能优化与向量化") print("=" * 80) # 1. 向量化 vs 循环 print("1. 向量化运算 vs Python循环:") # 创建测试数据 n = 1000000 arr = np.random.randn(n) # Python循环 start = time.time() result_loop = np.zeros(n) for i in range(n): result_loop[i] = arr[i] ** 2 + np.sin(arr[i]) loop_time = time.time() - start # NumPy向量化 start = time.time() result_vectorized = arr ** 2 + np.sin(arr) vectorized_time = time.time() - start print(f"数据规模: {n:,}") print(f"Python循环时间: {loop_time:.4f}秒") print(f"NumPy向量化时间: {vectorized_time:.4f}秒") print(f"加速比: {loop_time/vectorized_time:.1f}x") # 2. 内存布局优化 print("\n2. 内存布局优化:") # 创建大型矩阵 size = 2000 matrix_c = np.ones((size, size), order='C') # 行优先 matrix_f = np.ones((size, size), order='F') # 列优先 # 行优先访问 start = time.time() for i in range(size): for j in range(size): matrix_c[i, j] = i + j time_c_row_major = time.time() - start start = time.time() for i in range(size): for j in range(size): matrix_f[i, j] = i + j time_f_row_major = time.time() - start # 列优先访问 start = time.time() for j in range(size): for i in range(size): matrix_c[i, j] = i + j time_c_col_major = time.time() - start start = time.time() for j in range(size): for i in range(size): matrix_f[i, j] = i + j time_f_col_major = time.time() - start print(f"矩阵大小: {size}x{size}") print(f"\n{'访问方式':<15} {'C风格':<15} {'F风格':<15}") print("-" * 45) print(f"{'行优先':<15} {time_c_row_major:<15.4f} {time_f_row_major:<15.4f}") print(f"{'列优先':<15} {time_c_col_major:<15.4f} {time_f_col_major:<15.4f}") # 3. 广播优化 print("\n3. 广播优化技巧:") # 不优化的版本 arr1 = np.random.randn(1000, 1000) arr2 = np.random.randn(1000, 1) start = time.time() result_slow = np.zeros_like(arr1) for i in range(arr1.shape[0]): result_slow[i, :] = arr1[i, :] + arr2[i, 0] slow_time = time.time() - start # 广播优化版本 start = time.time() result_fast = arr1 + arr2 fast_time = time.time() - start print(f"数据形状: arr1={arr1.shape}, arr2={arr2.shape}") print(f"循环版本时间: {slow_time:.4f}秒") print(f"广播版本时间: {fast_time:.4f}秒") print(f"加速比: {slow_time/fast_time:.1f}x") # 4. 就地操作 print("\n4. 就地操作 (in-place):") large_arr = np.random.randn(5000, 5000) # 非就地操作 start = time.time() result = large_arr * 2 + 1 non_inplace_time = time.time() - start # 就地操作 start = time.time() large_arr *= 2 large_arr += 1 inplace_time = time.time() - start print(f"数组大小: {large_arr.shape}") print(f"非就地操作时间: {non_inplace_time:.4f}秒") print(f"就地操作时间: {inplace_time:.4f}秒") print(f"内存节省: 50% (无需创建中间数组)") # 5. 使用NumPy内置函数 print("\n5. 使用NumPy内置函数:") arr = np.random.randn(10000) # 手动计算 start = time.time() mean_manual = sum(arr) / len(arr) std_manual = np.sqrt(sum((arr - mean_manual) ** 2) / len(arr)) manual_time = time.time() - start # NumPy内置函数 start = time.time() mean_np = np.mean(arr) std_np = np.std(arr) numpy_time = time.time() - start print(f"数组大小: {len(arr):,}") print(f"手动计算时间: {manual_time:.6f}秒") print(f"NumPy函数时间: {numpy_time:.6f}秒") print(f"加速比: {manual_time/numpy_time:.1f}x") # 6. 避免不必要的复制 print("\n6. 避免不必要的复制:") original = np.arange(1000000) # 不必要的复制 start = time.time() copied = original.copy() copied = copied * 2 copy_time = time.time() - start # 原地修改 start = time.time() original *= 2 inplace_time_2 = time.time() - start print(f"数组大小: {len(original):,}") print(f"复制后操作时间: {copy_time:.4f}秒") print(f"原地操作时间: {inplace_time_2:.4f}秒") # 7. 使用einsum进行复杂运算 print("\n7. 使用einsum进行复杂运算:") A = np.random.randn(100, 100) B = np.random.randn(100, 100) # 传统方法 start = time.time() result_traditional = np.zeros((100, 100)) for i in range(100): for j in range(100): for k in range(100): result_traditional[i, j] += A[i, k] * B[k, j] traditional_time = time.time() - start # einsum方法 start = time.time() result_einsum = np.einsum('ik,kj->ij', A, B) einsum_time = time.time() - start # 矩阵乘法 start = time.time() result_matmul = A @ B matmul_time = time.time() - start print(f"矩阵大小: 100x100") print(f"三重循环时间: {traditional_time:.4f}秒") print(f"einsum时间: {einsum_time:.4f}秒") print(f"矩阵乘法时间: {matmul_time:.6f}秒") print(f"einsum vs 循环加速比: {traditional_time/einsum_time:.0f}x") # 8. 使用numexpr加速复杂表达式 print("\n8. 使用numexpr加速复杂表达式:") try: import numexpr as ne arr = np.random.randn(1000000) # NumPy版本 start = time.time() result_numpy = np.sin(arr) ** 2 + np.cos(arr) ** 2 numpy_expr_time = time.time() - start # numexpr版本 start = time.time() result_numexpr = ne.evaluate('sin(arr) ** 2 + cos(arr) ** 2') numexpr_time = time.time() - start print(f"数组大小: {len(arr):,}") print(f"NumPy表达式时间: {numpy_expr_time:.4f}秒") print(f"numexpr时间: {numexpr_time:.4f}秒") print(f"加速比: {numpy_expr_time/numexpr_time:.1f}x") except ImportError: print("安装numexpr以加速复杂表达式: pip install numexpr") # 9. 性能优化总结 print("\n" + "=" * 80) print("NumPy性能优化最佳实践总结:") print("=" * 80) best_practices = [ "1. 始终使用向量化操作,避免Python循环", "2. 合理选择内存布局(C风格或F风格)", "3. 利用广播机制减少内存使用", "4. 尽可能使用就地操作", "5. 使用NumPy内置函数而非手动实现", "6. 避免不必要的数组复制", "7. 对于复杂运算,使用einsum或专用函数", "8. 考虑使用numexpr加速复杂表达式", "9. 使用适当的数据类型(float32 vs float64)", "10. 利用多核并行计算(如通过BLAS库)" ] for practice in best_practices: print(practice) return arr1, arr2# 执行性能优化演示optimized_arr1, optimized_arr2 = demonstrate_performance_optimization()
八、实际应用案例
def practical_applications(): """NumPy实际应用案例""" print("=" * 80) print("NumPy实际应用案例") print("=" * 80) # 1. 图像处理 print("1. 图像处理:") # 创建模拟图像 (RGB) height, width = 400, 600 channels = 3 # 随机生成图像 image = np.random.randint(0, 256, (height, width, channels), dtype=np.uint8) print(f"图像形状: {image.shape}") print(f"图像数据类型: {image.dtype}") print(f"像素值范围: [{image.min()}, {image.max()}]") # 转换为灰度图 gray_image = np.dot(image[..., :3], [0.2989, 0.5870, 0.1140]).astype(np.uint8) print(f"灰度图像状: {gray_image.shape}") # 图像增强:对比度拉伸 def contrast_stretch(img, low_percent=2, high_percent=98): """对比度拉伸""" low_val = np.percentile(img, low_percent) high_val = np.percentile(img, high_percent) stretched = np.clip((img - low_val) * 255.0 / (high_val - low_val), 0, 255) return stretched.astype(np.uint8) enhanced_image = contrast_stretch(gray_image) # 2. 信号处理 print("\n2. 信号处理:") # 生成信号 fs = 1000 # 采样频率 1000 Hz t = np.linspace(0, 1, fs, endpoint=False) # 1秒信号 # 创建多频率信号 signal = (np.sin(2 * np.pi * 50 * t) + # 50Hz 0.5 * np.sin(2 * np.pi * 120 * t) + # 120Hz 0.3 * np.sin(2 * np.pi * 300 * t)) # 300Hz # 添加噪声 noise = 0.2 * np.random.randn(len(t)) noisy_signal = signal + noise # 快速傅里叶变换 fft_result = np.fft.fft(noisy_signal) frequencies = np.fft.fftfreq(len(t), 1/fs) # 找到主要频率成分 magnitude = np.abs(fft_result) dominant_freqs = frequencies[np.argsort(magnitude)[-3:]] print(f"信号长度: {len(signal)}") print(f"采样频率: {fs} Hz") print(f"主要频率成分: {np.sort(dominant_freqs[:3])} Hz") # 3. 物理模拟 print("\n3. 物理模拟 - 抛体运动:") # 模拟参数 g = 9.81 # 重力加速度 m/s² v0 = 50 # 初速度 m/s theta = np.radians(45) # 发射角度 45度 # 时间向量 t_max = 2 * v0 * np.sin(theta) / g t_sim = np.linspace(0, t_max, 100) # 运动方程 x = v0 * np.cos(theta) * t_sim y = v0 * np.sin(theta) * t_sim - 0.5 * g * t_sim**2 # 计算结果 max_height = np.max(y) range_distance = np.max(x) print(f"发射速度: {v0} m/s") print(f"发射角度: {np.degrees(theta):.1f}°") print(f"最大高度: {max_height:.2f} m") print(f"射程: {range_distance:.2f} m") print(f"飞行时间: {t_max:.2f} s") # 4. 金融计算 print("\n4. 金融计算 - 期权定价:") # 蒙特卡洛模拟欧式看涨期权 def monte_carlo_option_price(S0, K, T, r, sigma, n_simulations=100000): """蒙特卡洛模拟期权定价""" # 生成随机路径 np.random.seed(42) z = np.random.randn(n_simulations) # 使用几何布朗运动模型 ST = S0 * np.exp((r - 0.5 * sigma**2) * T + sigma * np.sqrt(T) * z) # 计算期权收益 payoff = np.maximum(ST - K, 0) # 贴现得到期权价格 option_price = np.exp(-r * T) * np.mean(payoff) return option_price, payoff # 参数设置 S0 = 100 # 当前股价 K = 105 # 行权价 T = 1.0 # 到期时间(年) r = 0.05 # 无风险利率 sigma = 0.2 # 波动率 price, payoffs = monte_carlo_option_price(S0, K, T, r, sigma, 100000) print(f"当前股价: ${S0}") print(f"行权价: ${K}") print(f"到期时间: {T} 年") print(f"无风险利率: {r*100:.1f}%") print(f"波动率: {sigma*100:.1f}%") print(f"蒙特卡洛期权价格: ${price:.4f}") # 计算置信区间 std_error = np.std(payoffs) / np.sqrt(len(payoffs)) confidence_interval = (price - 1.96 * std_error, price + 1.96 * std_error) print(f"95%置信区间: (${confidence_interval[0]:.4f}, ${confidence_interval[1]:.4f})") # 5. 机器学习特征工程 print("\n5. 机器学习特征工程:") # 创建示例数据集 n_samples = 1000 n_features = 5 X = np.random.randn(n_samples, n_features) y = 2 * X[:, 0] + 3 * X[:, 1] - 1.5 * X[:, 2] + np.random.randn(n_samples) * 0.5 print(f"数据集形状: X={X.shape}, y={y.shape}") # 特征标准化 X_mean = np.mean(X, axis=0) X_std = np.std(X, axis=0) X_normalized = (X - X_mean) / X_std print(f"\n特征标准化:") print(f"标准化后均值: {X_normalized.mean(axis=0).round(2)}") print(f"标准化后标准差: {X_normalized.std(axis=0).round(2)}") # 特征交互 interactions = np.column_stack([ X[:, 0] * X[:, 1], # 特征0和1的交互 X[:, 0] ** 2, # 特征0的平方 np.log(np.abs(X[:, 2]) + 1), # 特征2的对数变换 np.sin(X[:, 3]), # 特征3的正弦变换 np.exp(X[:, 4]) # 特征4的指数变换 ]) print(f"\n特征交互形状: {interactions.shape}") # 主成分分析 (PCA) from sklearn.decomposition import PCA pca = PCA(n_components=2) X_pca = pca.fit_transform(X_normalized) print(f"\nPCA结果:") print(f"主成分形状: {X_pca.shape}") print(f"解释方差比例: {pca.explained_variance_ratio_}") print(f"累计解释方差: {np.cumsum(pca.explained_variance_ratio_)}") # 6. 科学计算 - 求解微分方程 print("\n6. 科学计算 - 求解微分方程:") # 求解一阶微分方程 dy/dt = -ky def solve_ode_euler(k, y0, t_start, t_end, n_steps): """使用欧拉法求解微分方程""" t = np.linspace(t_start, t_end, n_steps) y = np.zeros(n_steps) y[0] = y0 dt = (t_end - t_start) / n_steps for i in range(1, n_steps): y[i] = y[i-1] + dt * (-k * y[i-1]) return t, y # 解析解 def analytical_solution(k, y0, t): return y0 * np.exp(-k * t) # 数值解 k = 0.5 y0 = 1.0 t_start, t_end = 0, 10 n_steps = 1000 t_numerical, y_numerical = solve_ode_euler(k, y0, t_start, t_end, n_steps) y_analytical = analytical_solution(k, y0, t_numerical) # 计算误差 error = np.abs(y_numerical - y_analytical) max_error = np.max(error) mean_error = np.mean(error) print(f"微分方程: dy/dt = -{k}y, y(0) = {y0}") print(f"数值方法: 欧拉法") print(f"时间区间: [{t_start}, {t_end}]") print(f"步数: {n_steps}") print(f"最大误差: {max_error:.6f}") print(f"平均误差: {mean_error:.6f}") # 可视化结果 fig, axes = plt.subplots(2, 3, figsize=(15, 10)) # 1. 图像处理 axes[0, 0].imshow(image) axes[0, 0].set_title('原始彩色图像') axes[0, 0].axis('off') axes[0, 1].imshow(gray_image, cmap='gray') axes[0, 1].set_title('灰度图像') axes[0, 1].axis('off') axes[0, 2].imshow(enhanced_image, cmap='gray') axes[0, 2].set_title('对比度增强') axes[0, 2].axis('off') # 2. 信号处理 axes[1, 0].plot(t[:200], noisy_signal[:200]) axes[1, 0].set_title('含噪信号 (前200个点)') axes[1, 0].set_xlabel('时间 (s)') axes[1, 0].set_ylabel('幅度') axes[1, 0].grid(True, alpha=0.3) # 3. 物理模拟 axes[1, 1].plot(x, y) axes[1, 1].set_title('抛体运动轨迹') axes[1, 1].set_xlabel('水平距离 (m)') axes[1, 1].set_ylabel('垂直高度 (m)') axes[1, 1].grid(True, alpha=0.3) axes[1, 1].axhline(y=0, color='k', linestyle='--', alpha=0.5) # 4. 微分方程求解 axes[1, 2].plot(t_numerical, y_numerical, 'b-', label='数值解') axes[1, 2].plot(t_numerical, y_analytical, 'r--', label='解析解') axes[1, 2].set_title('微分方程求解') axes[1, 2].set_xlabel('时间') axes[1, 2].set_ylabel('y(t)') axes[1, 2].legend() axes[1, 2].grid(True, alpha=0.3) plt.suptitle('NumPy实际应用案例', fontsize=16, y=1.02) plt.tight_layout() plt.show() return { 'image': image, 'gray_image': gray_image, 'signal': noisy_signal, 'projectile': (x, y), 'option_price': price, 'features': X_normalized }# 执行实际应用案例applications = practical_applications()
九、今日练习与挑战
"""今日练习任务:基础练习:1. 创建各种形状的数组并练习索引切片2. 实现基本的数组运算和广播3. 练习使用随机数生成函数中级挑战:1. 实现矩阵运算和线性代数函数2. 使用NumPy进行简单的图像处理3. 实现蒙特卡洛模拟高级项目:1. 实现完整的线性回归模型(从零开始)2. 使用NumPy实现简单的神经网络3. 构建一个物理模拟系统综合项目:股票数据分析系统要求:1. 生成模拟股票价格数据2. 计算技术指标(移动平均、波动率等)3. 实现简单的交易策略4. 进行回测分析"""def numpy_practice_exercises(): """NumPy练习题目""" print("=" * 80) print("NumPy练习题目") print("=" * 80) # 练习1:数组创建与操作 print("\n练习1:数组创建与操作") print("创建一个10x10的数组,其中元素为:") print(" - 主对角线为1") print(" - 副对角线为2") print(" - 其他元素为0") # 参考答案 n = 10 arr = np.zeros((n, n)) np.fill_diagonal(arr, 1) np.fill_diagonal(np.fliplr(arr), 2) print(f"\n参考答案:\n{arr}") # 练习2:广播应用 print("\n练习2:广播应用") print("创建一个函数,计算矩阵每行的归一化值") print("(每行减去该行均值,除以该行标准差)") def normalize_rows(matrix): """行归一化""" row_means = matrix.mean(axis=1, keepdims=True) row_stds = matrix.std(axis=1, keepdims=True) return (matrix - row_means) / row_stds test_matrix = np.random.randn(5, 3) normalized = normalize_rows(test_matrix) print(f"\n测试矩阵:\n{test_matrix}") print(f"归一化后每行均值: {normalized.mean(axis=1)}") print(f"归一化后每行标准差: {normalized.std(axis=1)}") # 练习3:线性代数 print("\n练习3:线性代数") print("实现线性回归的正规方程解法") def linear_regression_normal_equation(X, y): """正规方程线性回归""" # 添加偏置项 X_b = np.c_[np.ones((X.shape[0], 1)), X] # 正规方程: θ = (X^T X)^(-1) X^T y theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y return theta # 测试数据 X_test = np.random.randn(100, 2) y_test = 3 + 2 * X_test[:, 0] - 1.5 * X_test[:, 1] + np.random.randn(100) * 0.1 theta = linear_regression_normal_equation(X_test, y_test) print(f"\n真实参数: [3.0, 2.0, -1.5]") print(f"估计参数: {theta}") # 练习4:性能优化 print("\n练习4:性能优化") print("比较不同方法计算欧几里得距离矩阵的性能") def distance_matrix_slow(X): """慢速版本:双重循环""" n = X.shape[0] D = np.zeros((n, n)) for i in range(n): for j in range(n): D[i, j] = np.sqrt(np.sum((X[i] - X[j]) ** 2)) return D def distance_matrix_fast(X): """快速版本:向量化""" sum_X = np.sum(X ** 2, axis=1) D = np.sqrt(np.maximum(sum_X[:, np.newaxis] + sum_X[np.newaxis, :] - 2 * X @ X.T, 0)) return D # 测试小规模数据 X_small = np.random.randn(100, 3) start = time.time() D_slow = distance_matrix_slow(X_small) slow_time = time.time() - start start = time.time() D_fast = distance_matrix_fast(X_small) fast_time = time.time() - start print(f"数据形状: {X_small.shape}") print(f"慢速版本时间: {slow_time:.4f}秒") print(f"快速版本时间: {fast_time:.4f}秒") print(f"加速比: {slow_time/fast_time:.1f}x") print(f"结果一致性: {np.allclose(D_slow, D_fast)}") # 练习5:综合应用 print("\n练习5:综合应用 - K-means聚类") def kmeans(X, k, max_iters=100): """K-means聚类算法""" # 随机初始化中心点 n_samples = X.shape[0] indices = np.random.choice(n_samples, k, replace=False) centers = X[indices] for _ in range(max_iters): # 分配标签 distances = np.sqrt(((X[:, np.newaxis, :] - centers[np.newaxis, :, :]) ** 2).sum(axis=2)) labels = np.argmin(distances, axis=1) # 更新中心点 new_centers = np.array([X[labels == i].mean(axis=0) for i in range(k)]) # 检查收敛 if np.allclose(centers, new_centers): break centers = new_centers return centers, labels # 生成测试数据 np.random.seed(42) n_samples = 300 X1 = np.random.randn(n_samples, 2) + np.array([5, 5]) X2 = np.random.randn(n_samples, 2) + np.array([-5, -5]) X3 = np.random.randn(n_samples, 2) + np.array([5, -5]) X_kmeans = np.vstack([X1, X2, X3]) centers, labels = kmeans(X_kmeans, k=3) print(f"数据形状: {X_kmeans.shape}") print(f"聚类中心:\n{centers}") print(f"标签分布: {np.bincount(labels)}") return { 'normalized_matrix': normalized, 'regression_theta': theta, 'distance_matrix': D_fast, 'kmeans_results': (centers, labels, X_kmeans) }# 执行练习practice_results = numpy_practice_exercises()
NumPy不仅是Python的科学计算库,更是整个数据科学生态系统的基石。掌握NumPy是成为Python数据科学家的第一步。