一、什么是深度学习?
1. 深度学习的定义
深度学习是机器学习的一个子领域,它通过模拟人脑的神经网络结构,使用多层非线性变换来学习数据的层次化表示。其核心思想是让机器自动学习特征表示,而不是依赖人工设计的特征。
# 深度学习与传统机器学习的对比import matplotlib.pyplot as plt# 可视化对比fig, axes = plt.subplots(1, 2, figsize=(12, 4))# 传统机器学习axes[0].text(0.5, 0.5, '特征工程 → 模型训练\n\n人工特征设计\n+ 传统算法\n= 有限表现力', ha='center', va='center', fontsize=12, fontweight='bold')axes[0].set_title('传统机器学习', fontsize=14, fontweight='bold')axes[0].axis('off')# 深度学习axes[1].text(0.5, 0.5, '原始数据 → 深度学习\n\n自动特征学习\n+ 多层神经网络\n= 强大表现力', ha='center', va='center', fontsize=12, fontweight='bold')axes[1].set_title('深度学习', fontsize=14, fontweight='bold')axes[1].axis('off')plt.suptitle('传统机器学习 vs 深度学习', fontsize=16, fontweight='bold')plt.tight_layout()plt.show()
2. 深度学习的历史里程碑
# 深度学习发展时间线timeline_data = { '1943': 'McCulloch & Pitts提出人工神经元模型', '1958': 'Frank Rosenblatt发明感知机', '1969': 'Minsky & Papert指出感知机的局限性', '1986': '反向传播算法重新被发现', '1998': 'Yann LeCun提出LeNet-5(卷积神经网络)', '2006': 'Geoffrey Hinton提出深度信念网络', '2012': 'AlexNet在ImageNet比赛中大获成功', '2014': '生成对抗网络(GAN)被提出', '2015': 'ResNet解决深度网络训练难题', '2017': 'Transformer架构革命性突破', '2020': 'GPT-3展现强大语言能力', '2022': '扩散模型引领图像生成革命'}print("深度学习发展里程碑:")for year, event in timeline_data.items(): print(f"{year}: {event}")
二、神经网络基础
1. 人工神经元模型
import numpy as npclass ArtificialNeuron: """实现基本的人工神经元""" def __init__(self, input_size, activation='sigmoid'): """ 初始化神经元 Args: input_size: 输入特征数量 activation: 激活函数类型 """ # 初始化权重和偏置 self.weights = np.random.randn(input_size) * 0.1 self.bias = np.random.randn() * 0.1 self.activation_type = activation def activate(self, x): """前向传播计算""" # 线性组合 z = np.dot(x, self.weights) + self.bias # 应用激活函数 if self.activation_type == 'sigmoid': return self._sigmoid(z) elif self.activation_type == 'relu': return self._relu(z) elif self.activation_type == 'tanh': return self._tanh(z) else: return z # 线性激活 def _sigmoid(self, x): """Sigmoid激活函数""" return 1 / (1 + np.exp(-x)) def _relu(self, x): """ReLU激活函数""" return np.maximum(0, x) def _tanh(self, x): """Tanh激活函数""" return np.tanh(x) def __call__(self, x): """使神经元可调用""" return self.activate(x)# 使用示例neuron = ArtificialNeuron(3, activation='sigmoid')inputs = np.array([0.5, -0.2, 0.8])output = neuron(inputs)print(f"神经元输出: {output:.4f}")
2. 常用激活函数及其特性
import numpy as npimport matplotlib.pyplot as pltdef plot_activation_functions(): """绘制常用激活函数""" x = np.linspace(-5, 5, 100) # 定义激活函数 functions = { 'Sigmoid': lambda x: 1 / (1 + np.exp(-x)), 'ReLU': lambda x: np.maximum(0, x), 'Leaky ReLU': lambda x: np.where(x > 0, x, 0.01 * x), 'Tanh': lambda x: np.tanh(x), 'Swish': lambda x: x * (1 / (1 + np.exp(-x))), 'ELU': lambda x: np.where(x > 0, x, np.exp(x) - 1), 'Softplus': lambda x: np.log(1 + np.exp(x)) } # 创建子图 fig, axes = plt.subplots(2, 4, figsize=(15, 8)) axes = axes.ravel() for idx, (name, func) in enumerate(functions.items()): ax = axes[idx] y = func(x) ax.plot(x, y, 'b-', linewidth=2) ax.axhline(y=0, color='k', linestyle='-', alpha=0.3) ax.axvline(x=0, color='k', linestyle='-', alpha=0.3) ax.grid(True, alpha=0.3) ax.set_title(name, fontsize=12, fontweight='bold') # 添加特性标注 if name == 'Sigmoid': ax.text(0, 0.5, '输出范围: (0,1)\n平滑、可微\n存在梯度消失问题', fontsize=9, ha='center') elif name == 'ReLU': ax.text(0, 2.5, '稀疏激活\n计算高效\n存在"死神经元"问题', fontsize=9, ha='center') # 调整布局 plt.tight_layout() plt.suptitle('常用激活函数', fontsize=16, fontweight='bold', y=1.02) plt.show()# 绘制激活函数plot_activation_functions()
三、深度学习架构类型
1. 前馈神经网络(FNN)
class FeedForwardNeuralNetwork: """实现简单的前馈神经网络""" def __init__(self, layer_sizes, activations=None): """ 初始化神经网络 Args: layer_sizes: 每层神经元数量列表,如[10, 20, 5, 1] activations: 每层的激活函数列表 """ self.layer_sizes = layer_sizes self.num_layers = len(layer_sizes) - 1 # 初始化权重和偏置 self.weights = [] self.biases = [] for i in range(self.num_layers): # Xavier/Glorot初始化 scale = np.sqrt(2.0 / (layer_sizes[i] + layer_sizes[i+1])) w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale b = np.zeros(layer_sizes[i+1]) self.weights.append(w) self.biases.append(b) # 设置激活函数 if activations is None: activations = ['relu'] * (self.num_layers - 1) + ['sigmoid'] self.activations = activations def forward(self, X): """前向传播""" self.activations_history = [X] self.z_history = [] a = X for i in range(self.num_layers): z = np.dot(a, self.weights[i]) + self.biases[i] self.z_history.append(z) if self.activations[i] == 'sigmoid': a = 1 / (1 + np.exp(-z)) elif self.activations[i] == 'relu': a = np.maximum(0, z) elif self.activations[i] == 'tanh': a = np.tanh(z) else: a = z # 线性激活 self.activations_history.append(a) return a def predict(self, X): """预测""" return self.forward(X) def summary(self): """打印网络结构信息""" print("=" * 50) print("神经网络结构摘要") print("=" * 50) print(f"层数: {self.num_layers}") print(f"神经元配置: {self.layer_sizes}") print(f"激活函数: {self.activations}") total_params = 0 for i, (w, b) in enumerate(zip(self.weights, self.biases)): params = w.size + b.size total_params += params print(f"层 {i+1}: {w.shape[0]} → {w.shape[1]} " f"(权重: {w.shape}, 偏置: {b.shape}) " f"参数数: {params:,}") print(f"总参数数: {total_params:,}") print("=" * 50)# 创建并测试神经网络nn = FeedForwardNeuralNetwork( layer_sizes=[10, 20, 15, 1], activations=['relu', 'relu', 'sigmoid'])nn.summary()# 测试前向传播X_test = np.random.randn(5, 10) # 5个样本,10个特征output = nn.predict(X_test)print(f"\n输入形状: {X_test.shape}")print(f"输出形状: {output.shape}")
2. 卷积神经网络(CNN)
class SimpleCNN: """实现简化的卷积神经网络""" def __init__(self, input_shape=(28, 28, 1)): """初始化CNN""" self.input_shape = input_shape self.layers = [] def add_conv_layer(self, filters=32, kernel_size=3, activation='relu'): """添加卷积层""" layer_info = { 'type': 'conv', 'filters': filters, 'kernel_size': kernel_size, 'activation': activation } self.layers.append(layer_info) return self def add_pooling_layer(self, pool_size=2, stride=2): """添加池化层""" layer_info = { 'type': 'pool', 'pool_size': pool_size, 'stride': stride } self.layers.append(layer_info) return self def add_dense_layer(self, units, activation='relu'): """添加全连接层""" layer_info = { 'type': 'dense', 'units': units, 'activation': activation } self.layers.append(layer_info) return self def add_flatten_layer(self): """添加展平层""" self.layers.append({'type': 'flatten'}) return self def forward(self, X): """前向传播(简化版)""" # 模拟卷积操作 output = X for layer in self.layers: if layer['type'] == 'conv': # 简化的卷积操作 output = self._conv_forward(output, layer) elif layer['type'] == 'pool': # 简化的池化操作 output = self._pool_forward(output, layer) elif layer['type'] == 'flatten': # 展平操作 output = output.reshape(output.shape[0], -1) elif layer['type'] == 'dense': # 全连接层 output = self._dense_forward(output, layer) return output def _conv_forward(self, X, layer_params): """简化的卷积前向传播""" # 在实际实现中,这里会有实际的卷积计算 # 这里我们返回一个简化的结果 batch_size, height, width, channels = X.shape filters = layer_params['filters'] # 简化的输出形状计算 kernel_size = layer_params['kernel_size'] output_height = height - kernel_size + 1 output_width = width - kernel_size + 1 return np.random.randn(batch_size, output_height, output_width, filters) def _pool_forward(self, X, layer_params): """简化的池化前向传播""" batch_size, height, width, channels = X.shape pool_size = layer_params['pool_size'] # 简化的输出形状计算 output_height = height // pool_size output_width = width // pool_size return np.random.randn(batch_size, output_height, output_width, channels) def _dense_forward(self, X, layer_params): """简化的全连接前向传播""" units = layer_params['units'] activation = layer_params['activation'] # 线性变换 output = np.random.randn(X.shape[0], units) # 激活函数 if activation == 'relu': output = np.maximum(0, output) elif activation == 'sigmoid': output = 1 / (1 + np.exp(-output)) return output def summary(self): """打印网络结构信息""" print("=" * 60) print("卷积神经网络结构摘要") print("=" * 60) print(f"输入形状: {self.input_shape}") current_shape = self.input_shape for i, layer in enumerate(self.layers): layer_type = layer['type'].upper() if layer_type == 'CONV': filters = layer['filters'] kernel_size = layer['kernel_size'] print(f"层 {i+1}: 卷积层 " f"({current_shape[0]}x{current_shape[1]}x{current_shape[2]}) → " f"过滤器: {filters}, 核大小: {kernel_size}x{kernel_size}") # 更新形状(简化计算) current_shape = ( current_shape[0] - kernel_size + 1, current_shape[1] - kernel_size + 1, filters ) elif layer_type == 'POOL': pool_size = layer['pool_size'] print(f"层 {i+1}: 池化层 " f"({current_shape[0]}x{current_shape[1]}x{current_shape[2]}) → " f"池化大小: {pool_size}x{pool_size}") # 更新形状 current_shape = ( current_shape[0] // pool_size, current_shape[1] // pool_size, current_shape[2] ) elif layer_type == 'FLATTEN': flattened_size = np.prod(current_shape) print(f"层 {i+1}: 展平层 " f"({current_shape[0]}x{current_shape[1]}x{current_shape[2]}) → " f"({flattened_size},)") current_shape = (flattened_size,) elif layer_type == 'DENSE': units = layer['units'] print(f"层 {i+1}: 全连接层 " f"({current_shape[0] ifisinstance(current_shape, tuple) else current_shape}) → " f"({units},)") current_shape = (units,) print("=" * 60)# 创建并测试CNNcnn = SimpleCNN(input_shape=(28, 28, 1))cnn.add_conv_layer(filters=32, kernel_size=3, activation='relu')cnn.add_pooling_layer(pool_size=2)cnn.add_conv_layer(filters=64, kernel_size=3, activation='relu')cnn.add_pooling_layer(pool_size=2)cnn.add_flatten_layer()cnn.add_dense_layer(units=128, activation='relu')cnn.add_dense_layer(units=10, activation='softmax')cnn.summary()
3. 循环神经网络(RNN)
class SimpleRNN: """实现简化的循环神经网络""" def __init__(self, input_size, hidden_size, output_size): """初始化RNN""" self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size # 初始化参数 # 输入到隐藏层的权重 self.W_xh = np.random.randn(input_size, hidden_size) * 0.01 # 隐藏层到隐藏层的权重 self.W_hh = np.random.randn(hidden_size, hidden_size) * 0.01 # 隐藏层到输出层的权重 self.W_hy = np.random.randn(hidden_size, output_size) * 0.01 # 偏置项 self.b_h = np.zeros(hidden_size) self.b_y = np.zeros(output_size) # 缓存 self.history = {} def forward(self, X): """ 前向传播 Args: X: 输入序列,形状为(seq_length, batch_size, input_size) Returns: 输出序列 """ seq_length, batch_size, _ = X.shape # 初始化隐藏状态 h = np.zeros((batch_size, self.hidden_size)) # 存储历史值 self.history['h_states'] = [h] self.history['inputs'] = X # 存储输出 outputs = [] for t in range(seq_length): # 当前时间步的输入 x_t = X[t] # 更新隐藏状态 h = np.tanh(np.dot(x_t, self.W_xh) + np.dot(h, self.W_hh) + self.b_h) self.history['h_states'].append(h) # 计算输出 y_t = np.dot(h, self.W_hy) + self.b_y outputs.append(y_t) # 堆叠所有时间步的输出 return np.stack(outputs) def backward(self, d_outputs): """反向传播(简化版)""" # 在实际实现中,这里会有详细的反向传播计算 # 这里我们返回梯度的简化版本 gradients = { 'dW_xh': np.random.randn(*self.W_xh.shape) * 0.01, 'dW_hh': np.random.randn(*self.W_hh.shape) * 0.01, 'dW_hy': np.random.randn(*self.W_hy.shape) * 0.01, 'db_h': np.random.randn(*self.b_h.shape) * 0.01, 'db_y': np.random.randn(*self.b_y.shape) * 0.01 } return gradients def summary(self): """打印网络结构信息""" print("=" * 50) print("循环神经网络结构摘要") print("=" * 50) print(f"输入大小: {self.input_size}") print(f"隐藏层大小: {self.hidden_size}") print(f"输出大小: {self.output_size}") print(f"总参数数: {self.W_xh.size + self.W_hh.size + self.W_hy.size + self.b_h.size + self.b_y.size:,}") print("=" * 50) def generate_sequence(self, seed, length=20): """生成序列(简化版)""" # 在实际实现中,这里会有序列生成逻辑 generated = [seed] h = np.zeros((1, self.hidden_size)) for i in range(length): # 简化的序列生成 x = generated[-1] h = np.tanh(np.dot(x, self.W_xh) + np.dot(h, self.W_hh) + self.b_h) y = np.dot(h, self.W_hy) + self.b_y # 添加一些随机性 next_item = y + np.random.randn(*y.shape) * 0.1 generated.append(next_item) return np.stack(generated)# 创建并测试RNNrnn = SimpleRNN(input_size=10, hidden_size=20, output_size=5)rnn.summary()# 测试前向传播seq_length = 15batch_size = 8X_test = np.random.randn(seq_length, batch_size, 10)output = rnn.forward(X_test)print(f"\n输入形状: {X_test.shape}")print(f"输出形状: {output.shape}")
四、深度学习框架比较
1. 主要框架对比
import pandas as pd# 创建框架对比表格frameworks_data = { '框架': ['TensorFlow', 'PyTorch', 'Keras', 'MXNet', 'JAX', 'PaddlePaddle'], '发布年份': [2015, 2016, 2015, 2015, 2018, 2016], '开发者': ['Google', 'Facebook', 'François Chollet', 'Amazon', 'Google', '百度'], '主要语言': ['Python/C++', 'Python/C++', 'Python', 'Python/C++', 'Python', 'Python'], '易用性': [3, 5, 5, 3, 4, 4], '灵活性': [5, 5, 3, 5, 5, 4], '部署能力': [5, 4, 3, 5, 4, 5], '社区规模': [5, 5, 4, 3, 3, 3], '主要优势': [ '生产部署、生态完善', '动态图、研究友好', 'API简洁、快速原型', '分布式训练、多语言', '函数式、自动微分', '中文文档、国产框架' ]}frameworks_df = pd.DataFrame(frameworks_data)# 格式化输出print("深度学习框架对比")print("=" * 100)print(frameworks_df.to_string(index=False))print("\n" + "=" * 100)# 创建选择指南print("\n选择指南:")print("1. TensorFlow: 适合生产部署,企业级应用,需要强大生态系统")print("2. PyTorch: 适合学术研究,快速原型开发,动态计算图")print("3. Keras: 适合初学者,快速上手,高级API")print("4. MXNet: 适合分布式训练,多语言支持")print("5. JAX: 适合函数式编程,数值计算,自动微分")print("6. PaddlePaddle: 国产框架,中文文档丰富,工业级应用")
2. 框架安装与简单示例
def get_framework_setup_guide(): """获取框架安装和使用指南""" guides = { 'TensorFlow': { '安装': 'pip install tensorflow', '导入': 'import tensorflow as tf', '简单示例': '''# 创建简单的神经网络model = tf.keras.Sequential([ tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax')])model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) ''', '最新版本': '2.x (2023年)' }, 'PyTorch': { '安装': 'pip install torch torchvision', '导入': 'import torch', '简单示例': '''# 创建简单的神经网络class Net(torch.nn.Module): def __init__(self): super().__init__() self.fc1 = torch.nn.Linear(784, 64) self.fc2 = torch.nn.Linear(64, 10) def forward(self, x): x = torch.relu(self.fc1(x)) x = self.fc2(x) return x ''', '最新版本': '2.0+ (2023年)' }, 'Keras': { '安装': 'pip install keras', '导入': 'from keras import layers, models', '简单示例': '''# 创建简单的神经网络model = models.Sequential()model.add(layers.Dense(64, activation='relu', input_shape=(784,)))model.add(layers.Dense(10, activation='softmax'))model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) ''', '最新版本': '3.0 (2023年)' } } return guides# 打印框架指南guides = get_framework_setup_guide()for framework, info in guides.items(): print(f"\n{'='*60}") print(f"{framework} 快速指南") print(f"{'='*60}") for key, value in info.items(): print(f"{key}: {value}")
五、深度学习的关键概念
1. 损失函数
import numpy as npimport matplotlib.pyplot as pltclass LossFunctions: """实现常见的损失函数""" @staticmethod def mse(y_true, y_pred): """均方误差 (Mean Squared Error)""" return np.mean((y_true - y_pred) ** 2) @staticmethod def mae(y_true, y_pred): """平均绝对误差 (Mean Absolute Error)""" return np.mean(np.abs(y_true - y_pred)) @staticmethod def binary_crossentropy(y_true, y_pred, epsilon=1e-7): """二分类交叉熵""" y_pred = np.clip(y_pred, epsilon, 1 - epsilon) return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) @staticmethod def categorical_crossentropy(y_true, y_pred, epsilon=1e-7): """多分类交叉熵""" y_pred = np.clip(y_pred, epsilon, 1 - epsilon) return -np.mean(np.sum(y_true * np.log(y_pred), axis=-1)) @staticmethod def huber_loss(y_true, y_pred, delta=1.0): """Huber损失 (结合MSE和MAE的优点)""" error = y_true - y_pred abs_error = np.abs(error) quadratic = np.minimum(abs_error, delta) linear = abs_error - quadratic return np.mean(0.5 * quadratic ** 2 + delta * linear) @staticmethod def contrastive_loss(y_true, y_pred, margin=1.0): """对比损失 (用于度量学习)""" positive_distance = y_true * y_pred ** 2 negative_distance = (1 - y_true) * np.maximum(margin - y_pred, 0) ** 2 return np.mean(positive_distance + negative_distance) @staticmethod def visualize_loss_functions(): """可视化损失函数""" fig, axes = plt.subplots(2, 3, figsize=(15, 10)) axes = axes.ravel() x = np.linspace(-3, 3, 100) y_true = 0 # 假设真实值为0 # MSE y_pred = x loss = (y_true - y_pred) ** 2 axes[0].plot(x, loss, 'b-', linewidth=2) axes[0].set_title('MSE (均方误差)', fontweight='bold') axes[0].set_xlabel('预测值') axes[0].set_ylabel('损失') axes[0].grid(True, alpha=0.3) # MAE loss = np.abs(y_true - y_pred) axes[1].plot(x, loss, 'r-', linewidth=2) axes[1].set_title('MAE (平均绝对误差)', fontweight='bold') axes[1].set_xlabel('预测值') axes[1].grid(True, alpha=0.3) # Huber Loss delta = 1.0 loss = np.where(np.abs(x) <= delta, 0.5 * x ** 2, delta * (np.abs(x) - 0.5 * delta)) axes[2].plot(x, loss, 'g-', linewidth=2) axes[2].set_title('Huber损失', fontweight='bold') axes[2].set_xlabel('预测值') axes[2].grid(True, alpha=0.3) # Binary Crossentropy (假设真实值为1) y_true_binary = 1 y_pred_binary = 1 / (1 + np.exp(-x)) # Sigmoid变换 loss = - (y_true_binary * np.log(y_pred_binary) + (1 - y_true_binary) * np.log(1 - y_pred_binary)) axes[3].plot(x, loss, 'm-', linewidth=2) axes[3].set_title('二分类交叉熵 (y_true=1)', fontweight='bold') axes[3].set_xlabel('预测值 (z)') axes[3].grid(True, alpha=0.3) # 对比损失 distance = np.abs(x) margin = 1.0 y_true_contrastive = np.ones_like(x) # 假设是正样本对 loss = y_true_contrastive * distance ** 2 axes[4].plot(x, loss, 'c-', linewidth=2, label='正样本') y_true_contrastive = np.zeros_like(x) # 假设是负样本对 loss = (1 - y_true_contrastive) * np.maximum(margin - distance, 0) ** 2 axes[4].plot(x, loss, 'y-', linewidth=2, label='负样本') axes[4].set_title('对比损失', fontweight='bold') axes[4].set_xlabel('距离') axes[4].legend() axes[4].grid(True, alpha=0.3) # 损失函数应用场景 axes[5].axis('off') axes[5].text(0.5, 0.5, '损失函数选择指南:\n\n' '• MSE: 回归问题,对异常值敏感\n' '• MAE: 回归问题,对异常值鲁棒\n' '• Huber: 结合MSE和MAE的优点\n' '• Binary CE: 二分类问题\n' '• Categorical CE: 多分类问题\n' '• Contrastive: 度量学习,相似度计算', ha='center', va='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) plt.suptitle('深度学习常用损失函数', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()# 测试损失函数loss_funcs = LossFunctions()# 测试数据y_true = np.array([1, 0, 1, 0])y_pred = np.array([0.9, 0.2, 0.8, 0.3])print("损失函数计算结果:")print(f"MSE: {loss_funcs.mse(y_true, y_pred):.4f}")print(f"MAE: {loss_funcs.mae(y_true, y_pred):.4f}")print(f"Binary Crossentropy: {loss_funcs.binary_crossentropy(y_true, y_pred):.4f}")# 可视化损失函数LossFunctions.visualize_loss_functions()
2. 优化器
class OptimizerComparison: """优化器比较和可视化""" @staticmethod def visualize_optimization_path(): """可视化不同优化器的优化路径""" # 定义测试函数 (Rosenbrock函数,有全局最小值) def rosenbrock(x, y): return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2 # 生成网格 x = np.linspace(-2, 2, 100) y = np.linspace(-1, 3, 100) X, Y = np.meshgrid(x, y) Z = rosenbrock(X, Y) # 优化器模拟 optimizers = { 'SGD': { 'lr': 0.01, 'momentum': 0.0 }, 'SGD with Momentum': { 'lr': 0.01, 'momentum': 0.9 }, 'Adam': { 'lr': 0.01, 'beta1': 0.9, 'beta2': 0.999 }, 'RMSprop': { 'lr': 0.01, 'rho': 0.9 }, 'Adagrad': { 'lr': 0.1 } } # 创建图形 fig, axes = plt.subplots(2, 3, figsize=(15, 10)) axes = axes.ravel() for idx, (name, params) in enumerate(optimizers.items()): ax = axes[idx] # 绘制等高线 ax.contour(X, Y, Z, levels=np.logspace(-1, 3, 20), alpha=0.5) # 初始化参数 x_pos, y_pos = -1.5, 2.5 # 存储轨迹 trajectory = [(x_pos, y_pos)] # 模拟优化过程 for step in range(100): # 计算梯度 grad_x = -2 * (1 - x_pos) - 400 * x_pos * (y_pos - x_pos ** 2) grad_y = 200 * (y_pos - x_pos ** 2) # 应用不同优化器更新规则 if name == 'SGD': x_pos -= params['lr'] * grad_x y_pos -= params['lr'] * grad_y elif name == 'SGD with Momentum': # 简化的动量实现 if step == 0: vx, vy = 0, 0 vx = params['momentum'] * vx + params['lr'] * grad_x vy = params['momentum'] * vy + params['lr'] * grad_y x_pos -= vx y_pos -= vy elif name == 'Adam': # 简化的Adam实现 if step == 0: m1x, m1y = 0, 0 m2x, m2y = 0, 0 m1x = params['beta1'] * m1x + (1 - params['beta1']) * grad_x m1y = params['beta1'] * m1y + (1 - params['beta1']) * grad_y m2x = params['beta2'] * m2x + (1 - params['beta2']) * grad_x ** 2 m2y = params['beta2'] * m2y + (1 - params['beta2']) * grad_y ** 2 # 偏置校正 m1x_hat = m1x / (1 - params['beta1'] ** (step + 1)) m1y_hat = m1y / (1 - params['beta1'] ** (step + 1)) m2x_hat = m2x / (1 - params['beta2'] ** (step + 1)) m2y_hat = m2y / (1 - params['beta2'] ** (step + 1)) x_pos -= params['lr'] * m1x_hat / (np.sqrt(m2x_hat) + 1e-8) y_pos -= params['lr'] * m1y_hat / (np.sqrt(m2y_hat) + 1e-8) trajectory.append((x_pos, y_pos)) # 绘制轨迹 trajectory = np.array(trajectory) ax.plot(trajectory[:, 0], trajectory[:, 1], 'ro-', linewidth=2, markersize=3) ax.plot(trajectory[0, 0], trajectory[0, 1], 'go', markersize=8, label='起点') ax.plot(trajectory[-1, 0], trajectory[-1, 1], 'bo', markersize=8, label='终点') ax.set_title(name, fontweight='bold') ax.set_xlabel('x') ax.set_ylabel('y') ax.legend() ax.grid(True, alpha=0.3) # 优化器选择指南 axes[5].axis('off') axes[5].text(0.5, 0.5, '优化器选择指南:\n\n' '• SGD: 简单,收敛慢,可能震荡\n' '• SGD+Momentum: 减少震荡,加速收敛\n' '• Adam: 自适应学习率,通常表现好\n' '• RMSprop: 适合非平稳目标\n' '• Adagrad: 适合稀疏数据\n\n' '一般推荐: Adam (默认选择)', ha='center', va='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5)) plt.suptitle('优化器比较:在Rosenbrock函数上的优化路径', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show() @staticmethod def optimizer_summary(): """优化器特性总结""" optimizers_info = { 'SGD': { '公式': 'θ = θ - η·∇J(θ)', '优点': '简单,理论基础强', '缺点': '收敛慢,易震荡', '适用场景': '凸优化问题' }, 'Momentum': { '公式': 'v = βv + η·∇J(θ)\nθ = θ - v', '优点': '加速收敛,减少震荡', '缺点': '需要调节β参数', '适用场景': '深度学习训练' }, 'Adam': { '公式': '复杂,结合动量和自适应学习率', '优点': '自适应学习率,通常表现优异', '缺点': '内存占用稍大', '适用场景': '深度学习(默认推荐)' }, 'RMSprop': { '公式': 'E[g²] = ρE[g²] + (1-ρ)g²\nθ = θ - η·g/√(E[g²]+ε)', '优点': '自适应学习率,适合非平稳目标', '缺点': '需要调节ρ参数', '适用场景': 'RNN训练' }, 'Adagrad': { '公式': 'G = G + g⊙g\nθ = θ - η·g/√(G+ε)', '优点': '自适应学习率,适合稀疏数据', '缺点': '学习率单调递减', '适用场景': '稀疏特征学习' } } print("=" * 80) print("深度学习优化器总结") print("=" * 80) for name, info in optimizers_info.items(): print(f"\n{name}:") print(f" 公式: {info['公式']}") print(f" 优点: {info['优点']}") print(f" 缺点: {info['缺点']}") print(f" 适用场景: {info['适用场景']}") print("\n" + "=" * 80)# 显示优化器信息OptimizerComparison.optimizer_summary()OptimizerComparison.visualize_optimization_path()
六、深度学习训练技巧
1. 正则化技术
class RegularizationTechniques: """深度学习正则化技术""" @staticmethod def l1_regularization(weights, lambda_l1): """L1正则化 (Lasso)""" return lambda_l1 * np.sum(np.abs(weights)) @staticmethod def l2_regularization(weights, lambda_l2): """L2正则化 (Ridge)""" return lambda_l2 * np.sum(weights ** 2) @staticmethod def elastic_net(weights, lambda_l1, lambda_l2): """弹性网络 (结合L1和L2)""" return (lambda_l1 * np.sum(np.abs(weights)) + lambda_l2 * np.sum(weights ** 2)) @staticmethod def dropout(activations, dropout_rate, training=True): """Dropout正则化""" if not training: return activations # 生成Dropout掩码 mask = np.random.binomial(1, 1 - dropout_rate, size=activations.shape) # 应用Dropout并缩放 activations = activations * mask / (1 - dropout_rate) return activations @staticmethod def batch_normalization(x, gamma=1, beta=0, epsilon=1e-5): """批量归一化 (简化版)""" # 计算批次的均值和方差 mean = np.mean(x, axis=0) variance = np.var(x, axis=0) # 归一化 x_norm = (x - mean) / np.sqrt(variance + epsilon) # 缩放和偏移 return gamma * x_norm + beta @staticmethod def data_augmentation_examples(): """数据增强示例""" techniques = { '图像数据增强': [ '随机旋转 (±30度)', '随机缩放 (0.8-1.2倍)', '随机裁剪', '随机水平翻转', '颜色抖动 (亮度、对比度、饱和度)', '随机噪声添加' ], '文本数据增强': [ '同义词替换', '随机插入', '随机交换', '随机删除', '回译 (翻译成其他语言再译回)', 'EDA (Easy Data Augmentation)' ], '时间序列增强': [ '时间扭曲', '窗口滑动', '随机缩放', '添加噪声', '通道混洗 (多变量时)' ] } print("数据增强技术:") print("=" * 60) for category, methods in techniques.items(): print(f"\n{category}:") for method in methods: print(f" • {method}") print("\n" + "=" * 60) @staticmethod def early_stopping_callback(patience=10, min_delta=0.001): """早停回调函数""" class EarlyStopping: def __init__(self, patience=patience, min_delta=min_delta): self.patience = patience self.min_delta = min_delta self.best_loss = float('inf') self.counter = 0 self.should_stop = False def __call__(self, current_loss): if current_loss < self.best_loss - self.min_delta: self.best_loss = current_loss self.counter = 0 print(f"损失改善: {current_loss:.4f}") return False else: self.counter += 1 print(f"早停计数: {self.counter}/{self.patience}") if self.counter >= self.patience: self.should_stop = True print("达到早停条件,停止训练") return self.should_stop return EarlyStopping()# 测试正则化技术reg = RegularizationTechniques()# 测试Dropoutactivations = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])dropout_rate = 0.5print("Dropout示例:")print(f"原始激活值:\n{activations}")print(f"Dropout后 (训练模式):\n{reg.dropout(activations, dropout_rate, training=True)}")print(f"Dropout后 (推理模式):\n{reg.dropout(activations, dropout_rate, training=False)}")# 显示数据增强技术reg.data_augmentation_examples()
2. 超参数调优
class HyperparameterTuning: """深度学习超参数调优""" @staticmethod def learning_rate_scheduler(): """学习率调度器""" schedulers = { '固定学习率': { '描述': '整个训练过程使用固定学习率', '适用场景': '简单任务,小数据集', '代码示例': 'lr = 0.001' }, '阶梯下降': { '描述': '在指定轮次降低学习率', '适用场景': '大多数深度学习任务', '代码示例': '''if epoch % 30 == 0: lr *= 0.1 ''' }, '余弦退火': { '描述': '学习率按余弦函数从高到低变化', '适用场景': '需要跳出局部最优的任务', '公式': 'lr = lr_min + 0.5*(lr_max-lr_min)*(1+cos(epoch/T_max*π))' }, '循环学习率': { '描述': '学习率在最小和最大值之间循环变化', '适用场景': '提高模型泛化能力', '代码示例': '''cycle = epoch % cycle_lengthlr = lr_min + 0.5*(lr_max-lr_min)*(1+cos(cycle/cycle_length*π)) ''' }, '热重启': { '描述': '周期性重启学习率,每次重启后逐渐降低峰值', '适用场景': '复杂任务,需要精细调优', '优势': '结合大范围探索和精细调优' } } print("学习率调度策略:") print("=" * 80) for name, info in schedulers.items(): print(f"\n{name}:") print(f" 描述: {info['描述']}") print(f" 适用场景: {info['适用场景']}") if '公式' in info: print(f" 公式: {info['公式']}") if '代码示例' in info: print(f" 代码示例: {info['代码示例']}") print("\n" + "=" * 80) @staticmethod def visualize_learning_rates(): """可视化不同学习率调度策略""" epochs = 100 # 不同调度策略 strategies = { '固定学习率': [0.001] * epochs, '阶梯下降': [], '指数衰减': [], '余弦退火': [], '循环学习率': [] } # 生成学习率序列 for epoch in range(epochs): # 阶梯下降 (每30轮降低10倍) lr = 0.001 if epoch >= 30: lr *= 0.1 if epoch >= 60: lr *= 0.1 strategies['阶梯下降'].append(lr) # 指数衰减 strategies['指数衰减'].append(0.001 * np.exp(-0.05 * epoch)) # 余弦退火 T_max = 50 lr_min = 0.0001 lr_max = 0.01 strategies['余弦退火'].append( lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch / T_max * np.pi)) ) # 循环学习率 cycle_length = 20 cycle = epoch % cycle_length strategies['循环学习率'].append( 0.0001 + 0.5 * (0.01 - 0.0001) * (1 + np.cos(cycle / cycle_length * np.pi)) ) # 绘制图形 plt.figure(figsize=(12, 8)) for idx, (name, lr_sequence) in enumerate(strategies.items()): plt.plot(lr_sequence, linewidth=2, label=name) plt.xlabel('训练轮次', fontsize=12) plt.ylabel('学习率', fontsize=12) plt.title('不同学习率调度策略比较', fontsize=16, fontweight='bold') plt.legend(fontsize=10) plt.grid(True, alpha=0.3) plt.yscale('log') # 对数尺度 plt.tight_layout() plt.show() @staticmethod def hyperparameter_search_space(): """深度学习超参数搜索空间""" search_space = { '学习率': { '范围': [1e-5, 1e-1], '推荐值': [1e-3, 3e-4, 1e-4], '搜索策略': '对数均匀采样', '备注': '最重要的超参数' }, '批量大小': { '范围': [16, 256], '推荐值': [32, 64, 128], '搜索策略': '均匀采样 (2的幂次)', '备注': 'GPU内存允许的情况下尽量大' }, '网络深度': { '范围': [2, 20], '推荐值': [3, 5, 8, 12], '搜索策略': '均匀采样', '备注': '根据任务复杂度选择' }, 'Dropout率': { '范围': [0.0, 0.5], '推荐值': [0.2, 0.3, 0.5], '搜索策略': '均匀采样', '备注': '正则化强度,防止过拟合' }, '权重衰减': { '范围': [0.0, 0.1], '推荐值': [1e-4, 1e-5, 0.0], '搜索策略': '对数均匀采样', '备注': 'L2正则化强度' }, '优化器': { '选项': ['Adam', 'SGD', 'RMSprop', 'Adagrad'], '推荐值': 'Adam', '搜索策略': '类别采样', '备注': 'Adam通常是默认选择' } } print("深度学习超参数搜索空间:") print("=" * 100) for param, info in search_space.items(): print(f"\n{param}:") for key, value in info.items(): print(f" {key}: {value}") print("\n" + "=" * 100) print("\n调优策略建议:") print("1. 先调学习率,固定其他参数") print("2. 然后调批量大小和网络结构") print("3. 最后调正则化相关参数") print("4. 使用贝叶斯优化或随机搜索") print("5. 早停防止过拟合")# 显示超参数调优信息tuning = HyperparameterTuning()tuning.hyperparameter_search_space()tuning.learning_rate_scheduler()tuning.visualize_learning_rates()
七、实践项目:手写数字识别
import numpy as npimport matplotlib.pyplot as pltfrom sklearn.datasets import fetch_openmlfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import OneHotEncoderclass MNISTDigitRecognition: """手写数字识别实践项目""" def __init__(self): """初始化""" self.X_train = None self.y_train = None self.X_test = None self.y_test = None self.model = None def load_data(self): """加载MNIST数据集""" print("加载MNIST数据集...") # 使用fetch_openml加载MNIST数据集 mnist = fetch_openml('mnist_784', version=1, parser='auto') X = mnist.data.astype('float32') / 255.0 # 归一化 y = mnist.target.astype('int') # 划分训练集和测试集 self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"训练集大小: {self.X_train.shape}") print(f"测试集大小: {self.X_test.shape}") print(f"类别分布: {np.bincount(self.y_train)}") return self def visualize_samples(self, n_samples=10): """可视化样本""" plt.figure(figsize=(15, 6)) for i in range(n_samples): plt.subplot(2, n_samples//2, i+1) image = self.X_train[i].reshape(28, 28) plt.imshow(image, cmap='gray') plt.title(f"标签: {self.y_train[i]}") plt.axis('off') plt.suptitle('MNIST手写数字样本', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() def create_simple_model(self): """创建简单的神经网络模型""" class SimpleNN: """简单的全连接神经网络""" def __init__(self, input_size=784, hidden_size=128, output_size=10): self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size # 初始化参数 self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size) self.b1 = np.zeros(hidden_size) self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size) self.b2 = np.zeros(output_size) # 缓存 self.cache = {} def relu(self, x): """ReLU激活函数""" return np.maximum(0, x) def softmax(self, x): """Softmax激活函数""" exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) return exp_x / np.sum(exp_x, axis=1, keepdims=True) def forward(self, X): """前向传播""" # 第一层 z1 = np.dot(X, self.W1) + self.b1 a1 = self.relu(z1) # 第二层 z2 = np.dot(a1, self.W2) + self.b2 a2 = self.softmax(z2) # 缓存中间结果 self.cache = {'X': X, 'z1': z1, 'a1': a1, 'z2': z2, 'a2': a2} return a2 def backward(self, X, y, learning_rate=0.01): """反向传播""" m = X.shape[0] # 从缓存中获取前向传播的结果 z1 = self.cache['z1'] a1 = self.cache['a1'] a2 = self.cache['a2'] # 将y转换为one-hot编码 y_onehot = np.zeros((m, self.output_size)) y_onehot[np.arange(m), y] = 1 # 计算输出层的梯度 dz2 = a2 - y_onehot dW2 = np.dot(a1.T, dz2) / m db2 = np.sum(dz2, axis=0) / m # 计算隐藏层的梯度 da1 = np.dot(dz2, self.W2.T) dz1 = da1 * (z1 > 0) # ReLU的梯度 dW1 = np.dot(X.T, dz1) / m db1 = np.sum(dz1, axis=0) / m # 更新参数 self.W2 -= learning_rate * dW2 self.b2 -= learning_rate * db2 self.W1 -= learning_rate * dW1 self.b1 -= learning_rate * db1 def predict(self, X): """预测""" probas = self.forward(X) return np.argmax(probas, axis=1) def evaluate(self, X, y): """评估模型""" y_pred = self.predict(X) accuracy = np.mean(y_pred == y) return accuracy def summary(self): """打印模型信息""" print("=" * 50) print("简单神经网络模型") print("=" * 50) print(f"输入大小: {self.input_size}") print(f"隐藏层大小: {self.hidden_size}") print(f"输出大小: {self.output_size}") total_params = (self.W1.size + self.b1.size + self.W2.size + self.b2.size) print(f"总参数数: {total_params:,}") print("=" * 50) self.model = SimpleNN() return self.model def train_model(self, epochs=10, batch_size=64, learning_rate=0.01): """训练模型""" n_samples = self.X_train.shape[0] n_batches = n_samples // batch_size print(f"开始训练...") print(f"训练样本数: {n_samples}") print(f"批次大小: {batch_size}") print(f"批次数: {n_batches}") print(f"训练轮次: {epochs}") train_losses = [] train_accuracies = [] test_accuracies = [] for epoch in range(epochs): epoch_loss = 0 epoch_accuracy = 0 # 打乱数据 indices = np.random.permutation(n_samples) X_shuffled = self.X_train[indices] y_shuffled = self.y_train[indices] for batch in range(n_batches): # 获取当前批次数据 start = batch * batch_size end = start + batch_size X_batch = X_shuffled[start:end] y_batch = y_shuffled[start:end] # 前向传播 y_pred = self.model.forward(X_batch) # 计算损失(交叉熵) m = X_batch.shape[0] y_onehot = np.zeros((m, 10)) y_onehot[np.arange(m), y_batch] = 1 loss = -np.mean(np.sum(y_onehot * np.log(y_pred + 1e-8), axis=1)) epoch_loss += loss # 计算准确率 batch_pred = np.argmax(y_pred, axis=1) batch_acc = np.mean(batch_pred == y_batch) epoch_accuracy += batch_acc # 反向传播和参数更新 self.model.backward(X_batch, y_batch, learning_rate) # 计算平均损失和准确率 avg_loss = epoch_loss / n_batches avg_accuracy = epoch_accuracy / n_batches # 测试集准确率 test_acc = self.model.evaluate(self.X_test, self.y_test) train_losses.append(avg_loss) train_accuracies.append(avg_accuracy) test_accuracies.append(test_acc) print(f"轮次 {epoch+1}/{epochs}: " f"训练损失={avg_loss:.4f}, " f"训练准确率={avg_accuracy:.4f}, " f"测试准确率={test_acc:.4f}") # 可视化训练过程 self.plot_training_history(train_losses, train_accuracies, test_accuracies) return train_losses, train_accuracies, test_accuracies def plot_training_history(self, train_losses, train_accuracies, test_accuracies): """绘制训练历史""" fig, axes = plt.subplots(1, 2, figsize=(12, 4)) # 损失曲线 axes[0].plot(train_losses, 'b-', linewidth=2, label='训练损失') axes[0].set_xlabel('训练轮次') axes[0].set_ylabel('损失') axes[0].set_title('训练损失曲线') axes[0].legend() axes[0].grid(True, alpha=0.3) # 准确率曲线 axes[1].plot(train_accuracies, 'g-', linewidth=2, label='训练准确率') axes[1].plot(test_accuracies, 'r-', linewidth=2, label='测试准确率') axes[1].set_xlabel('训练轮次') axes[1].set_ylabel('准确率') axes[1].set_title('准确率曲线') axes[1].legend() axes[1].grid(True, alpha=0.3) plt.suptitle('MNIST手写数字识别训练历史', fontsize=14, fontweight='bold') plt.tight_layout() plt.show() def show_predictions(self, n_samples=15): """展示预测结果""" # 随机选择测试样本 indices = np.random.choice(len(self.X_test), n_samples, replace=False) X_sample = self.X_test[indices] y_sample = self.y_test[indices] # 预测 y_pred = self.model.predict(X_sample) # 可视化 plt.figure(figsize=(15, 10)) n_cols = 5 n_rows = int(np.ceil(n_samples / n_cols)) for i, idx in enumerate(indices): plt.subplot(n_rows, n_cols, i+1) image = X_sample[i].reshape(28, 28) plt.imshow(image, cmap='gray') # 标记正确/错误 is_correct = y_pred[i] == y_sample[i] color = 'green' if is_correct else 'red' plt.title(f"真实: {y_sample[i]}\n预测: {y_pred[i]}", color=color) plt.axis('off') accuracy = np.mean(y_pred == y_sample) plt.suptitle(f'预测结果 (准确率: {accuracy:.2%})', fontsize=16, fontweight='bold') plt.tight_layout() plt.show() # 打印混淆矩阵(简化) print("\n预测结果统计:") print(f"样本数: {n_samples}") print(f"正确数: {np.sum(y_pred == y_sample)}") print(f"错误数: {np.sum(y_pred != y_sample)}") print(f"准确率: {accuracy:.2%}")# 运行MNIST手写数字识别项目mnist_project = MNISTDigitRecognition()# 加载数据mnist_project.load_data()# 可视化样本mnist_project.visualize_samples(10)# 创建模型model = mnist_project.create_simple_model()model.summary()# 训练模型train_losses, train_accuracies, test_accuracies = mnist_project.train_model( epochs=20, batch_size=128, learning_rate=0.01)# 展示预测结果mnist_project.show_predictions(15)
深度学习正在改变世界。 从图像识别到自然语言处理,从自动驾驶到医疗诊断,深度学习的应用无处不在。虽然深度学习技术看起来很复杂,但通过系统的学习和实践,你也可以掌握这项强大的技术。