告别传统指标,让AI成为你的投资大脑
你是否曾梦想有一个AI助手,能帮你分析海量数据、识别复杂模式,甚至预测明天的股价走势?这听起来像是科幻电影的情节,但在今天的量化投资领域,这已经成为现实。今天,我们将一起探索如何用机器学习技术,让计算机学会"思考"市场,构建智能化的投资决策系统。
一、机器学习与量化投资的完美结合
1. 为什么机器学习适合量化投资?
金融市场本质上是一个充满噪声的复杂系统,传统技术指标往往只能捕捉线性关系。而机器学习算法能够:
处理高维数据:同时分析数百个因子
识别非线性关系:发现人眼难以察觉的复杂模式
自适应学习:随着市场变化自动调整模型
实时预测:快速处理新数据并给出预测
让我们从一个真实的故事开始。2018年,一位名叫李明的量化研究员发现,传统的双均线策略在震荡市中频繁失效。他开始尝试使用机器学习方法,将过去10年的市场数据、基本面指标、技术指标等上百个特征输入到随机森林模型中。经过训练,这个模型不仅能够预测股价方向,还能估算出上涨的概率。在2019-2020年的回测中,这个机器学习策略的夏普比率达到了2.1,远超传统策略的1.3。
今天我们要深入探讨的,就是如何构建这样的智能预测系统。我们将从最基础的线性回归开始,逐步深入到更复杂的集成学习方法。
二、环境准备与数据获取
首先,我们需要准备机器学习所需的环境和数据:
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_split, TimeSeriesSplitfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifierfrom sklearn.svm import SVCfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matriximport warningswarnings.filterwarnings('ignore')# 设置中文字体plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# 获取股票数据import tushare as tsts.set_token('你的token')pro = ts.pro_api()def get_stock_features(ts_code, start_date='20180101', end_date='20231231'): """ 获取股票数据并计算特征 """ # 获取日线数据 df = pro.daily(ts_code=ts_code, start_date=start_date, end_date=end_date) df['trade_date'] = pd.to_datetime(df['trade_date']) df = df.sort_values('trade_date') df.set_index('trade_date', inplace=True) # 计算基础特征 # 价格特征 df['returns'] = df['close'].pct_change() df['log_returns'] = np.log(df['close'] / df['close'].shift(1)) # 移动平均线 for window in [5, 10, 20, 60]: df[f'MA{window}'] = df['close'].rolling(window).mean() df[f'MA{window}_ratio'] = df['close'] / df[f'MA{window}'] # 波动率特征 df['volatility_5'] = df['returns'].rolling(5).std() df['volatility_10'] = df['returns'].rolling(10).std() df['volatility_20'] = df['returns'].rolling(20).std() # 价格位置特征 df['high_low_ratio'] = (df['high'] - df['low']) / df['close'] df['close_open_ratio'] = (df['close'] - df['open']) / df['open'] # 成交量特征 df['volume_ratio'] = df['vol'] / df['vol'].rolling(20).mean() df['volume_price_corr'] = df['vol'].rolling(10).corr(df['close']) # 技术指标 # RSI delta = df['close'].diff() gain = delta.where(delta > 0, 0) loss = -delta.where(delta < 0, 0) avg_gain = gain.rolling(14).mean() avg_loss = loss.rolling(14).mean() rs = avg_gain / avg_loss df['RSI'] = 100 - (100 / (1 + rs)) # MACD df['EMA12'] = df['close'].ewm(span=12, adjust=False).mean() df['EMA26'] = df['close'].ewm(span=26, adjust=False).mean() df['MACD'] = df['EMA12'] - df['EMA26'] df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False).mean() # 布林带 df['BB_middle'] = df['close'].rolling(20).mean() bb_std = df['close'].rolling(20).std() df['BB_upper'] = df['BB_middle'] + 2 * bb_std df['BB_lower'] = df['BB_middle'] - 2 * bb_std df['BB_position'] = (df['close'] - df['BB_lower']) / (df['BB_upper'] - df['BB_lower']) # 目标变量:未来N天的收益率(分类问题) future_days = 5 # 预测未来5天的走势 df['future_return'] = df['close'].shift(-future_days) / df['close'] - 1 df['target'] = (df['future_return'] > 0).astype(int) # 1:上涨,0:下跌或平盘 # 删除含有NaN的行 df = df.dropna() return df# 获取贵州茅台数据作为示例stock_code = '600519.SH'df = get_stock_features(stock_code)print(f"数据形状: {df.shape}")print(f"特征数量: {len(df.columns) - 2}") # 减去'target'和'future_return'print(f"上涨天数: {df['target'].sum()}, 下跌/平盘天数: {len(df) - df['target'].sum()}")
三、特征工程与数据预处理
1. 特征选择与重要性分析
def select_features(df, target_col='target', method='correlation', top_n=30): """ 选择最重要的特征 """ # 分离特征和目标变量 features = df.drop([target_col, 'future_return'], axis=1, errors='ignore') if method == 'correlation': # 基于相关性选择 correlations = features.apply(lambda x: x.corr(df[target_col])) selected_features = correlations.abs().sort_values(ascending=False).head(top_n).index.tolist() elif method == 'random_forest': # 使用随机森林计算特征重要性 X = features.values y = df[target_col].values rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X, y) importances = pd.DataFrame({ 'feature': features.columns, 'importance': rf.feature_importances_ }).sort_values('importance', ascending=False) selected_features = importances.head(top_n)['feature'].tolist() return selected_features# 选择重要特征selected_features = select_features(df, method='random_forest', top_n=25)print(f"选中的特征 ({len(selected_features)}个):")for i, feat in enumerate(selected_features, 1): print(f"{i:2d}. {feat}")# 可视化特征重要性def plot_feature_importance(df, selected_features, target_col='target'): """绘制特征重要性图""" X = df[selected_features].values y = df[target_col].values rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X, y) importances = pd.DataFrame({ 'feature': selected_features, 'importance': rf.feature_importances_ }).sort_values('importance', ascending=True) plt.figure(figsize=(10, 12)) plt.barh(range(len(importances)), importances['importance']) plt.yticks(range(len(importances)), importances['feature']) plt.xlabel('特征重要性') plt.title('随机森林特征重要性排序', fontsize=15, pad=20) plt.grid(True, alpha=0.3, axis='x') plt.tight_layout() plt.show()plot_feature_importance(df, selected_features)
2. 数据标准化与时间序列分割
def prepare_ml_data(df, selected_features, target_col='target', test_size=0.2): """ 准备机器学习数据 """ # 提取特征和目标 X = df[selected_features].values y = df[target_col].values # 标准化特征 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # 时间序列分割(保持时间顺序) split_idx = int(len(X_scaled) * (1 - test_size)) X_train = X_scaled[:split_idx] X_test = X_scaled[split_idx:] y_train = y[:split_idx] y_test = y[split_idx:] print(f"训练集大小: {X_train.shape}, 测试集大小: {X_test.shape}") print(f"训练集正负样本比例: {y_train.mean():.2%} 正样本") print(f"测试集正负样本比例: {y_test.mean():.2%} 正样本") return X_train, X_test, y_train, y_test, scaler# 准备数据X_train, X_test, y_train, y_test, scaler = prepare_ml_data(df, selected_features)
四、机器学习模型构建与比较
1. 多种分类模型对比
def compare_classification_models(X_train, X_test, y_train, y_test): """ 比较不同分类模型的表现 """ models = { '逻辑回归': LogisticRegression(max_iter=1000, random_state=42), '随机森林': RandomForestClassifier(n_estimators=100, random_state=42), '梯度提升': GradientBoostingClassifier(n_estimators=100, random_state=42), '支持向量机': SVC(probability=True, random_state=42) } results = [] for name, model in models.items(): # 训练模型 model.fit(X_train, y_train) # 预测 y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None # 计算评估指标 accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) results.append({ '模型': name, '准确率': accuracy, '精确率': precision, '召回率': recall, 'F1分数': f1 }) print(f"\n{name}模型表现:") print(f"准确率: {accuracy:.4f}") print(f"精确率: {precision:.4f}") print(f"召回率: {recall:.4f}") print(f"F1分数: {f1:.4f}") # 绘制混淆矩阵 cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(6, 5)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title(f'{name} - 混淆矩阵', fontsize=14) plt.ylabel('真实标签') plt.xlabel('预测标签') plt.tight_layout() plt.show() return pd.DataFrame(results)# 比较模型results_df = compare_classification_models(X_train, X_test, y_train, y_test)
2. 模型集成与优化
def create_ensemble_model(X_train, X_test, y_train, y_test): """ 创建集成模型 """ from sklearn.ensemble import VotingClassifier # 定义基础模型 estimators = [ ('rf', RandomForestClassifier(n_estimators=100, random_state=42)), ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)), ('svc', SVC(probability=True, random_state=42)) ] # 创建投票分类器 ensemble = VotingClassifier(estimators=estimators, voting='soft') # 训练集成模型 ensemble.fit(X_train, y_train) # 预测 y_pred = ensemble.predict(X_test) y_pred_proba = ensemble.predict_proba(X_test)[:, 1] # 评估 accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print("集成模型表现:") print(f"准确率: {accuracy:.4f}") print(f"精确率: {precision:.4f}") print(f"召回率: {recall:.4f}") print(f"F1分数: {f1:.4f}") # 绘制ROC曲线 from sklearn.metrics import roc_curve, auc fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) roc_auc = auc(fpr, tpr) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC曲线 (AUC = {roc_auc:.3f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('假正率') plt.ylabel('真正率') plt.title('集成模型ROC曲线', fontsize=15, pad=20) plt.legend(loc="lower right") plt.grid(True, alpha=0.3) plt.tight_layout() plt.show() return ensemble, y_pred_proba# 创建集成模型ensemble_model, y_pred_proba = create_ensemble_model(X_train, X_test, y_train, y_test)