一、什么是特征工程?
特征工程是将原始数据转换为更能代表问题本质的特征的过程,它是机器学习中最具创造性和影响力的环节之一。一个优秀的特征工程可以让简单的模型表现优异,而糟糕的特征工程即使使用复杂模型也难以取得好结果。
核心思想:“数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限。”
二、为什么特征工程如此重要?
数据质量决定模型质量 - 垃圾进,垃圾出
提升模型性能 - 好的特征可以显著提升准确率
降低计算成本 - 减少过拟合,加快训练速度
增强模型解释性 - 让模型决策更透明
三、特征工程的主要技术
1. 数值型特征处理
import numpy as npimport pandas as pdfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler# 创建示例数据data = pd.DataFrame({ 'age': [25, 30, 35, 40, 45, 200], # 包含异常值 'salary': [50000, 60000, 70000, 80000, 90000, 100000], 'experience': [1, 3, 5, 7, 9, 11]})# 标准化(Z-score标准化)scaler = StandardScaler()data_standardized = scaler.fit_transform(data[['age', 'salary']])# 归一化(Min-Max Scaling)minmax_scaler = MinMaxScaler()data_normalized = minmax_scaler.fit_transform(data[['age', 'salary']])# 鲁棒标准化(对异常值不敏感)robust_scaler = RobustScaler()data_robust = robust_scaler.fit_transform(data[['age', 'salary']])# 对数变换(处理偏态分布)data['salary_log'] = np.log1p(data['salary'])# 分箱处理(连续特征离散化)data['age_bin'] = pd.cut(data['age'], bins=[0, 30, 40, 50, 200], labels=['青年', '中青年', '中年', '异常'])
2. 类别型特征编码
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder# 示例数据categories = pd.DataFrame({ 'city': ['北京', '上海', '广州', '北京', '深圳'], 'education': ['本科', '硕士', '博士', '本科', '硕士']})# 标签编码(适用于有序分类)label_encoder = LabelEncoder()categories['city_label'] = label_encoder.fit_transform(categories['city'])# 独热编码(适用于无序分类)onehot = OneHotEncoder(sparse_output=False)city_onehot = onehot.fit_transform(categories[['city']])city_onehot_df = pd.DataFrame(city_onehot, columns=onehot.get_feature_names_out(['city']))# 频率编码(用类别频率代替类别)freq_encoding = categories['city'].value_counts(normalize=True)categories['city_freq'] = categories['city'].map(freq_encoding)# 目标编码(考虑目标变量的均值)# 注意:需要小心防止数据泄露
3. 日期时间特征提取
# 创建日期特征dates = pd.DataFrame({ 'timestamp': pd.date_range('2024-01-01', periods=10, freq='D')})# 提取多种时间特征dates['year'] = dates['timestamp'].dt.yeardates['month'] = dates['timestamp'].dt.monthdates['day'] = dates['timestamp'].dt.daydates['dayofweek'] = dates['timestamp'].dt.dayofweekdates['is_weekend'] = dates['dayofweek'].isin([5, 6]).astype(int)dates['quarter'] = dates['timestamp'].dt.quarterdates['hour'] = dates['timestamp'].dt.hour # 如果有时间部分# 创建周期特征(考虑周期性)dates['month_sin'] = np.sin(2 * np.pi * dates['month']/12)dates['month_cos'] = np.cos(2 * np.pi * dates['month']/12)
4. 文本特征处理
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer# 示例文本数据documents = [ "Python is a great programming language", "Machine learning with Python is fun", "Feature engineering is important for ML"]# 词袋模型count_vectorizer = CountVectorizer()bag_of_words = count_vectorizer.fit_transform(documents)print("词汇表:", count_vectorizer.get_feature_names_out())# TF-IDF向量化tfidf_vectorizer = TfidfVectorizer(max_features=10)tfidf_features = tfidf_vectorizer.fit_transform(documents)# N-gram特征bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))bigram_features = bigram_vectorizer.fit_transform(documents)
5. 特征组合与交互
# 创建交互特征data['age_salary_interaction'] = data['age'] * data['salary']data['experience_per_year'] = data['experience'] / (data['age'] - 22)# 多项式特征from sklearn.preprocessing import PolynomialFeaturespoly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)poly_features = poly.fit_transform(data[['age', 'salary']])# 组合类别特征data['city_education'] = categories['city'] + '_' + categories['education']
6. 特征选择方法
from sklearn.feature_selection import ( SelectKBest, f_classif, RFE, SelectFromModel)from sklearn.ensemble import RandomForestClassifierfrom sklearn.linear_model import Lasso# 过滤法:基于统计检验selector = SelectKBest(score_func=f_classif, k=5)selected_features = selector.fit_transform(X, y)# 包装法:递归特征消除estimator = RandomForestClassifier()rfe = RFE(estimator, n_features_to_select=5)rfe_features = rfe.fit_transform(X, y)# 嵌入法:基于模型的特征选择lasso = Lasso(alpha=0.01)lasso.fit(X, y)# 系数为零的特征可以被移除# 方差阈值法from sklearn.feature_selection import VarianceThresholdselector = VarianceThreshold(threshold=0.1) # 移除低方差特征low_variance_features = selector.fit_transform(X)
7. 降维技术
from sklearn.decomposition import PCA, TruncatedSVDfrom sklearn.manifold import TSNE# 主成分分析(PCA)pca = PCA(n_components=0.95) # 保留95%的方差X_pca = pca.fit_transform(X_standardized)print(f"原始维度: {X.shape[1]}, PCA后维度: {pca.n_components_}")# t-SNE(可视化高维数据)tsne = TSNE(n_components=2, random_state=42)X_tsne = tsne.fit_transform(X_sample)# 线性判别分析(LDA)from sklearn.discriminant_analysis import LinearDiscriminantAnalysislda = LinearDiscriminantAnalysis(n_components=2)X_lda = lda.fit_transform(X, y)
四、特征工程最佳实践
1. 处理缺失值
# 多种缺失值处理策略from sklearn.impute import SimpleImputer, KNNImputer# 均值/中位数/众数填充mean_imputer = SimpleImputer(strategy='mean')median_imputer = SimpleImputer(strategy='median')mode_imputer = SimpleImputer(strategy='most_frequent')# KNN填充(考虑相似样本)knn_imputer = KNNImputer(n_neighbors=5)# 创建缺失值指示器data['age_missing'] = data['age'].isnull().astype(int)
2. 异常值检测与处理
# 多种异常值检测方法from scipy import stats# Z-score方法z_scores = np.abs(stats.zscore(data['age']))outliers_z = data[z_scores > 3]# IQR方法Q1 = data['salary'].quantile(0.25)Q3 = data['salary'].quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQRoutliers_iqr = data[(data['salary'] < lower_bound) | (data['salary'] > upper_bound)]# 处理异常值的策略# 1. 截断data['age_capped'] = data['age'].clip(upper=100)# 2. 替换为分位数data['salary_winsorized'] = data['salary'].clip( lower=data['salary'].quantile(0.01), upper=data['salary'].quantile(0.99))
3. 自动化特征工程
# 使用FeatureTools进行自动化特征工程import featuretools as ft# 创建实体集es = ft.EntitySet(id='transactions')es = es.add_dataframe( dataframe_name='transactions', dataframe=data, index='id', time_index='timestamp')# 自动生成特征feature_matrix, feature_defs = ft.dfs( entityset=es, target_dataframe_name='transactions', max_depth=2, verbose=True)# 使用tsfresh提取时间序列特征from tsfresh import extract_featuresfrom tsfresh.utilities.dataframe_functions import impute# 提取时间序列特征ts_features = extract_features(timeseries_data, column_id='id', column_sort='time')ts_features_imputed = impute(ts_features)
五、实战示例:完整的特征工程流程
def create_features_pipeline(data): """完整的特征工程流程""" # 1. 数据预处理 # 处理缺失值 data['age'].fillna(data['age'].median(), inplace=True) data['income'].fillna(data['income'].mean(), inplace=True) # 2. 创建基础特征 # 数值特征标准化 scaler = StandardScaler() data[['age', 'income']] = scaler.fit_transform(data[['age', 'income']]) # 3. 创建衍生特征 data['age_income_ratio'] = data['age'] / (data['income'] + 1) data['age_squared'] = data['age'] ** 2 # 4. 处理类别特征 data = pd.get_dummies(data, columns=['city', 'education'], prefix=['city', 'edu']) # 5. 时间特征 if 'timestamp' in data.columns: data['hour'] = pd.to_datetime(data['timestamp']).dt.hour data['is_business_hour'] = ((data['hour'] >= 9) & (data['hour'] <= 17)).astype(int) # 6. 特征选择 # 移除低方差特征 from sklearn.feature_selection import VarianceThreshold selector = VarianceThreshold(threshold=0.01) features = selector.fit_transform(data) # 7. 保存特征名称 feature_names = data.columns[selector.get_support()] return features, feature_names
六、特征工程的注意事项
七、工具推荐
八、今日挑战
特征工程是机器学习的艺术与科学的完美结合。 通过创造性地转换和组合数据,你可以让模型看到数据中隐藏的模式。记住:花在特征工程上的时间,通常会获得比调整模型参数更大的回报!