1.导入必要的库
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns
2.加载数据
#假设数据文件名为data.csvdf = pd.read_csv (' data.csv')
3.数据预览
# 显示数据的前5行print (df. headO)#显示数据的基本信息,如列名、数据类型和非空值数量df. info(
4.数据清洗
#删除重复的行df. drop_duplicates (inplace=True)#处理缺失值df.fillna (method=' ffill', inplace=True)# 可能需要根据实际情况删除某些列# df.drop([’不需要的列名’],axis=1,inplace=True)
5.数据探索与分析
#基本统计分析print (df. describe ()#分组分析grouped= df.groupby (某列名’)print (grouped. mean ()#相关性分析print (df.corr()
6.数据可视化
#直方图df['某列名’ ].hist (bins=50)plt.show(#散点图df.plot(kind=' scatter',x=’列名X',y='列名Y’)plt.show(#热力图显示相关性sns.heatmap (df.corr(, annot=True,plt.show)
7.特征工程
#类别数据编码df[编码后的列名’]= pd.get_dummies (df[类别列名’])#新增特征df[新特征’]=df[某列名’]*df[’另一列名’]#标准化from sklearn. preprocessing import StandardScalerscaler = StandardScaler(df[[’需要标准化的列’]]=scaler.fit_transform (df[[需要标准化的列’]])
#类别数据编码df[编码后的列名’]=pd.get_dummies (df[类别列名’])#新增特征df[新特征’]=df[某列名’]*df[另一列名’]#标准化from sklearn. preprocessing import StandardScalerscaler = StandardScaler (df[[需要标准化的列’]]= scaler.fit_transform (df[[’需要标准化的列’]])
8.模型训练与评估
from sklearn. model_selection import train_test_splitfrom sklearn. linear_model import LinearRegressionfrom sklearn.metrics import mean_squared_error#定义特征变量和目标变量X=df[[’特征列1’,‘特征列2’,‘特征列3’]]y=df[’目标列’]#划分训练集和测试集X train, X test, y_train, y_test = train test_split (X, y, test_size=0.#初始化模型并训练model = LinearRegression(model.fit (X_train, y_train)#预测predictions = model.predict (X_test)#评估print ("均方差(MSE):",mean_squared_error (y_test, predictions))