1. Pandas简介
什么是Pandas?
Pandas是一个强大的Python数据分析库,建立在NumPy之上,提供了高级数据结构和数据分析工具。它的名字来源于"Panel Data"(面板数据)和"Python Data Analysis"(Python数据分析)。
Pandas的核心特性
# pandas_features.pyimport pandas as pdpandas_features = { "DataFrame": "二维表格数据结构(类似Excel表格)", "Series": "一维带标签数组(类似字典)", "数据导入导出": "支持CSV、Excel、SQL、JSON等多种格式", "数据清洗": "处理缺失值、重复值、异常值", "数据转换": "数据重塑、合并、分组、聚合", "时间序列": "专门的时间序列处理功能", "可视化": "集成的Matplotlib绘图接口", "高性能": "基于NumPy,性能优异"}print("Pandas核心特性:")for feature, description in pandas_features.items(): print(f" • {feature}: {description}")
Pandas安装
# 基础安装pip install pandas# 安装扩展功能pip install pandas[excel] # Excel支持pip install pandas[parquet] # Parquet格式支持pip install pandas[performance] # 性能优化# 使用condaconda install pandas# 安装开发版本pip install git+https://github.com/pandas-dev/pandas.git
2. Pandas核心数据结构
Series数据结构
# series_basics.pyimport pandas as pdimport numpy as npprint("=== Pandas Series基础 ===\n")print("1. 创建Series:")# 从列表创建s1 = pd.Series([1, 3, 5, np.nan, 6, 8])print(f" 从列表创建: \n{s1}")# 从NumPy数组创建arr = np.array([10, 20, 30, 40])s2 = pd.Series(arr)print(f"\n 从NumPy数组创建: \n{s2}")# 从字典创建s3 = pd.Series({'a': 1, 'b': 2, 'c': 3})print(f"\n 从字典创建: \n{s3}")# 指定索引s4 = pd.Series([100, 200, 300], index=['x', 'y', 'z'])print(f"\n 指定索引: \n{s4}")print("\n2. Series属性:")s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])print(f" Series:\n{s}")print(f"\n 索引: {s.index}")print(f" 值: {s.values}")print(f" 数据类型: {s.dtype}")print(f" 形状: {s.shape}")print(f" 大小: {s.size}")print(f" 是否为空: {s.empty}")print("\n3. Series索引和切片:")s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])print(f" Series:\n{s}")print(f"\n 标签索引 s['b']: {s['b']}")print(f" 位置索引 s[1]: {s[1]}")print(f" 切片 s['b':'d']: \n{s['b':'d']}")print(f" 布尔索引 s[s > 25]: \n{s[s > 25]}")print(f" 花式索引 s[['a', 'c', 'e']]: \n{s[['a', 'c', 'e']]}")print("\n4. Series运算:")s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'e', 'f'])print(f" s1:\n{s1}")print(f"\n s2:\n{s2}")print(f"\n s1 + s2 (自动对齐索引):\n{s1 + s2}")print(f" s1 * 2:\n{s1 * 2}")print(f" s1 + 10:\n{s1 + 10}")print("\n5. Series方法:")s = pd.Series([1, 2, 3, 4, 5, np.nan, 7, 8])print(f" Series:\n{s}")print(f"\n 统计方法:")print(f" sum(): {s.sum()}")print(f" mean(): {s.mean()}")print(f" std(): {s.std()}")print(f" min(): {s.min()}")print(f" max(): {s.max()}")print(f" count(): {s.count()} (非NaN数量)")print(f" isnull().sum(): {s.isnull().sum()} (NaN数量)")print(f"\n 数据处理方法:")print(f" fillna(0):\n{s.fillna(0)}")print(f" dropna():\n{s.dropna()}")print(f" cumsum():\n{s.cumsum()}")print("\n6. Series应用函数:")s = pd.Series([1, 4, 9, 16, 25])# 应用NumPy函数print(f" sqrt(s):\n{np.sqrt(s)}")# 应用lambda函数print(f" s.apply(lambda x: x**0.5):\n{s.apply(lambda x: x**0.5)}")# 矢量化操作print(f" s ** 0.5:\n{s ** 0.5}")
DataFrame数据结构
# dataframe_basics.pyimport pandas as pdimport numpy as npprint("=== Pandas DataFrame基础 ===\n")print("1. 创建DataFrame:")# 从字典创建data = { '姓名': ['张三', '李四', '王五', '赵六'], '年龄': [25, 30, 35, 28], '城市': ['北京', '上海', '广州', '深圳'], '工资': [50000, 60000, 55000, 65000]}df = pd.DataFrame(data)print(f" 从字典创建:\n{df}")# 从列表创建data_list = [ ['苹果', '水果', 5.0, 100], ['香蕉', '水果', 3.0, 200], ['胡萝卜', '蔬菜', 2.5, 150], ['牛肉', '肉类', 30.0, 50]]df2 = pd.DataFrame(data_list, columns=['商品', '类别', '价格', '库存'])print(f"\n 从列表创建:\n{df2}")# 从NumPy数组创建arr = np.random.rand(4, 3)df3 = pd.DataFrame(arr, columns=['A', 'B', 'C'])print(f"\n 从NumPy数组创建:\n{df3}")print("\n2. DataFrame属性:")print(f" 形状: {df.shape}")print(f" 维度: {df.ndim}")print(f" 大小: {df.size}")print(f" 列名: {df.columns}")print(f" 索引: {df.index}")print(f" 数据类型:\n{df.dtypes}")print(f"\n 信息概览:")print(df.info())print(f"\n 统计描述:\n{df.describe()}")print("\n3. 数据查看:")print(f" 前2行:\n{df.head(2)}")print(f"\n 后2行:\n{df.tail(2)}")print(f"\n 随机3行:\n{df.sample(3, random_state=42)}")print("\n4. 列操作:")# 选择列print(f" 选择单列 df['姓名']:\n{df['姓名']}")print(f"\n 选择多列 df[['姓名', '年龄']]:\n{df[['姓名', '年龄']]}")# 添加列df['年薪'] = df['工资'] * 12print(f"\n 添加年薪列:\n{df}")# 修改列df['城市'] = df['城市'] + '市'print(f"\n 修改城市列:\n{df}")# 删除列df_dropped = df.drop('年薪', axis=1)print(f"\n 删除年薪列:\n{df_dropped}")print("\n5. 行操作:")# 选择行print(f" 通过索引选择 df.iloc[1]:\n{df.iloc[1]}")print(f"\n 通过标签选择 df.loc[0]:\n{df.loc[0]}")# 添加行new_row = {'姓名': '钱七', '年龄': 32, '城市': '成都', '工资': 58000, '年薪': 696000}df = df.append(new_row, ignore_index=True)print(f"\n 添加新行:\n{df}")# 修改行df.loc[0, '工资'] = 52000df.loc[0, '年薪'] = 624000print(f"\n 修改第一行工资:\n{df}")# 删除行df = df.drop(2) # 删除索引为2的行print(f"\n 删除索引为2的行:\n{df}")print("\n6. 条件筛选:")print(f" 年龄大于28:\n{df[df['年龄'] > 28]}")print(f"\n 城市包含'北':\n{df[df['城市'].str.contains('北')]}")print(f"\n 年龄大于28且工资大于55000:\n{df[(df['年龄'] > 28) & (df['工资'] > 55000)]}")print(f"\n 城市为北京或上海:\n{df[df['城市'].isin(['北京市', '上海市'])]}")print("\n7. 排序:")print(f" 按年龄升序:\n{df.sort_values('年龄')}")print(f"\n 按工资降序:\n{df.sort_values('工资', ascending=False)}")print(f"\n 先按城市升序,再按年龄降序:\n{df.sort_values(['城市', '年龄'], ascending=[True, False])}")print("\n8. 重置索引:")df_reset = df.reset_index(drop=True)print(f" 重置索引:\n{df_reset}")
3. 数据导入导出
数据读取和保存
# data_io.pyimport pandas as pdimport numpy as npimport osprint("=== Pandas数据导入导出 ===\n")# 创建示例数据data = { 'ID': [1, 2, 3, 4, 5], 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 'Age': [25, 30, 35, 28, 32], 'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'], 'Salary': [50000, 60000, 55000, 70000, 65000]}df = pd.DataFrame(data)print("1. CSV文件操作:")# 保存为CSVdf.to_csv('employees.csv', index=False)print(f" 保存到 employees.csv")# 读取CSVdf_csv = pd.read_csv('employees.csv')print(f"\n 读取CSV文件:\n{df_csv}")# 读取CSV带参数df_csv_params = pd.read_csv('employees.csv', usecols=['ID', 'Name', 'Age'], nrows=3)print(f"\n 读取CSV(指定列和前3行):\n{df_csv_params}")print("\n2. Excel文件操作:")# 保存为Exceldf.to_excel('employees.xlsx', sheet_name='员工信息', index=False)print(f" 保存到 employees.xlsx")# 读取Exceldf_excel = pd.read_excel('employees.xlsx', sheet_name='员工信息')print(f"\n 读取Excel文件:\n{df_excel}")# 读取多个sheetwith pd.ExcelWriter('multiple_sheets.xlsx') as writer: df.to_excel(writer, sheet_name='Sheet1', index=False) df.head(3).to_excel(writer, sheet_name='Sheet2', index=False)print(f" 创建多sheet的Excel文件")print("\n3. JSON文件操作:")# 保存为JSONdf.to_json('employees.json', orient='records')print(f" 保存到 employees.json")# 读取JSONdf_json = pd.read_json('employees.json')print(f"\n 读取JSON文件:\n{df_json}")print("\n4. SQL数据库操作:")# 需要安装sqlalchemy和数据库驱动# pip install sqlalchemy# pip install pymysql # MySQL# pip install psycopg2 # PostgreSQLtry: from sqlalchemy import create_engine # 创建SQLite内存数据库 engine = create_engine('sqlite:///:memory:') # 保存到数据库 df.to_sql('employees', engine, if_exists='replace', index=False) print(f" 保存到SQLite数据库") # 从数据库读取 df_sql = pd.read_sql('SELECT * FROM employees', engine) print(f"\n 从数据库读取:\n{df_sql}") # 读取特定查询 df_query = pd.read_sql_query('SELECT Name, Age FROM employees WHERE Age > 30', engine) print(f"\n 执行查询:\n{df_query}")except ImportError: print(" 需要安装sqlalchemy才能运行SQL示例")print("\n5. 其他格式:")# Parquet格式(高效列式存储)df.to_parquet('employees.parquet')print(f" 保存为Parquet格式")# HDF5格式(科学计算常用)df.to_hdf('employees.h5', key='df', mode='w')print(f" 保存为HDF5格式")# Pickle格式(Python对象序列化)df.to_pickle('employees.pkl')print(f" 保存为Pickle格式")# 读取这些格式df_parquet = pd.read_parquet('employees.parquet')df_hdf = pd.read_hdf('employees.h5')df_pickle = pd.read_pickle('employees.pkl')print("\n6. 网络数据:")# 从URL读取CSVtry: url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv' df_titanic = pd.read_csv(url) print(f" 从网络读取Titanic数据集,形状: {df_titanic.shape}")except: print(" 网络读取示例需要网络连接")print("\n7. 清理生成的文件:")# 删除生成的文件files_to_remove = [ 'employees.csv', 'employees.xlsx', 'multiple_sheets.xlsx', 'employees.json', 'employees.parquet', 'employees.h5', 'employees.pkl']for file in files_to_remove: if os.path.exists(file): os.remove(file) print(f" 已删除: {file}")
处理大型数据集
# large_data.pyimport pandas as pdimport numpy as npprint("=== 处理大型数据集 ===\n")print("1. 分块读取:")# 对于大型CSV文件,可以分块读取chunk_size = 1000chunks = []# 创建大型CSV文件用于演示large_data = pd.DataFrame({ 'id': range(10000), 'value': np.random.randn(10000), 'category': np.random.choice(['A', 'B', 'C', 'D'], 10000)})large_data.to_csv('large_data.csv', index=False)# 分块读取print(f" 分块读取large_data.csv (每块{chunk_size}行)")for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size): chunks.append(chunk) print(f" 读取块 {len(chunks)},形状: {chunk.shape}")# 合并所有块df_combined = pd.concat(chunks, ignore_index=True)print(f"\n 合并后形状: {df_combined.shape}")print("\n2. 选择列子集:")# 只读取需要的列df_subset = pd.read_csv('large_data.csv', usecols=['id', 'category'])print(f" 只读取id和category列,形状: {df_subset.shape}")print("\n3. 指定数据类型:")# 指定数据类型可以减少内存使用dtypes = { 'id': 'int32', 'value': 'float32', 'category': 'category' # 分类数据类型}df_typed = pd.read_csv('large_data.csv', dtype=dtypes)print(f" 指定数据类型后:")print(f" 内存使用: {df_typed.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")print(f" 数据类型:\n{df_typed.dtypes}")print("\n4. 过滤行:")# 读取时过滤# 方法1: 使用skiprowsprint(f" 跳过前1000行:")df_skip = pd.read_csv('large_data.csv', skiprows=range(1, 1001))print(f" 形状: {df_skip.shape}")# 方法2: 使用nrowsprint(f"\n 只读取前1000行:")df_nrows = pd.read_csv('large_data.csv', nrows=1000)print(f" 形状: {df_nrows.shape}")print("\n5. 内存优化技巧:")# 使用分类数据类型df = pd.read_csv('large_data.csv')df['category'] = df['category'].astype('category')# 使用稀疏数据结构(适合大部分为0的数据)sparse_series = pd.Series([0, 0, 1, 0, 0, 0, 0, 0, 0, 1]).to_sparse()print(f" 稀疏Series:\n{sparse_series}")print(f" 稀疏Series内存使用: {sparse_series.memory_usage(deep=True)} 字节")# 删除不需要的列df_reduced = df.drop('value', axis=1)print(f"\n 删除value列后内存: {df_reduced.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")print("\n6. 清理文件:")import osif os.path.exists('large_data.csv'): os.remove('large_data.csv') print(f" 已删除large_data.csv")
4. 数据清洗和预处理
处理缺失值
# missing_values.pyimport pandas as pdimport numpy as npprint("=== 处理缺失值 ===\n")# 创建包含缺失值的数据data = { 'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, np.nan, 5], 'C': [1, 2, 3, 4, 5], 'D': ['a', 'b', np.nan, 'd', 'e']}df = pd.DataFrame(data)print("原始数据:")print(df)print(f"\n缺失值统计:")print(df.isnull().sum())print("\n1. 检测缺失值:")print(f" df.isnull():\n{df.isnull()}")print(f"\n df.notnull():\n{df.notnull()}")print(f"\n 每列缺失值数量:\n{df.isnull().sum()}")print(f"\n 总缺失值数量: {df.isnull().sum().sum()}")print(f" 是否有缺失值: {df.isnull().values.any()}")print("\n2. 删除缺失值:")print(" 删除包含缺失值的行:")print(f" df.dropna():\n{df.dropna()}")print("\n 删除包含缺失值的列:")print(f" df.dropna(axis=1):\n{df.dropna(axis=1)}")print("\n 只删除所有值都为NaN的行:")print(f" df.dropna(how='all'):\n{df.dropna(how='all')}")print("\n 只删除在特定列有缺失值的行:")print(f" df.dropna(subset=['A', 'B']):\n{df.dropna(subset=['A', 'B'])}")print("\n3. 填充缺失值:")print(" 用固定值填充:")print(f" df.fillna(0):\n{df.fillna(0)}")print("\n 用每列的平均值填充:")print(f" df.fillna(df.mean()):\n{df.fillna(df.mean())}")print("\n 前向填充:")print(f" df.fillna(method='ffill'):\n{df.fillna(method='ffill')}")print("\n 后向填充:")print(f" df.fillna(method='bfill'):\n{df.fillna(method='bfill')}")print("\n 不同列用不同值填充:")fill_values = {'A': df['A'].mean(), 'B': 0, 'D': 'missing'}print(f" df.fillna(fill_values):\n{df.fillna(fill_values)}")print("\n4. 插值:")# 创建时间序列数据dates = pd.date_range('2023-01-01', periods=10, freq='D')ts_data = pd.Series([1, np.nan, np.nan, np.nan, 5, np.nan, 7, 8, np.nan, 10], index=dates)print(" 时间序列数据:")print(ts_data)print("\n 线性插值:")print(f" ts_data.interpolate():\n{ts_data.interpolate()}")print("\n 时间插值:")print(f" ts_data.interpolate(method='time'):\n{ts_data.interpolate(method='time')}")print("\n5. 高级缺失值处理:")# 使用机器学习方法填充from sklearn.impute import SimpleImputer# 创建数值数据num_data = df[['A', 'B', 'C']].values# 使用均值填充imputer = SimpleImputer(strategy='mean')imputed_data = imputer.fit_transform(num_data)df_imputed = pd.DataFrame(imputed_data, columns=['A', 'B', 'C'])print(f" 使用Scikit-learn的SimpleImputer:\n{df_imputed}")print("\n6. 缺失值模式分析:")# 创建缺失值模式矩阵missing_pattern = df.isnull().astype(int)print(f" 缺失值模式(1表示缺失):\n{missing_pattern}")# 计算缺失值相关性missing_corr = df.isnull().corr()print(f"\n 缺失值相关性:\n{missing_corr}")
处理重复值和异常值
# duplicates_outliers.pyimport pandas as pdimport numpy as npprint("=== 处理重复值和异常值 ===\n")print("1. 处理重复值:")# 创建包含重复值的数据data = { 'id': [1, 2, 3, 4, 1, 2, 7, 8], 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Bob', 'Eve', 'Frank'], 'score': [85, 90, 78, 92, 85, 90, 88, 95]}df = pd.DataFrame(data)print(" 原始数据:")print(df)print(f"\n 检测重复行:")print(f" df.duplicated():\n{df.duplicated()}")print(f"\n 基于特定列的重复检测:")print(f" df.duplicated(subset=['id', 'name']):\n{df.duplicated(subset=['id', 'name'])}")print(f"\n 删除重复行:")print(f" df.drop_duplicates():\n{df.drop_duplicates()}")print(f"\n 基于特定列删除重复行:")print(f" df.drop_duplicates(subset=['id', 'name']):\n{df.drop_duplicates(subset=['id', 'name'])}")print(f"\n 保留第一次出现的重复行(默认):")print(f" df.drop_duplicates(keep='first'):\n{df.drop_duplicates(keep='first')}")print(f"\n 保留最后一次出现的重复行:")print(f" df.drop_duplicates(keep='last'):\n{df.drop_duplicates(keep='last')}")print(f"\n 删除所有重复行:")print(f" df.drop_duplicates(keep=False):\n{df.drop_duplicates(keep=False)}")print("\n2. 处理异常值:")# 创建包含异常值的数据np.random.seed(42)normal_data = np.random.normal(50, 10, 95) # 95个正常值outliers = np.array([120, 130, 140, -20, -30]) # 5个异常值all_data = np.concatenate([normal_data, outliers])df_outliers = pd.DataFrame({'value': all_data})print(" 包含异常值的数据:")print(f" 形状: {df_outliers.shape}")print(f" 统计描述:\n{df_outliers.describe()}")print("\n 可视化数据分布:")# 绘制箱线图import matplotlib.pyplot as pltplt.figure(figsize=(10, 6))plt.subplot(1, 2, 1)plt.hist(df_outliers['value'], bins=30, edgecolor='black')plt.title('直方图')plt.xlabel('值')plt.ylabel('频数')plt.subplot(1, 2, 2)plt.boxplot(df_outliers['value'])plt.title('箱线图')plt.ylabel('值')plt.tight_layout()plt.show()print("\n 检测异常值方法:")# 方法1: Z-score方法from scipy import statsz_scores = np.abs(stats.zscore(df_outliers['value']))threshold = 3outliers_z = df_outliers[z_scores > threshold]print(f"\n Z-score方法(阈值={threshold}):")print(f" 检测到 {len(outliers_z)} 个异常值")print(f" 异常值:\n{outliers_z}")# 方法2: IQR方法Q1 = df_outliers['value'].quantile(0.25)Q3 = df_outliers['value'].quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQRoutliers_iqr = df_outliers[(df_outliers['value'] < lower_bound) | (df_outliers['value'] > upper_bound)]print(f"\n IQR方法:")print(f" Q1={Q1:.2f}, Q3={Q3:.2f}, IQR={IQR:.2f}")print(f" 边界: [{lower_bound:.2f}, {upper_bound:.2f}]")print(f" 检测到 {len(outliers_iqr)} 个异常值")print(f" 异常值:\n{outliers_iqr}")print("\n 处理异常值:")# 方法1: 删除异常值df_no_outliers = df_outliers[(z_scores <= threshold)]print(f" 删除异常值后形状: {df_no_outliers.shape}")# 方法2: 截断(Winsorization)from scipy.stats.mstats import winsorizewinsorized = winsorize(df_outliers['value'], limits=[0.05, 0.05])df_winsorized = pd.DataFrame({'value': winsorized})print(f"\n 截断处理(限制5%):")print(f" 统计描述:\n{df_winsorized.describe()}")# 方法3: 用中位数或均值替换median_value = df_outliers['value'].median()df_median_replaced = df_outliers.copy()df_median_replaced['value'] = np.where(z_scores > threshold, median_value, df_outliers['value'])print(f"\n 用中位数替换异常值:")print(f" 统计描述:\n{df_median_replaced.describe()}")print("\n3. 数据转换:")# 创建偏态分布数据skewed_data = np.random.exponential(scale=2, size=1000)df_skewed = pd.DataFrame({'value': skewed_data})print(" 偏态数据:")print(f" 偏度: {df_skewed['value'].skew():.2f}")print(f" 峰度: {df_skewed['value'].kurtosis():.2f}")# 对数变换df_skewed['log_value'] = np.log1p(df_skewed['value'])print(f"\n 对数变换后偏度: {df_skewed['log_value'].skew():.2f}")# Box-Cox变换from scipy import statstransformed, lambda_value = stats.boxcox(df_skewed['value'] + 1) # +1避免负值df_skewed['boxcox_value'] = transformedprint(f" Box-Cox变换后偏度: {df_skewed['boxcox_value'].skew():.2f}")print(f" λ值: {lambda_value:.2f}")print("\n4. 数据规范化:")# 最小-最大规范化df_norm = df_outliers.copy()df_norm['minmax'] = (df_norm['value'] - df_norm['value'].min()) / (df_norm['value'].max() - df_norm['value'].min())# Z-score规范化df_norm['zscore'] = (df_norm['value'] - df_norm['value'].mean()) / df_norm['value'].std()print(" 规范化后的数据(前5行):")print(df_norm.head())
5. 数据转换和操作
数据重塑
# data_transformation.pyimport pandas as pdimport numpy as npprint("=== 数据转换和重塑 ===\n")print("1. 数据透视表(Pivot Table):")# 创建销售数据sales_data = { '日期': pd.date_range('2023-01-01', periods=12, freq='M'), '产品': ['A', 'B', 'C'] * 4, '地区': ['北京', '上海', '广州'] * 4, '销售额': np.random.randint(1000, 5000, 12), '数量': np.random.randint(10, 100, 12)}df = pd.DataFrame(sales_data)print(" 原始销售数据:")print(df)print("\n 创建数据透视表:")# 按产品和地区汇总销售额pivot = df.pivot_table( values='销售额', index='产品', columns='地区', aggfunc='sum', fill_value=0)print(f" 按产品和地区汇总销售额:\n{pivot}")print("\n 多级数据透视表:")pivot_multi = df.pivot_table( values=['销售额', '数量'], index=['产品', '地区'], aggfunc={'销售额': 'sum', '数量': 'mean'})print(f" 按产品和地区的多指标汇总:\n{pivot_multi}")print("\n 带边缘总计的数据透视表:")pivot_margins = df.pivot_table( values='销售额', index='产品', columns='地区', aggfunc='sum', margins=True, margins_name='总计')print(f" 带总计的数据透视表:\n{pivot_margins}")print("\n2. 数据融合(Melt):")# 创建宽格式数据wide_data = { '姓名': ['张三', '李四', '王五'], '语文': [85, 90, 78], '数学': [92, 88, 95], '英语': [88, 85, 92]}df_wide = pd.DataFrame(wide_data)print(" 宽格式数据:")print(df_wide)print("\n 转换为长格式:")df_long = df_wide.melt( id_vars='姓名', value_vars=['语文', '数学', '英语'], var_name='科目', value_name='成绩')print(f" 长格式数据:\n{df_long}")print("\n3. 数据堆叠(Stack)和反堆叠(Unstack):")# 创建多层索引数据arrays = [ ['A', 'A', 'B', 'B'], [1, 2, 1, 2]]index = pd.MultiIndex.from_arrays(arrays, names=['字母', '数字'])df_multi = pd.DataFrame({'值': [10, 20, 30, 40]}, index=index)print(" 多层索引数据:")print(df_multi)print("\n 堆叠操作:")stacked = df_multi.stack()print(f" 堆叠后:\n{stacked}")print("\n 反堆叠操作:")unstacked = df_multi.unstack()print(f" 反堆叠后:\n{unstacked}")print("\n4. 数据分组(GroupBy):")print(" 原始销售数据:")print(df)print("\n 按产品分组:")grouped = df.groupby('产品')for name, group in grouped: print(f"\n 产品 {name}:") print(group)print("\n 分组聚合:")agg_result = df.groupby('产品').agg({ '销售额': ['sum', 'mean', 'std'], '数量': ['sum', 'mean']})print(f" 按产品的聚合统计:\n{agg_result}")print("\n 多级分组:")multi_group = df.groupby(['产品', '地区']).agg({'销售额': 'sum'})print(f" 按产品和地区分组汇总:\n{multi_group}")print("\n 分组转换:")# 计算每个产品销售额占该产品总销售额的比例df['销售额占比'] = df.groupby('产品')['销售额'].transform(lambda x: x / x.sum())print(f" 添加销售额占比列:\n{df}")print("\n 分组筛选:")# 筛选销售额超过组平均值的记录high_sales = df.groupby('产品').filter(lambda x: x['销售额'].mean() > 3000)print(f" 筛选高销售额产品记录:\n{high_sales}")print("\n5. 数据合并(Merge/Join/Concat):")# 创建两个数据框df1 = pd.DataFrame({ '员工ID': [1, 2, 3, 4], '姓名': ['张三', '李四', '王五', '赵六'], '部门': ['技术部', '销售部', '技术部', '市场部']})df2 = pd.DataFrame({ '员工ID': [1, 2, 3, 5], '工资': [50000, 60000, 55000, 65000], '入职日期': ['2020-01-15', '2019-03-20', '2021-05-10', '2018-08-01']})print(" 数据框1:")print(df1)print("\n 数据框2:")print(df2)print("\n 内连接(Inner Join):")inner_merge = pd.merge(df1, df2, on='员工ID', how='inner')print(f" 内连接结果:\n{inner_merge}")print("\n 左连接(Left Join):")left_merge = pd.merge(df1, df2, on='员工ID', how='left')print(f" 左连接结果:\n{left_merge}")print("\n 右连接(Right Join):")right_merge = pd.merge(df1, df2, on='员工ID', how='right')print(f" 右连接结果:\n{right_merge}")print("\n 外连接(Full Outer Join):")outer_merge = pd.merge(df1, df2, on='员工ID', how='outer')print(f" 外连接结果:\n{outer_merge}")print("\n 连接多个列:")df3 = pd.DataFrame({ '部门': ['技术部', '销售部', '市场部'], '经理': ['王经理', '李经理', '张经理'], '预算': [1000000, 800000, 600000]})multi_merge = pd.merge(df1, df3, on='部门', how='left')print(f" 按部门合并:\n{multi_merge}")print("\n 连接(Join):")# 设置索引后使用joindf1_indexed = df1.set_index('员工ID')df2_indexed = df2.set_index('员工ID')join_result = df1_indexed.join(df2_indexed, how='left')print(f" Join结果:\n{join_result}")print("\n 连接(Concat):")# 垂直连接concat_vertical = pd.concat([df1, pd.DataFrame({ '员工ID': [5], '姓名': ['钱七'], '部门': ['人事部']})], ignore_index=True)print(f" 垂直连接:\n{concat_vertical}")# 水平连接concat_horizontal = pd.concat([df1, df2.drop('员工ID', axis=1)], axis=1)print(f"\n 水平连接:\n{concat_horizontal}")print("\n6. 数据排序和排名:")df = pd.DataFrame({ '姓名': ['张三', '李四', '王五', '赵六', '钱七'], '销售额': [50000, 60000, 45000, 70000, 55000], '订单数': [25, 30, 20, 35, 28]})print(" 原始数据:")print(df)print("\n 按销售额降序排序:")sorted_df = df.sort_values('销售额', ascending=False)print(sorted_df)print("\n 排名:")df['销售额排名'] = df['销售额'].rank(ascending=False, method='min')df['订单数排名'] = df['订单数'].rank(ascending=False, method='min')print(f" 添加排名列:\n{df}")
6. 时间序列处理
# time_series.pyimport pandas as pdimport numpy as npprint("=== Pandas时间序列处理 ===\n")print("1. 创建时间序列:")# 从字符串创建dates = pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])print(f" 从字符串创建: {dates}")# 生成日期范围date_range = pd.date_range('2023-01-01', periods=10, freq='D')print(f"\n 生成日期范围: {date_range}")# 生成带时间的日期范围datetime_range = pd.date_range('2023-01-01', periods=5, freq='H')print(f"\n 生成小时频率日期范围: {datetime_range}")# 创建时间序列数据ts = pd.Series(np.random.randn(10), index=date_range)print(f"\n 时间序列数据:\n{ts}")print("\n2. 时间序列索引和切片:")# 创建时间序列数据dates = pd.date_range('2023-01-01', periods=100, freq='D')ts = pd.Series(np.random.randn(100), index=dates)print(f" 时间序列数据(前5行):\n{ts.head()}")print(f"\n 按日期索引: ts['2023-01-15'] = {ts['2023-01-15']}")print(f" 按月份切片: ts['2023-01']:\n{ts['2023-01'].head()}")print(f" 按日期范围切片: ts['2023-01-15':'2023-01-20']:\n{ts['2023-01-15':'2023-01-20']}")print("\n3. 时间序列重采样:")# 创建日度数据dates = pd.date_range('2023-01-01', periods=90, freq='D')data = np.random.randn(90).cumsum() # 累积和,模拟趋势ts = pd.Series(data, index=dates)print(f" 原始日度数据(前5行):\n{ts.head()}")print(f" 形状: {ts.shape}")print("\n 重采样为月度数据(取平均值):")monthly = ts.resample('M').mean()print(f" 月度数据:\n{monthly}")print(f" 形状: {monthly.shape}")print("\n 重采样为周数据(取第一个值):")weekly = ts.resample('W').first()print(f" 周数据(前5行):\n{weekly.head()}")print(f" 形状: {weekly.shape}")print("\n 重采样为周数据(不同聚合方法):")weekly_agg = ts.resample('W').agg(['mean', 'min', 'max', 'std'])print(f" 周数据(多种统计):\n{weekly_agg.head()}")print("\n4. 移动窗口操作:")# 创建示例数据dates = pd.date_range('2023-01-01', periods=30, freq='D')ts = pd.Series(np.random.randn(30), index=dates)print(f" 原始数据:\n{ts.head()}")print("\n 移动平均值(窗口大小为3):")moving_avg = ts.rolling(window=3).mean()print(f" 移动平均值:\n{moving_avg.head()}")print("\n 移动标准差:")moving_std = ts.rolling(window=3).std()print(f" 移动标准差:\n{moving_std.head()}")print("\n 扩展窗口(累计平均):")expanding_mean = ts.expanding().mean()print(f" 扩展平均:\n{expanding_mean.head()}")print("\n 指数加权移动平均:")ewm = ts.ewm(span=3).mean()print(f" 指数加权移动平均:\n{ewm.head()}")print("\n5. 时间序列分解:")from statsmodels.tsa.seasonal import seasonal_decompose# 创建有趋势和季节性的数据np.random.seed(42)n = 100t = np.arange(n)trend = 0.1 * tseasonal = 5 * np.sin(2 * np.pi * t / 12)noise = np.random.randn(n) * 2data = trend + seasonal + noisedates = pd.date_range('2023-01-01', periods=n, freq='M')ts = pd.Series(data, index=dates)print(f" 创建的时间序列(前5行):\n{ts.head()}")# 季节性分解result = seasonal_decompose(ts, model='additive', period=12)print(f"\n 分解结果:")print(f" 趋势成分(前5行):\n{result.trend.head()}")print(f" 季节性成分(前5行):\n{result.seasonal.head()}")print(f" 残差成分(前5行):\n{result.resid.head()}")print("\n6. 时间序列特征工程:")# 创建时间序列数据dates = pd.date_range('2023-01-01', periods=100, freq='D')ts = pd.Series(np.random.randn(100).cumsum(), index=dates)# 创建DataFramedf = pd.DataFrame({'value': ts})print(" 原始时间序列(前5行):")print(df.head())print("\n 提取时间特征:")df['year'] = df.index.yeardf['month'] = df.index.monthdf['day'] = df.index.daydf['dayofweek'] = df.index.dayofweek # 0=周一, 6=周日df['quarter'] = df.index.quarterdf['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)print(f" 添加时间特征后(前5行):\n{df.head()}")print("\n 滞后特征:")df['value_lag1'] = df['value'].shift(1) # 滞后1期df['value_lag7'] = df['value'].shift(7) # 滞后7期(一周)df['value_diff1'] = df['value'].diff(1) # 一阶差分df['value_pct_change'] = df['value'].pct_change() # 百分比变化print(f" 添加滞后特征后(前10行):\n{df.head(10)}")print("\n 滚动统计特征:")df['value_roll_mean7'] = df['value'].rolling(window=7).mean()df['value_roll_std7'] = df['value'].rolling(window=7).std()df['value_roll_min7'] = df['value'].rolling(window=7).min()df['value_roll_max7'] = df['value'].rolling(window=7).max()print(f" 添加滚动特征后(前10行):\n{df.head(10)}")print("\n7. 时区处理:")# 创建带时区的时间序列ts_utc = pd.Timestamp('2023-01-01 12:00:00', tz='UTC')print(f" UTC时间: {ts_utc}")# 时区转换ts_est = ts_utc.tz_convert('US/Eastern')print(f" 转换为美国东部时间: {ts_est}")# 本地化ts_naive = pd.Timestamp('2023-01-01 12:00:00')ts_localized = ts_naive.tz_localize('Asia/Shanghai')print(f" 本地化为上海时间: {ts_localized}")
7. 数据可视化
# data_visualization.pyimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltprint("=== Pandas数据可视化 ===\n")# 创建示例数据np.random.seed(42)dates = pd.date_range('2023-01-01', periods=100, freq='D')sales_data = pd.DataFrame({ '日期': dates, '销售额': np.random.randn(100).cumsum() + 100, '订单数': np.random.randint(10, 100, 100), '产品类别': np.random.choice(['A', 'B', 'C'], 100), '地区': np.random.choice(['北京', '上海', '广州', '深圳'], 100)})print("1. 线图:")# 创建线图plt.figure(figsize=(12, 8))# 子图1: 销售额趋势plt.subplot(2, 2, 1)sales_data.set_index('日期')['销售额'].plot(title='销售额趋势', color='blue')plt.xlabel('日期')plt.ylabel('销售额')plt.grid(True, alpha=0.3)print(" 绘制销售额趋势线图")# 子图2: 滚动平均plt.subplot(2, 2, 2)sales_data.set_index('日期')['销售额'].rolling(window=7).mean().plot( title='7日移动平均', color='red')plt.xlabel('日期')plt.ylabel('滚动平均')plt.grid(True, alpha=0.3)print(" 绘制7日移动平均线图")print("\n2. 柱状图:")# 按产品类别统计销售额category_sales = sales_data.groupby('产品类别')['销售额'].sum()# 子图3: 柱状图plt.subplot(2, 2, 3)category_sales.plot(kind='bar', title='按产品类别销售额', color=['red', 'green', 'blue'])plt.xlabel('产品类别')plt.ylabel('销售额')plt.xticks(rotation=0)plt.grid(True, alpha=0.3, axis='y')print(" 绘制按产品类别的销售额柱状图")print("\n3. 饼图:")# 按地区统计订单数region_orders = sales_data.groupby('地区')['订单数'].sum()# 子图4: 饼图plt.subplot(2, 2, 4)region_orders.plot( kind='pie', title='各地区订单数占比', autopct='%1.1f%%', startangle=90, explode=(0.05, 0.05, 0.05, 0.05))plt.ylabel('') # 隐藏ylabelprint(" 绘制各地区订单数占比饼图")plt.tight_layout()plt.show()print("\n4. 散点图和相关性:")# 创建新的示例数据product_data = pd.DataFrame({ '价格': np.random.uniform(10, 100, 50), '销量': np.random.uniform(100, 1000, 50), '广告投入': np.random.uniform(1000, 10000, 50), '产品类型': np.random.choice(['高', '中', '低'], 50)})fig, axes = plt.subplots(1, 3, figsize=(15, 5))# 散点图axes[0].scatter(product_data['价格'], product_data['销量'], alpha=0.6)axes[0].set_xlabel('价格')axes[0].set_ylabel('销量')axes[0].set_title('价格 vs 销量')axes[0].grid(True, alpha=0.3)# 带颜色的散点图colors = {'高': 'red', '中': 'green', '低': 'blue'}for level, color in colors.items(): subset = product_data[product_data['产品类型'] == level] axes[1].scatter(subset['广告投入'], subset['销量'], color=color, label=level, alpha=0.6)axes[1].set_xlabel('广告投入')axes[1].set_ylabel('销量')axes[1].set_title('广告投入 vs 销量(按产品类型)')axes[1].legend()axes[1].grid(True, alpha=0.3)# 相关矩阵热力图correlation = product_data[['价格', '销量', '广告投入']].corr()im = axes[2].imshow(correlation, cmap='coolwarm', vmin=-1, vmax=1)axes[2].set_xticks(range(len(correlation.columns)))axes[2].set_yticks(range(len(correlation.columns)))axes[2].set_xticklabels(correlation.columns)axes[2].set_yticklabels(correlation.columns)axes[2].set_title('相关矩阵热力图')# 添加数值标签for i in range(len(correlation.columns)): for j in range(len(correlation.columns)): text = axes[2].text(j, i, f'{correlation.iloc[i, j]:.2f}', ha="center", va="center", color="black")plt.colorbar(im, ax=axes[2])plt.tight_layout()plt.show()print("\n5. 箱线图和直方图:")fig, axes = plt.subplots(1, 3, figsize=(15, 5))# 箱线图sales_data.boxplot(column='销售额', by='产品类别', ax=axes[0])axes[0].set_title('各产品类别销售额箱线图')axes[0].set_xlabel('产品类别')axes[0].set_ylabel('销售额')axes[0].grid(True, alpha=0.3)# 直方图axes[1].hist(sales_data['销售额'], bins=30, edgecolor='black', alpha=0.7)axes[1].set_xlabel('销售额')axes[1].set_ylabel('频数')axes[1].set_title('销售额分布直方图')axes[1].grid(True, alpha=0.3)# 密度图sales_data['销售额'].plot(kind='density', ax=axes[2])axes[2].set_xlabel('销售额')axes[2].set_ylabel('密度')axes[2].set_title('销售额密度图')axes[2].grid(True, alpha=0.3)plt.tight_layout()plt.show()print("\n6. 时间序列分解图:")# 创建有趋势和季节性的数据np.random.seed(42)n = 100t = np.arange(n)trend = 0.1 * tseasonal = 5 * np.sin(2 * np.pi * t / 12)noise = np.random.randn(n) * 2data = trend + seasonal + noisedates = pd.date_range('2023-01-01', periods=n, freq='M')ts = pd.Series(data, index=dates)# 使用Pandas进行简单分解rolling_mean = ts.rolling(window=12, center=True).mean()trend_component = rolling_meanseasonal_component = ts - trend_componentfig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)# 原始数据axes[0].plot(ts, label='原始数据', color='blue')axes[0].set_ylabel('原始数据')axes[0].legend()axes[0].grid(True, alpha=0.3)# 趋势成分axes[1].plot(trend_component, label='趋势成分', color='red')axes[1].set_ylabel('趋势')axes[1].legend()axes[1].grid(True, alpha=0.3)# 季节性成分axes[2].plot(seasonal_component, label='季节性成分', color='green')axes[2].set_ylabel('季节性')axes[2].legend()axes[2].grid(True, alpha=0.3)# 残差成分residual = ts - trend_component - seasonal_component.mean()axes[3].plot(residual, label='残差成分', color='purple')axes[3].set_ylabel('残差')axes[3].set_xlabel('日期')axes[3].legend()axes[3].grid(True, alpha=0.3)plt.suptitle('时间序列分解')plt.tight_layout()plt.show()print("\n7. 多轴图:")# 创建双轴数据dates = pd.date_range('2023-01-01', periods=30, freq='D')dual_data = pd.DataFrame({ '日期': dates, '销售额': np.random.randn(30).cumsum() + 100, '订单数': np.random.randint(10, 100, 30)}).set_index('日期')fig, ax1 = plt.subplots(figsize=(10, 6))# 第一个y轴(销售额)color1 = 'tab:red'ax1.set_xlabel('日期')ax1.set_ylabel('销售额', color=color1)ax1.plot(dual_data.index, dual_data['销售额'], color=color1)ax1.tick_params(axis='y', labelcolor=color1)# 第二个y轴(订单数)ax2 = ax1.twinx()color2 = 'tab:blue'ax2.set_ylabel('订单数', color=color2)ax2.plot(dual_data.index, dual_data['订单数'], color=color2)ax2.tick_params(axis='y', labelcolor=color2)plt.title('销售额和订单数趋势(双轴图)')fig.tight_layout()plt.show()
总结
今天学习了Pandas的基础知识,掌握了以下核心内容:
关键收获
Pandas核心优势
最佳实践
了解数据:使用head()、info()、describe()了解数据概况
处理缺失值:根据业务需求选择填充或删除缺失值
使用向量化操作:避免Python循环,使用Pandas内置函数
内存优化:对于大型数据集,注意数据类型和内存使用
保持数据整洁:遵循整洁数据(Tidy Data)原则
常见错误和解决方案
# common_mistakes.pyimport pandas as pdimport numpy as npprint("=== Pandas常见错误和解决方案 ===\n")print("1. SettingWithCopyWarning:")df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})print(f" 正确做法 - 使用.loc:")df.loc[df['A'] > 1, 'B'] = 0print(df)print("\n2. 数据类型错误:")# 字符串和数字混合df = pd.DataFrame({'value': ['1', '2', 'three', '4']})print(f" 混合数据类型: {df['value'].dtype}")print(f" 转换为数值(错误处理):")df['value_numeric'] = pd.to_numeric(df['value'], errors='coerce')print(df)print("\n3. 内存使用问题:")# 大型数据集内存优化large_df = pd.DataFrame(np.random.randn(1000000, 4), columns=['A', 'B', 'C', 'D'])print(f" 原始数据类型: {large_df['A'].dtype}")print(f" 内存使用: {large_df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")# 优化数据类型large_df['A'] = large_df['A'].astype('float32')print(f" 优化后数据类型: {large_df['A'].dtype}")print(f" 优化后内存使用: {large_df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")print("\n4. 性能问题:")# 避免循环,使用向量化操作df = pd.DataFrame({'A': range(10000), 'B': range(10000)})# 错误做法:使用循环# for i in range(len(df)):# df.loc[i, 'C'] = df.loc[i, 'A'] + df.loc[i, 'B']# 正确做法:向量化操作df['C'] = df['A'] + df['B']print(f" 向量化操作示例: df['A'] + df['B']")print("\n5. 时区问题:")# 时区处理ts = pd.Timestamp('2023-01-01 12:00:00')print(f" 无时区时间: {ts}")ts_utc = ts.tz_localize('UTC')print(f" 本地化为UTC: {ts_utc}")print(f" 转换为其他时区: {ts_utc.tz_convert('Asia/Shanghai')}")
明天我们将学习数据清洗技术,这是数据分析中至关重要的一步,确保数据质量和准确性。