当前位置：首页>python>一天一个Python知识点——Day 140:Pandas基础

一天一个Python知识点——Day 140:Pandas基础

2026-02-01 19:29:32

1. Pandas简介

什么是Pandas？

Pandas是一个强大的Python数据分析库，建立在NumPy之上，提供了高级数据结构和数据分析工具。它的名字来源于"Panel Data"（面板数据）和"Python Data Analysis"（Python数据分析）。

Pandas的核心特性

# pandas_features.pyimport pandas as pdpandas_features = {    "DataFrame": "二维表格数据结构（类似Excel表格）",    "Series": "一维带标签数组（类似字典）",    "数据导入导出": "支持CSV、Excel、SQL、JSON等多种格式",    "数据清洗": "处理缺失值、重复值、异常值",    "数据转换": "数据重塑、合并、分组、聚合",    "时间序列": "专门的时间序列处理功能",    "可视化": "集成的Matplotlib绘图接口",    "高性能": "基于NumPy，性能优异"}print("Pandas核心特性:")for feature, description in pandas_features.items():    print(f"  • {feature}: {description}")

Pandas安装

# 基础安装pip install pandas# 安装扩展功能pip install pandas[excel]  # Excel支持pip install pandas[parquet]  # Parquet格式支持pip install pandas[performance]  # 性能优化# 使用condaconda install pandas# 安装开发版本pip install git+https://github.com/pandas-dev/pandas.git

2. Pandas核心数据结构

Series数据结构

# series_basics.pyimport pandas as pdimport numpy as npprint("=== Pandas Series基础 ===\n")print("1. 创建Series:")# 从列表创建s1 = pd.Series([1, 3, 5, np.nan, 6, 8])print(f"   从列表创建: \n{s1}")# 从NumPy数组创建arr = np.array([10, 20, 30, 40])s2 = pd.Series(arr)print(f"\n   从NumPy数组创建: \n{s2}")# 从字典创建s3 = pd.Series({'a': 1, 'b': 2, 'c': 3})print(f"\n   从字典创建: \n{s3}")# 指定索引s4 = pd.Series([100, 200, 300], index=['x', 'y', 'z'])print(f"\n   指定索引: \n{s4}")print("\n2. Series属性:")s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])print(f"   Series:\n{s}")print(f"\n   索引: {s.index}")print(f"   值: {s.values}")print(f"   数据类型: {s.dtype}")print(f"   形状: {s.shape}")print(f"   大小: {s.size}")print(f"   是否为空: {s.empty}")print("\n3. Series索引和切片:")s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])print(f"   Series:\n{s}")print(f"\n   标签索引 s['b']: {s['b']}")print(f"   位置索引 s[1]: {s[1]}")print(f"   切片 s['b':'d']: \n{s['b':'d']}")print(f"   布尔索引 s[s > 25]: \n{s[s > 25]}")print(f"   花式索引 s[['a', 'c', 'e']]: \n{s[['a', 'c', 'e']]}")print("\n4. Series运算:")s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'e', 'f'])print(f"   s1:\n{s1}")print(f"\n   s2:\n{s2}")print(f"\n   s1 + s2 (自动对齐索引):\n{s1 + s2}")print(f"   s1 * 2:\n{s1 * 2}")print(f"   s1 + 10:\n{s1 + 10}")print("\n5. Series方法:")s = pd.Series([1, 2, 3, 4, 5, np.nan, 7, 8])print(f"   Series:\n{s}")print(f"\n   统计方法:")print(f"     sum(): {s.sum()}")print(f"     mean(): {s.mean()}")print(f"     std(): {s.std()}")print(f"     min(): {s.min()}")print(f"     max(): {s.max()}")print(f"     count(): {s.count()} (非NaN数量)")print(f"     isnull().sum(): {s.isnull().sum()} (NaN数量)")print(f"\n   数据处理方法:")print(f"     fillna(0):\n{s.fillna(0)}")print(f"     dropna():\n{s.dropna()}")print(f"     cumsum():\n{s.cumsum()}")print("\n6. Series应用函数:")s = pd.Series([1, 4, 9, 16, 25])# 应用NumPy函数print(f"   sqrt(s):\n{np.sqrt(s)}")# 应用lambda函数print(f"   s.apply(lambda x: x**0.5):\n{s.apply(lambda x: x**0.5)}")# 矢量化操作print(f"   s ** 0.5:\n{s ** 0.5}")

DataFrame数据结构

# dataframe_basics.pyimport pandas as pdimport numpy as npprint("=== Pandas DataFrame基础 ===\n")print("1. 创建DataFrame:")# 从字典创建data = {    '姓名': ['张三', '李四', '王五', '赵六'],    '年龄': [25, 30, 35, 28],    '城市': ['北京', '上海', '广州', '深圳'],    '工资': [50000, 60000, 55000, 65000]}df = pd.DataFrame(data)print(f"   从字典创建:\n{df}")# 从列表创建data_list = [    ['苹果', '水果', 5.0, 100],    ['香蕉', '水果', 3.0, 200],    ['胡萝卜', '蔬菜', 2.5, 150],    ['牛肉', '肉类', 30.0, 50]]df2 = pd.DataFrame(data_list, columns=['商品', '类别', '价格', '库存'])print(f"\n   从列表创建:\n{df2}")# 从NumPy数组创建arr = np.random.rand(4, 3)df3 = pd.DataFrame(arr, columns=['A', 'B', 'C'])print(f"\n   从NumPy数组创建:\n{df3}")print("\n2. DataFrame属性:")print(f"   形状: {df.shape}")print(f"   维度: {df.ndim}")print(f"   大小: {df.size}")print(f"   列名: {df.columns}")print(f"   索引: {df.index}")print(f"   数据类型:\n{df.dtypes}")print(f"\n   信息概览:")print(df.info())print(f"\n   统计描述:\n{df.describe()}")print("\n3. 数据查看:")print(f"   前2行:\n{df.head(2)}")print(f"\n   后2行:\n{df.tail(2)}")print(f"\n   随机3行:\n{df.sample(3, random_state=42)}")print("\n4. 列操作:")# 选择列print(f"   选择单列 df['姓名']:\n{df['姓名']}")print(f"\n   选择多列 df[['姓名', '年龄']]:\n{df[['姓名', '年龄']]}")# 添加列df['年薪'] = df['工资'] * 12print(f"\n   添加年薪列:\n{df}")# 修改列df['城市'] = df['城市'] + '市'print(f"\n   修改城市列:\n{df}")# 删除列df_dropped = df.drop('年薪', axis=1)print(f"\n   删除年薪列:\n{df_dropped}")print("\n5. 行操作:")# 选择行print(f"   通过索引选择 df.iloc[1]:\n{df.iloc[1]}")print(f"\n   通过标签选择 df.loc[0]:\n{df.loc[0]}")# 添加行new_row = {'姓名': '钱七', '年龄': 32, '城市': '成都', '工资': 58000, '年薪': 696000}df = df.append(new_row, ignore_index=True)print(f"\n   添加新行:\n{df}")# 修改行df.loc[0, '工资'] = 52000df.loc[0, '年薪'] = 624000print(f"\n   修改第一行工资:\n{df}")# 删除行df = df.drop(2)  # 删除索引为2的行print(f"\n   删除索引为2的行:\n{df}")print("\n6. 条件筛选:")print(f"   年龄大于28:\n{df[df['年龄'] > 28]}")print(f"\n   城市包含'北':\n{df[df['城市'].str.contains('北')]}")print(f"\n   年龄大于28且工资大于55000:\n{df[(df['年龄'] > 28) & (df['工资'] > 55000)]}")print(f"\n   城市为北京或上海:\n{df[df['城市'].isin(['北京市', '上海市'])]}")print("\n7. 排序:")print(f"   按年龄升序:\n{df.sort_values('年龄')}")print(f"\n   按工资降序:\n{df.sort_values('工资', ascending=False)}")print(f"\n   先按城市升序，再按年龄降序:\n{df.sort_values(['城市', '年龄'], ascending=[True, False])}")print("\n8. 重置索引:")df_reset = df.reset_index(drop=True)print(f"   重置索引:\n{df_reset}")

3. 数据导入导出

数据读取和保存

# data_io.pyimport pandas as pdimport numpy as npimport osprint("=== Pandas数据导入导出 ===\n")# 创建示例数据data = {    'ID': [1, 2, 3, 4, 5],    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],    'Age': [25, 30, 35, 28, 32],    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],    'Salary': [50000, 60000, 55000, 70000, 65000]}df = pd.DataFrame(data)print("1. CSV文件操作:")# 保存为CSVdf.to_csv('employees.csv', index=False)print(f"   保存到 employees.csv")# 读取CSVdf_csv = pd.read_csv('employees.csv')print(f"\n   读取CSV文件:\n{df_csv}")# 读取CSV带参数df_csv_params = pd.read_csv('employees.csv',                            usecols=['ID', 'Name', 'Age'],                           nrows=3)print(f"\n   读取CSV（指定列和前3行）:\n{df_csv_params}")print("\n2. Excel文件操作:")# 保存为Exceldf.to_excel('employees.xlsx', sheet_name='员工信息', index=False)print(f"   保存到 employees.xlsx")# 读取Exceldf_excel = pd.read_excel('employees.xlsx', sheet_name='员工信息')print(f"\n   读取Excel文件:\n{df_excel}")# 读取多个sheetwith pd.ExcelWriter('multiple_sheets.xlsx') as writer:    df.to_excel(writer, sheet_name='Sheet1', index=False)    df.head(3).to_excel(writer, sheet_name='Sheet2', index=False)print(f"   创建多sheet的Excel文件")print("\n3. JSON文件操作:")# 保存为JSONdf.to_json('employees.json', orient='records')print(f"   保存到 employees.json")# 读取JSONdf_json = pd.read_json('employees.json')print(f"\n   读取JSON文件:\n{df_json}")print("\n4. SQL数据库操作:")# 需要安装sqlalchemy和数据库驱动# pip install sqlalchemy# pip install pymysql  # MySQL# pip install psycopg2  # PostgreSQLtry:    from sqlalchemy import create_engine    # 创建SQLite内存数据库    engine = create_engine('sqlite:///:memory:')    # 保存到数据库    df.to_sql('employees', engine, if_exists='replace', index=False)    print(f"   保存到SQLite数据库")    # 从数据库读取    df_sql = pd.read_sql('SELECT * FROM employees', engine)    print(f"\n   从数据库读取:\n{df_sql}")    # 读取特定查询    df_query = pd.read_sql_query('SELECT Name, Age FROM employees WHERE Age > 30', engine)    print(f"\n   执行查询:\n{df_query}")except ImportError:    print("   需要安装sqlalchemy才能运行SQL示例")print("\n5. 其他格式:")# Parquet格式（高效列式存储）df.to_parquet('employees.parquet')print(f"   保存为Parquet格式")# HDF5格式（科学计算常用）df.to_hdf('employees.h5', key='df', mode='w')print(f"   保存为HDF5格式")# Pickle格式（Python对象序列化）df.to_pickle('employees.pkl')print(f"   保存为Pickle格式")# 读取这些格式df_parquet = pd.read_parquet('employees.parquet')df_hdf = pd.read_hdf('employees.h5')df_pickle = pd.read_pickle('employees.pkl')print("\n6. 网络数据:")# 从URL读取CSVtry:    url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'    df_titanic = pd.read_csv(url)    print(f"   从网络读取Titanic数据集，形状: {df_titanic.shape}")except:    print("   网络读取示例需要网络连接")print("\n7. 清理生成的文件:")# 删除生成的文件files_to_remove = [    'employees.csv', 'employees.xlsx', 'multiple_sheets.xlsx',    'employees.json', 'employees.parquet', 'employees.h5', 'employees.pkl']for file in files_to_remove:    if os.path.exists(file):        os.remove(file)        print(f"   已删除: {file}")

处理大型数据集

# large_data.pyimport pandas as pdimport numpy as npprint("=== 处理大型数据集 ===\n")print("1. 分块读取:")# 对于大型CSV文件，可以分块读取chunk_size = 1000chunks = []# 创建大型CSV文件用于演示large_data = pd.DataFrame({    'id': range(10000),    'value': np.random.randn(10000),    'category': np.random.choice(['A', 'B', 'C', 'D'], 10000)})large_data.to_csv('large_data.csv', index=False)# 分块读取print(f"   分块读取large_data.csv (每块{chunk_size}行)")for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size):    chunks.append(chunk)    print(f"   读取块 {len(chunks)}，形状: {chunk.shape}")# 合并所有块df_combined = pd.concat(chunks, ignore_index=True)print(f"\n   合并后形状: {df_combined.shape}")print("\n2. 选择列子集:")# 只读取需要的列df_subset = pd.read_csv('large_data.csv', usecols=['id', 'category'])print(f"   只读取id和category列，形状: {df_subset.shape}")print("\n3. 指定数据类型:")# 指定数据类型可以减少内存使用dtypes = {    'id': 'int32',    'value': 'float32',    'category': 'category'  # 分类数据类型}df_typed = pd.read_csv('large_data.csv', dtype=dtypes)print(f"   指定数据类型后:")print(f"   内存使用: {df_typed.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")print(f"   数据类型:\n{df_typed.dtypes}")print("\n4. 过滤行:")# 读取时过滤# 方法1: 使用skiprowsprint(f"   跳过前1000行:")df_skip = pd.read_csv('large_data.csv', skiprows=range(1, 1001))print(f"   形状: {df_skip.shape}")# 方法2: 使用nrowsprint(f"\n   只读取前1000行:")df_nrows = pd.read_csv('large_data.csv', nrows=1000)print(f"   形状: {df_nrows.shape}")print("\n5. 内存优化技巧:")# 使用分类数据类型df = pd.read_csv('large_data.csv')df['category'] = df['category'].astype('category')# 使用稀疏数据结构（适合大部分为0的数据）sparse_series = pd.Series([0, 0, 1, 0, 0, 0, 0, 0, 0, 1]).to_sparse()print(f"   稀疏Series:\n{sparse_series}")print(f"   稀疏Series内存使用: {sparse_series.memory_usage(deep=True)} 字节")# 删除不需要的列df_reduced = df.drop('value', axis=1)print(f"\n   删除value列后内存: {df_reduced.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")print("\n6. 清理文件:")import osif os.path.exists('large_data.csv'):    os.remove('large_data.csv')    print(f"   已删除large_data.csv")

4. 数据清洗和预处理

处理缺失值

# missing_values.pyimport pandas as pdimport numpy as npprint("=== 处理缺失值 ===\n")# 创建包含缺失值的数据data = {    'A': [1, 2, np.nan, 4, 5],    'B': [np.nan, 2, 3, np.nan, 5],    'C': [1, 2, 3, 4, 5],    'D': ['a', 'b', np.nan, 'd', 'e']}df = pd.DataFrame(data)print("原始数据:")print(df)print(f"\n缺失值统计:")print(df.isnull().sum())print("\n1. 检测缺失值:")print(f"   df.isnull():\n{df.isnull()}")print(f"\n   df.notnull():\n{df.notnull()}")print(f"\n   每列缺失值数量:\n{df.isnull().sum()}")print(f"\n   总缺失值数量: {df.isnull().sum().sum()}")print(f"   是否有缺失值: {df.isnull().values.any()}")print("\n2. 删除缺失值:")print("   删除包含缺失值的行:")print(f"   df.dropna():\n{df.dropna()}")print("\n   删除包含缺失值的列:")print(f"   df.dropna(axis=1):\n{df.dropna(axis=1)}")print("\n   只删除所有值都为NaN的行:")print(f"   df.dropna(how='all'):\n{df.dropna(how='all')}")print("\n   只删除在特定列有缺失值的行:")print(f"   df.dropna(subset=['A', 'B']):\n{df.dropna(subset=['A', 'B'])}")print("\n3. 填充缺失值:")print("   用固定值填充:")print(f"   df.fillna(0):\n{df.fillna(0)}")print("\n   用每列的平均值填充:")print(f"   df.fillna(df.mean()):\n{df.fillna(df.mean())}")print("\n   前向填充:")print(f"   df.fillna(method='ffill'):\n{df.fillna(method='ffill')}")print("\n   后向填充:")print(f"   df.fillna(method='bfill'):\n{df.fillna(method='bfill')}")print("\n   不同列用不同值填充:")fill_values = {'A': df['A'].mean(), 'B': 0, 'D': 'missing'}print(f"   df.fillna(fill_values):\n{df.fillna(fill_values)}")print("\n4. 插值:")# 创建时间序列数据dates = pd.date_range('2023-01-01', periods=10, freq='D')ts_data = pd.Series([1, np.nan, np.nan, np.nan, 5, np.nan, 7, 8, np.nan, 10], index=dates)print("   时间序列数据:")print(ts_data)print("\n   线性插值:")print(f"   ts_data.interpolate():\n{ts_data.interpolate()}")print("\n   时间插值:")print(f"   ts_data.interpolate(method='time'):\n{ts_data.interpolate(method='time')}")print("\n5. 高级缺失值处理:")# 使用机器学习方法填充from sklearn.impute import SimpleImputer# 创建数值数据num_data = df[['A', 'B', 'C']].values# 使用均值填充imputer = SimpleImputer(strategy='mean')imputed_data = imputer.fit_transform(num_data)df_imputed = pd.DataFrame(imputed_data, columns=['A', 'B', 'C'])print(f"   使用Scikit-learn的SimpleImputer:\n{df_imputed}")print("\n6. 缺失值模式分析:")# 创建缺失值模式矩阵missing_pattern = df.isnull().astype(int)print(f"   缺失值模式（1表示缺失）:\n{missing_pattern}")# 计算缺失值相关性missing_corr = df.isnull().corr()print(f"\n   缺失值相关性:\n{missing_corr}")

处理重复值和异常值

# duplicates_outliers.pyimport pandas as pdimport numpy as npprint("=== 处理重复值和异常值 ===\n")print("1. 处理重复值:")# 创建包含重复值的数据data = {    'id': [1, 2, 3, 4, 1, 2, 7, 8],    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Bob', 'Eve', 'Frank'],    'score': [85, 90, 78, 92, 85, 90, 88, 95]}df = pd.DataFrame(data)print("   原始数据:")print(df)print(f"\n   检测重复行:")print(f"   df.duplicated():\n{df.duplicated()}")print(f"\n   基于特定列的重复检测:")print(f"   df.duplicated(subset=['id', 'name']):\n{df.duplicated(subset=['id', 'name'])}")print(f"\n   删除重复行:")print(f"   df.drop_duplicates():\n{df.drop_duplicates()}")print(f"\n   基于特定列删除重复行:")print(f"   df.drop_duplicates(subset=['id', 'name']):\n{df.drop_duplicates(subset=['id', 'name'])}")print(f"\n   保留第一次出现的重复行（默认）:")print(f"   df.drop_duplicates(keep='first'):\n{df.drop_duplicates(keep='first')}")print(f"\n   保留最后一次出现的重复行:")print(f"   df.drop_duplicates(keep='last'):\n{df.drop_duplicates(keep='last')}")print(f"\n   删除所有重复行:")print(f"   df.drop_duplicates(keep=False):\n{df.drop_duplicates(keep=False)}")print("\n2. 处理异常值:")# 创建包含异常值的数据np.random.seed(42)normal_data = np.random.normal(50, 10, 95)  # 95个正常值outliers = np.array([120, 130, 140, -20, -30])  # 5个异常值all_data = np.concatenate([normal_data, outliers])df_outliers = pd.DataFrame({'value': all_data})print("   包含异常值的数据:")print(f"   形状: {df_outliers.shape}")print(f"   统计描述:\n{df_outliers.describe()}")print("\n   可视化数据分布:")# 绘制箱线图import matplotlib.pyplot as pltplt.figure(figsize=(10, 6))plt.subplot(1, 2, 1)plt.hist(df_outliers['value'], bins=30, edgecolor='black')plt.title('直方图')plt.xlabel('值')plt.ylabel('频数')plt.subplot(1, 2, 2)plt.boxplot(df_outliers['value'])plt.title('箱线图')plt.ylabel('值')plt.tight_layout()plt.show()print("\n   检测异常值方法:")# 方法1: Z-score方法from scipy import statsz_scores = np.abs(stats.zscore(df_outliers['value']))threshold = 3outliers_z = df_outliers[z_scores > threshold]print(f"\n   Z-score方法（阈值={threshold}）:")print(f"   检测到 {len(outliers_z)} 个异常值")print(f"   异常值:\n{outliers_z}")# 方法2: IQR方法Q1 = df_outliers['value'].quantile(0.25)Q3 = df_outliers['value'].quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQRoutliers_iqr = df_outliers[(df_outliers['value'] < lower_bound) | (df_outliers['value'] > upper_bound)]print(f"\n   IQR方法:")print(f"   Q1={Q1:.2f}, Q3={Q3:.2f}, IQR={IQR:.2f}")print(f"   边界: [{lower_bound:.2f}, {upper_bound:.2f}]")print(f"   检测到 {len(outliers_iqr)} 个异常值")print(f"   异常值:\n{outliers_iqr}")print("\n   处理异常值:")# 方法1: 删除异常值df_no_outliers = df_outliers[(z_scores <= threshold)]print(f"   删除异常值后形状: {df_no_outliers.shape}")# 方法2: 截断（Winsorization）from scipy.stats.mstats import winsorizewinsorized = winsorize(df_outliers['value'], limits=[0.05, 0.05])df_winsorized = pd.DataFrame({'value': winsorized})print(f"\n   截断处理（限制5%）:")print(f"   统计描述:\n{df_winsorized.describe()}")# 方法3: 用中位数或均值替换median_value = df_outliers['value'].median()df_median_replaced = df_outliers.copy()df_median_replaced['value'] = np.where(z_scores > threshold, median_value, df_outliers['value'])print(f"\n   用中位数替换异常值:")print(f"   统计描述:\n{df_median_replaced.describe()}")print("\n3. 数据转换:")# 创建偏态分布数据skewed_data = np.random.exponential(scale=2, size=1000)df_skewed = pd.DataFrame({'value': skewed_data})print("   偏态数据:")print(f"   偏度: {df_skewed['value'].skew():.2f}")print(f"   峰度: {df_skewed['value'].kurtosis():.2f}")# 对数变换df_skewed['log_value'] = np.log1p(df_skewed['value'])print(f"\n   对数变换后偏度: {df_skewed['log_value'].skew():.2f}")# Box-Cox变换from scipy import statstransformed, lambda_value = stats.boxcox(df_skewed['value'] + 1)  # +1避免负值df_skewed['boxcox_value'] = transformedprint(f"   Box-Cox变换后偏度: {df_skewed['boxcox_value'].skew():.2f}")print(f"   λ值: {lambda_value:.2f}")print("\n4. 数据规范化:")# 最小-最大规范化df_norm = df_outliers.copy()df_norm['minmax'] = (df_norm['value'] - df_norm['value'].min()) / (df_norm['value'].max() - df_norm['value'].min())# Z-score规范化df_norm['zscore'] = (df_norm['value'] - df_norm['value'].mean()) / df_norm['value'].std()print("   规范化后的数据（前5行）:")print(df_norm.head())

5. 数据转换和操作

数据重塑

# data_transformation.pyimport pandas as pdimport numpy as npprint("=== 数据转换和重塑 ===\n")print("1. 数据透视表(Pivot Table):")# 创建销售数据sales_data = {    '日期': pd.date_range('2023-01-01', periods=12, freq='M'),    '产品': ['A', 'B', 'C'] * 4,    '地区': ['北京', '上海', '广州'] * 4,    '销售额': np.random.randint(1000, 5000, 12),    '数量': np.random.randint(10, 100, 12)}df = pd.DataFrame(sales_data)print("   原始销售数据:")print(df)print("\n   创建数据透视表:")# 按产品和地区汇总销售额pivot = df.pivot_table(    values='销售额',    index='产品',    columns='地区',    aggfunc='sum',    fill_value=0)print(f"   按产品和地区汇总销售额:\n{pivot}")print("\n   多级数据透视表:")pivot_multi = df.pivot_table(    values=['销售额', '数量'],    index=['产品', '地区'],    aggfunc={'销售额': 'sum', '数量': 'mean'})print(f"   按产品和地区的多指标汇总:\n{pivot_multi}")print("\n   带边缘总计的数据透视表:")pivot_margins = df.pivot_table(    values='销售额',    index='产品',    columns='地区',    aggfunc='sum',    margins=True,    margins_name='总计')print(f"   带总计的数据透视表:\n{pivot_margins}")print("\n2. 数据融合(Melt):")# 创建宽格式数据wide_data = {    '姓名': ['张三', '李四', '王五'],    '语文': [85, 90, 78],    '数学': [92, 88, 95],    '英语': [88, 85, 92]}df_wide = pd.DataFrame(wide_data)print("   宽格式数据:")print(df_wide)print("\n   转换为长格式:")df_long = df_wide.melt(    id_vars='姓名',    value_vars=['语文', '数学', '英语'],    var_name='科目',    value_name='成绩')print(f"   长格式数据:\n{df_long}")print("\n3. 数据堆叠(Stack)和反堆叠(Unstack):")# 创建多层索引数据arrays = [    ['A', 'A', 'B', 'B'],    [1, 2, 1, 2]]index = pd.MultiIndex.from_arrays(arrays, names=['字母', '数字'])df_multi = pd.DataFrame({'值': [10, 20, 30, 40]}, index=index)print("   多层索引数据:")print(df_multi)print("\n   堆叠操作:")stacked = df_multi.stack()print(f"   堆叠后:\n{stacked}")print("\n   反堆叠操作:")unstacked = df_multi.unstack()print(f"   反堆叠后:\n{unstacked}")print("\n4. 数据分组(GroupBy):")print("   原始销售数据:")print(df)print("\n   按产品分组:")grouped = df.groupby('产品')for name, group in grouped:    print(f"\n   产品 {name}:")    print(group)print("\n   分组聚合:")agg_result = df.groupby('产品').agg({    '销售额': ['sum', 'mean', 'std'],    '数量': ['sum', 'mean']})print(f"   按产品的聚合统计:\n{agg_result}")print("\n   多级分组:")multi_group = df.groupby(['产品', '地区']).agg({'销售额': 'sum'})print(f"   按产品和地区分组汇总:\n{multi_group}")print("\n   分组转换:")# 计算每个产品销售额占该产品总销售额的比例df['销售额占比'] = df.groupby('产品')['销售额'].transform(lambda x: x / x.sum())print(f"   添加销售额占比列:\n{df}")print("\n   分组筛选:")# 筛选销售额超过组平均值的记录high_sales = df.groupby('产品').filter(lambda x: x['销售额'].mean() > 3000)print(f"   筛选高销售额产品记录:\n{high_sales}")print("\n5. 数据合并(Merge/Join/Concat):")# 创建两个数据框df1 = pd.DataFrame({    '员工ID': [1, 2, 3, 4],    '姓名': ['张三', '李四', '王五', '赵六'],    '部门': ['技术部', '销售部', '技术部', '市场部']})df2 = pd.DataFrame({    '员工ID': [1, 2, 3, 5],    '工资': [50000, 60000, 55000, 65000],    '入职日期': ['2020-01-15', '2019-03-20', '2021-05-10', '2018-08-01']})print("   数据框1:")print(df1)print("\n   数据框2:")print(df2)print("\n   内连接(Inner Join):")inner_merge = pd.merge(df1, df2, on='员工ID', how='inner')print(f"   内连接结果:\n{inner_merge}")print("\n   左连接(Left Join):")left_merge = pd.merge(df1, df2, on='员工ID', how='left')print(f"   左连接结果:\n{left_merge}")print("\n   右连接(Right Join):")right_merge = pd.merge(df1, df2, on='员工ID', how='right')print(f"   右连接结果:\n{right_merge}")print("\n   外连接(Full Outer Join):")outer_merge = pd.merge(df1, df2, on='员工ID', how='outer')print(f"   外连接结果:\n{outer_merge}")print("\n   连接多个列:")df3 = pd.DataFrame({    '部门': ['技术部', '销售部', '市场部'],    '经理': ['王经理', '李经理', '张经理'],    '预算': [1000000, 800000, 600000]})multi_merge = pd.merge(df1, df3, on='部门', how='left')print(f"   按部门合并:\n{multi_merge}")print("\n   连接(Join):")# 设置索引后使用joindf1_indexed = df1.set_index('员工ID')df2_indexed = df2.set_index('员工ID')join_result = df1_indexed.join(df2_indexed, how='left')print(f"   Join结果:\n{join_result}")print("\n   连接(Concat):")# 垂直连接concat_vertical = pd.concat([df1, pd.DataFrame({    '员工ID': [5],    '姓名': ['钱七'],    '部门': ['人事部']})], ignore_index=True)print(f"   垂直连接:\n{concat_vertical}")# 水平连接concat_horizontal = pd.concat([df1, df2.drop('员工ID', axis=1)], axis=1)print(f"\n   水平连接:\n{concat_horizontal}")print("\n6. 数据排序和排名:")df = pd.DataFrame({    '姓名': ['张三', '李四', '王五', '赵六', '钱七'],    '销售额': [50000, 60000, 45000, 70000, 55000],    '订单数': [25, 30, 20, 35, 28]})print("   原始数据:")print(df)print("\n   按销售额降序排序:")sorted_df = df.sort_values('销售额', ascending=False)print(sorted_df)print("\n   排名:")df['销售额排名'] = df['销售额'].rank(ascending=False, method='min')df['订单数排名'] = df['订单数'].rank(ascending=False, method='min')print(f"   添加排名列:\n{df}")

6. 时间序列处理

# time_series.pyimport pandas as pdimport numpy as npprint("=== Pandas时间序列处理 ===\n")print("1. 创建时间序列:")# 从字符串创建dates = pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])print(f"   从字符串创建: {dates}")# 生成日期范围date_range = pd.date_range('2023-01-01', periods=10, freq='D')print(f"\n   生成日期范围: {date_range}")# 生成带时间的日期范围datetime_range = pd.date_range('2023-01-01', periods=5, freq='H')print(f"\n   生成小时频率日期范围: {datetime_range}")# 创建时间序列数据ts = pd.Series(np.random.randn(10), index=date_range)print(f"\n   时间序列数据:\n{ts}")print("\n2. 时间序列索引和切片:")# 创建时间序列数据dates = pd.date_range('2023-01-01', periods=100, freq='D')ts = pd.Series(np.random.randn(100), index=dates)print(f"   时间序列数据（前5行）:\n{ts.head()}")print(f"\n   按日期索引: ts['2023-01-15'] = {ts['2023-01-15']}")print(f"   按月份切片: ts['2023-01']:\n{ts['2023-01'].head()}")print(f"   按日期范围切片: ts['2023-01-15':'2023-01-20']:\n{ts['2023-01-15':'2023-01-20']}")print("\n3. 时间序列重采样:")# 创建日度数据dates = pd.date_range('2023-01-01', periods=90, freq='D')data = np.random.randn(90).cumsum()  # 累积和，模拟趋势ts = pd.Series(data, index=dates)print(f"   原始日度数据（前5行）:\n{ts.head()}")print(f"   形状: {ts.shape}")print("\n   重采样为月度数据（取平均值）:")monthly = ts.resample('M').mean()print(f"   月度数据:\n{monthly}")print(f"   形状: {monthly.shape}")print("\n   重采样为周数据（取第一个值）:")weekly = ts.resample('W').first()print(f"   周数据（前5行）:\n{weekly.head()}")print(f"   形状: {weekly.shape}")print("\n   重采样为周数据（不同聚合方法）:")weekly_agg = ts.resample('W').agg(['mean', 'min', 'max', 'std'])print(f"   周数据（多种统计）:\n{weekly_agg.head()}")print("\n4. 移动窗口操作:")# 创建示例数据dates = pd.date_range('2023-01-01', periods=30, freq='D')ts = pd.Series(np.random.randn(30), index=dates)print(f"   原始数据:\n{ts.head()}")print("\n   移动平均值（窗口大小为3）:")moving_avg = ts.rolling(window=3).mean()print(f"   移动平均值:\n{moving_avg.head()}")print("\n   移动标准差:")moving_std = ts.rolling(window=3).std()print(f"   移动标准差:\n{moving_std.head()}")print("\n   扩展窗口（累计平均）:")expanding_mean = ts.expanding().mean()print(f"   扩展平均:\n{expanding_mean.head()}")print("\n   指数加权移动平均:")ewm = ts.ewm(span=3).mean()print(f"   指数加权移动平均:\n{ewm.head()}")print("\n5. 时间序列分解:")from statsmodels.tsa.seasonal import seasonal_decompose# 创建有趋势和季节性的数据np.random.seed(42)n = 100t = np.arange(n)trend = 0.1 * tseasonal = 5 * np.sin(2 * np.pi * t / 12)noise = np.random.randn(n) * 2data = trend + seasonal + noisedates = pd.date_range('2023-01-01', periods=n, freq='M')ts = pd.Series(data, index=dates)print(f"   创建的时间序列（前5行）:\n{ts.head()}")# 季节性分解result = seasonal_decompose(ts, model='additive', period=12)print(f"\n   分解结果:")print(f"   趋势成分（前5行）:\n{result.trend.head()}")print(f"   季节性成分（前5行）:\n{result.seasonal.head()}")print(f"   残差成分（前5行）:\n{result.resid.head()}")print("\n6. 时间序列特征工程:")# 创建时间序列数据dates = pd.date_range('2023-01-01', periods=100, freq='D')ts = pd.Series(np.random.randn(100).cumsum(), index=dates)# 创建DataFramedf = pd.DataFrame({'value': ts})print("   原始时间序列（前5行）:")print(df.head())print("\n   提取时间特征:")df['year'] = df.index.yeardf['month'] = df.index.monthdf['day'] = df.index.daydf['dayofweek'] = df.index.dayofweek  # 0=周一, 6=周日df['quarter'] = df.index.quarterdf['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)print(f"   添加时间特征后（前5行）:\n{df.head()}")print("\n   滞后特征:")df['value_lag1'] = df['value'].shift(1)  # 滞后1期df['value_lag7'] = df['value'].shift(7)  # 滞后7期（一周）df['value_diff1'] = df['value'].diff(1)  # 一阶差分df['value_pct_change'] = df['value'].pct_change()  # 百分比变化print(f"   添加滞后特征后（前10行）:\n{df.head(10)}")print("\n   滚动统计特征:")df['value_roll_mean7'] = df['value'].rolling(window=7).mean()df['value_roll_std7'] = df['value'].rolling(window=7).std()df['value_roll_min7'] = df['value'].rolling(window=7).min()df['value_roll_max7'] = df['value'].rolling(window=7).max()print(f"   添加滚动特征后（前10行）:\n{df.head(10)}")print("\n7. 时区处理:")# 创建带时区的时间序列ts_utc = pd.Timestamp('2023-01-01 12:00:00', tz='UTC')print(f"   UTC时间: {ts_utc}")# 时区转换ts_est = ts_utc.tz_convert('US/Eastern')print(f"   转换为美国东部时间: {ts_est}")# 本地化ts_naive = pd.Timestamp('2023-01-01 12:00:00')ts_localized = ts_naive.tz_localize('Asia/Shanghai')print(f"   本地化为上海时间: {ts_localized}")

7. 数据可视化

# data_visualization.pyimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltprint("=== Pandas数据可视化 ===\n")# 创建示例数据np.random.seed(42)dates = pd.date_range('2023-01-01', periods=100, freq='D')sales_data = pd.DataFrame({    '日期': dates,    '销售额': np.random.randn(100).cumsum() + 100,    '订单数': np.random.randint(10, 100, 100),    '产品类别': np.random.choice(['A', 'B', 'C'], 100),    '地区': np.random.choice(['北京', '上海', '广州', '深圳'], 100)})print("1. 线图:")# 创建线图plt.figure(figsize=(12, 8))# 子图1: 销售额趋势plt.subplot(2, 2, 1)sales_data.set_index('日期')['销售额'].plot(title='销售额趋势', color='blue')plt.xlabel('日期')plt.ylabel('销售额')plt.grid(True, alpha=0.3)print("   绘制销售额趋势线图")# 子图2: 滚动平均plt.subplot(2, 2, 2)sales_data.set_index('日期')['销售额'].rolling(window=7).mean().plot(    title='7日移动平均', color='red')plt.xlabel('日期')plt.ylabel('滚动平均')plt.grid(True, alpha=0.3)print("   绘制7日移动平均线图")print("\n2. 柱状图:")# 按产品类别统计销售额category_sales = sales_data.groupby('产品类别')['销售额'].sum()# 子图3: 柱状图plt.subplot(2, 2, 3)category_sales.plot(kind='bar', title='按产品类别销售额', color=['red', 'green', 'blue'])plt.xlabel('产品类别')plt.ylabel('销售额')plt.xticks(rotation=0)plt.grid(True, alpha=0.3, axis='y')print("   绘制按产品类别的销售额柱状图")print("\n3. 饼图:")# 按地区统计订单数region_orders = sales_data.groupby('地区')['订单数'].sum()# 子图4: 饼图plt.subplot(2, 2, 4)region_orders.plot(    kind='pie',    title='各地区订单数占比',    autopct='%1.1f%%',    startangle=90,    explode=(0.05, 0.05, 0.05, 0.05))plt.ylabel('')  # 隐藏ylabelprint("   绘制各地区订单数占比饼图")plt.tight_layout()plt.show()print("\n4. 散点图和相关性:")# 创建新的示例数据product_data = pd.DataFrame({    '价格': np.random.uniform(10, 100, 50),    '销量': np.random.uniform(100, 1000, 50),    '广告投入': np.random.uniform(1000, 10000, 50),    '产品类型': np.random.choice(['高', '中', '低'], 50)})fig, axes = plt.subplots(1, 3, figsize=(15, 5))# 散点图axes[0].scatter(product_data['价格'], product_data['销量'], alpha=0.6)axes[0].set_xlabel('价格')axes[0].set_ylabel('销量')axes[0].set_title('价格 vs 销量')axes[0].grid(True, alpha=0.3)# 带颜色的散点图colors = {'高': 'red', '中': 'green', '低': 'blue'}for level, color in colors.items():    subset = product_data[product_data['产品类型'] == level]    axes[1].scatter(subset['广告投入'], subset['销量'], color=color, label=level, alpha=0.6)axes[1].set_xlabel('广告投入')axes[1].set_ylabel('销量')axes[1].set_title('广告投入 vs 销量（按产品类型）')axes[1].legend()axes[1].grid(True, alpha=0.3)# 相关矩阵热力图correlation = product_data[['价格', '销量', '广告投入']].corr()im = axes[2].imshow(correlation, cmap='coolwarm', vmin=-1, vmax=1)axes[2].set_xticks(range(len(correlation.columns)))axes[2].set_yticks(range(len(correlation.columns)))axes[2].set_xticklabels(correlation.columns)axes[2].set_yticklabels(correlation.columns)axes[2].set_title('相关矩阵热力图')# 添加数值标签for i in range(len(correlation.columns)):    for j in range(len(correlation.columns)):        text = axes[2].text(j, i, f'{correlation.iloc[i, j]:.2f}',                          ha="center", va="center", color="black")plt.colorbar(im, ax=axes[2])plt.tight_layout()plt.show()print("\n5. 箱线图和直方图:")fig, axes = plt.subplots(1, 3, figsize=(15, 5))# 箱线图sales_data.boxplot(column='销售额', by='产品类别', ax=axes[0])axes[0].set_title('各产品类别销售额箱线图')axes[0].set_xlabel('产品类别')axes[0].set_ylabel('销售额')axes[0].grid(True, alpha=0.3)# 直方图axes[1].hist(sales_data['销售额'], bins=30, edgecolor='black', alpha=0.7)axes[1].set_xlabel('销售额')axes[1].set_ylabel('频数')axes[1].set_title('销售额分布直方图')axes[1].grid(True, alpha=0.3)# 密度图sales_data['销售额'].plot(kind='density', ax=axes[2])axes[2].set_xlabel('销售额')axes[2].set_ylabel('密度')axes[2].set_title('销售额密度图')axes[2].grid(True, alpha=0.3)plt.tight_layout()plt.show()print("\n6. 时间序列分解图:")# 创建有趋势和季节性的数据np.random.seed(42)n = 100t = np.arange(n)trend = 0.1 * tseasonal = 5 * np.sin(2 * np.pi * t / 12)noise = np.random.randn(n) * 2data = trend + seasonal + noisedates = pd.date_range('2023-01-01', periods=n, freq='M')ts = pd.Series(data, index=dates)# 使用Pandas进行简单分解rolling_mean = ts.rolling(window=12, center=True).mean()trend_component = rolling_meanseasonal_component = ts - trend_componentfig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)# 原始数据axes[0].plot(ts, label='原始数据', color='blue')axes[0].set_ylabel('原始数据')axes[0].legend()axes[0].grid(True, alpha=0.3)# 趋势成分axes[1].plot(trend_component, label='趋势成分', color='red')axes[1].set_ylabel('趋势')axes[1].legend()axes[1].grid(True, alpha=0.3)# 季节性成分axes[2].plot(seasonal_component, label='季节性成分', color='green')axes[2].set_ylabel('季节性')axes[2].legend()axes[2].grid(True, alpha=0.3)# 残差成分residual = ts - trend_component - seasonal_component.mean()axes[3].plot(residual, label='残差成分', color='purple')axes[3].set_ylabel('残差')axes[3].set_xlabel('日期')axes[3].legend()axes[3].grid(True, alpha=0.3)plt.suptitle('时间序列分解')plt.tight_layout()plt.show()print("\n7. 多轴图:")# 创建双轴数据dates = pd.date_range('2023-01-01', periods=30, freq='D')dual_data = pd.DataFrame({    '日期': dates,    '销售额': np.random.randn(30).cumsum() + 100,    '订单数': np.random.randint(10, 100, 30)}).set_index('日期')fig, ax1 = plt.subplots(figsize=(10, 6))# 第一个y轴（销售额）color1 = 'tab:red'ax1.set_xlabel('日期')ax1.set_ylabel('销售额', color=color1)ax1.plot(dual_data.index, dual_data['销售额'], color=color1)ax1.tick_params(axis='y', labelcolor=color1)# 第二个y轴（订单数）ax2 = ax1.twinx()color2 = 'tab:blue'ax2.set_ylabel('订单数', color=color2)ax2.plot(dual_data.index, dual_data['订单数'], color=color2)ax2.tick_params(axis='y', labelcolor=color2)plt.title('销售额和订单数趋势（双轴图）')fig.tight_layout()plt.show()

总结

今天学习了Pandas的基础知识，掌握了以下核心内容：

关键收获

Pandas核心数据结构：深入理解了Series和DataFrame的概念和操作
数据导入导出：学会了从多种数据源读取数据并保存为不同格式
数据清洗：掌握了处理缺失值、重复值和异常值的方法
数据转换：熟悉了数据重塑、合并、分组、聚合等操作
时间序列：了解了Pandas的时间序列处理功能
数据可视化：学会了使用Pandas内置的绘图功能

Pandas核心优势

灵活的数据结构：Series和DataFrame适合各种数据处理场景
丰富的数据操作：提供了SQL-like的数据操作接口
高效的数据处理：基于NumPy，性能优异
强大的时间序列支持：专门为时间序列分析设计
完善的生态系统：与其他数据科学库无缝集成

最佳实践

了解数据：使用head()、info()、describe()了解数据概况
处理缺失值：根据业务需求选择填充或删除缺失值
使用向量化操作：避免Python循环，使用Pandas内置函数
内存优化：对于大型数据集，注意数据类型和内存使用
保持数据整洁：遵循整洁数据（Tidy Data）原则

常见错误和解决方案

# common_mistakes.pyimport pandas as pdimport numpy as npprint("=== Pandas常见错误和解决方案 ===\n")print("1. SettingWithCopyWarning:")df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})print(f"   正确做法 - 使用.loc:")df.loc[df['A'] > 1, 'B'] = 0print(df)print("\n2. 数据类型错误:")# 字符串和数字混合df = pd.DataFrame({'value': ['1', '2', 'three', '4']})print(f"   混合数据类型: {df['value'].dtype}")print(f"   转换为数值（错误处理）:")df['value_numeric'] = pd.to_numeric(df['value'], errors='coerce')print(df)print("\n3. 内存使用问题:")# 大型数据集内存优化large_df = pd.DataFrame(np.random.randn(1000000, 4), columns=['A', 'B', 'C', 'D'])print(f"   原始数据类型: {large_df['A'].dtype}")print(f"   内存使用: {large_df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")# 优化数据类型large_df['A'] = large_df['A'].astype('float32')print(f"   优化后数据类型: {large_df['A'].dtype}")print(f"   优化后内存使用: {large_df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")print("\n4. 性能问题:")# 避免循环，使用向量化操作df = pd.DataFrame({'A': range(10000), 'B': range(10000)})# 错误做法：使用循环# for i in range(len(df)):#     df.loc[i, 'C'] = df.loc[i, 'A'] + df.loc[i, 'B']# 正确做法：向量化操作df['C'] = df['A'] + df['B']print(f"   向量化操作示例: df['A'] + df['B']")print("\n5. 时区问题:")# 时区处理ts = pd.Timestamp('2023-01-01 12:00:00')print(f"   无时区时间: {ts}")ts_utc = ts.tz_localize('UTC')print(f"   本地化为UTC: {ts_utc}")print(f"   转换为其他时区: {ts_utc.tz_convert('Asia/Shanghai')}")

明天我们将学习数据清洗技术，这是数据分析中至关重要的一步，确保数据质量和准确性。

本文来自网友投稿或网络内容，如有侵犯您的权益请联系我们删除，联系邮箱：wyl860211@qq.com 。

一天一个Python知识点——Day 140:Pandas基础

1. Pandas简介

什么是Pandas？

Pandas的核心特性

Pandas安装

2. Pandas核心数据结构

Series数据结构

DataFrame数据结构

3. 数据导入导出

数据读取和保存

处理大型数据集

4. 数据清洗和预处理

处理缺失值

处理重复值和异常值

5. 数据转换和操作

数据重塑

6. 时间序列处理

7. 数据可视化

总结

关键收获

Pandas核心优势

最佳实践

常见错误和解决方案

最新文章

热门文章

随机文章

一天一个Python知识点——Day 140:Pandas基础

1. Pandas简介

什么是Pandas？

Pandas的核心特性

Pandas安装

2. Pandas核心数据结构

Series数据结构

DataFrame数据结构

3. 数据导入导出

数据读取和保存

处理大型数据集

4. 数据清洗和预处理

处理缺失值

处理重复值和异常值

5. 数据转换和操作

数据重塑

6. 时间序列处理

7. 数据可视化

总结

关键收获

Pandas核心优势

最佳实践

常见错误和解决方案

19道编程题拿下Python程设期末

【Python源码】6G波形:OTFS 高移动性通感一体化仿真平台

最新文章

热门文章

随机文章