import os  # 安全地拼接路径（自动处理分隔符） log_path = os.path.join("/var/log", "nginx", "access.log") print(log_path)  # /var/log/nginx/access.log# 检查文件是否存在if os.path.exists(log_path):     print("日志文件存在") else:     print("日志文件不存在")  # 提取文件名和目录名print(os.path.basename(log_path))  # access.logprint(os.path.dirname(log_path))   # /var/log/nginx

Python 3.4 引入了 pathlib，用面向对象的方式操作路径，代码更直观：

from pathlib import Path  # 用 / 运算符拼接路径，像写公式一样自然 log_dir = Path("/var/log") / "nginx" log_file = log_dir / "access.log"# 遍历目录下所有 .log 文件for f in Path("/var/log").glob("**/*.log"):     print(f.name, f.stat().st_size)  # 文件名 + 大小

操作	os.path	pathlib
拼接路径	os.path.join(a, b)	Path(a) / b
获取文件名	os.path.basename(p)	Path(p).name
检查存在	os.path.exists(p)	Path(p).exists()
遍历目录	os.listdir(d)	Path(d).iterdir()

⚠️ 常见错误：硬编码路径分隔符。写 "data/" + filename 在 Windows 上会出问题。应该用 os.path.join() 或 Path / filename，让 Python 自动选择正确的分隔符。

② datetime：日期和时间

处理日期和时间是编程中的高频需求——生成时间戳、计算工期、格式化显示。Python 的 datetime 模块提供了四个核心类型。

datetime日期时间处理

date — 年月日time — 时分秒datetime — 年月日时分秒timedelta — 时间间隔

from datetime import datetime, timedelta  # 获取当前时间 now = datetime.now() print(now)  # 2026-06-28 14:30:00.123456# 计算项目截止日期（从今天起 45 天后） deadline = now + timedelta(days=45) print(f"项目截止：{deadline.strftime('%Y-%m-%d')}")  # 计算两个日期之间的天数 start = datetime(2026, 1, 1) diff = now - start print(f"今年已过 {diff.days} 天")  # 今年已过 178 天# 格式化输出print(now.strftime("%Y年%m月%d日 %H:%M")) # 2026年06月28日 14:30

strftime / strptime 常用格式化代码

代码	含义	示例
%Y	四位年份	2026
%m	月份（补零）	06
%d	日期（补零）	28
%H	小时（24小时制）	14
%M	分钟（补零）	30
%S	秒（补零）	05

💡 小贴士：获取时间戳用 datetime.now().timestamp()，但显示给用户时要先用 strftime() 转成可读格式。时间戳是给程序看的，格式化字符串是给人看的。

③ random：随机数

随机抽奖、洗牌、生成验证码、模拟实验……这些场景都需要 random 模块。它提供了各种「靠运气」的功能。

random随机数生成

random() — 0~1浮点数randint(a,b) — 区间整数choice(seq) — 随机选一个shuffle(lst) — 打乱列表sample(pop,k) — 不重复抽样

import random  # 随机抽奖：从名单中抽取3人（不重复） candidates = ["张伟", "李娜", "王磊", "赵敏",             "陈晨", "刘洋", "周杰", "吴芳"] winners = random.sample(candidates, 3) print(f"中奖者：{winners}")  # 生成8位随机密码 chars = "abcdefghijklmnopqrstuvwxyz0123456789!@#$" password = "".join(random.choice(chars) for _ inrange(8)) print(f"随机密码：{password}")  # 洗牌：打乱任务列表顺序 tasks = ["写报告", "开会", "回邮件", "审代码", "部署"] random.shuffle(tasks) print(f"今日顺序：{tasks}")

⚠️ 常见错误：random 生成的是「伪随机数」，可以被预测。如果用于密码、令牌、验证码等安全场景，请使用 secrets 模块（Python 3.6+），它使用操作系统级别的真随机源。

④ collections：高级数据容器

Python 内置的 dict 和 list 已经很强大了，但 collections 模块在它们的基础上提供了更专业的工具——就像从普通工具箱升级到了精密工具箱。

collections高级数据容器

Counter — 计数器defaultdict — 带默认值的字典OrderedDict — 有序字典

Counter 是统计频率的利器。给它一个列表或字符串，它自动计算每个元素出现的次数：

from collections import Counter, defaultdict  # Counter：统计日志中各错误类型的出现次数 errors = ["Timeout", "404", "Timeout", "500",           "404", "Timeout", "403", "404"] error_count = Counter(errors) print(error_count) # Counter({'Timeout': 3, '404': 3, '500': 1, '403': 1})# 查看最常见的 2 个print(error_count.most_common(2)) # [('Timeout', 3), ('404', 3)]

defaultdict 解决了「键不存在时报错」的问题。访问不存在的键时，它会自动创建一个默认值，而不是抛出 KeyError：

# defaultdict：按部门分组员工 groups = defaultdict(list)  employees = [     ("技术部", "张三"),     ("市场部", "李四"),     ("技术部", "王五"),     ("市场部", "赵六"), ]  for dept, name in employees:     groups[dept].append(name)  # 不需要先检查键是否存在print(dict(groups)) # {'技术部': ['张三', '王五'], '市场部': ['李四', '赵六']}

场景	普通写法	collections 写法
统计词频	手动循环 + dict 计数	Counter(text.split())
分组数据	if key not in d: d[key]=[]	defaultdict(list)
获取前N名	sorted() + 切片	counter.most_common(N)

⑤ re：正则表达式

正则表达式是一门嵌入在 Python 中的「微型语言」，专门用来匹配文本模式。验证手机号、提取邮箱、替换敏感信息——这些文本处理任务用正则表达式可以一行搞定。

re正则表达式匹配

re.match() — 从头匹配re.search() — 搜索第一个re.findall() — 找所有匹配re.sub() — 替换

正则语法速查

语法	含义	示例
\d	任意数字	\d{11} 匹配11位数字
\w	字母/数字/下划线	\w+ 匹配一个单词
.	任意字符（除换行）	a.b 匹配 axb、acb
*	0次或多次	ab* 匹配 a、ab、abb
+	1次或多次	ab+ 匹配 ab、abb
?	0次或1次	ab? 匹配 a 或 ab
[]	字符集合	[aeiou] 匹配元音
()	分组捕获	(\d+) 捕获数字串

import re  # 验证手机号格式 phone = "13812345678"if re.match(r"^1[3-9]\d{9}$", phone):     print("手机号格式正确")  # 从文本中提取所有邮箱 text = "联系我：alice@gmail.com 或 bob@qq.com" emails = re.findall(r"[\w.]+@[\w.]+\.\w+", text) print(emails)  # ['alice@gmail.com', 'bob@qq.com']# 替换敏感信息（隐藏手机号中间4位） msg = "客服电话：13812345678" hidden = re.sub(r"(\d{3})\d{4}(\d{4})", r"\1****\2", msg) print(hidden)  # 客服电话：138****5678

⚠️ 常见错误：贪婪匹配 vs 非贪婪匹配。.* 会尽可能多地匹配（贪婪），而 .*? 会尽可能少地匹配（非贪婪）。比如从 "<b>粗体</b>和<i>斜体</i>" 中提取标签内容，用 <.*?> 才能正确匹配每个标签。

⑥ json 进阶与 csv 模块

第五篇我们学过 json 的基础用法。这里补充两个实用技巧，再认识一下处理表格数据的 csv 模块。

import json  data = {"姓名": "张三", "城市": "北京"}  # 技巧1：ensure_ascii=False 正确保存中文# 默认会把中文转成 \uXXXX 编码print(json.dumps(data, ensure_ascii=False)) # {"姓名": "张三", "城市": "北京"}# 技巧2：indent 参数美化输出print(json.dumps(data, ensure_ascii=False, indent=2)) # { #   "姓名": "张三", #   "城市": "北京" # }

csv 模块用来读写 CSV 文件（逗号分隔的表格数据）。其中 DictReader 最方便——直接用列名访问数据：

import csv from collections import Counter  # 假设 scores.csv 内容： # 姓名,科目,分数 # 张三,数学,88 # 李四,英语,72 # 王五,数学,95 # 赵六,数学,63 # 陈七,英语,81  # 读取 CSV 并统计各分数段人数 score_ranges = Counter()  withopen("scores.csv", encoding="utf-8") as f:     reader = csv.DictReader(f)     for row in reader:         score = int(row["分数"])         if score >= 90:             score_ranges["优秀(90+)"] += 1elif score >= 70:             score_ranges["良好(70-89)"] += 1else:             score_ranges["待提高(<70)"] += 1print(dict(score_ranges)) # {'良好(70-89)': 3, '优秀(90+)': 1, '待提高(<70)': 1}

💡 小贴士：处理表格数据时，csv.DictReader 比普通 csv.reader 更方便——它自动把第一行作为表头，你可以直接用 row["列名"] 访问数据，不需要记住列的索引号。

⑦ 实战项目：数据分析小工具

把今天学的5个模块组合起来，做一个完整的日志分析工具：读取日志文件、用正则提取信息、用 Counter 统计错误、用 datetime 计算时间范围、用 json 输出报告。

import re, json, os, random from datetime import datetime from collections import Counter  defanalyze_log(log_text):     """分析日志文本，生成统计报告"""# 1. 用正则提取时间戳和日志级别     pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)"     entries = re.findall(pattern, log_text)      if not entries:         return {"error": "未找到有效日志条目"}      # 2. 统计各级别日志数量     level_count = Counter(entry[1] for entry in entries)      # 3. 计算时间跨度     fmt = "%Y-%m-%d %H:%M:%S"     first = datetime.strptime(entries[0][0], fmt)     last = datetime.strptime(entries[-1][0], fmt)     duration = last - first      # 4. 提取错误信息的关键词     error_msgs = [msg for ts, level, msg in entries                   if level in ("ERROR", "CRITICAL")]     error_words = Counter()     for msg in error_msgs:         for word in msg.split():             iflen(word) > 3:                 error_words[word] += 1# 5. 生成报告     report = {         "总日志数": len(entries),         "时间跨度": str(duration),         "各级别统计": dict(level_count),         "错误高频词": dict(error_words.most_common(5)),         "报告生成时间": datetime.now().strftime(fmt),     }     return report  # 模拟日志数据 sample_log = """2026-06-28 08:00:01 [INFO] 服务启动成功 2026-06-28 08:15:33 [WARNING] 内存使用率超过80% 2026-06-28 08:30:12 [ERROR] 数据库连接超时 timeout 2026-06-28 09:00:45 [INFO] 定时任务执行完成 2026-06-28 09:12:08 [ERROR] 数据库连接超时 timeout 2026-06-28 09:45:22 [CRITICAL] 磁盘空间不足 disk full 2026-06-28 10:00:00 [INFO] 健康检查通过"""# 运行分析 report = analyze_log(sample_log) print(json.dumps(report, ensure_ascii=False, indent=2))