在上次我们介绍了帧对象,今天我们就来看看生成器的帧对象,标准的系统栈(数组实现)是LIFO结构(后进先出):最后调用的函数,最先执行完毕,帧对象最先释放。但Python的生成器,打破了这个限制——生成器的帧对象,会被持久化,直到生成器被关闭。换句话说就是同一个生成器对象,从头到尾使用同一个帧对象,无论被调用多少次,帧对象都不会被释放——这就是生成器能“暂停/恢复”的核心秘密。
import gcimport inspectdef gen_numbers(n): """生成器函数:生成1~n的数字,支持暂停/恢复""" i = 1 while True: while i <= n: # 暂停生成器,返回i;下次调用时,从这里恢复 x = yield i if x == -1: # 打印当前的栈帧信息 print(f"Inner loop done. show stack at level {i}:") stack = inspect.stack() for frame in stack: print(f" frame at 0x{id(frame[0]):x}, function: {frame[3]}") i = 1 # 重置i,准备下一轮生成 else: i += 1def create_frames(generator): """创建多层函数调用,触发生成器执行""" def level_3(generator): # 向生成器发送值,触发生成器恢复执行 return generator.send(-1) def level_2(generator): # 调用生成器的next()方法,获取下一个值 return next(generator) + level_3(generator) def level_1(generator): return next(generator) + level_2(generator) level_1(generator) # 启动调用print("=== FRAME ALLOCATION PATTERNS ===")# 创建生成器对象(只创建一次)generator = gen_numbers(3)# 运行3次,观察生成器的帧地址是否变化for iteration in range(3): print(f"\nIteration {iteration + 1}:") create_frames(generator=generator) # 手动触发垃圾回收 gc.collect()
运行结果如下:
=== FRAME ALLOCATION PATTERNS ===Iteration 1:Inner loop done. show stack at level 3: frame at 0x7f0944582c50, function: gen_numbers frame at 0x7f0944344a90, function: level_3 frame at 0x7f0944535180, function: level_2 frame at 0x7f0944535000, function: level_1 frame at 0x7f0944340880, function: create_frames frame at 0x7f09443407c0, function: <module>Iteration 2:Inner loop done. show stack at level 3: frame at 0x7f0944582c50, function: gen_numbers frame at 0x7f0944344720, function: level_3 frame at 0x7f0944537dc0, function: level_2 frame at 0x7f0944534a00, function: level_1 frame at 0x7f0944340040, function: create_frames frame at 0x7f09443407c0, function: <module>Iteration 3:Inner loop done. show stack at level 3: frame at 0x7f0944582c50, function: gen_numbers frame at 0x7f0944344a90, function: level_3 frame at 0x7f0944535180, function: level_2 frame at 0x7f0944535000, function: level_1 frame at 0x7f0944340880, function: create_frames frame at 0x7f09443407c0, function: <module>
我们来观察「gen_numbers生成器的帧地址」:3次迭代中,生成器的帧地址始终是0x7f0944582c50(你的地址会不同,但始终不变)——这证明了:同一个生成器对象,从头到尾使用同一个帧对象。而其他函数(level_3、level_2等)的帧地址,每次迭代都会变化——它们的帧对象会在调用结束后被释放、复用。
理解了PyFrame对象,我们就能轻松解释一个经典问题:为什么Python的协程,比线程更轻量、性能更高?核心原因:线程的切换,需要操作系统参与(切换系统栈、保存/恢复寄存器、调度线程),还要加上Python GIL的开销;而协程的切换,只需要在Python解释器内部切换PyFrame对象,无需操作系统介入,开销极低。
我们用10000个IO任务做实战对比,用代码直观感受两者的性能差距——代码完整,复制就能运行,重点看CPU、内存、上下文切换的指标。
import timeimport asyncioimport osimport resourcefrom argparse import ArgumentParserfrom concurrent.futures import ThreadPoolExecutor, as_completed# 测试配置N_TASKS = 10_000 # 总任务数IO_DELAY = 0.1 # 每个任务的IO延迟(模拟真实场景)MAX_CONCURRENCY = 500 # 最大并发数class PerformanceMonitor: """性能监控上下文管理器:统计CPU、内存、上下文切换等指标""" def __enter__(self): # 记录开始时间和系统资源使用情况 self.start_perf = time.perf_counter() self.ru_start = resource.getrusage(resource.RUSAGE_SELF) return self def __exit__(self, exc_type, exc_val, exc_tb): # 记录结束时间和系统资源使用情况 self.end_perf = time.perf_counter() self.ru_end = resource.getrusage(resource.RUSAGE_SELF) # 计算总耗时 def perf_counter_time(self): return self.end_perf - self.start_perf # 计算用户态耗时(程序本身执行的时间) def user_time(self): return self.ru_end.ru_utime - self.ru_start.ru_utime # 计算内核态耗时(操作系统调度的时间) def sys_time(self): return self.ru_end.ru_stime - self.ru_start.ru_stime # 计算总CPU耗时 def cpu_time(self): return self.user_time() + self.sys_time() # 计算CPU使用率 def cpu_percent(self): return (self.cpu_time() / self.perf_counter_time() * 100) if self.perf_counter_time() > 0 else 0 # 计算最大内存占用(MB) def max_rss_kb(self): return self.ru_end.ru_maxrss / 1024. # 计算自愿上下文切换次数(程序主动释放CPU) def voluntary_switches(self): return self.ru_end.ru_nvcsw - self.ru_start.ru_nvcsw # 计算非自愿上下文切换次数(操作系统强制切换CPU) def involuntary_switches(self): return self.ru_end.ru_nivcsw - self.ru_start.ru_nivcsw # 打印性能结果 def print_results(self): print( f"{'='*50}", f"TIME METRICS:", f" CPU time: {self.cpu_time():.6f}s", f" User: {self.user_time():.6f}s", f" System: {self.sys_time():.6f}s", f" CPU Usage Percent: {self.cpu_percent():.1f}%", f" Max RSS: {self.max_rss_kb():,} MB", f" Voluntary Context switch: {self.voluntary_switches():,}", f" Involuntary Context switch: {self.involuntary_switches():,}", sep="\n" )# 同步任务(用于线程测试)def sync_stack(): time.sleep(IO_DELAY) # 模拟IO延迟 return 1# 异步任务(用于协程测试)async def make_async_stack(): await asyncio.sleep(IO_DELAY) # 模拟异步IO延迟 return 1# 线程模式运行def run_threads(): print(f"Running in THREAD mode") with ThreadPoolExecutor(max_workers=MAX_CONCURRENCY) as executor: # 提交10000个同步任务 futures = [executor.submit(sync_stack) for _ in range(N_TASKS)] # 等待所有任务完成 for f in as_completed(futures): f.result()# 异步模式运行async def run_async(): print(f"Running in ASYNC mode ") sem = asyncio.Semaphore(MAX_CONCURRENCY) # 控制最大并发数 async def limited_task(): async with sem: await make_async_stack() # 创建10000个异步任务 tasks = [asyncio.create_task(limited_task()) for _ in range(N_TASKS)] # 等待所有任务完成 await asyncio.gather(*tasks)# 启动基准测试def run_benchmark(mode: str): if mode == "threads": run_threads() if mode == "async": asyncio.run(run_async())if __name__ == "__main__": # 解析命令行参数(指定运行模式:threads/async) parser = ArgumentParser() parser.add_argument("mode", choices=["threads", "async"]) args = parser.parse_args() # 启动性能监控并运行测试 with PerformanceMonitor() as mon: run_benchmark(args.mode) # 打印性能结果 mon.print_results()
在终端中运行以下两个命令,来测试线程和协程的性能:
# 测试线程模式python concurrent_benchmark.py threads# 测试协程模式python concurrent_benchmark.py async
具体情况自己看吧,这里就不再贴了。我们可以看到二者的差距。线程模式,每次切换都需要操作系统接入,切换系统栈、保存/恢复寄存器,还要争夺GIL,系统耗时和上下文切换次数暴增;协程模式,切换只在Python解释器内部进行,本质就是切换PyFrame对象的引用,无需操作系统调度,开销极低。
到这里,Python调用栈的核心内容我们就都梳理了一遍。