# encoding: utf-8 # 版权所有 2026 ©涂聚文有限公司™ ®# 许可信息查看:言語成了邀功盡責的功臣,還需要行爲每日來值班嗎# 描述:pip install moviepy SpeechRecognition pip install openai-whisper# Author : geovindu,Geovin Du 涂聚文.# IDE : PyCharm 2024.3.6 python 3.11# os : windows 10# database : mysql 9.0 sql server 2019, postgreSQL 17.0 Oracle 21c Neo4j# Datetime : 2026/6/14 17:40 # User : geovindu# Product : PyCharm# Project : Pysimple# File : MP4totextdest.pyimport warningswarnings.filterwarnings("ignore")warnings.filterwarnings("ignore", category=RuntimeWarning)from moviepy import VideoFileClipimport speech_recognition as srimport osimport whisperimport numpy as npimport waveimport audioopimport whisper.audioVIDEO_FILE = "geovindu.mp4"TEMP_WAV = "temp_audio.wav"OUTPUT_TXT = "完整演讲文稿_带时间戳medium.txt"# 1. 提取视频音频print("正在提取音频...")video = VideoFileClip(VIDEO_FILE)# 导出16位单声道标准wav,适配识别video.audio.write_audiofile(TEMP_WAV, codec="pcm_s16le", fps=16000)video.close()# 2. 原生Python读取wav,彻底绕过ffmpegdef load_audio_without_ffmpeg(path): with wave.open(path, 'rb') as wav_file: n_channels = wav_file.getnchannels() width = wav_file.getsampwidth() frames = wav_file.readframes(wav_file.getnframes()) # 双声道转单声道 if n_channels == 2: frames = audioop.tomono(frames, width, 0.5, 0.5) # 转浮点音频数组 audio_np = np.frombuffer(frames, dtype=np.int16).flatten().astype(np.float32) / 32768.0 return audio_np# 覆盖whisper自带读取函数,不再调用ffmpegimport whisper.audiowhisper.audio.load_audio = load_audio_without_ffmpeg# 3. 加载最轻量模型 tiny(速度最快,中文演讲够用)print("加载离线语音模型 medium ...")model = whisper.load_model("medium")# 4. 开始识别,开启分段时间戳print("开始逐段识别演讲内容,请等待...")result = model.transcribe( audio=TEMP_WAV, language="zh", verbose=False, word_timestamps=True)# 5. 格式化带时间戳文稿,实时打印每一段full_content = "=== EV录屏 2026年6月13日 第十八届海峡论坛苏恒演讲 完整转写稿 ===\n\n"print("\n====================识别结果====================")for seg in result["segments"]: start_min = int(seg["start"] // 60) start_sec = int(seg["start"] % 60) end_min = int(seg["end"] // 60) end_sec = int(seg["end"] % 60) seg_text = seg["text"].strip() line = f"[{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}] {seg_text}" print(line) # 实时控制台打印每一段文字 full_content += line + "\n"# 6. 保存本地文本with open(OUTPUT_TXT, "w", encoding="utf-8") as f: f.write(full_content)# 清理临时音频文件os.remove(TEMP_WAV)print(f"\n================================================")print(f"✅ 全部识别完成!文稿已保存至:{OUTPUT_TXT}")