研究生看完会哭,科研狗看完想请我吃烧烤
场景:导师周五下午扔了10篇英文PDF,周一要汇报。
你的选择:
完整代码(复制即用)
第一步:环境准备
pip install openai pymupdf xmind
第二步:核心代码(60行)
import fitz # PyMuPDF,PDF解析import openaiimport osfrom xmind import xmind# 设置API密钥openai.api_key = os.getenv("OPENAI_API_KEY")def extract_text_from_pdf(pdf_path): """从PDF提取文本""" doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() doc.close() # 截断过长文本(GPT-4上下文限制) return text[:15000] # 约10-15页内容def generate_summary(text): """调用GPT-4生成结构化摘要""" prompt = f""" 请阅读以下学术论文,输出JSON格式: {{ "一句话摘要": "用一句话概括核心贡献", "创新点": ["要点1", "要点2", "要点3"], "实验结果": "关键数据和结论", "局限性": "作者提到的不足", "思维导图结构": {{ "根节点": "论文标题", "一级分支": ["分支1", "分支2"], "分支1的二级": ["细节1", "细节2"] }} }} 论文内容: {text} """ response = openai.ChatCompletion.create( model="gpt-4", messages=[{"role": "user", "content": prompt}], temperature=0.3 ) import json content = response.choices[0].message.content # 清理可能的markdown标记 content = content.replace("```json", "").replace("```", "") try: return json.loads(content) except: # 解析失败返回原始文本 return {"原始分析": content}def create_mindmap(summary, output_path): """生成XMind思维导图""" workbook = xmind.load(output_path) sheet = workbook.getPrimarySheet() sheet.setTitle(summary.get("一句话摘要", "论文摘要")[:20]) root = sheet.getRootTopic() root.setTitle(summary.get("思维导图结构", {}).get("根节点", "论文核心")) # 添加创新点分支 innovations = summary.get("创新点", []) if innovations: innovation_branch = root.addSubTopic() innovation_branch.setTitle("核心创新") for item in innovations: sub = innovation_branch.addSubTopic() sub.setTitle(item) # 添加实验结果分支 results = summary.get("实验结果", "") if results: result_branch = root.addSubTopic() result_branch.setTitle("实验结果") sub = result_branch.addSubTopic() sub.setTitle(str(results)[:50]) # 添加局限性分支 limitations = summary.get("局限性", "") if limitations and limitations != "无": limit_branch = root.addSubTopic() limit_branch.setTitle("局限性") sub = limit_branch.addSubTopic() sub.setTitle(str(limitations)[:50]) xmind.save(workbook, output_path) return output_pathdef process_paper(pdf_path, output_dir="output"): """完整流程:PDF → 摘要 → 思维导图""" os.makedirs(output_dir, exist_ok=True) filename = os.path.basename(pdf_path).replace(".pdf", "") print(f"📖 正在读取: {filename}") text = extract_text_from_pdf(pdf_path) print(f"🤖 正在分析...(调用GPT-4)") summary = generate_summary(text) print(f"🗺️ 正在生成思维导图...") mindmap_path = os.path.join(output_dir, f"{filename}.xmind") create_mindmap(summary, mindmap_path) # 保存文本摘要 txt_path = os.path.join(output_dir, f"{filename}_摘要.txt") with open(txt_path, "w", encoding="utf-8") as f: f.write(f"一句话摘要:\n{summary.get('一句话摘要', 'N/A')}\n\n") f.write(f"创新点:\n") for i, point in enumerate(summary.get("创新点", []), 1): f.write(f"{i}. {point}\n") f.write(f"\n实验结果:\n{summary.get('实验结果', 'N/A')}\n") f.write(f"\n局限性:\n{summary.get('局限性', 'N/A')}\n") print(f"✅ 完成!输出文件:") print(f" 📄 {txt_path}") print(f" 🗺️ {mindmap_path}") return summary# 批量处理多个PDFdef batch_process(pdf_folder): """批量处理文件夹内所有PDF""" pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")] print(f"🔍 发现 {len(pdf_files)} 个PDF文件") print("=" * 50) all_summaries = [] for pdf_file in pdf_files: pdf_path = os.path.join(pdf_folder, pdf_file) try: summary = process_paper(pdf_path) summary["文件名"] = pdf_file all_summaries.append(summary) print("-" * 50) except Exception as e: print(f"❌ 处理失败 {pdf_file}: {str(e)}") # 生成汇总对比表 if len(all_summaries) > 1: generate_comparison_table(all_summaries) return all_summariesdef generate_comparison_table(summaries): """生成多篇论文对比表""" comparison = "📊 论文对比汇总\n" comparison += "=" * 80 + "\n" comparison += f"{'文件名':<30} | {'核心创新':<40}\n" comparison += "-" * 80 + "\n" for s in summaries: filename = s.get("文件名", "未知")[:28] innovation = s.get("一句话摘要", "N/A")[:38] comparison += f"{filename} | {innovation}\n" comparison += "=" * 80 + "\n" with open("output/论文对比表.txt", "w", encoding="utf-8") as f: f.write(comparison) print(f"\n📊 已生成对比表: output/论文对比表.txt")# 运行入口if __name__ == "__main__": import sys if len(sys.argv) > 1: # 处理单个文件 process_paper(sys.argv[1]) else: # 批量处理当前目录的papers文件夹 batch_process("papers")
运行效果演示
$ python paper_reader.py papers/🔍 发现 3 个PDF文件==================================================📖 正在读取: attention_is_all_you_need.pdf🤖 正在分析...(调用GPT-4)🗺️ 正在生成思维导图...✅ 完成! 📄 output/attention_is_all_you_need_摘要.txt 🗺️ output/attention_is_all_you_need.xmind--------------------------------------------------📖 正在读取: bert_pretraining.pdf🤖 正在分析...(调用GPT-4)🗺️ 正在生成思维导图...✅ 完成! 📄 output/bert_pretraining_摘要.txt 🗺️ output/bert_pretraining.xmind--------------------------------------------------📖 正在读取: gpt3_language_models.pdf🤖 正在分析...(调用GPT-4)🗺️ 正在生成思维导图...✅ 完成! 📄 output/gpt3_language_models_摘要.txt 🗺️ output/gpt3_language_models.xmind--------------------------------------------------📊 已生成对比表: output/论文对比表.txt🎉 全部完成!总耗时: 7分12秒
进阶魔改(收藏后慢慢玩)
魔改1:接入本地模型,论文不上云
# 用Ollama本地运行Llama2,敏感论文(如未发表工作)不上传import requestsdef local_summary(text): response = requests.post("http://localhost:11434/api/generate", json={ "model": "llama2:13b", "prompt": f"总结以下论文:{text[:8000]}", "stream": False }) return response.json()["response"]# 替换generate_summary中的API调用# summary = local_summary(text) # 零成本,隐私安全
魔改2:生成PPT直接汇报
from pptx import Presentationfrom pptx.util import Inches, Ptdef create_ppt(summary, output_path): """自动生成汇报PPT""" prs = Presentation() # 标题页 title_slide = prs.slides.add_slide(prs.slide_layouts[0]) title_slide.shapes.title.text = "论文精读汇报" title_slide.placeholders[1].text = summary.get("一句话摘要", "") # 创新点页 bullet_slide = prs.slides.add_slide(prs.slide_layouts[1]) bullet_slide.shapes.title.text = "核心创新点" tf = bullet_slide.shapes.placeholders[1].text_frame for point in summary.get("创新点", []): tf.add_paragraph().text = point # 实验结果页 # ... prs.save(output_path) print(f"📊 PPT已生成: {output_path}")# 调用:create_ppt(summary, "output/汇报.pptx")
成本与替代方案
避坑指南
❌ 这些论文别用AI读
✅ AI读论文的正确姿势
第一遍:AI速读 → 判断值不值得精读(5分钟)
第二遍:人工精读核心章节(30分钟)
第三遍:AI辅助做笔记+思维导图(10分钟)
时间节省:从2小时 → 45分钟,且理解更深
写在最后
这个工具不是为了让你"不读论文",而是让你把有限的时间花在值得读的论文上。
科研的本质不是拼谁读得多,而是拼谁想得深。AI帮你省下的时间,应该用来思考、实验、写代码、喝咖啡、或者睡觉。
"最好的研究者,不是最勤奋的读者,而是最聪明的筛选者。"