import argparseimport ioimport osimport globimport fitz # PyMuPDFfrom PIL import Imagefrom docx import Documentfrom docx.shared import Inchesfrom docx.enum.text import WD_ALIGN_PARAGRAPHfrom docx.oxml.ns import qnfrom docx.oxml import OxmlElementdef add_toc(doc): """在文档开头插入可更新的目录域""" paragraph = doc.add_paragraph("目录", style="Heading 1") p = doc.add_paragraph() run = p.add_run() fldChar1 = OxmlElement('w:fldChar') fldChar1.set(qn('w:fldCharType'), 'begin') run._r.append(fldChar1) run2 = p.add_run() instrText = OxmlElement('w:instrText') instrText.set(qn('xml:space'), 'preserve') instrText.text = ' TOC \\o "1-3" ' run2._r.append(instrText) run3 = p.add_run() fldChar2 = OxmlElement('w:fldChar') fldChar2.set(qn('w:fldCharType'), 'end') run3._r.append(fldChar2)def calculate_size(img_width_px, img_height_px, dpi, args): """根据参数计算图片在 Word 中的宽高(英寸)""" w_inch = img_width_px / dpi h_inch = img_height_px / dpi if args.scale: w_inch *= args.scale h_inch *= args.scale elif args.width and args.height: # 强制拉伸 w_inch = args.width h_inch = args.height elif args.width: ratio = args.width / w_inch w_inch = args.width h_inch *= ratio elif args.height: ratio = args.height / h_inch h_inch = args.height w_inch *= ratio return Inches(w_inch), Inches(h_inch)def collect_pdf_files(args): """获取需要处理的所有 PDF 文件列表(已按名称排序)""" if args.input_dir: # 从文件夹中获取所有 .pdf 文件(不区分大小写) folder = args.input_dir if not os.path.isdir(folder): raise ValueError(f"文件夹不存在: {folder}") pdf_files = glob.glob(os.path.join(folder, "*.pdf")) + \ glob.glob(os.path.join(folder, "*.PDF")) pdf_files = sorted(set(pdf_files)) # 去重并排序 if not pdf_files: raise ValueError(f"文件夹中没有找到 PDF 文件: {folder}") return pdf_files else: # 使用命令行传入的文件列表 if not args.pdf_files: raise ValueError("请指定要处理的 PDF 文件,或使用 --input-dir 指定文件夹") # 检查文件是否存在 for f in args.pdf_files: if not os.path.isfile(f): raise ValueError(f"文件不存在: {f}") return args.pdf_filesdef process_pdfs(pdf_files, output_docx, args): doc = Document() # 设置页边距 section = doc.sections[0] section.top_margin = Inches(args.margin_top) section.bottom_margin = Inches(args.margin_bottom) section.left_margin = Inches(args.margin_left) section.right_margin = Inches(args.margin_right) # 插入目录 if args.toc: add_toc(doc) doc.add_page_break() total_files = len(pdf_files) for idx, pdf_path in enumerate(pdf_files): file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 章节标题(目录项) doc.add_heading(file_name, level=1) pdf_doc = fitz.open(pdf_path) page_count = len(pdf_doc) for page_num in range(page_count): page = pdf_doc.load_page(page_num) pix = page.get_pixmap(dpi=args.dpi) img = Image.open(io.BytesIO(pix.tobytes("png"))) # 转为指定格式并保存到内存 img_bytes = io.BytesIO() img_format = args.image_format.upper() if img_format == "JPEG": img = img.convert("RGB") img.save(img_bytes, format=img_format) img_bytes.seek(0) # 计算 Word 尺寸 width_inch, height_inch = calculate_size( img.width, img.height, args.dpi, args ) # 居中插入图片 p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER run = p.add_run() run.add_picture(img_bytes, width=width_inch, height=height_inch) # 页面间是否分页 if args.separate_page and page_num < page_count - 1: doc.add_page_break() pdf_doc.close() # PDF 文件之间是否分页(最后一个文件后不加) if args.separate_page and idx < total_files - 1: doc.add_page_break() doc.save(output_docx) print(f"处理完成!共 {total_files} 个 PDF 文件,输出: {output_docx}")if __name__ == "__main__": parser = argparse.ArgumentParser(description="将多个 PDF 拆分为图片并按序插入 Word,支持批量处理文件夹") parser.add_argument("pdf_files", nargs="*", help="要处理的 PDF 文件(可同时指定多个)") parser.add_argument("-d", "--input-dir", help="批量处理指定文件夹下的所有 PDF 文件") parser.add_argument("-o", "--output", default="output.docx", help="输出的 Word 文件名") parser.add_argument("--margin-top", type=float, default=1.0, help="上边距(英寸)") parser.add_argument("--margin-bottom", type=float, default=1.0, help="下边距(英寸)") parser.add_argument("--margin-left", type=float, default=1.0, help="左边距(英寸)") parser.add_argument("--margin-right", type=float, default=1.0, help="右边距(英寸)") parser.add_argument("--dpi", type=int, default=200, help="PDF 渲染 DPI") parser.add_argument("--scale", type=float, help="统一缩放比例(如 0.5 缩小一半)") parser.add_argument("--width", type=float, help="指定图片宽度(英寸),高度自适应") parser.add_argument("--height", type=float, help="指定图片高度(英寸),宽度自适应") parser.add_argument("--image-format", choices=["png", "jpeg"], default="png", help="图片格式") parser.add_argument("--no-separate-page", dest="separate_page", action="store_false", help="不在每张图片后自动分页") parser.add_argument("--no-toc", dest="toc", action="store_false", help="不生成目录") parser.set_defaults(separate_page=True, toc=True) args = parser.parse_args() try: pdf_list = collect_pdf_files(args) process_pdfs(pdf_list, args.output, args) except Exception as e: print(f"错误: {e}") exit(1)