#!/usr/bin/env python3"""PDF to Word converter with better layout/table retention.Features:1) Automatically detect scanned PDFs.2) Run OCR for scanned PDFs (via ocrmypdf) to add a text layer.3) Convert to .docx using pdf2docx to preserve tables/paragraph layout.4) Support single-file and batch directory conversion.Install: pip install pdf2docx pymupdf # Optional but recommended for scanned PDFs: # 1) Install Tesseract OCR on your system. # 2) Install Ghostscript on your system. # 3) pip install ocrmypdfExamples: python pdf_to_word.py python pdf_to_word.py -i input.pdf -o output.docx python pdf_to_word.py -i ./pdfs -o ./docx --recursive python pdf_to_word.py -i scan.pdf --force-ocr --ocr-lang chi_sim+eng python pdf_to_word.py -i ./pdfs -o ./docx --recursive --retries 2 --error-log ./failed.csv"""from __future__ import annotationsimport argparseimport csvimport shutilimport subprocessimport sysimport tempfileimport timefrom datetime import datetimefrom pathlib import Pathfrom typing import Dict, Iterable, List, Optional, Tupleimport fitz # pymupdffrom pdf2docx import Converter# ====== Fixed batch folder configuration ======# 直接修改该路径后,执行 `python doc/pdf_to_word.py` 即可批量转换。# 输入和输出使用同一个目录:会在该目录(及其子目录)生成同名 .docx。FIXED_BATCH_DIR = Path(r"D:\pdf\input")def find_pdf_files(input_path: Path, recursive: bool) -> List[Path]: """收集待处理 PDF 文件列表。 - 输入为文件时:仅接受 .pdf - 输入为目录时:按 recursive 决定是否递归扫描 """ if input_path.is_file(): if input_path.suffix.lower() != ".pdf": raise ValueError(f"Input file is not a PDF: {input_path}") return [input_path] if not input_path.is_dir(): raise ValueError(f"Input path does not exist: {input_path}") pattern = "**/*.pdf" if recursive else "*.pdf" return sorted(input_path.glob(pattern))def is_scanned_pdf(pdf_path: Path, sample_pages: int = 3, min_text_chars: int = 60) -> bool: """ Heuristic: - If first N pages have very little extractable text, treat as scanned PDF. """ doc = fitz.open(str(pdf_path)) try: pages = min(sample_pages, doc.page_count) if pages <= 0: return False total_chars = 0 for i in range(pages): total_chars += len(doc.load_page(i).get_text("text").strip()) return total_chars < min_text_chars finally: doc.close()def require_command(name: str) -> None: """检查外部命令是否存在于 PATH。""" if shutil.which(name) is None: raise RuntimeError( f"Required command not found: {name}. " f"Please install it and ensure it is in PATH." )def run_ocr(input_pdf: Path, output_pdf: Path, ocr_lang: str) -> None: """调用 ocrmypdf 给扫描件补文本层,便于后续版面/表格识别。""" require_command("ocrmypdf") cmd = [ "ocrmypdf", "--skip-text", "--redo-ocr", "-l", ocr_lang, str(input_pdf), str(output_pdf), ] print(f"[OCR] {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: msg = result.stderr.strip() or result.stdout.strip() raise RuntimeError(f"OCR failed for {input_pdf}:\n{msg}")def convert_pdf_to_docx(input_pdf: Path, output_docx: Path, start: int = 0, end: Optional[int] = None) -> None: """执行 PDF 到 DOCX 的核心转换。""" output_docx.parent.mkdir(parents=True, exist_ok=True) cv = Converter(str(input_pdf)) try: cv.convert(str(output_docx), start=start, end=end) finally: cv.close()def build_output_path(src_pdf: Path, input_root: Path, output_root: Path) -> Path: """根据输入根目录和输出根目录,构建对应的 .docx 路径。""" if input_root.is_file(): return output_root rel = src_pdf.relative_to(input_root) return output_root / rel.with_suffix(".docx")def convert_one( pdf_file: Path, output_docx: Path, force_ocr: bool, ocr_lang: str, start_page: int, end_page: Optional[int],) -> None: """转换单个 PDF。 处理流程: 1) 扫描件检测 2) 必要时 OCR 3) 执行 pdf2docx 转换 """ scanned = is_scanned_pdf(pdf_file) need_ocr = force_ocr or scanned print(f"\n[INFO] Processing: {pdf_file}") print(f"[INFO] Scanned detection: {'YES'if scanned else'NO'}") print(f"[INFO] OCR step: {'ENABLED'if need_ocr else'SKIPPED'}") with tempfile.TemporaryDirectory(prefix="pdf2docx_") as tmp_dir: source_pdf = pdf_file if need_ocr: ocr_pdf = Path(tmp_dir) / f"{pdf_file.stem}.ocr.pdf" run_ocr(pdf_file, ocr_pdf, ocr_lang=ocr_lang) source_pdf = ocr_pdf convert_pdf_to_docx( input_pdf=source_pdf, output_docx=output_docx, start=max(0, start_page), end=end_page, ) print(f"[OK] Output: {output_docx}")def convert_with_retry( pdf_file: Path, output_docx: Path, force_ocr: bool, ocr_lang: str, start_page: int, end_page: Optional[int], retries: int, retry_delay: float,) -> Tuple[bool, int, Optional[str]]: """带重试机制的单文件转换。 返回: - 是否成功 - 实际尝试次数 - 错误信息(成功时为 None) """ # retries 表示“失败后重试次数”,总尝试次数 = retries + 1 max_attempts = max(1, retries + 1) last_error: Optional[str] = None for attempt in range(1, max_attempts + 1): try: if attempt > 1: print(f"[RETRY] {pdf_file.name} attempt {attempt}/{max_attempts}") convert_one( pdf_file=pdf_file, output_docx=output_docx, force_ocr=force_ocr, ocr_lang=ocr_lang, start_page=start_page, end_page=end_page, ) return True, attempt, None except Exception as exc: # noqa: BLE001 last_error = str(exc) # 失败后按设定间隔重试,减小瞬时环境波动的影响 if attempt < max_attempts and retry_delay > 0: time.sleep(retry_delay) return False, max_attempts, last_errordef write_failure_csv(failures: List[Dict[str, str]], error_log_path: Path) -> None: """将失败任务写入 CSV,便于批量复盘与二次处理。""" error_log_path.parent.mkdir(parents=True, exist_ok=True) fieldnames = [ "timestamp", "input_pdf", "output_docx", "attempts", "error", ] with error_log_path.open("w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(failures)def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace: """解析命令行参数。""" parser = argparse.ArgumentParser( description="Convert PDF to Word with OCR support for scanned PDFs." ) parser.add_argument( "-i", "--input", required=False, default=None, help=( "Input PDF file or directory. " "If omitted, the script uses FIXED_BATCH_DIR." ), ) parser.add_argument( "-o", "--output", required=False, help=( "Output .docx file (single input) or output directory (batch input). " "Default: same folder as input." ), ) parser.add_argument( "--recursive", action="store_true", help="Recursively scan input directory for PDFs.", ) parser.add_argument( "--force-ocr", action="store_true", help="Force OCR even if PDF already has extractable text.", ) parser.add_argument( "--ocr-lang", default="chi_sim+eng", help="OCR language for ocrmypdf (default: chi_sim+eng).", ) parser.add_argument( "--start-page", type=int, default=0, help="Start page index for conversion (0-based, default: 0).", ) parser.add_argument( "--end-page", type=int, default=None, help="End page index for conversion (0-based, exclusive).", ) parser.add_argument( "--retries", type=int, default=2, help="Retry count after a failure (default: 2).", ) parser.add_argument( "--retry-delay", type=float, default=1.0, help="Delay seconds between retries (default: 1.0).", ) parser.add_argument( "--error-log", default=None, help=( "CSV path for failed tasks. " "Default: single file -> same dir 'pdf_to_word_failures.csv'; " "batch -> output dir 'pdf_to_word_failures.csv'." ), ) return parser.parse_args(argv)def main(argv: Optional[Iterable[str]] = None) -> int: """程序入口:支持单文件模式与批量目录模式。""" args = parse_args(argv) use_fixed_folder_mode = args.input is None input_path = ( FIXED_BATCH_DIR.expanduser().resolve() if use_fixed_folder_mode else Path(args.input).expanduser().resolve() ) try: pdf_files = find_pdf_files(input_path, recursive=args.recursive) except ValueError as exc: print(f"[ERROR] {exc}") return 2 if not pdf_files: print(f"[WARN] No PDF files found in: {input_path}") return 0 if input_path.is_file(): # 单文件模式:输出必须是 .docx(或默认同名) if args.output: out_path = Path(args.output).expanduser().resolve() if out_path.suffix.lower() != ".docx": print("[ERROR] For single input file, output must end with .docx") return 2 output_path = out_path else: output_path = input_path.with_suffix(".docx") ok, attempts, err = convert_with_retry( pdf_file=pdf_files[0], output_docx=output_path, force_ocr=args.force_ocr, ocr_lang=args.ocr_lang, start_page=args.start_page, end_page=args.end_page, retries=args.retries, retry_delay=args.retry_delay, ) if ok: return 0 # 单文件失败也写 CSV,统一日志格式 error_log_path = ( Path(args.error_log).expanduser().resolve() if args.error_log else output_path.parent / "pdf_to_word_failures.csv" ) failures = [ { "timestamp": datetime.now().isoformat(timespec="seconds"), "input_pdf": str(pdf_files[0]), "output_docx": str(output_path), "attempts": str(attempts), "error": err or "Unknown error", } ] write_failure_csv(failures, error_log_path) print(f"[ERROR] {pdf_files[0]}: {err}") print(f"[LOG] Failure CSV: {error_log_path}") return 1 output_root = ( Path(args.output).expanduser().resolve() if args.output else input_path ) output_root.mkdir(parents=True, exist_ok=True) # 批量模式:记录失败任务,最后统一落盘 CSV failed = 0 failure_records: List[Dict[str, str]] = [] for pdf_file in pdf_files: output_docx = build_output_path( src_pdf=pdf_file, input_root=input_path, output_root=output_root, ) ok, attempts, err = convert_with_retry( pdf_file=pdf_file, output_docx=output_docx, force_ocr=args.force_ocr, ocr_lang=args.ocr_lang, start_page=args.start_page, end_page=args.end_page, retries=args.retries, retry_delay=args.retry_delay, ) if not ok: failed += 1 print(f"[ERROR] {pdf_file}: {err}") failure_records.append( { "timestamp": datetime.now().isoformat(timespec="seconds"), "input_pdf": str(pdf_file), "output_docx": str(output_docx), "attempts": str(attempts), "error": err or "Unknown error", } ) total = len(pdf_files) success = total - failed print(f"\n[DONE] Total: {total}, Success: {success}, Failed: {failed}") if failure_records: # 优先使用用户指定路径,否则默认写到输出目录 error_log_path = ( Path(args.error_log).expanduser().resolve() if args.error_log else output_root / "pdf_to_word_failures.csv" ) write_failure_csv(failure_records, error_log_path) print(f"[LOG] Failure CSV: {error_log_path}") return 1 if failed else 0if __name__ == "__main__": sys.exit(main())