import fitz # PyMuPDFimport reimport requestsfrom collections import defaultdictfrom typing import List, Dictimport statistics# ----------------------------# 1. 从链接下载 PDF# ----------------------------def download_pdf(url: str, timeout: int = 30) -> bytes: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } resp = requests.get(url, headers=headers, timeout=timeout, stream=True) resp.raise_for_status() return resp.content# ----------------------------# 2. 中文标题匹配及特征打分# ----------------------------CHINESE_NUM_PATTERN = re.compile(r'^[一二三四五六七八九十百千]+[、..]')def is_chinese_heading(text: str) -> bool: return bool(CHINESE_NUM_PATTERN.match(text.strip()))def score_heading_line(line, global_median_size, page_width, page_height): text = "".join([s["text"] for s in line["spans"]]).strip() if not is_chinese_heading(text): return 0 score = 1 sizes = [s["size"] for s in line["spans"]] max_size = max(sizes) if sizes else 0 is_bold = any("Bold" in s.get("font","") for s in line["spans"]) if max_size > global_median_size * 1.1: score += 1 if is_bold: score += 1 bbox = line["bbox"] if bbox[1] < page_height * 0.4: # 位于页面上半部 score += 1 if bbox[0] < page_width * 0.1: # 左对齐无缩进 score += 1 return score# ----------------------------# 3. 获取页码(带默认值)# ----------------------------def get_display_page(doc: fitz.Document, physical_index: int) -> str: """ 返回可显示页码:优先使用逻辑标签,若无则返回物理页码+1。 physical_index: 从0开始的页面索引。 """ label = doc[physical_index].get_label() if label and label.strip(): return label.strip() else: return str(physical_index + 1) # 物理页码转为1-based# ----------------------------# 4. 提取章节及页码范围# ----------------------------def extract_chapters( pdf_bytes: bytes, score_threshold: int = 3, numbered_only: bool = False # 若为True,只保留以中文序号开头的标题) -> List[Dict]: doc = fitz.open(stream=pdf_bytes, filetype="pdf") chapters = [] # ===== 优先使用书签 ===== toc = doc.get_toc() if toc: # 筛选一级条目 level1 = [(title, page_num) for level, title, page_num in toc if level == 1] if level1: if numbered_only: level1 = [(t, p) for t, p in level1 if is_chinese_heading(t)] total_phys = doc.page_count for i, (title, page_num) in enumerate(level1): # page_num 是1-based物理页码(PyMuPDF约定) phys_start = page_num - 1 start_label = get_display_page(doc, phys_start) if i + 1 < len(level1): phys_end = level1[i+1][1] - 2 # 下一章起始页的前一页 if phys_end < phys_start: phys_end = phys_start else: phys_end = total_phys - 1 end_label = get_display_page(doc, phys_end) chapters.append({ "title": title, "start": start_label, "end": end_label }) return chapters # ===== 无书签时,文本特征识别 ===== all_sizes = [] for page in doc: for b in page.get_text("dict")["blocks"]: for line in b.get("lines", []): for span in line["spans"]: all_sizes.append(span["size"]) if not all_sizes: return chapters global_median_size = statistics.median(all_sizes) candidates = [] for pnum, page in enumerate(doc): pw, ph = page.rect.width, page.rect.height for b in page.get_text("dict")["blocks"]: for line in b.get("lines", []): text = "".join([s["text"] for s in line["spans"]]).strip() if not is_chinese_heading(text): continue s = score_heading_line(line, global_median_size, pw, ph) if s >= score_threshold: candidates.append((text, pnum, s)) candidates.sort(key=lambda x: x[1]) # 同页去重,保留最高分 filtered = [] page_best = {} for t, pn, sc in candidates: if pn not in page_best or sc > page_best[pn][1]: page_best[pn] = (t, sc) for pn in sorted(page_best): filtered.append((page_best[pn][0], pn)) total_phys = doc.page_count for i, (title, pn) in enumerate(filtered): start_label = get_display_page(doc, pn) if i + 1 < len(filtered): end_pn = filtered[i+1][1] - 1 if end_pn < pn: end_pn = pn else: end_pn = total_phys - 1 end_label = get_display_page(doc, end_pn) chapters.append({ "title": title, "start": start_label, "end": end_label }) return chapters# ----------------------------# 5. 示例运行# ----------------------------if __name__ == "__main__": url = "https://sthjj.suzhou.gov.cn/szhbj/jsslgs/202605/3a30a2711bb5402389ffa4882a5a71d4/files/e493759743c242e69f9d6cb572f61d75.pdf" try: pdf_data = download_pdf(url) # numbered_only=True 可以只保留“一、”“二、”这类标题 results = extract_chapters(pdf_data, score_threshold=3, numbered_only=False) if results: print(f"共识别到 {len(results)} 个章节:") for ch in results: if ch["start"] == ch["end"]: print(f" {ch['title']} → 第 {ch['start']} 页") else: print(f" {ch['title']} → 第 {ch['start']} 页 至 第 {ch['end']} 页") else: print("未识别到任何章节。") except Exception as e: print(f"处理出错: {e}")