当前位置：首页>python>【Python】依据序号识别PDF章节
【Python】依据序号识别PDF章节

2026-07-01 23:46:59
python实现对PDF网页链接导入，然后根据大写一、二、..........来识别PDF章节分布，然后进一步划分章节分布。
import fitz  # PyMuPDFimport reimport requestsfrom collections import defaultdictfrom typing import List, Dictimport statistics# ----------------------------# 1. 从链接下载 PDF# ----------------------------def download_pdf(url: str, timeout: int = 30) -> bytes:    headers = {        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"    }    resp = requests.get(url, headers=headers, timeout=timeout, stream=True)    resp.raise_for_status()    return resp.content# ----------------------------# 2. 中文标题匹配及特征打分# ----------------------------CHINESE_NUM_PATTERN = re.compile(r'^[一二三四五六七八九十百千]+[、．.]')def is_chinese_heading(text: str) -> bool:    return bool(CHINESE_NUM_PATTERN.match(text.strip()))def score_heading_line(line, global_median_size, page_width, page_height):    text = "".join([s["text"] for s in line["spans"]]).strip()    if not is_chinese_heading(text):        return 0    score = 1    sizes = [s["size"] for s in line["spans"]]    max_size = max(sizes) if sizes else 0    is_bold = any("Bold" in s.get("font","") for s in line["spans"])    if max_size > global_median_size * 1.1:        score += 1    if is_bold:        score += 1    bbox = line["bbox"]    if bbox[1] < page_height * 0.4:  # 位于页面上半部        score += 1    if bbox[0] < page_width * 0.1:   # 左对齐无缩进        score += 1    return score# ----------------------------# 3. 获取页码（带默认值）# ----------------------------def get_display_page(doc: fitz.Document, physical_index: int) -> str:    """    返回可显示页码：优先使用逻辑标签，若无则返回物理页码+1。    physical_index: 从0开始的页面索引。    """    label = doc[physical_index].get_label()    if label and label.strip():        return label.strip()    else:        return str(physical_index + 1)   # 物理页码转为1-based# ----------------------------# 4. 提取章节及页码范围# ----------------------------def extract_chapters(    pdf_bytes: bytes,    score_threshold: int = 3,    numbered_only: bool = False  # 若为True，只保留以中文序号开头的标题) -> List[Dict]:    doc = fitz.open(stream=pdf_bytes, filetype="pdf")    chapters = []    # ===== 优先使用书签 =====    toc = doc.get_toc()    if toc:        # 筛选一级条目        level1 = [(title, page_num) for level, title, page_num in toc if level == 1]        if level1:            if numbered_only:                level1 = [(t, p) for t, p in level1 if is_chinese_heading(t)]            total_phys = doc.page_count            for i, (title, page_num) in enumerate(level1):                # page_num 是1-based物理页码（PyMuPDF约定）                phys_start = page_num - 1                start_label = get_display_page(doc, phys_start)                if i + 1 < len(level1):                    phys_end = level1[i+1][1] - 2  # 下一章起始页的前一页                    if phys_end < phys_start:                        phys_end = phys_start                else:                    phys_end = total_phys - 1                end_label = get_display_page(doc, phys_end)                chapters.append({                    "title": title,                    "start": start_label,                    "end": end_label                })            return chapters    # ===== 无书签时，文本特征识别 =====    all_sizes = []    for page in doc:        for b in page.get_text("dict")["blocks"]:            for line in b.get("lines", []):                for span in line["spans"]:                    all_sizes.append(span["size"])    if not all_sizes:        return chapters    global_median_size = statistics.median(all_sizes)    candidates = []    for pnum, page in enumerate(doc):        pw, ph = page.rect.width, page.rect.height        for b in page.get_text("dict")["blocks"]:            for line in b.get("lines", []):                text = "".join([s["text"] for s in line["spans"]]).strip()                if not is_chinese_heading(text):                    continue                s = score_heading_line(line, global_median_size, pw, ph)                if s >= score_threshold:                    candidates.append((text, pnum, s))    candidates.sort(key=lambda x: x[1])    # 同页去重，保留最高分    filtered = []    page_best = {}    for t, pn, sc in candidates:        if pn not in page_best or sc > page_best[pn][1]:            page_best[pn] = (t, sc)    for pn in sorted(page_best):        filtered.append((page_best[pn][0], pn))    total_phys = doc.page_count    for i, (title, pn) in enumerate(filtered):        start_label = get_display_page(doc, pn)        if i + 1 < len(filtered):            end_pn = filtered[i+1][1] - 1            if end_pn < pn:                end_pn = pn        else:            end_pn = total_phys - 1        end_label = get_display_page(doc, end_pn)        chapters.append({            "title": title,            "start": start_label,            "end": end_label        })    return chapters# ----------------------------# 5. 示例运行# ----------------------------if __name__ == "__main__":    url = "https://sthjj.suzhou.gov.cn/szhbj/jsslgs/202605/3a30a2711bb5402389ffa4882a5a71d4/files/e493759743c242e69f9d6cb572f61d75.pdf"    try:        pdf_data = download_pdf(url)        # numbered_only=True 可以只保留“一、”“二、”这类标题        results = extract_chapters(pdf_data, score_threshold=3, numbered_only=False)        if results:            print(f"共识别到 {len(results)} 个章节：")            for ch in results:                if ch["start"] == ch["end"]:                    print(f"  {ch['title']}  →  第 {ch['start']} 页")                else:                    print(f"  {ch['title']}  →  第 {ch['start']} 页 至 第 {ch['end']} 页")        else:            print("未识别到任何章节。")    except Exception as e:        print(f"处理出错: {e}")
本文来自网友投稿或网络内容，如有侵犯您的权益请联系我们删除，联系邮箱：wyl860211@qq.com 。
【Python】依据序号识别PDF章节

最新文章

热门文章

随机文章

【Python】依据序号识别PDF章节

被3万人看过的Python学习流程...

python办公自动化值得考的8个证书

最新文章

热门文章

随机文章