🕐 预计用时:2-3 小时 | 🎯 目标:掌握 HTML 结构、标签选择、CSS 选择器、find/find_all
<!-- HTML 是由标签组成的树形结构 -->
<html>
<head>
<title>我的网页</title>
</head>
<body>
<div class="container">
<h1 id="title">欢迎</h1>
<p class="intro">这是一个段落</p>
<ul>
<li>项目1</li>
<li>项目2</li>
</ul>
<a href="https://example.com" target="_blank">链接</a>
</div>
</body>
</html>
<!-- 标签 = 元素
class = CSS 类名(可重复)
id = 唯一标识
href/target = 属性 --># 安装: pip install beautifulsoup4
from bs4 import BeautifulSoup
html = """
<html>
<head><title>测试页面</title></head>
<body>
<h1 id="main-title">Hello World</h1>
<p class="intro">欢迎来到 Python 世界</p>
<p class="content">学习爬虫很有趣</p>
</body>
</html>
"""
# 创建 BeautifulSoup 对象
soup = BeautifulSoup(html, "html.parser") # 使用内置解析器
# 基本操作
print(soup.title) # <title>测试页面</title>
print(soup.title.text) # 测试页面
print(soup.h1) # <h1 id="main-title">Hello World</h1>
print(soup.h1.text) # Hello World
print(soup.body) # 整个 body 内容💡 解析器选择:
• html.parser — Python 内置,不需要安装
• lxml — 更快,需要安装(pip install lxml)
• html5lib — 最宽容,能处理最烂的 HTML
from bs4 import BeautifulSoup
html = """
<div class="products">
<div class="item">
<h2>商品A</h2>
<span class="price">¥99</span>
<span class="stock">有货</span>
</div>
<div class="item">
<h2>商品B</h2>
<span class="price">¥199</span>
<span class="stock">缺货</span>
</div>
</div>
"""
soup = BeautifulSoup(html, "html.parser")
# 按标签名查找(返回第一个匹配的)
h2 = soup.find("h2")
print(h2.text) # 商品A
# 按 class 查找
price = soup.find("span", class_="price")
print(price.text) # ¥99
# 按 id 查找
# element = soup.find("div", id="main")
# 按属性查找
stock = soup.find("span", attrs={"class": "stock"})
print(stock.text) # 有货
# 组合条件
item = soup.find("div", class_="item")
print(item.h2.text) # 商品A
print(item.find("span", class_="price").text) # ¥99from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# 查找所有 h2 标签
h2s = soup.find_all("h2")
for h2 in h2s:
print(h2.text)
# 商品A
# 商品B
# 查找所有 class="price" 的 span
prices = soup.find_all("span", class_="price")
for p in prices:
print(p.text)
# ¥99
# ¥199
# 限制返回数量
first_price = soup.find_all("span", class_="price", limit=1)
# 用正则表达式匹配
import re
tags = soup.find_all(re.compile("^h[1-6]")) # 所有标题标签
for tag in tags:
print(f"<{tag.name}>: {tag.text}")# 提取所有商品信息
items = soup.find_all("div", class_="item")
products = []
for item in items:
name = item.h2.text
price = item.find("span", class_="price").text
stock = item.find("span", class_="stock").text
products.append({
"name": name,
"price": price,
"stock": stock
})
print(f"{name}: {price} ({stock})")
# 商品A: ¥99 (有货)
# 商品B: ¥199 (缺货)from bs4 import BeautifulSoup
html = """
<div class="container">
<ul id="menu">
<li class="active">首页</li>
<li>产品</li>
<li>关于</li>
</ul>
<div class="content">
<p>段落1</p>
<p>段落2</p>
<a href="/link1">链接1</a>
<a href="/link2">链接2</a>
</div>
</div>
"""
soup = BeautifulSoup(html, "html.parser")
# CSS 选择器语法
print(soup.select("li")) # 所有 li
print(soup.select(".active")) # class="active"
print(soup.select("#menu")) # id="menu"
print(soup.select("#menu li")) # #menu 下的所有 li
print(soup.select("ul > li")) # ul 的直接子元素 li
print(soup.select("div.content")) # div 且 class="content"
print(soup.select("a[href]")) # 有 href 属性的 a 标签
# 组合选择器
print(soup.select("div.container ul#menu li.active"))
# class="container" 的 div → id="menu" 的 ul → class="active" 的 li
# 获取选择结果的文本
for li in soup.select("#menu li"):
print(li.text)
# 首页
# 产品
# 关于💡 CSS 选择器速查:tag— 标签名.class— 类名#id— IDtag.class— 标签+类名A > B— A 的直接子元素 BA B— A 的后代元素 BA, B— A 或 B[attr] — 有某属性
from bs4 import BeautifulSoup
html = """
<a href="https://example.com" target="_blank" class="link">示例链接</a>
<img src="photo.jpg" alt="照片" width="300">
<input type="text" name="username" value="admin">
"""
soup = BeautifulSoup(html, "html.parser")
# 获取文本
a = soup.find("a")
print(a.text) # 示例链接
print(a.get_text()) # 示例链接
print(a.string) # 示例链接
# 获取属性
print(a["href"]) # https://example.com
print(a.get("target")) # _blank
print(a.attrs) # {'href': 'https://example.com', 'target': '_blank', 'class': ['link']}
# img 属性
img = soup.find("img")
print(img["src"]) # photo.jpg
print(img["alt"]) # 照片
# input 属性
inp = soup.find("input")
print(inp["type"]) # text
print(inp["value"]) # adminfrom bs4 import BeautifulSoup
html = """
<div id="root">
<p>第一段</p>
<span>span文本</span>
<p>第二段</p>
</div>
"""
soup = BeautifulSoup(html, "html.parser")
root = soup.find("div", id="root")
# 向下遍历
print(root.children) # 直接子元素(迭代器)
for child in root.children:
if child.name:
print(f"子元素: <{child.name}> {child.text}")
print(root.descendants) # 所有后代元素(递归)
# 向上遍历
p = soup.find("p")
print(p.parent) # 父元素
print(p.parent.name) # div
# 兄弟元素
print(p.next_sibling) # 下一个兄弟(可能含空文本)
print(p.find_next_sibling()) # 下一个标签兄弟
print(p.find_next_sibling("span")) # 下一个 span 兄弟from bs4 import BeautifulSoup
import requests
html = """
<div class="news-list">
<div class="news-item">
<h3><a href="/news/1">Python 3.13 发布</a></h3>
<span class="time">2024-01-15</span>
<span class="source">官方博客</span>
</div>
<div class="news-item">
<h3><a href="/news/2">AI 编程助手大比拼</a></h3>
<span class="time">2024-01-14</span>
<span class="source">科技日报</span>
</div>
<div class="news-item">
<h3><a href="/news/3">机器学习入门指南</a></h3>
<span class="time">2024-01-13</span>
<span class="source">数据科学杂志</span>
</div>
</div>
"""
soup = BeautifulSoup(html, "html.parser")
news_list = []
for item in soup.find_all("div", class_="news-item"):
title_tag = item.find("a")
title = title_tag.text
link = title_tag.get("href", "")
time = item.find("span", class_="time").text
source = item.find("span", class_="source").text
news_list.append({
"title": title,
"link": link,
"time": time,
"source": source
})
for news in news_list:
print(f"📰 [{news['time']}] {news['title']} ({news['source']})")
# 📰 [2024-01-15] Python 3.13 发布 (官方博客)
# 📰 [2024-01-14] AI 编程助手大比拼 (科技日报)
# 📰 [2024-01-13] 机器学习入门指南 (数据科学杂志)from bs4 import BeautifulSoup
html = """
<table id="scores">
<thead>
<tr><th>姓名</th><th>语文</th><th>数学</th><th>英语</th></tr>
</thead>
<tbody>
<tr><td>张三</td><td>90</td><td>95</td><td>88</td></tr>
<tr><td>李四</td><td>85</td><td>80</td><td>92</td></tr>
<tr><td>王五</td><td>78</td><td>88</td><td>76</td></tr>
</tbody>
</table>
"""
soup = BeautifulSoup(html, "html.parser")
# 提取表头
headers = [th.text for th in soup.select("thead th")]
print(f"列名: {headers}")
# 提取数据行
rows = []
for tr in soup.select("tbody tr"):
cells = [td.text for td in tr.find_all("td")]
row = dict(zip(headers, cells))
rows.append(row)
for row in rows:
total = sum(int(v) for v in row.values() if v.isdigit())
print(f" {row['姓名']}: 总分 {total}")
# 列名: ['姓名', '语文', '数学', '英语']
# 张三: 总分 273
# 李四: 总分 257
# 王五: 总分 242find(tag, class_=, id=) | ||
find_all(tag, class_=) | ||
select(css_selector) | ||
tag.text | ||
tag["attr"] | ||
tag.attrs | ||
tag.find_next_sibling() | ||
tag.parent |
🎯 练习建议:
1. 用 BeautifulSoup 解析一个真实网页,提取所有链接
2. 提取一个电商页面的商品名、价格、评分
3. 解析维基百科的一个页面,提取所有段落的标题和内容
📚 Day42 完成!明天学习 XPath + lxml — 另一种强大的解析方式
请在微信客户端打开