课程目标
- 掌握XPath语法和表达式编写
- 学会使用lxml库进行高效解析
- 理解XPath与CSS选择器的区别
- 掌握处理复杂XML和HTML文档的技巧
1. XPath简介
XPath(XML Path Language)是一种在XML和HTML文档中查找信息的语言。它使用路径表达式来选取文档中的节点或节点集。
1.1 XPath的优势
- 功能强大,表达能力强
- 支持复杂的条件查询
- 可以进行数学运算和字符串操作
- 浏览器原生支持
1.2 安装lxml
pip install lxml
2. XPath基本语法
2.1 路径表达式
from lxml import html, etree# 示例HTMLhtml_content ="""<html><body> <div class="container"> <h1 id="title">主标题</h1> <div class="content"> <p>第一段</p> <p>第二段</p> <ul> <li>项目1</li> <li>项目2</li> <li>项目3</li> </ul> </div> </div></body></html>"""# 创建解析树tree = html.fromstring(html_content)# 基本路径表达式# / : 从根节点选取# // : 从任意位置选取# . : 当前节点# .. : 父节点# @ : 属性# 绝对路径title = tree.xpath('/html/body/div/h1/text()')print(title)# ['主标题']# 相对路径paragraphs = tree.xpath('//p/text()')print(paragraphs)# ['第一段', '第二段']
2.2 节点选择
# 选择所有div元素divs = tree.xpath('//div')# 选择第一个div元素first_div = tree.xpath('//div[1]')# 选择最后一个li元素last_li = tree.xpath('//li[last()]')# 选择前两个li元素first_two_li = tree.xpath('//li[position()<=2]')# 选择所有有class属性的divdivs_with_class = tree.xpath('//div[@class]')# 选择class为container的divcontainer = tree.xpath('//div[@class="container"]')
2.3 属性选择
html_content ="""<div class="article"> <a href="https://example.com" title="示例">链接1</a> <a href="https://test.com" title="测试">链接2</a> <img src="image1.jpg" alt="图片1" width="300"> <img src="image2.png" alt="图片2" width="400"></div>"""tree = html.fromstring(html_content)# 获取所有链接的href属性hrefs = tree.xpath('//a/@href')print(hrefs)# ['https://example.com', 'https://test.com']# 获取所有图片的src属性srcs = tree.xpath('//img/@src')print(srcs)# ['image1.jpg', 'image2.png']# 获取width大于300的图片wide_images = tree.xpath('//img[@width>300]/@src')print(wide_images)# ['image2.png']
3. XPath高级语法
3.1 条件表达式
html_content ="""<div class="products"> <div class="product" data-price="100"> <h3>产品A</h3> <span class="price">¥100</span> </div> <div class="product" data-price="200"> <h3>产品B</h3> <span class="price">¥200</span> </div> <div class="product" data-price="50"> <h3>产品C</h3> <span class="price">¥50</span> </div></div>"""tree = html.fromstring(html_content)# 价格大于100的产品expensive_products = tree.xpath('//div[@data-price>100]/h3/text()')print(expensive_products)# ['产品B']# 包含特定文本的元素product_a = tree.xpath('//h3[text()="产品A"]')# 使用contains函数products_with_a = tree.xpath('//h3[contains(text(), "产品")]')# 使用starts-with函数price_elements = tree.xpath('//span[starts-with(@class, "price")]')
3.2 轴(Axes)
html_content ="""<div class="container"> <div class="header">头部</div> <div class="content"> <p>段落1</p> <p class="highlight">段落2</p> <p>段落3</p> </div> <div class="footer">底部</div></div>"""tree = html.fromstring(html_content)# 获取highlight段落的父元素parent = tree.xpath('//p[@class="highlight"]/parent::div')# 获取highlight段落的前一个兄弟元素preceding_sibling = tree.xpath('//p[@class="highlight"]/preceding-sibling::p/text()')print(preceding_sibling)# ['段落1']# 获取highlight段落的后一个兄弟元素following_sibling = tree.xpath('//p[@class="highlight"]/following-sibling::p/text()')print(following_sibling)# ['段落3']# 获取所有祖先元素ancestors = tree.xpath('//p[@class="highlight"]/ancestor::*')# 获取所有后代元素descendants = tree.xpath('//div[@class="content"]/descendant::*')
3.3 函数使用
# 文本函数text_content = tree.xpath('//p[normalize-space(text())!=""]/text()')# 字符串长度long_text = tree.xpath('//p[string-length(text())>5]/text()')# 位置函数first_p = tree.xpath('//p[position()=1]/text()')last_p = tree.xpath('//p[position()=last()]/text()')# 计数函数p_count = tree.xpath('count(//p)')print(f"段落数量:{p_count}")# 字符串函数uppercase_text = tree.xpath('//p[contains(translate(text(), "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), "段落")]')
4. lxml库详解
4.1 解析HTML
from lxml import htmlimport requestsdef parse_html_with_lxml(url):"""使用lxml解析HTML""" response = requests.get(url) tree = html.fromstring(response.content)# 修复可能的HTML错误# lxml会自动修复一些HTML错误return tree# 从字符串解析html_string ="<div><p>Hello World</p></div>"tree = html.fromstring(html_string)# 从文件解析tree = html.parse('example.html')
4.2 解析XML
from lxml import etreexml_content ="""<?xml version="1.0" encoding="UTF-8"?><bookstore> <book id="1" category="fiction"> <title>Python编程</title> <author>张三</author> <price>59.99</price> </book> <book id="2" category="technical"> <title>数据结构</title> <author>李四</author> <price>79.99</price> </book></bookstore>"""# 解析XMLroot = etree.fromstring(xml_content)# 获取所有书籍标题titles = root.xpath('//title/text()')print(titles)# ['Python编程', '数据结构']# 获取技术类书籍tech_books = root.xpath('//book[@category="technical"]/title/text()')print(tech_books)# ['数据结构']# 获取价格大于60的书籍expensive_books = root.xpath('//book[price>60]/title/text()')print(expensive_books)# ['数据结构']
4.3 命名空间处理
xml_with_ns ="""<?xml version="1.0"?><root xmlns:book="http://example.com/book" xmlns:author="http://example.com/author"> <book:catalog> <book:item> <book:title>Python指南</book:title> <author:name>王五</author:name> </book:item> </book:catalog></root>"""root = etree.fromstring(xml_with_ns)# 定义命名空间namespaces ={'book':'http://example.com/book','author':'http://example.com/author'}# 使用命名空间查询titles = root.xpath('//book:title/text()', namespaces=namespaces)authors = root.xpath('//author:name/text()', namespaces=namespaces)print(titles)# ['Python指南']print(authors)# ['王五']
5. 实战案例:爬取电商商品信息
import requestsfrom lxml import htmlimport csvimport timeimport randomclassProductSpider:def __init__(self): self.session = requests.Session() self.session.headers.update({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})def get_product_list(self, url, max_pages=5):"""获取商品列表""" all_products =[]for page in range(1, max_pages +1):print(f"正在爬取第{page}页...") page_url = f"{url}?page={page}" products = self.parse_product_page(page_url)ifnot products:print("没有更多商品,停止爬取")break all_products.extend(products)# 随机延时,避免被反爬 time.sleep(random.uniform(1,3))return all_productsdef parse_product_page(self, url):"""解析商品页面"""try: response = self.session.get(url, timeout=10) response.raise_for_status() tree = html.fromstring(response.content)return self.extract_products(tree)exceptExceptionas e:print(f"解析页面失败:{e}")return[]def extract_products(self, tree):"""提取商品信息""" products =[]# 使用XPath定位商品容器 product_nodes = tree.xpath('//div[@class="product-item"]')for node in product_nodes: product = self.extract_single_product(node)if product: products.append(product)return productsdef extract_single_product(self, node):"""提取单个商品信息"""try:# 商品名称 name_nodes = node.xpath('.//h3[@class="product-title"]/a/text()') name = name_nodes[0].strip()if name_nodes else''# 商品链接 link_nodes = node.xpath('.//h3[@class="product-title"]/a/@href') link = link_nodes[0]if link_nodes else''# 商品价格 price_nodes = node.xpath('.//span[@class="price"]/text()') price = price_nodes[0].strip()if price_nodes else''# 商品评分 rating_nodes = node.xpath('.//div[@class="rating"]/@data-rating') rating = rating_nodes[0]if rating_nodes else''# 评论数 review_nodes = node.xpath('.//span[@class="review-count"]/text()') review_count = review_nodes[0].strip()if review_nodes else''# 商品图片 img_nodes = node.xpath('.//img[@class="product-img"]/@src') image_url = img_nodes[0]if img_nodes else''# 店铺名称 shop_nodes = node.xpath('.//span[@class="shop-name"]/text()') shop_name = shop_nodes[0].strip()if shop_nodes else''return{'name': name,'link': link,'price': price,'rating': rating,'review_count': review_count,'image_url': image_url,'shop_name': shop_name}exceptExceptionas e:print(f"提取商品信息失败:{e}")returnNonedef get_product_detail(self, product_url):"""获取商品详情"""try: response = self.session.get(product_url, timeout=10) response.raise_for_status() tree = html.fromstring(response.content)# 详细描述 desc_nodes = tree.xpath('//div[@class="product-description"]//text()') description =''.join(desc_nodes).strip()# 规格参数 specs ={} spec_rows = tree.xpath('//table[@class="specs-table"]//tr')for row in spec_rows: key_nodes = row.xpath('./td[1]/text()') value_nodes = row.xpath('./td[2]/text()')if key_nodes and value_nodes: specs[key_nodes[0].strip()]= value_nodes[0].strip()# 商品图片列表 image_nodes = tree.xpath('//div[@class="product-images"]//img/@src') images =[img for img in image_nodes if img]return{'description': description,'specifications': specs,'images': images}exceptExceptionas e:print(f"获取商品详情失败:{e}")return{}def save_to_csv(self, products, filename='products.csv'):"""保存到CSV文件"""ifnot products:print("没有数据需要保存")returnwith open(filename,'w', newline='', encoding='utf-8')as csvfile: fieldnames =['name','link','price','rating','review_count','image_url','shop_name'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader()for product in products: writer.writerow(product)print(f"数据已保存到 {filename}")# 使用示例if __name__ =="__main__": spider =ProductSpider()# 爬取商品列表 products = spider.get_product_list('https://example-shop.com/search?q=手机')# 保存基本信息 spider.save_to_csv(products)# 获取前5个商品的详细信息for i, product in enumerate(products[:5]):print(f"获取第{i+1}个商品的详细信息...") detail = spider.get_product_detail(product['link']) product.update(detail) time.sleep(random.uniform(1,2))
6. XPath调试技巧
6.1 浏览器调试
# 在浏览器开发者工具中测试XPath# 1. 按F12打开开发者工具# 2. 在Console中输入:# $x('//div[@class="content"]') // 测试XPath表达式# $x('//div[@class="content"]')[0] // 获取第一个匹配元素
6.2 Python调试
def debug_xpath(tree, xpath_expr):"""调试XPath表达式"""try: result = tree.xpath(xpath_expr)print(f"XPath: {xpath_expr}")print(f"结果数量: {len(result)}")if result:print("前3个结果:")for i, item in enumerate(result[:3]):if hasattr(item,'text'):print(f" {i+1}: {item.text}")elif hasattr(item,'tag'):print(f" {i+1}: <{item.tag}>")else:print(f" {i+1}: {item}")else:print("没有找到匹配的元素")exceptExceptionas e:print(f"XPath表达式错误: {e}")# 使用示例tree = html.fromstring(html_content)debug_xpath(tree,'//div[@class="product"]//h3/text()')
7. 性能优化
7.1 选择合适的解析器
# lxml比BeautifulSoup更快from lxml import htmlimport timedef benchmark_parsers(html_content, iterations=1000):"""比较解析器性能"""# lxml start_time = time.time()for _ in range(iterations): tree = html.fromstring(html_content) titles = tree.xpath('//h1/text()') lxml_time = time.time()- start_timeprint(f"lxml: {lxml_time:.4f}秒")# BeautifulSoupfrom bs4 importBeautifulSoup start_time = time.time()for _ in range(iterations): soup =BeautifulSoup(html_content,'html.parser') titles = soup.find_all('h1') bs4_time = time.time()- start_timeprint(f"BeautifulSoup: {bs4_time:.4f}秒")print(f"lxml比BeautifulSoup快 {bs4_time/lxml_time:.2f}倍")
7.2 XPath优化技巧
# 优化前:低效的XPathslow_xpath ='//*[@class="content"]//*[@class="item"]//*[@class="title"]'# 优化后:更具体的路径fast_xpath ='//div[@class="content"]//div[@class="item"]/h3[@class="title"]'# 使用索引而不是last()# 慢://li[last()]# 快://li[3] (如果知道具体位置)# 避免使用//开头,如果知道具体路径# 慢://div//span//text()# 快:/html/body/div/span/text()
8. 实践练习
练习1:爬取新闻网站
使用XPath爬取新闻网站的文章列表,提取标题、链接、发布时间等信息。
练习2:解析XML数据
处理一个包含商品信息的XML文件,提取所有商品的详细信息。
练习3:复杂表格解析
使用XPath解析包含合并单元格的复杂HTML表格。
9. 课程小结
本课程我们学习了:
- XPath语法和基本表达式
- XPath高级功能和函数
- lxml库的使用方法
- XML和HTML的解析技巧
- 命名空间处理
- 实战案例和性能优化
10. 下节预告
下一课我们将学习:
- 正则表达式在爬虫中的应用
- 数据清洗和预处理技术
- 处理各种数据格式
- 数据验证和质量控制
11. 作业
- 使用XPath爬取一个电商网站的商品信息
- 练习编写复杂的XPath表达式
- 比较XPath和CSS选择器的性能差异
- 处理包含命名空间的XML文档
提示:XPath是强大的数据提取工具,熟练掌握其语法可以大大提高爬虫的效率和准确性。