从入门到精通,掌握这些指令让你成为爬虫高手
欢迎大家关注此公众号,后台点击按钮【免费资料】可免费获取【Python入门30节课】电子书
此外小庄推荐一本适合于新手\小白入手一本 Python基础书籍,欢迎大家订阅,也感谢大家支持,我才有更新的动力
作为一名Python爬虫工程师,你需要掌握一系列核心指令和库,才能高效地从互联网上获取数据。本文将系统性地介绍爬虫工程师必须掌握的指令,从基础到进阶,助你成为爬虫领域的专家。
pip install requests
pip install beautifulsoup4
pip install lxml
pip install selenium
pip install scrapy
pip install aiohttp
pip install parsel
pip install fake-useragent
pip install -r requirements.txt
pip list
pip show requests
import requests
response = requests.get('https://www.example.com')
print(response.status_code) # 状态码
print(response.text) # 文本内容
print(response.content) # 二进制内容
print(response.headers) # 响应头
print(response.url) # 最终URL
import requests
data = {
'username': 'admin',
'password': '123456'
}
response = requests.post('https://www.example.com/login', data=data)
print(response.json()) # JSON响应
import requests
params = {
'q': 'python',
'page': 1
}
response = requests.get('https://www.example.com/search', params=params)
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.example.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
response = requests.get('https://www.example.com', headers=headers)
import requests
session = requests.Session()
session.get('https://www.example.com/login') # 登录
response = session.get('https://www.example.com/profile') # 访问个人页面
import requests
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
response = requests.get('https://www.example.com', proxies=proxies)
import requests
response = requests.get('https://www.example.com', timeout=5) # 5秒超时
from bs4 import BeautifulSoup
html = '<html><body><p>Hello World</p></body></html>'
soup = BeautifulSoup(html, 'lxml')
# 查找单个元素
soup.find('div', class_='content')
soup.find('div', id='main')
soup.find('a', href=True)
# 查找所有元素
soup.find_all('p')
soup.find_all('a', limit=10) # 限制数量
# CSS选择器
soup.select('.content')
soup.select('#main')
soup.select('div.item > a')
element = soup.find('p')
print(element.text) # 获取文本
print(element.get_text()) # 获取文本
print(element['href']) # 获取属性
print(element.get('href', '')) # 获取属性(带默认值)
from parsel import Selector
html = '<html><body><p>Hello</p></body></html>'
sel = Selector(text=html)
# XPath选择
sel.xpath('//p/text()').get()
sel.xpath('//p/text()').getall()
# CSS选择
sel.css('p::text').get()
sel.css('p::text').getall()
# 选取所有p标签
sel.xpath('//p')
# 选取带有class属性的div
sel.xpath('//div[@class="content"]')
# 选取href属性
sel.xpath('//a/@href')
# 选取文本
sel.xpath('//p/text()')
# 包含某个class
sel.xpath('//div[contains(@class, "item")]')
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.example.com')
# 通过ID查找
element = driver.find_element(By.ID, 'username')
# 通过CSS选择器查找
element = driver.find_element(By.CSS_SELECTOR, '.content')
# 通过XPath查找
element = driver.find_element(By.XPATH, '//div[@class="item"]')
# 通过类名查找
element = driver.find_element(By.CLASS_NAME, 'btn')
# 查找多个元素
elements = driver.find_elements(By.TAG_NAME, 'a')
# 点击元素
element.click()
# 输入文本
element.send_keys('Hello')
# 清空输入框
element.clear()
# 获取属性
element.get_attribute('href')
# 获取文本
element.text
# 等待元素出现
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'myElement'))
)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
driver.execute_script('arguments[0].click()', element)
scrapy startproject myproject
cd myproject
scrapy genspider example example.com
scrapy shell 'https://www.example.com'
# 在Shell中使用
response.xpath('//title/text()').get()
response.css('title::text').get()
scrapy crawl myspider
scrapy crawl myspider -o output.json
scrapy crawl myspider -o output.csv
scrapy crawl myspider -o output.xml
import scrapy
class ProductItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
start_urls = ['https://www.example.com']
def parse(self, response):
for item in response.css('.product'):
yield {
'name': item.css('h2::text').get(),
'price': item.css('.price::text').get(),
}
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
}
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
]
import aiohttp
import asyncio
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
async def main():
urls = [
'https://www.example.com/page1',
'https://www.example.com/page2',
'https://www.example.com/page3',
]
tasks = [fetch(url) for url in urls]
results = await asyncio.gather(*tasks)
return results
asyncio.run(main())
import aiohttp
import asyncio
async def fetch_with_params():
params = {'key': 'value'}
async with aiohttp.ClientSession() as session:
async with session.get('https://api.example.com', params=params) as resp:
data = await resp.json()
return data
asyncio.run(fetch_with_params())
import json
data = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
import csv
data = [['Name', 'Age'], ['Alice', 25], ['Bob', 30]]
with open('data.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerows(data)
import sqlite3
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS users (name TEXT, age INTEGER)')
cursor.execute('INSERT INTO users VALUES (?, ?)', ('Alice', 25))
conn.commit()
conn.close()
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
import time
import random
time.sleep(random.uniform(1, 3)) # 随机延时1-3秒
# 使用第三方验证码识别服务
# 或使用ddddocr库
import ddddocr
ocr = ddddocr.DdddOcr()
result = ocr.classification(image_bytes)
import requests
cookies = {
'session_id': 'abc123',
'user_token': 'xyz789'
}
response = requests.get('https://www.example.com', cookies=cookies)
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info('开始爬取...')
logger.error('爬取失败!')
pip install fake-useragent
from fake_useragent import UserAgent
ua = UserAgent()
print(ua.chrome)
print(ua.firefox)
print(ua.random)
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url('https://www.example.com/robots.txt')
rp.read()
can_fetch = rp.can_fetch('*', 'https://www.example.com/page')
作为Python爬虫工程师,掌握这些指令是基本功:
掌握这些工具,你就能应对绝大多数爬虫场景。
关注我,获取更多Python技术干货!