🕷️ Python爬虫反反爬实战:5招搞定90%的网站反爬机制,附完整代码
📌 导语
你辛辛苦苦写的爬虫,刚跑几分钟就返回403、被封IP、看到"您访问过于频繁"?别急,这不是你技术不行——只是你没踩对反爬的点。本文从底层原理出发,手把手教你5种最实用的爬虫反反爬技巧,涵盖请求头伪装、IP代理池、浏览器指纹模拟、动态渲染破解和验证码处理,让你轻松拿捏90%的网站反爬机制。文末有完整项目代码,建议收藏!
一、🤔 为什么你的爬虫总被封?
在动手写代码前,我们先弄清楚一个核心问题:服务器是怎么识别你是爬虫,而不是正常用户的?
简单来说,网站的反爬虫系统会从三个维度进行检测:
| | |
|---|
| User-Agent、Headers、Cookie | |
| | |
| | |
| | |
知道了对方怎么检测,我们就知道怎么应对了。接下来,我们逐一击破。
二、🛡️ 第一招:请求头伪装——最基础也最关键
很多爬虫新手连 User-Agent 都不换,直接用 Python 默认的 Python-urllib/3.x,这等于在脸上写着"我是爬虫"。
2.1 轮换 User-Agent
import requests
from random import choice
USER_AGENTS = [
# Chrome Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
# Chrome macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
# Edge
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
# Firefox
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
# Safari
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1",
]
defget_headers():
"""生成伪造的请求头"""
return {
"User-Agent": choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
# 使用方式
resp = requests.get("https://example.com", headers=get_headers())
💡 关键点:仅靠 User-Agent 远远不够!现代反爬系统会检查 Accept-*、Sec-Fetch-* 等十几个请求头。上面的代码模拟了一个完整浏览器的请求头组合。
2.2 Cookie 策略
有些网站第一次访问时服务器会设置 Cookie,第二次请求时必须携带。用 Session 对象自动管理:
# 用 Session 自动管理 Cookie
session = requests.Session()
session.headers.update(get_headers())
# 第一次请求,获取 Cookie
resp = session.get("https://example.com/login")
# 后续请求自动携带 Cookie
resp2 = session.get("https://example.com/dashboard")
三、🌐 第二招:IP代理池——让封IP成为过去式
当你的爬虫需要爬取大量数据时,单 IP 高频请求迟早会被封。这时候就需要 IP代理池。
3.1 免费代理采集 + 验证
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict
classProxyPool:
"""简易IP代理池"""
def__init__(self):
self.proxies: List[Dict[str, str]] = []
self._test_url = "https://httpbin.org/ip"
self._test_timeout = 5
deffetch_free_proxies(self) -> List[Dict[str, str]]:
"""从免费代理网站获取代理列表(示例用快代理)"""
url = "https://www.kuaidaili.com/free/inha/"
resp = requests.get(url, headers=get_headers(), timeout=10)
# 解析代理IP和端口(实际使用需根据页面结构调整)
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser")
proxy_list = []
for tr in soup.select("table tbody tr"):
tds = tr.select("td")
if len(tds) >= 2:
ip = tds[0].text.strip()
port = tds[1].text.strip()
proxy_list.append({
"http": f"http://{ip}:{port}",
"https": f"http://{ip}:{port}",
})
return proxy_list
defvalidate_proxy(self, proxy: Dict[str, str]) -> bool:
"""验证代理是否可用"""
try:
resp = requests.get(
self._test_url,
proxies=proxy,
timeout=self._test_timeout
)
return resp.status_code == 200
except:
returnFalse
defrefresh(self, max_workers=20):
"""刷新代理池 - 多线程验证,只保留可用代理"""
raw_proxies = self.fetch_free_proxies()
valid_proxies = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_map = {executor.submit(self.validate_proxy, p): p for p in raw_proxies}
for future in as_completed(future_map):
proxy = future_map[future]
if future.result():
valid_proxies.append(proxy)
self.proxies = valid_proxies
print(f"✅ 代理池刷新完成:可用 {len(self.proxies)}/{len(raw_proxies)}")
defget_random_proxy(self) -> Dict[str, str]:
"""随机获取一个可用代理"""
ifnot self.proxies:
raise RuntimeError("代理池为空,请先调用 refresh()")
return choice(self.proxies)
defremove_proxy(self, proxy: Dict[str, str]):
"""移除失效的代理"""
if proxy in self.proxies:
self.proxies.remove(proxy)
# 使用示例
pool = ProxyPool()
pool.refresh()
# 带代理的请求
for i in range(100):
try:
proxy = pool.get_random_proxy()
resp = requests.get(
"https://httpbin.org/ip",
proxies=proxy,
headers=get_headers(),
timeout=10
)
print(f"第{i+1}次请求,IP: {resp.json()['origin']}")
except Exception:
pool.remove_proxy(proxy) # 自动剔除失效代理
⚠️ 注意:免费代理质量参差不齐,生产环境建议使用付费代理服务(如快代理、芝麻代理等),稳定性和速度有保障。
四、🔄 第三招:请求频率控制——别像个机器人
如果你每秒发 100 个请求,正常用户不可能做到,服务器一眼就能识别。
4.1 随机延时 + 自动退避
import time
import random
from functools import wraps
defpolite_crawl(
min_delay: float = 1.0,
max_delay: float = 3.0,
max_retries: int = 3,
backoff_factor: float = 2.0
):
"""
爬虫礼貌装饰器
Args:
min_delay: 最小请求间隔(秒)
max_delay: 最大请求间隔(秒)
max_retries: 最大重试次数
backoff_factor: 退避倍数
"""
defdecorator(func):
@wraps(func)
defwrapper(*args, **kwargs):
for attempt in range(max_retries + 1):
try:
result = func(*args, **kwargs)
# 模拟人类行为:请求成功后延时
delay = random.uniform(min_delay, max_delay)
time.sleep(delay)
return result
except (requests.ConnectionError,
requests.Timeout) as e:
if attempt == max_retries:
raise
# 指数退避 + 随机抖动
wait = backoff_factor ** attempt + random.uniform(0, 1)
print(f"⚠️ 请求失败,{wait:.1f}s 后重试(第{attempt+1}次)")
time.sleep(wait)
except requests.HTTPError as e:
if e.response.status_code == 429:
# 429 Too Many Requests - 被限流了!
retry_after = int(e.response.headers.get(
"Retry-After",
backoff_factor ** attempt
))
print(f"🚫 被限流,等待 {retry_after}s...")
time.sleep(retry_after)
else:
raise
returnNone
return wrapper
return decorator
# 使用示例
@polite_crawl(min_delay=1.5, max_delay=4.0)
deffetch_page(url: str) -> str:
resp = requests.get(url, headers=get_headers(), timeout=10)
resp.raise_for_status()
return resp.text
💡 核心逻辑:每次请求后随机等待 1.5~4 秒,失败后指数退避重试,遇到 429 状态码按服务器要求等待。这样既不会太快被封,也不会因为一次失败就放弃。
五、🖥️ 第四招:Selenium 反检测——搞定JS动态渲染
现在越来越多的网站使用 JavaScript 动态加载数据,直接用 requests 拿不到内容。Selenium 可以模拟真实浏览器,但网站也有检测手段。
5.1 隐藏 WebDriver 特征
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
defcreate_stealth_driver():
"""创建一个防检测的 Chrome 浏览器实例"""
options = Options()
# 基础配置
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# 窗口大小(模拟真实屏幕分辨率)
options.add_argument("--window-size=1920,1080")
# 禁用自动化提示条
options.add_argument("--disable-infobars")
# 保持浏览器特征
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)
# ✅ 关键:执行JS脚本覆盖navigator.webdriver
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
// 覆盖 WebDriver 检测属性
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// 覆盖 Chrome 检测
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// 覆盖 languages 属性
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
// 模拟 Canvas 指纹(避免 Canvas fingerprinting)
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.toDataURL = function(type) {
const context = this.getContext('2d');
// 添加微小随机噪声,规避指纹追踪
const shift = Math.floor(Math.random() * 10) - 5;
const imageData = context.getImageData(0, 0, this.width, this.height);
for (let i = 0; i < imageData.data.length; i += 4) {
imageData.data[i] = imageData.data[i] + shift;
}
context.putImageData(imageData, 0, 0);
return originalToDataURL.apply(this, arguments);
};
"""
})
return driver
# 使用示例
driver = create_stealth_driver()
driver.get("https://bot.sannysoft.com/") # 检测页面
print("页面标题:", driver.title)
# 获取动态加载的内容
content = driver.find_element("tag name", "body").text
🎯 为什么这招有效? 网站通过检查 navigator.webdriver、navigator.plugins.length、chrome.runtime 等 JS 属性来判断是否自动化。上述代码在页面加载前就覆盖了这些属性,让浏览器看起来像真人用户。
5.2 模拟人类操作行为
from selenium.webdriver.common.action_chains import ActionChains
import random
defhuman_like_click(driver, element):
"""模拟人类点击行为"""
actions = ActionChains(driver)
# 1. 缓慢移动到目标元素(带随机偏移)
x_offset = random.randint(-5, 5)
y_offset = random.randint(-5, 5)
actions.move_to_element_with_offset(element, x_offset, y_offset)
actions.perform()
# 2. 暂停一下,像人在看
time.sleep(random.uniform(0.1, 0.3))
# 3. 点击
actions.click(element)
actions.perform()
# 4. 点击后停留
time.sleep(random.uniform(0.3, 0.8))
defhuman_like_scroll(driver, target_scroll_px: int = None):
"""模拟人类滚动行为"""
current = driver.execute_script("return window.pageYOffset")
target = target_scroll_px or random.randint(300, 1500)
while current < target:
# 每次滚动一个随机小距离
step = random.randint(100, 300)
current = min(current + step, target)
driver.execute_script(f"window.scrollTo(0, {current})")
time.sleep(random.uniform(0.3, 0.8))
六、🔐 第五招:验证码绕过——最后的堡垒
如果你已经做了前三步还是被拦,大概率是遇到验证码了。根据不同验证码类型,有不同的策略:
6.1 文字验证码 → OCR 识别
import ddddocr # pip install ddddocr
defsolve_text_captcha(image_path: str) -> str:
"""识别文字验证码"""
ocr = ddddocr.DdddOcr(show_ad=False)
with open(image_path, "rb") as f:
img_bytes = f.read()
result = ocr.classification(img_bytes)
return result
# 使用示例
code = solve_text_captcha("captcha.png")
print(f"识别结果: {code}")
# 填入验证码并提交
# driver.find_element(By.ID, "captcha-input").send_keys(code)
6.2 滑块验证码 → 轨迹模拟
from selenium.webdriver import ActionChains
defslide_captcha(driver, slider_element, track_element=None):
"""
模拟人类滑动验证码
核心思路:人的滑动是"快-慢-停-微调"模式,
不是匀速直线,更不是瞬间到位
"""
# 生成人类滑动轨迹
defgenerate_track(distance):
track = []
current = 0
mid = distance * 3 / 4# 前3/4快速滑
# 第一阶段:加速
while current < mid:
move = random.randint(2, 5)
current += move
track.append(current)
# 第二阶段:减速接近目标
while current < distance - 10:
move = random.randint(1, 3)
current += move
track.append(current)
# 第三阶段:微调
while current < distance:
move = 1
current += move
track.append(current)
# 最后可能稍微过头再回来(人类滑动常有)
if random.random() > 0.5:
track.append(distance + random.randint(1, 3))
track.append(distance)
return track
# 获取滑块起始位置
actions = ActionChains(driver)
actions.click_and_hold(slider_element).perform()
# 计算滑动距离(实际项目中需要计算缺口位置)
distance = 258# 示例值
# 分段滑动
track = generate_track(distance)
for x in track:
actions.move_by_offset(xoffset=1, yoffset=random.randint(-1, 1))
time.sleep(random.uniform(0.002, 0.01)) # 极小延时模拟真实滑动
# 释放滑块
time.sleep(random.uniform(0.1, 0.2))
actions.release().perform()
🤫 进阶提示:高级滑块验证码(如极验4.0、某盾)还会检测鼠标轨迹的贝塞尔曲线特征、加速度变化、点击压力(Web API)等。这种情况下,建议使用付费的第三方打码平台。
七、🎯 实战案例:爬取某招聘网站职位信息
整合上面所有技巧,来一个完整的实战:
"""
爬取某招聘网站Python岗位信息
使用:请求头伪装 + IP代理 + 随机延时 + Selenium反检测 + 错误重试
"""
import json
import time
import random
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
classJobSpider:
"""智能职位爬虫"""
def__init__(self, proxy_pool=None):
self.session = requests.Session()
self.session.headers.update(get_headers())
self.proxy_pool = proxy_pool
deffetch_jobs(self, keyword: str, pages: int = 5) -> List[Dict]:
"""爬取职位列表"""
all_jobs = []
for page in range(1, pages + 1):
try:
# 每页间隔随机延时
time.sleep(random.uniform(2, 5))
url = f"https://www.zhipin.com/web/geek/job?query={keyword}&page={page}"
# 使用代理
proxies = None
if self.proxy_pool:
try:
proxies = self.proxy_pool.get_random_proxy()
except:
pass
resp = self.session.get(
url,
proxies=proxies,
timeout=15
)
resp.raise_for_status()
# 解析数据
soup = BeautifulSoup(resp.text, "html.parser")
jobs = self._parse_job_list(soup)
all_jobs.extend(jobs)
print(f"📄 第{page}页完成,获取到 {len(jobs)} 个职位")
except requests.RequestException as e:
print(f"❌ 第{page}页失败: {e}")
if self.proxy_pool and proxies:
self.proxy_pool.remove_proxy(proxies)
continue
return all_jobs
def_parse_job_list(self, soup: BeautifulSoup) -> List[Dict]:
"""解析职位列表(根据实际页面结构调整选择器)"""
jobs = []
for item in soup.select(".job-list-item"):
job = {
"title": item.select_one(".job-name").text.strip(),
"company": item.select_one(".company-name").text.strip(),
"salary": item.select_one(".salary").text.strip(),
"location": item.select_one(".job-area").text.strip(),
}
jobs.append(job)
return jobs
# 运行
spider = JobSpider(proxy_pool=pool)
results = spider.fetch_jobs("Python", pages=3)
# 保存结果
with open("python_jobs.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"🎉 共获取 {len(results)} 个职位信息,已保存到 python_jobs.json")
八、📝 总结
核心要点回顾
| | | |
|---|
| | UA轮换 + 完整Headers + Cookie管理 | |
| | | |
| | | |
| | | |
| | | |
最后的建议
**爬虫的最高境界不是技术,而是"像人"**。你越接近一个真实用户的行为,越不容易被识别。
💬 互动环节:你写爬虫时遇到过最离谱的反爬是什么?用的是什么骚操作绕过的?欢迎在评论区分享你的故事!
🏷️ 标签
#Python#爬虫#反爬虫#Selenium#BeautifulSoup#Web开发#自动化#效率工具