如何爬取网页数据
爬取网页数据(Web Scraping)是一项常见的技术任务,下面我将详细介绍从基础到进阶的网页爬取方法。
一、基础网页爬取方法
1.1 使用Python requests库
import requests
url = "https://example.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
html_content = response.text
print(html_content[:500]) # 打印前500个字符
else:
print(f"请求失败,状态码: {response.status_code}")
1.2 使用BeautifulSoup解析HTML
from bs4 import BeautifulSoup
# 假设html_content是上面获取的网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 提取标题
title = soup.title.string if soup.title else "无标题"
print(f"网页标题: {title}")
# 提取所有链接
for link in soup.find_all('a', href=True):
print(link['href'])
二、处理动态加载内容
2.1 使用Selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
chrome_options = Options()
chrome_options.add_argument("--headless") # 无头模式
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://example.com")
# 等待JavaScript加载
time.sleep(2)
# 获取渲染后的页面源码
html_content = driver.page_source
# 可以用BeautifulSoup继续解析
soup = BeautifulSoup(html_content, 'html.parser')
# ...解析逻辑...
driver.quit() # 关闭浏览器
2.2 使用Pyppeteer(无头Chrome)
import asyncio
from pyppeteer import launch
async def scrape():
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto('https://example.com')
# 等待特定元素出现
await page.waitForSelector('#content')
content = await page.content()
# ...处理content...
await browser.close()
asyncio.get_event_loop().run_until_complete(scrape())
三、处理常见反爬机制
3.1 使用代理IP
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
response = requests.get(url, headers=headers, proxies=proxies)
3.2 随机User-Agent
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
3.3 处理验证码
# 使用第三方验证码识别服务
def solve_captcha(image_url):
# 这里可以使用2captcha、打码兔等API
pass
四、使用Scrapy框架
4.1 创建Scrapy项目
scrapy startproject myproject
cd myproject
scrapy genspider example example.com
4.2 编写Spider
import scrapy
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = ["https://example.com"]
def parse(self, response):
# 提取数据
title = response.css('title::text').get()
links = response.css('a::attr(href)').getall()
yield {
'title': title,
'links': links
}
# 跟进链接
for link in links:
yield response.follow(link, callback=self.parse)
五、数据存储
5.1 存储到CSV
import csv
with open('output.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Title', 'URL'])
writer.writerow([title, url])
5.2 存储到数据库
import sqlite3
conn = sqlite3.connect('scraped_data.db')
cursor = conn.cursor()
# 创建表
cursor.execute('''CREATE TABLE IF NOT EXISTS pages
(id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT, url TEXT, content TEXT)''')
# 插入数据
cursor.execute("INSERT INTO pages (title, url, content) VALUES (?, ?, ?)",
(title, url, content))
conn.commit()
conn.close()
六、高级技巧
6.1 分布式爬取
使用Scrapy-Redis实现分布式爬虫:
# settings.py
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379'
# spider.py
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
name = 'myspider'
redis_key = 'myspider:start_urls'
6.2 增量爬取
# 使用数据库记录已爬取URL
def is_url_crawled(url):
# 检查数据库
return False # 假设未爬取
if not is_url_crawled(url):
# 爬取逻辑
pass
七、法律与道德注意事项
- 检查robots.txt -
https://example.com/robots.txt
- 限制请求频率 - 使用
time.sleep()
控制爬取速度 - 尊重版权 - 不要爬取受版权保护的内容
- 不爬取个人隐私数据 - 遵守数据保护法规
- 考虑使用API - 优先使用网站提供的官方API
八、完整示例
import requests
from bs4 import BeautifulSoup
import time
import random
from fake_useragent import UserAgent
ua = UserAgent()
def scrape_website(base_url, max_pages=10):
visited = set()
to_visit = {base_url}
data = []
while to_visit and len(data) < max_pages:
url = to_visit.pop()
if url in visited:
continue
try:
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 提取数据
title = soup.title.string if soup.title else "无标题"
data.append({'url': url, 'title': title})
# 提取新链接
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('http') and base_url in href:
to_visit.add(href)
visited.add(url)
print(f"已爬取: {url}")
# 随机延迟
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"爬取 {url} 失败: {e}")
return data
# 使用示例
results = scrape_website("https://example.com", max_pages=5)
for item in results:
print(f"{item['title']} - {item['url']}")
以上是网页爬取的基本方法和技巧,实际应用中需要根据目标网站的具体情况进行调整。