Python 网络爬虫
Python 是编写网络爬虫的流行语言,因为它有丰富的库和框架支持。
基本爬虫组件
1 核心库
import requests # 发送HTTP请求
from bs4 import BeautifulSoup # HTML解析
import csv # 数据存储
import pandas as pd # 数据处理
import time # 控制爬取速度
import random # 随机化请求
2 简单爬虫示例
def simple_spider(url):
try:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 提取数据示例
titles = [h1.text for h1 in soup.find_all('h1')]
links = [a['href'] for a in soup.find_all('a', href=True)]
return titles, links
except Exception as e:
print(f"爬取失败: {e}")
return [], []
高级爬虫技术
1 处理动态内容
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
def dynamic_spider(url):
options = Options()
options.headless = True # 无头模式
driver = webdriver.Chrome(options=options)
try:
driver.get(url)
time.sleep(3) # 等待页面加载
# 使用Selenium提取动态加载的内容
elements = driver.find_elements(By.CSS_SELECTOR, '.dynamic-content')
data = [el.text for el in elements]
return data
finally:
driver.quit()
2 处理分页
def pagination_spider(base_url, max_pages=10):
all_data = []
page = 1
while page <= max_pages:
url = f"{base_url}?page={page}"
print(f"正在爬取: {url}")
try:
data = simple_spider(url)
if not data: # 如果没有数据,停止爬取
break
all_data.extend(data)
page += 1
time.sleep(random.uniform(1, 3)) # 随机延迟
except Exception as e:
print(f"第{page}页爬取失败: {e}")
break
return all_data
数据存储
1 存储到CSV
def save_to_csv(data, filename):
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['标题', '链接']) # 表头
writer.writerows(data)
2 存储到数据库
import sqlite3
def save_to_db(data, db_name='spider_data.db'):
conn = sqlite3.connect(db_name)
c = conn.cursor()
# 创建表
c.execute('''CREATE TABLE IF NOT EXISTS scraped_data
(id INTEGER PRIMARY KEY, title TEXT, url TEXT)''')
# 插入数据
for item in data:
c.execute("INSERT INTO scraped_data (title, url) VALUES (?, ?)", item)
conn.commit()
conn.close()
完整示例
【参考代码链接】https://github.com/Python-World/python-mini-projects
1. 图片下载
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
from bs4 import BeautifulSoup
import time
# 获取用户输入
path = "D:/Program Files/chromedriver-win64/chromedriver.exe" #input("Enter ChromeDriver Path: ").strip() # 例如: D:\chromedriver\chromedriver.exe
url = "https://image.baidu.com/"#input("Enter URL: ").strip() # 例如: https://image.baidu.com/
output_dir = "output" # 图片保存目录
def get_driver(path, url):
try:
# 设置Chrome选项
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式
options.add_argument('--disable-gpu')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
# 初始化WebDriver
service = Service(executable_path=path)
driver = webdriver.Chrome(service=service, options=options)
print("Loading page...")
driver.get(url)
# 等待页面加载完成
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.TAG_NAME, 'body'))
)
# 滚动页面以加载更多图片(针对懒加载)
for _ in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
return driver
except Exception as e:
print(f"Failed to initialize browser: {e}")
return None
def get_img_links(driver):
try:
# 获取页面源码并解析
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# 查找所有img标签
img_tags = soup.find_all('img')
# 提取有效的图片链接
img_links = []
for img in img_tags:
src = img.get('src', '')
if src and (src.startswith('http://') or src.startswith('https://')):
img_links.append(src)
return list(set(img_links)) # 去重
except Exception as e:
print(f"Error extracting image links: {e}")
return []
def download_image(img_url, index):
try:
# 获取图片扩展名
extension = os.path.splitext(img_url)[1]
if not extension: # 如果URL中没有扩展名
extension = '.jpg'
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 下载图片
response = requests.get(img_url, stream=True, timeout=10)
response.raise_for_status()
# 保存图片
filename = f"{output_dir}/image_{index}{extension}"
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {filename}")
return True
except Exception as e:
print(f"Failed to download {img_url}: {e}")
return False
def main():
# 初始化浏览器
driver = get_driver(path, url)
if not driver:
return
try:
# 获取图片链接
print("Extracting image links...")
img_links = get_img_links(driver)
print(f"Found {len(img_links)} images")
# 下载图片
success_count = 0
for index, img_url in enumerate(img_links):
if download_image(img_url, index):
success_count += 1
print(f"\nDownload complete! Successfully downloaded {success_count}/{len(img_links)} images")
finally:
driver.quit()
if __name__ == "__main__":
main()
2. 文章爬取
from bs4 import BeautifulSoup
import requests
import csv
from urllib.parse import urljoin
def scrape_quotes(base_url, output_file='quote_list.csv'):
"""
Scrape quotes from quotes.toscrape.com and save to CSV file.
Args:
base_url (str): The base URL of the website to scrape
output_file (str): Path to the output CSV file
"""
try:
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
fieldnames = ['quote', 'author', 'tags']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
current_url = base_url
while current_url:
try:
# Fetch the page with timeout and error handling
response = requests.get(current_url, timeout=10)
response.raise_for_status()
bs = BeautifulSoup(response.text, 'html.parser')
# Process all quotes on the page
for quote in bs.find_all('div', class_='quote'):
try:
text = quote.find('span', class_='text').get_text(strip=True)
author = quote.find('small', class_='author').get_text(strip=True)
tags = [tag.get_text(strip=True)
for tag in quote.find_all('a', class_='tag')]
writer.writerow({
'quote': text,
'author': author,
'tags': ', '.join(tags) # Store tags as comma-separated string
})
except AttributeError as e:
print(f"Skipping malformed quote: {e}")
continue
# Find next page link
next_button = bs.find('li', class_='next')
current_url = urljoin(base_url, next_button.a['href']) if next_button else None
except requests.RequestException as e:
print(f"Error fetching {current_url}: {e}")
break
except Exception as e:
print(f"Unexpected error processing {current_url}: {e}")
break
except IOError as e:
print(f"Error writing to file {output_file}: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
else:
print(f"Successfully scraped quotes to {output_file}")
if __name__ == '__main__':
BASE_URL = 'http://quotes.toscrape.com'
scrape_quotes(BASE_URL)
爬虫实践
-
遵守robots.txt:检查目标网站的
robots.txt
文件 -
设置User-Agent:模拟浏览器行为
-
限制请求频率:避免被封禁
-
处理异常:网络问题、页面变化等
-
使用代理IP:对于大规模爬取
# 遵守robots.txt示例
from urllib.robotparser import RobotFileParser
def check_robots(url):
rp = RobotFileParser()
rp.set_url(url + "/robots.txt")
rp.read()
return rp.can_fetch("*", url)
反反爬虫策略
-
随机延迟:
time.sleep(random.uniform(0.5, 3))
-
轮换User-Agent:
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ...'
]
headers = {'User-Agent': random.choice(user_agents)}
-
使用代理池:
proxies = {
'http': 'http://proxy1.example.com:8080',
'https': 'https://proxy2.example.com:8080'
}
response = requests.get(url, proxies=proxies)
爬虫框架
1 Scrapy框架示例
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['http://example.com']
def parse(self, response):
for item in response.css('div.item'):
yield {
'title': item.css('h2::text').get(),
'link': item.css('a::attr(href)').get()
}
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
Scrapy与Requests+BeautifulSoup
Scrapy框架与使用Requests+BeautifulSoup的爬虫在架构、功能和适用场景上有显著区别。以下是针对您提到的"图片下文章下载"场景的具体对比分析:
1. 架构层级区别
特性 | Scrapy框架 | Requests+BeautifulSoup |
---|---|---|
架构类型 | 完整的爬虫框架(有明确的项目结构) | 脚本式开发(线性执行) |
组件化程度 | 高度组件化(Spider/Middleware/Pipeline) | 需要手动实现各功能模块 |
内置功能 | 包含请求调度、去重、并发等完整机制 | 需自行实现基础功能 |
2. 图片文章下载场景对比
Scrapy实现优势
import scrapy from scrapy.pipelines.images import ImagesPipeline class ArticleSpider(scrapy.Spider): name = 'article_downloader' custom_settings = { 'ITEM_PIPELINES': { 'scrapy.pipelines.images.ImagesPipeline': 1, 'myproject.pipelines.TextPipeline': 300 }, 'IMAGES_STORE': './downloads/images' } def parse(self, response): yield { 'title': response.css('h1::text').get(), 'content': response.css('article::text').getall(), 'image_urls': response.css('img.article-image::attr(src)').getall() }
特点:
-
内置媒体管道:自动处理图片下载、存储和校验
-
异步处理:并发下载文章和图片(默认16并发)
-
断点续爬:自动记录爬取状态
-
自动去重:避免重复下载相同图片/文章
Requests+BeautifulSoup实现
import os import requests from bs4 import BeautifulSoup def download_article(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 下载文本内容 content = '\n'.join([p.text for p in soup.select('article p')]) # 下载图片 for img in soup.select('img.article-image'): img_url = img['src'] img_data = requests.get(img_url).content with open(f'./downloads/{os.path.basename(img_url)}', 'wb') as f: f.write(img_data) return content
特点:
-
同步阻塞:逐个下载资源(图片下载会阻塞主线程)
-
手动管理:需要自行处理重试、去重、存储等逻辑
-
灵活性高:可快速调整解析逻辑
3. 核心能力对比
功能需求 | Scrapy解决方案 | Requests方案 |
---|---|---|
并发下载 | 内置Twisted异步引擎(高性能) | 需手动实现多线程/异步(如aiohttp ) |
媒体文件下载 | 内置ImagesPipeline/FilesPipeline | 需自行实现下载和存储逻辑 |
请求去重 | 自动基于指纹去重 | 需手动维护URL集合 |
爬取规则 | 支持CrawlSpider规则自动跟进链接 | 需手动解析和跟踪链接 |
数据清洗 | 通过Item Loader系统化处理 | 临时编写清洗代码 |
反爬对抗 | 内置Rotating User-Agent等中间件 | 需自行实现请求头管理 |
4. 选择建议
使用Scrapy当:
-
需要下载大量文章和图片(>1000篇)
-
要求稳定的断点续爬能力
-
需要定期增量爬取
-
项目需要长期维护
使用Requests+BS4当:
-
快速抓取少量文章(<100篇)
-
需要灵活调试解析逻辑
-
目标网站结构简单且无反爬
-
临时性抓取需求
5. 混合方案(推荐)
对于图片文章下载这种混合内容类型,可以采用折中方案:
# 使用Scrapy框架作为核心,配合Requests处理特殊页面 class HybridSpider(scrapy.Spider): def parse(self, response): if 'captcha' in response.text: # 遇到验证码时切换为requests处理 import requests new_response = requests.get(response.url, headers=...) # 处理逻辑... else: # 正常Scrapy处理 yield { 'images': response.css('img::attr(src)').getall(), 'text': response.css('article::text').get() }
这种组合既能利用Scrapy的高效调度,又能保持对特殊情况的处理灵活性。
注意事项
-
尊重网站的
robots.txt
规则 -
不要过度请求导致服务器负载过高
-
遵守数据保护法规(如GDPR)
-
仅爬取公开可用数据
-
考虑使用官方API(如果可用)
通过遵循这些指南,可以构建高效、可靠且合规的Python网络爬虫。