python编程:网络爬虫

Python 网络爬虫

Python 是编写网络爬虫的流行语言,因为它有丰富的库和框架支持。

基本爬虫组件

1 核心库

import requests       # 发送HTTP请求
from bs4 import BeautifulSoup  # HTML解析
import csv            # 数据存储
import pandas as pd   # 数据处理
import time           # 控制爬取速度
import random         # 随机化请求

2 简单爬虫示例

def simple_spider(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取数据示例
        titles = [h1.text for h1 in soup.find_all('h1')]
        links = [a['href'] for a in soup.find_all('a', href=True)]
        
        return titles, links
    except Exception as e:
        print(f"爬取失败: {e}")
        return [], []

高级爬虫技术

1 处理动态内容

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

def dynamic_spider(url):
    options = Options()
    options.headless = True  # 无头模式
    driver = webdriver.Chrome(options=options)
    
    try:
        driver.get(url)
        time.sleep(3)  # 等待页面加载
        
        # 使用Selenium提取动态加载的内容
        elements = driver.find_elements(By.CSS_SELECTOR, '.dynamic-content')
        data = [el.text for el in elements]
        
        return data
    finally:
        driver.quit()

2 处理分页

def pagination_spider(base_url, max_pages=10):
    all_data = []
    page = 1
    
    while page <= max_pages:
        url = f"{base_url}?page={page}"
        print(f"正在爬取: {url}")
        
        try:
            data = simple_spider(url)
            if not data:  # 如果没有数据,停止爬取
                break
                
            all_data.extend(data)
            page += 1
            time.sleep(random.uniform(1, 3))  # 随机延迟
        except Exception as e:
            print(f"第{page}页爬取失败: {e}")
            break
            
    return all_data

数据存储

1 存储到CSV

def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['标题', '链接'])  # 表头
        writer.writerows(data)

2 存储到数据库

import sqlite3

def save_to_db(data, db_name='spider_data.db'):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    # 创建表
    c.execute('''CREATE TABLE IF NOT EXISTS scraped_data
                 (id INTEGER PRIMARY KEY, title TEXT, url TEXT)''')
    
    # 插入数据
    for item in data:
        c.execute("INSERT INTO scraped_data (title, url) VALUES (?, ?)", item)
    
    conn.commit()
    conn.close()

完整示例

参考代码链接https://github.com/Python-World/python-mini-projects

1. 图片下载

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
from bs4 import BeautifulSoup
import time

# 获取用户输入
path = "D:/Program Files/chromedriver-win64/chromedriver.exe" #input("Enter ChromeDriver Path: ").strip()  # 例如: D:\chromedriver\chromedriver.exe
url = "https://image.baidu.com/"#input("Enter URL: ").strip()  # 例如: https://image.baidu.com/
output_dir = "output"  # 图片保存目录

def get_driver(path, url):
    try:
        # 设置Chrome选项
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # 无头模式
        options.add_argument('--disable-gpu')
        options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        
        # 初始化WebDriver
        service = Service(executable_path=path)
        driver = webdriver.Chrome(service=service, options=options)
        
        print("Loading page...")
        driver.get(url)
        
        # 等待页面加载完成
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.TAG_NAME, 'body'))
        )
        
        # 滚动页面以加载更多图片(针对懒加载)
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
        
        return driver
    except Exception as e:
        print(f"Failed to initialize browser: {e}")
        return None

def get_img_links(driver):
    try:
        # 获取页面源码并解析
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # 查找所有img标签
        img_tags = soup.find_all('img')
        
        # 提取有效的图片链接
        img_links = []
        for img in img_tags:
            src = img.get('src', '')
            if src and (src.startswith('http://') or src.startswith('https://')):
                img_links.append(src)
        
        return list(set(img_links))  # 去重
    except Exception as e:
        print(f"Error extracting image links: {e}")
        return []

def download_image(img_url, index):
    try:
        # 获取图片扩展名
        extension = os.path.splitext(img_url)[1]
        if not extension:  # 如果URL中没有扩展名
            extension = '.jpg'
        
        # 创建输出目录
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # 下载图片
        response = requests.get(img_url, stream=True, timeout=10)
        response.raise_for_status()
        
        # 保存图片
        filename = f"{output_dir}/image_{index}{extension}"
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        
        print(f"Downloaded: {filename}")
        return True
    except Exception as e:
        print(f"Failed to download {img_url}: {e}")
        return False

def main():
    # 初始化浏览器
    driver = get_driver(path, url)
    if not driver:
        return
    
    try:
        # 获取图片链接
        print("Extracting image links...")
        img_links = get_img_links(driver)
        print(f"Found {len(img_links)} images")
        
        # 下载图片
        success_count = 0
        for index, img_url in enumerate(img_links):
            if download_image(img_url, index):
                success_count += 1
        
        print(f"\nDownload complete! Successfully downloaded {success_count}/{len(img_links)} images")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

2. 文章爬取

from bs4 import BeautifulSoup
import requests
import csv
from urllib.parse import urljoin

def scrape_quotes(base_url, output_file='quote_list.csv'):
    """
    Scrape quotes from quotes.toscrape.com and save to CSV file.
    
    Args:
        base_url (str): The base URL of the website to scrape
        output_file (str): Path to the output CSV file
    """
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
            fieldnames = ['quote', 'author', 'tags']
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()

            current_url = base_url
            
            while current_url:
                try:
                    # Fetch the page with timeout and error handling
                    response = requests.get(current_url, timeout=10)
                    response.raise_for_status()
                    bs = BeautifulSoup(response.text, 'html.parser')

                    # Process all quotes on the page
                    for quote in bs.find_all('div', class_='quote'):
                        try:
                            text = quote.find('span', class_='text').get_text(strip=True)
                            author = quote.find('small', class_='author').get_text(strip=True)
                            tags = [tag.get_text(strip=True) 
                                   for tag in quote.find_all('a', class_='tag')]
                            
                            writer.writerow({
                                'quote': text,
                                'author': author,
                                'tags': ', '.join(tags)  # Store tags as comma-separated string
                            })
                        except AttributeError as e:
                            print(f"Skipping malformed quote: {e}")
                            continue

                    # Find next page link
                    next_button = bs.find('li', class_='next')
                    current_url = urljoin(base_url, next_button.a['href']) if next_button else None
                    
                except requests.RequestException as e:
                    print(f"Error fetching {current_url}: {e}")
                    break
                except Exception as e:
                    print(f"Unexpected error processing {current_url}: {e}")
                    break

    except IOError as e:
        print(f"Error writing to file {output_file}: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    else:
        print(f"Successfully scraped quotes to {output_file}")

if __name__ == '__main__':
    BASE_URL = 'http://quotes.toscrape.com'
    scrape_quotes(BASE_URL)

 

爬虫实践

  1. 遵守robots.txt:检查目标网站的robots.txt文件

  2. 设置User-Agent:模拟浏览器行为

  3. 限制请求频率:避免被封禁

  4. 处理异常:网络问题、页面变化等

  5. 使用代理IP:对于大规模爬取

# 遵守robots.txt示例
from urllib.robotparser import RobotFileParser

def check_robots(url):
    rp = RobotFileParser()
    rp.set_url(url + "/robots.txt")
    rp.read()
    return rp.can_fetch("*", url)

反反爬虫策略

  1. 随机延迟time.sleep(random.uniform(0.5, 3))

  2. 轮换User-Agent

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ...'
]
headers = {'User-Agent': random.choice(user_agents)}
  1. 使用代理池

proxies = {
    'http': 'http://proxy1.example.com:8080',
    'https': 'https://proxy2.example.com:8080'
}
response = requests.get(url, proxies=proxies)

爬虫框架

1 Scrapy框架示例

import scrapy

class MySpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['http://example.com']
    
    def parse(self, response):
        for item in response.css('div.item'):
            yield {
                'title': item.css('h2::text').get(),
                'link': item.css('a::attr(href)').get()
            }
        
        next_page = response.css('a.next::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

Scrapy与Requests+BeautifulSoup

Scrapy框架与使用Requests+BeautifulSoup的爬虫在架构、功能和适用场景上有显著区别。以下是针对您提到的"图片下文章下载"场景的具体对比分析:

1. 架构层级区别

特性Scrapy框架Requests+BeautifulSoup
架构类型完整的爬虫框架(有明确的项目结构)脚本式开发(线性执行)
组件化程度高度组件化(Spider/Middleware/Pipeline)需要手动实现各功能模块
内置功能包含请求调度、去重、并发等完整机制需自行实现基础功能

2. 图片文章下载场景对比

Scrapy实现优势
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class ArticleSpider(scrapy.Spider):
    name = 'article_downloader'
    
    custom_settings = {
        'ITEM_PIPELINES': {
            'scrapy.pipelines.images.ImagesPipeline': 1,
            'myproject.pipelines.TextPipeline': 300
        },
        'IMAGES_STORE': './downloads/images'
    }

    def parse(self, response):
        yield {
            'title': response.css('h1::text').get(),
            'content': response.css('article::text').getall(),
            'image_urls': response.css('img.article-image::attr(src)').getall()
        }

特点:

  • 内置媒体管道:自动处理图片下载、存储和校验

  • 异步处理:并发下载文章和图片(默认16并发)

  • 断点续爬:自动记录爬取状态

  • 自动去重:避免重复下载相同图片/文章

Requests+BeautifulSoup实现
import os
import requests
from bs4 import BeautifulSoup

def download_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 下载文本内容
    content = '\n'.join([p.text for p in soup.select('article p')])
    
    # 下载图片
    for img in soup.select('img.article-image'):
        img_url = img['src']
        img_data = requests.get(img_url).content
        with open(f'./downloads/{os.path.basename(img_url)}', 'wb') as f:
            f.write(img_data)
    
    return content

特点:

  • 同步阻塞:逐个下载资源(图片下载会阻塞主线程)

  • 手动管理:需要自行处理重试、去重、存储等逻辑

  • 灵活性高:可快速调整解析逻辑

3. 核心能力对比

功能需求Scrapy解决方案Requests方案
并发下载内置Twisted异步引擎(高性能)需手动实现多线程/异步(如aiohttp
媒体文件下载内置ImagesPipeline/FilesPipeline需自行实现下载和存储逻辑
请求去重自动基于指纹去重需手动维护URL集合
爬取规则支持CrawlSpider规则自动跟进链接需手动解析和跟踪链接
数据清洗通过Item Loader系统化处理临时编写清洗代码
反爬对抗内置Rotating User-Agent等中间件需自行实现请求头管理

4. 选择建议

使用Scrapy当:

  • 需要下载大量文章和图片(>1000篇)

  • 要求稳定的断点续爬能力

  • 需要定期增量爬取

  • 项目需要长期维护

使用Requests+BS4当:

  • 快速抓取少量文章(<100篇)

  • 需要灵活调试解析逻辑

  • 目标网站结构简单且无反爬

  • 临时性抓取需求

5. 混合方案(推荐)

对于图片文章下载这种混合内容类型,可以采用折中方案:

# 使用Scrapy框架作为核心,配合Requests处理特殊页面
class HybridSpider(scrapy.Spider):
    def parse(self, response):
        if 'captcha' in response.text:
            # 遇到验证码时切换为requests处理
            import requests
            new_response = requests.get(response.url, headers=...)
            # 处理逻辑...
        else:
            # 正常Scrapy处理
            yield {
                'images': response.css('img::attr(src)').getall(),
                'text': response.css('article::text').get()
            }

这种组合既能利用Scrapy的高效调度,又能保持对特殊情况的处理灵活性。

注意事项

  1. 尊重网站的robots.txt规则

  2. 不要过度请求导致服务器负载过高

  3. 遵守数据保护法规(如GDPR)

  4. 仅爬取公开可用数据

  5. 考虑使用官方API(如果可用)

通过遵循这些指南,可以构建高效、可靠且合规的Python网络爬虫。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值