如何爬取网页数据

最新推荐文章于 2025-06-03 17:21:29 发布

Fro.Heart

最新推荐文章于 2025-06-03 17:21:29 发布

阅读量1.6k

点赞数 7

CC 4.0 BY-SA版权

文章标签： python

本文链接：https://blog.csdn.net/hg1291150225/article/details/147637606

如何爬取网页数据

爬取网页数据(Web Scraping)是一项常见的技术任务，下面我将详细介绍从基础到进阶的网页爬取方法。

一、基础网页爬取方法

1.1 使用Python requests库

import requests

url = "https://example.com"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    html_content = response.text
    print(html_content[:500])  # 打印前500个字符
else:
    print(f"请求失败，状态码: {response.status_code}")

1.2 使用BeautifulSoup解析HTML

from bs4 import BeautifulSoup

# 假设html_content是上面获取的网页内容
soup = BeautifulSoup(html_content, 'html.parser')

# 提取标题
title = soup.title.string if soup.title else "无标题"
print(f"网页标题: {title}")

# 提取所有链接
for link in soup.find_all('a', href=True):
    print(link['href'])

二、处理动态加载内容

2.1 使用Selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

chrome_options = Options()
chrome_options.add_argument("--headless")  # 无头模式

driver = webdriver.Chrome(options=chrome_options)
driver.get("https://example.com")

# 等待JavaScript加载
time.sleep(2)

# 获取渲染后的页面源码
html_content = driver.page_source

# 可以用BeautifulSoup继续解析
soup = BeautifulSoup(html_content, 'html.parser')
# ...解析逻辑...

driver.quit()  # 关闭浏览器

2.2 使用Pyppeteer(无头Chrome)

import asyncio
from pyppeteer import launch

async def scrape():
    browser = await launch(headless=True)
    page = await browser.newPage()
    await page.goto('https://example.com')
    
    # 等待特定元素出现
    await page.waitForSelector('#content')
    
    content = await page.content()
    # ...处理content...
    
    await browser.close()

asyncio.get_event_loop().run_until_complete(scrape())

三、处理常见反爬机制

3.1 使用代理IP

proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080',
}

response = requests.get(url, headers=headers, proxies=proxies)

3.2 随机User-Agent

from fake_useragent import UserAgent

ua = UserAgent()
headers = {'User-Agent': ua.random}

3.3 处理验证码

# 使用第三方验证码识别服务
def solve_captcha(image_url):
    # 这里可以使用2captcha、打码兔等API
    pass

四、使用Scrapy框架

4.1 创建Scrapy项目

scrapy startproject myproject
cd myproject
scrapy genspider example example.com

4.2 编写Spider

import scrapy

class ExampleSpider(scrapy.Spider):
    name = "example"
    allowed_domains = ["example.com"]
    start_urls = ["https://example.com"]

    def parse(self, response):
        # 提取数据
        title = response.css('title::text').get()
        links = response.css('a::attr(href)').getall()
        
        yield {
            'title': title,
            'links': links
        }
        
        # 跟进链接
        for link in links:
            yield response.follow(link, callback=self.parse)

五、数据存储

5.1 存储到CSV

import csv

with open('output.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'URL'])
    writer.writerow([title, url])

5.2 存储到数据库

import sqlite3

conn = sqlite3.connect('scraped_data.db')
cursor = conn.cursor()

# 创建表
cursor.execute('''CREATE TABLE IF NOT EXISTS pages
             (id INTEGER PRIMARY KEY AUTOINCREMENT,
              title TEXT, url TEXT, content TEXT)''')

# 插入数据
cursor.execute("INSERT INTO pages (title, url, content) VALUES (?, ?, ?)",
              (title, url, content))
conn.commit()
conn.close()

六、高级技巧

6.1 分布式爬取

使用Scrapy-Redis实现分布式爬虫:

# settings.py
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379'

# spider.py
from scrapy_redis.spiders import RedisSpider

class MySpider(RedisSpider):
    name = 'myspider'
    redis_key = 'myspider:start_urls'

6.2 增量爬取

# 使用数据库记录已爬取URL
def is_url_crawled(url):
    # 检查数据库
    return False  # 假设未爬取

if not is_url_crawled(url):
    # 爬取逻辑
    pass

七、法律与道德注意事项

检查robots.txt - https://example.com/robots.txt
限制请求频率 - 使用time.sleep()控制爬取速度
尊重版权 - 不要爬取受版权保护的内容
不爬取个人隐私数据 - 遵守数据保护法规
考虑使用API - 优先使用网站提供的官方API

八、完整示例

import requests
from bs4 import BeautifulSoup
import time
import random
from fake_useragent import UserAgent

ua = UserAgent()

def scrape_website(base_url, max_pages=10):
    visited = set()
    to_visit = {base_url}
    data = []
    
    while to_visit and len(data) < max_pages:
        url = to_visit.pop()
        
        if url in visited:
            continue
            
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(url, headers=headers, timeout=10)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # 提取数据
                title = soup.title.string if soup.title else "无标题"
                data.append({'url': url, 'title': title})
                
                # 提取新链接
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if href.startswith('http') and base_url in href:
                        to_visit.add(href)
                
                visited.add(url)
                print(f"已爬取: {url}")
                
                # 随机延迟
                time.sleep(random.uniform(1, 3))
                
        except Exception as e:
            print(f"爬取 {url} 失败: {e}")
    
    return data

# 使用示例
results = scrape_website("https://example.com", max_pages=5)
for item in results:
    print(f"{item['title']} - {item['url']}")