项目背景
作为环境工程从业者,我经常需要查阅各类环评报告作为学习参考资料。这些报告虽然都是全文公开的,但每次都要手动一个个查找下载非常耗时。最近我终于下定决心要解决这个问题,利用Python开发了一个自动化采集系统,能够批量下载环评公示报告。
技术实现
系统架构
这个爬虫系统主要包含以下几个功能模块:
- 分页检测模块:自动识别公告总页数
- 链接提取模块:从每页获取所有公告链接
- 内容解析模块:提取公告标题、日期、正文等关键信息
- 附件处理模块:智能识别并下载各类附件
- 本地存储模块:将内容保存为结构化的Markdown格式
核心功能亮点
- 智能分页处理:采用三重检测机制确保准确获取总页数
- 解析分页DOM元素
- 匹配页面文本中的页数信息
- 自动探测最后一页
- 内容增强提取:通过多选择器组合确保内容获取成功率
- 附件智能命名:基于公告内容自动生成有意义的附件文件名
- 完善的错误处理:每个关键步骤都有异常捕获和重试机制
使用效果
经过实际测试,该系统能够:
- 自动识别22页共计200+条公告
- 准确提取每条公告的完整内容
- 智能下载PDF、Word等各类附件
- 按项目分类存储到本地目录
优化方向
虽然当前版本已经能满足基本需求,但还有不少优化空间:
- 信息结构化:提取建设单位、项目地点等关键字段构建数据库
- 内容分析:对报告文本进行NLP处理提取关键信息
- 可视化展示:生成项目地理分布图、时间趋势图等
- 自动化更新:设置定时任务自动获取最新公告
法律声明
需要特别说明的是:
- 本工具仅用于个人学习研究
- 请控制采集频率,避免对服务器造成负担
- 不得将采集数据用于商业用途
- 请遵守网站robots.txt的相关规定
结语
通过这个项目,我深刻体会到Python在信息采集处理方面的强大能力。后续我计划继续优化这个工具,也欢迎有兴趣的朋友一起交流探讨。
最后需要说明的是每一个城市的网页都是不尽相同的,需要单独进行微调,我的代码也只是某一个城市的,不能通用。为了规避责任我隐去了网址。
附上完整代码:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re
from datetime import datetime
import json
class EIAAnnouncementCrawler:
def __init__(self):
self.base_url = " 网址~脱敏 "
self.output_dir = "环评公告"
self.attachments_dir = os.path.join(self.output_dir, "附件")
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# 创建目录
os.makedirs(self.output_dir, exist_ok=True)
os.makedirs(self.attachments_dir, exist_ok=True)
# 统计信息
self.stats = {
'total_pages': 0,
'total_announcements': 0,
'successful_downloads': 0,
'failed_downloads': 0,
'attachments_downloaded': 0
}
def get_total_pages(self):
"""第一步:识别公告总共有多少页"""
try:
print("正在获取总页数...")
response = requests.get(self.base_url, headers=self.headers, timeout=15)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 方法1:查找分页信息
pagination = soup.find('div', class_='pagination') or soup.find('div', class_='page')
if pagination:
page_links = pagination.find_all('a')
if page_links:
# 查找最后一个数字页码
page_numbers = []
for link in page_links:
text = link.get_text(strip=True)
if text.isdigit():
page_numbers.append(int(text))
if page_numbers:
total_pages = max(page_numbers)
print(f"通过分页信息找到总页数: {total_pages}")
return total_pages
# 方法2:查找页面中的总页数文本
page_text = soup.find(text=re.compile(r'共.*页'))
if page_text:
match = re.search(r'共(\d+)页', page_text)
if match:
total_pages = int(match.group(1))
print(f"通过文本匹配找到总页数: {total_pages}")
return total_pages
# 方法3:尝试访问第二页,看是否存在
test_url = urljoin(self.base_url, "index_1.html")
response = requests.get(test_url, headers=self.headers, timeout=10)
if response.status_code == 200:
# 如果第二页存在,继续尝试更多页
for i in range(2, 50): # 最多尝试50页
test_url = urljoin(self.base_url, f"index_{i}.html")
response = requests.get(test_url, headers=self.headers, timeout=5)
if response.status_code != 200:
total_pages = i
print(f"通过测试找到总页数: {total_pages}")
return total_pages
# 默认返回1页
print("无法确定总页数,默认返回1页")
return 1
except Exception as e:
print(f"获取总页数失败: {e}")
return 1
def get_page_url(self, page_num):
"""生成指定页面的URL"""
if page_num == 1:
return self.base_url
else:
return urljoin(self.base_url, f"index_{page_num-1}.html")
def extract_announcement_links(self, page_url):
"""从页面中提取所有公告链接"""
try:
response = requests.get(page_url, headers=self.headers, timeout=15)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
links = []
# 查找公告列表
# 尝试多种可能的选择器
selectors = [
'ul.list-unstyled li a[href]',
'.news_list a[href]',
'.list a[href]',
'a[href*="html"]',
'a[href*="hpspgg"]'
]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
href = element.get('href', '').strip()
if href and ('html' in href or 'hpspgg' in href):
# 构建完整URL
if href.startswith('http'):
full_url = href
else:
full_url = urljoin(page_url, href)
# 确保是公告链接
if 'hpspgg' in full_url and full_url not in links:
links.append(full_url)
if links:
break
return links
except Exception as e:
print(f"提取公告链接失败 {page_url}: {e}")
return []
def extract_announcement_content(self, url):
"""提取单个公告的内容"""
try:
response = requests.get(url, headers=self.headers, timeout=15)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 提取标题
title = ""
title_selectors = ['h1', '.title', '.article-title', 'h2']
for selector in title_selectors:
title_elem = soup.select_one(selector)
if title_elem:
title = title_elem.get_text(strip=True)
break
if not title:
title = "无标题"
# 提取发布日期
publish_date = ""
date_selectors = ['.date', '.time', '.publish-date', '.article-date']
for selector in date_selectors:
date_elem = soup.select_one(selector)
if date_elem:
date_text = date_elem.get_text(strip=True)
# 提取日期格式
date_match = re.search(r'(\d{4}-\d{2}-\d{2}|\d{4}/\d{2}/\d{2}|\d{4}\.\d{2}\.\d{2})', date_text)
if date_match:
publish_date = date_match.group(1)
break
if not publish_date:
publish_date = datetime.now().strftime('%Y-%m-%d')
# 提取正文内容
content = ""
content_selectors = ['.content', '.article-content', '.main-content', '.text']
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
content = content_elem.get_text(strip=True, separator='\n')
break
if not content:
# 如果没有找到特定内容区域,尝试获取整个body
body = soup.find('body')
if body:
content = body.get_text(strip=True, separator='\n')
# 提取附件 - 优化版本
attachments = []
attachment_links = []
# 查找所有可能的附件链接
for a_tag in soup.find_all('a', href=True):
href = a_tag.get('href', '')
text = a_tag.get_text(strip=True)
# 检查是否是附件
if (href.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar')) or
'下载' in text or '附件' in text or '文件' in text):
attachment_url = urljoin(url, href)
attachment_links.append({
'url': attachment_url,
'text': text,
'href': href
})
# 智能处理附件信息
for i, link_info in enumerate(attachment_links, 1):
attachment_name = self.extract_attachment_name(link_info, title, i)
attachments.append({
'name': attachment_name,
'url': link_info['url'],
'filename': os.path.basename(link_info['href']),
'index': i
})
return {
'title': title,
'publish_date': publish_date,
'content': content,
'attachments': attachments,
'url': url
}
except Exception as e:
print(f"提取公告内容失败 {url}: {e}")
return None
def extract_attachment_name(self, link_info, announcement_title, index):
"""智能提取附件名称"""
text = link_info['text']
href = link_info['href']
original_filename = os.path.basename(href)
# 方法1:从链接文本中提取有意义的名称
if text and text != original_filename and len(text) > 2:
# 清理文本,移除常见的无用词汇
clean_text = re.sub(r'[下载|附件|文件|点击|查看]', '', text).strip()
if clean_text and len(clean_text) > 2:
return clean_text
# 方法2:从原始文件名中提取(去除扩展名)
if original_filename:
name_without_ext = os.path.splitext(original_filename)[0]
if name_without_ext and len(name_without_ext) > 2:
return name_without_ext
# 方法3:从公告标题中提取关键词作为附件名称
keywords = self.extract_keywords_from_title(announcement_title)
if keywords:
return f"{keywords}_附件{index}"
# 方法4:默认命名
return f"附件{index}"
def extract_keywords_from_title(self, title):
"""从公告标题中提取关键词"""
# 移除常见的无意义词汇
stop_words = ['关于', '的', '项目', '环评', '审批', '决定', '公告', '公示', '信息', '通知']
# 提取可能的关键词(公司名、项目名等)
keywords = []
# 查找公司名称模式
company_patterns = [
r'([^有限公司]+有限公司)',
r'([^公司]+公司)',
r'([^集团]+集团)',
r'([^科技]+科技)',
r'([^发展]+发展)'
]
for pattern in company_patterns:
matches = re.findall(pattern, title)
if matches:
keywords.extend(matches)
# 查找项目名称模式
project_patterns = [
r'([^项目]+项目)',
r'([^工程]+工程)',
r'([^建设]+建设)',
r'([^制造]+制造)',
r'([^加工]+加工)'
]
for pattern in project_patterns:
matches = re.findall(pattern, title)
if matches:
keywords.extend(matches)
# 清理和过滤关键词
clean_keywords = []
for keyword in keywords:
# 移除停用词
clean_keyword = keyword
for stop_word in stop_words:
clean_keyword = clean_keyword.replace(stop_word, '')
if clean_keyword and len(clean_keyword) > 1:
clean_keywords.append(clean_keyword)
# 返回最长的关键词(通常最有意义)
if clean_keywords:
return max(clean_keywords, key=len)
return None
def generate_attachment_filename(self, attachment_info, announcement_title, publish_date):
"""生成有意义的附件文件名"""
attachment_name = attachment_info['name']
original_filename = attachment_info['filename']
index = attachment_info['index']
# 获取文件扩展名
file_ext = os.path.splitext(original_filename)[1].lower()
# 从公告标题中提取简短标识
title_identifier = self.get_title_identifier(announcement_title)
# 生成文件名格式:日期_标题标识_附件名称_序号.扩展名
safe_attachment_name = re.sub(r'[<>:"/\\|?*]', '_', attachment_name)
safe_title_identifier = re.sub(r'[<>:"/\\|?*]', '_', title_identifier)
# 如果附件名称太长,截取前20个字符
if len(safe_attachment_name) > 20:
safe_attachment_name = safe_attachment_name[:20]
filename = f"{publish_date}_{safe_title_identifier}_{safe_attachment_name}_{index:02d}{file_ext}"
return filename
def get_title_identifier(self, title):
"""从标题中提取简短标识"""
# 提取公司名称或项目名称的前几个字符
if '有限公司' in title:
# 提取公司名称
company_match = re.search(r'([^有限公司]+)有限公司', title)
if company_match:
company_name = company_match.group(1)
if len(company_name) <= 10:
return company_name
else:
return company_name[:10]
# 如果没有公司名称,提取前几个字符
if len(title) <= 15:
return title
else:
return title[:15]
def download_attachment(self, attachment_info, announcement_title, publish_date):
"""下载附件并保存到本地"""
try:
url = attachment_info['url']
# 生成有意义的文件名
filename = self.generate_attachment_filename(attachment_info, announcement_title, publish_date)
# 创建以公告标题命名的子文件夹
safe_title = re.sub(r'[<>:"/\\|?*]', '_', announcement_title)
attachment_folder = os.path.join(self.attachments_dir, safe_title)
os.makedirs(attachment_folder, exist_ok=True)
filepath = os.path.join(attachment_folder, filename)
# 如果文件已存在,跳过下载
if os.path.exists(filepath):
print(f"附件已存在,跳过: {filename}")
return True
response = requests.get(url, headers=self.headers, timeout=10, stream=True)
if response.status_code == 200:
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"已下载附件: {filename}")
self.stats['attachments_downloaded'] += 1
return True
else:
print(f"附件下载失败: {url} (状态码: {response.status_code})")
return False
except Exception as e:
print(f"下载附件出错: {url} | 错误: {e}")
return False
def save_as_markdown(self, announcement_data):
"""保存公告内容为markdown格式"""
try:
title = announcement_data['title']
publish_date = announcement_data['publish_date']
content = announcement_data['content']
url = announcement_data['url']
attachments = announcement_data['attachments']
# 清理文件名
safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)
filename = f"{publish_date}_{safe_title}.md"
filepath = os.path.join(self.output_dir, filename)
# 如果文件已存在,跳过
if os.path.exists(filepath):
print(f"文件已存在,跳过: {filename}")
return True
with open(filepath, 'w', encoding='utf-8') as f:
# 写入markdown格式
f.write(f"# {title}\n\n")
f.write(f"**发布日期:** {publish_date}\n\n")
f.write(f"**原文链接:** [{url}]({url})\n\n")
f.write("---\n\n")
f.write("## 公告内容\n\n")
f.write(content)
f.write("\n\n")
if attachments:
f.write("## 附件\n\n")
for i, attachment in enumerate(attachments, 1):
f.write(f"{i}. [{attachment['name']}]({attachment['url']})\n")
f.write("\n")
print(f"已保存为markdown: {filename}")
return True
except Exception as e:
print(f"保存markdown失败: {e}")
return False
def crawl_all_pages(self):
"""爬取所有页面的公告"""
print("开始爬取环评公告...")
print(f"基础URL: {self.base_url}")
print(f"输出目录: {self.output_dir}")
print(f"附件目录: {self.attachments_dir}")
print("-" * 50)
# 第一步:获取总页数
total_pages = self.get_total_pages()
self.stats['total_pages'] = total_pages
print(f"总页数: {total_pages}")
print("-" * 50)
all_announcements = []
# 第二步和第三步:从第一页开始依次处理
for page_num in range(1, total_pages + 1):
print(f"\n正在处理第 {page_num}/{total_pages} 页...")
# 获取当前页面的URL
page_url = self.get_page_url(page_num)
print(f"页面URL: {page_url}")
# 提取当前页面的所有公告链接
announcement_links = self.extract_announcement_links(page_url)
print(f"本页找到 {len(announcement_links)} 个公告链接")
# 处理每个公告
for i, link in enumerate(announcement_links, 1):
print(f"\n 处理公告 {i}/{len(announcement_links)}: {link}")
# 提取公告内容
announcement_data = self.extract_announcement_content(link)
if not announcement_data:
print(f" 提取内容失败,跳过")
self.stats['failed_downloads'] += 1
continue
# 保存为markdown格式
if self.save_as_markdown(announcement_data):
self.stats['successful_downloads'] += 1
all_announcements.append(announcement_data)
else:
self.stats['failed_downloads'] += 1
# 下载附件
if announcement_data['attachments']:
print(f" 发现 {len(announcement_data['attachments'])} 个附件")
for attachment in announcement_data['attachments']:
self.download_attachment(attachment, announcement_data['title'], announcement_data['publish_date'])
# 礼貌爬取,添加延迟
time.sleep(2)
# 页面间延迟
if page_num < total_pages:
time.sleep(3)
# 保存统计信息
self.save_statistics(all_announcements)
print("\n" + "=" * 50)
print("爬取完成!")
print(f"总页数: {self.stats['total_pages']}")
print(f"成功下载: {self.stats['successful_downloads']}")
print(f"失败下载: {self.stats['failed_downloads']}")
print(f"附件下载: {self.stats['attachments_downloaded']}")
print("=" * 50)
def save_statistics(self, announcements):
"""保存统计信息"""
stats_file = os.path.join(self.output_dir, "爬取统计.json")
stats_data = {
'crawl_time': datetime.now().isoformat(),
'statistics': self.stats,
'announcements': [
{
'title': ann['title'],
'publish_date': ann['publish_date'],
'url': ann['url'],
'attachments_count': len(ann['attachments'])
}
for ann in announcements
]
}
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats_data, f, ensure_ascii=False, indent=2)
print(f"统计信息已保存到: {stats_file}")
def main():
crawler = EIAAnnouncementCrawler()
crawler.crawl_all_pages()
if __name__ == "__main__":
main()