Scrapy分页爬取广州ZF新闻页信息
scrapy的核心代码,爬虫部分
class TencentSpider(scrapy.Spider):
name = 'tencent'
# urls里可以同时放入多个网页
# def start_requests(self):
# urls = [
# 'http://www.gz.gov.cn/gzgov/snzc/common_list.shtml'
# ]
# for url in urls:
# yield scrapy.Request(url=url, callback=self.parse)
baseURL = 'http://www.gz.gov.cn/gzgov/snzc/common_list_'
offset = 1
end = '.shtml'
start_urls = ['http://www.gz.gov.cn/gzgov/snzc/common_list.shtml']
def parse(self, response):
node_list = response.xpath('//ul[@class="news_list"]/li')
for node in node_list:
item = TencentItem()
title = node.xpath('./a/text()').extract()
time = node.xpath('./span/text()').extract()
link = node.xpath('./a/@href').extract()
item['title'] = title[0]
item['time'] = time[0]
item['link'] = link[0]
yield item
if self.offset < 16:
self.offset += 1
url = self.baseURL + str(self.offset) + self.end
yield scrapy.Request(url, callback=self.parse)
items
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
time = scrapy.Field()
link = scrapy.Field()
pipelines
class TencentPipeline(object):
# 主要初始化打开一个文件夹,并表示将要向其中输入数据。只执行一次
def __init__(self):
self.f = open("detail.json", 'w')
# 主要用于把json格式转化为unicode编码,并把数据写入文件中。
def process_item(self, item, spider):
content = json.dumps(dict(item), ensure_ascii=False) + ",\n"
self.f.write(content)
return item
# 用于写数据结束后关闭文件
def close_spider(self, spider):
self.f.close()
详细代码git链接