#设置页码数爬取
import scrapy
from datetime import datetime
from ..items import CallBidItem
import re
from scrapy.http import Request
from pybloom_live import BloomFilter
import hashlib
import html2text as ht # pip install html2text
import requests
import time
import pymysql
import pandas
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from BuildingPlat.models import CallBid
startaffich_mysql =CallBid.objects.filter(province ="天津").order_by("-startaffich").values_list("startaffich").distinct()[0:2]
df = pandas.DataFrame(startaffich_mysql,columns=['startaffich'])
global g_province, g_type
g_province = '天津'
class callBidSpider(scrapy.spiders.Spider):
name = "callbidVtj"
allow_domains = ["tjconstruct.cn"]
start_urls = ['http://www.tjconstruct.cn/Zbgg/Index/1?type=sgzb']
# custom_settings ={
# 'ITEM_PIPELINES':{'BuildingSpider.pipelines.callbidVtjPipeline':300}
# }
def parse(self, response):
bids = response.css('table .t1 tr')#css选择器
# bids = response.xpath('//table[@class="t1"]/tr')#css选择器
for bid in bids[1:]:
item = CallBidItem()
try:
nname = re.search('_blank\">(.*?)</a>', bid.extract()).group(1)
npurl = bid.re_first('href=\"(.*?)\"')
kw = re.search('a>.*?;\">\s+(.*?)\s+<.*?;\">\s+(.*?)\s+<.*?;\">\s+(.*?)\s+<', bid.extract(), re.S)
ntenderee = kw.group(1)
ndocnmb = kw.group(2)
ndate = datetime.strptime(kw.group(3), "%Y/%m/%d").strftime("%Y-%m-%d")
bloomnmb = nname + ndocnmb
#md文本
text_maker = ht.HTML2Text()
text_maker.bypass_tables = False
htmlfile = requests.get(npurl)
htmlfile.encoding = 'gbk'
htmlpage = htmlfile.text
# text = text_maker.handle(htmlpage)
# md = text.split('#') # split post content
item['name']= nname
item['province'] = g_province
item['dom'] = self.allow_domains[0]
item['purl'] = npurl
item['docnmb'] = ndocnmb
item['startaffich'] = ndate
item['endaffich'] = None
item['startRegistration'] = None
item['endRegistration'] = None
item['type'] = '--'
item['tenderee'] = ntenderee
item['tenderer'] = '--'
item['district'] = '--'
item['bloomnb'] = bloomnmb
# item['md'] = ''.join(md)
item['md'] = None
item['content'] = '--'
# item['content'] = htmlpage
item['crawltime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
except:
pass
finally:
if item:
if datetime.date(datetime.strptime(item['startaffich'], "%Y-%m-%d")) in df['startaffich'].unique():
print('该条数据爬取过')
yield Request('www', callback=self.parse)
elif df['startaffich'].empty:
CallBid.objects.create(**item)
elif time.mktime(
datetime.date(datetime.strptime(item['startaffich'], "%Y-%m-%d")).timetuple()) \
< time.mktime(df['startaffich'][0].timetuple()):
print('该条数据爬取过')
yield Request('www', callback=self.parse)
else:
# yield item
try:
CallBid.objects.create(**item)
except:
pass
else:
CallBid.objects.create(**item)
link = response.css(u'div[data-ajax="true"] a:contains("下页")::attr(href)').extract_first() # css选择器
print('callbidtj' + link)
next_page= 'http://www.tjconstruct.cn'+link
yield Request(next_page, callback=self.parse) # 生成器回调函数