爬虫系列之django方式存入数据库-CSDN博客

本文链接：https://blog.csdn.net/peiwang245/article/details/104550849

#设置页码数爬取
import scrapy
from datetime import datetime
from  ..items import CallBidItem
import re
from scrapy.http import Request
from pybloom_live import BloomFilter
import hashlib
import html2text as ht  # pip install html2text
import requests
import time
import pymysql
import pandas

import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from BuildingPlat.models import CallBid
startaffich_mysql =CallBid.objects.filter(province ="天津").order_by("-startaffich").values_list("startaffich").distinct()[0:2]
df = pandas.DataFrame(startaffich_mysql,columns=['startaffich'])

global g_province, g_type
g_province = '天津'

class callBidSpider(scrapy.spiders.Spider):
    name = "callbidVtj"
    allow_domains = ["tjconstruct.cn"]
    start_urls = ['http://www.tjconstruct.cn/Zbgg/Index/1?type=sgzb']

    # custom_settings ={
    #     'ITEM_PIPELINES':{'BuildingSpider.pipelines.callbidVtjPipeline':300}
    # }

    def parse(self, response):
        bids = response.css('table .t1 tr')#css选择器
        # bids = response.xpath('//table[@class="t1"]/tr')#css选择器

        for bid in bids[1:]:
            item = CallBidItem()
            try:
                nname = re.search('_blank\">(.*?)</a>', bid.extract()).group(1)
                npurl = bid.re_first('href=\"(.*?)\"')
                kw = re.search('a>.*?;\">\s+(.*?)\s+<.*?;\">\s+(.*?)\s+<.*?;\">\s+(.*?)\s+<', bid.extract(), re.S)
                ntenderee = kw.group(1)
                ndocnmb = kw.group(2)
                ndate = datetime.strptime(kw.group(3), "%Y/%m/%d").strftime("%Y-%m-%d")
                bloomnmb = nname + ndocnmb

                #md文本
                text_maker = ht.HTML2Text()
                text_maker.bypass_tables = False
                htmlfile = requests.get(npurl)
                htmlfile.encoding = 'gbk'
                htmlpage = htmlfile.text
                # text = text_maker.handle(htmlpage)
                # md = text.split('#')  # split post content

                item['name']= nname
                item['province'] = g_province
                item['dom'] = self.allow_domains[0]

                item['purl'] = npurl
                item['docnmb'] = ndocnmb
                item['startaffich'] = ndate

                item['endaffich'] = None
                item['startRegistration'] = None
                item['endRegistration'] = None

                item['type'] = '--'
                item['tenderee'] = ntenderee
                item['tenderer'] = '--'

                item['district'] = '--'
                item['bloomnb'] = bloomnmb
                # item['md'] = ''.join(md)
                item['md'] = None
                item['content'] = '--'
                # item['content'] = htmlpage
                item['crawltime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            except:
                pass
            finally:
                if item:
                    if datetime.date(datetime.strptime(item['startaffich'], "%Y-%m-%d")) in df['startaffich'].unique():
                        print('该条数据爬取过')
                        yield Request('www', callback=self.parse)
                    elif df['startaffich'].empty:
                        CallBid.objects.create(**item)
                    elif time.mktime(
                            datetime.date(datetime.strptime(item['startaffich'], "%Y-%m-%d")).timetuple()) \
                            < time.mktime(df['startaffich'][0].timetuple()):
                        print('该条数据爬取过')
                        yield Request('www', callback=self.parse)
                    else:
                        # yield item
                        try:
                            CallBid.objects.create(**item)
                        except:
                            pass
                else:
                    CallBid.objects.create(**item)


        link = response.css(u'div[data-ajax="true"] a:contains("下页")::attr(href)').extract_first() # css选择器
        print('callbidtj' + link)
        next_page= 'http://www.tjconstruct.cn'+link
        yield Request(next_page, callback=self.parse)  # 生成器回调函数