import requests
from lxml import etree
import json
import csv
import os
import time
from datetime import datetime
province_list = ['北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '上海', '江苏',
'浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南', '广东',
'广西', '海南', '重庆', '四川', '贵州', '云南', '西藏', '陕西', '甘肃',
'青海', '宁夏', '新疆', '台湾', '香港', '澳门']
class spider(object):
# 初始化数据
def __init__(self):
# 城市景点列表的ajax请求网址:keyword:城市名称 page:第几页 sort:pp=》按人气排名
self.url = 'https://piao.qunar.com/ticket/list.json?keyword=%s®ion=&from=mpl_search_suggest&page=%s&sort=pp'
# 景点url
self.travel_url = 'https://piao.qunar.com/ticket/detail_%s.html'
# https://piao.qunar.com/ticket/detailLight/sightCommentList.json?sightId=13728&index=1&page=1&pageSize=10&tagType=0
# 景点评论url
self.comment_url = 'https://piao.qunar.com/ticket/detailLight/sightCommentList.json?sightId=%s&index=1&page=%s&pageSize=10&tagType=0'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/125.0.0.0 Safari/537.36",
"Cookie": """SECKEY_ABVK=2anBSiiH/Blr/wkyCcDwTXF0EsPCC/lk6glOqMaU1Vg%3D; BMAP_SECKEY=XkTjMln_lU_mCoCBDmPz7uo3dpODwHBrdvIk_Mkn9HzbXbA-irSglXaPlXjV4xSGSjAuYwTSHFq91Vwa4Z71wZQQiWWpIq_XrGt_G96cDb9_jy-GUGWoSNmB62cAmKK8NXJQzobpJGcuvRO9n0f7fYNdLXt2Ch5juzyx1fSnktETMW4BylLAv7nW8qdfJcuB; QN1=00013a80306c6097a8f84b0b; QN99=7575; qunar-assist={%22version%22:%2220211215173359.925%22%2C%22show%22:false%2C%22audio%22:false%2C%22speed%22:%22middle%22%2C%22zomm%22:1%2C%22cursor%22:false%2C%22pointer%22:false%2C%22bigtext%22:false%2C%22overead%22:false%2C%22readscreen%22:false%2C%22theme%22:%22default%22}; QN269=6A0222B020E111EFA884C68241B7B338; fid=3854ee48-b5d9-4db4-939b-729b0d854dac; QN277=s%3Dbing; QN48=tc_c652c6704bf0bad7_18fd912ac32_2157; QN300=s%3Dbing; QN205=s%3Dbing; QN601=5cccfb678265181e61e4e020915750d9; ariaDefaultTheme=null; quinn=434bffd7f0dc297385480c1911d2e2dbf0717f3301ebedf049150f058afb9a4571430528114ccba477d0187067bbdd37; QN57=17173346638190.9571355881571146; QunarGlobal=10.68.204.216_-51a614aa_18f7c8916a5_6032|1717334665766; ctt_june=1683616182042##iK3wasanVuPwawPwasaOW2kIVK0GERjwX%3DGDaSDwVRoDX2WGWK0hER38XPX8iK3siK3saKgnWsa%2BVKvAWKjOahPwaUvt; ctf_june=1683616182042##iK3wWSjsWwPwawPwa%3DjAW2j%3DaSGRXSHGW2aAWPXAWK3sWSXsEPPAaRXOEDP%3DiK3siK3saKgnWsa%2BVKvAWKj%2BahPwaUvt; cs_june=f6eb45fb177ffc7b1a1adcaf4660700ac0aa2dd64e1fd40ba7c431a5a772eaea204e2da782fb1b0b8373bd775579573192de14385b88bb72b63f8d0d3f1253ddb17c80df7eee7c02a9c1a6a5b97c11793f60b97952cd65ddcf77dc3cf9a2cf7c5a737ae180251ef5be23400b098dd8ca; QN271AC=register_pc; QN271SL=82ff5934cad5e560a6bfcaec8c37203c; QN271RC=82ff5934cad5e560a6bfcaec8c37203c; _q=U.uyfetru1921; csrfToken=vu3zgaCwhJ3g6fucnulczWdzfpbpbxv1; _s=s_GFG76Z4DKCE4GE7K6LKDXHLFPM; _t=28732424; _v=g5-1QWto2OobEtgP816t8cP3_jTE2_1kq2vwB4mVnkIC8LC4ORuq9M2a4Rw3xGqtM0mvMbG4gQVHIMgezGdMXgvP5-iewvn1H0OfYnhDW23JTtYUJJoxcPDVWvqTC6ILYQcuDJrKQIYVcvJiUCs7uw_GOMW6Vef37HgSQd2nwrhE; QN43=""; QN42=%E5%8E%BB%E5%93%AA%E5%84%BF%E7%94%A8%E6%88%B7; QN44=uyfetru1921; _i=DFiEuYALRrSwIKi6tc2mTe5qDK6w; activityClose=1; QN243=3; QNSPU=4065721161%2C4272920601%2C3394745388%2C2664232863; QN71="MTEzLjE2LjI0NS44MDrljJfkuqw6MQ=="; QN63=%E5%8C%97%E4%BA%AC; QN67=14763; _vi=MsSK3np0a5nqRkzbcGBdbxf0t5qX5_XZdhDR7_QJ9RoHm0Hwaa1xAuIpnD7QjuMZ--JjrohKPiDawmnBhPTE9ofMLydzFTZIyLkOI1EjU-0ydgwOHLh1wG9yoZyJDPhHf739NRktIWEHV5upKT0vSPXFMZ-kKKY7NjuW0JzQXJLU; QN271=d4400887-38ce-426c-88b8-2036b72c6f05; __qt=v1%7CVTJGc2RHVmtYMSsxanBaOU12SmZkbS9PdWVmYUlscnpOdyttYnFwakUwZW44ajFMeEZhNzJwQmpRSHlhVXNEZ2twN1NYRXN5bEdCb0xsRlRvMVhzb2RlWGt6YjhHbVpteGhCRG81K25waEozekhQWStoYmtQTDZDQ2JCUnpRZmtRYmtRdWlySUFUTGhHMDlaMDRvZ041bEwwbkRzdHc3bTdEblo3TTRBVmU4PQ%3D%3D%7C1717381770078%7CVTJGc2RHVmtYMS9SQlhFL1JKSkYrTkJQT3ZydEdpUGtXMjExQVd6N1MvZ3JNZW90bEZBRjhjQzFHcEZsUGdjR0xTV3k4MjVPZXcvZHBWeFBMclpzc3c9PQ%3D%3D%7CVTJGc2RHVmtYMS9GOFQyelVrVDBKY0VKUForTGkrbnZtbitjV200cTJLamlHUjZhZm9UOGVCa01ZS3ByUWVIV3Y4M0ZQUnhtNmhiMnN4Y0U0Y3NvYWIvZDgxWG1YekNVV3gyYnJGdDVMS09oOGxIbU8xYVMwNmI3dmRUZ25ZV2FSckxZRkd4czNoOHFWMnRTZko1dUtKVUkrTDZ0MTM0WUxidEN5OTRUVHZpQlk0bmxGTkdwb2xZclVST2prRkd3REY1S2lIdkZxcExxemVNQjN4Vld2YVZ2ems1UkVVei90dkQwbStRYVZEcGJ0aGhRaWdVaG1CZzdVQ1J0K0VDSldaa0wyWHcxVG1KY2lrMzB6R3pZTEdUNDZ5cjhBcWRMNytvcjE5bHBmZ0FrNVR4cERuVVBkekZ1SmVOUzZtWG0yOXQwT2lqdzB2SU1QMGpLRlBFYm5oU09ZRS9HellTUHMwZ2NaNGpsOVJSZ2VPMCtTdk9oUFJXdWtZaWZFektOd1JlTFRQN0U1VThvVk5ucHFoWDdEL3M5U0NSMTlJcU9QWFZqUkZWVGExTENCREdwN0h4NnE3N21Xd0RBSmRSSUFVQXBxUmwrKzNPZ0tBQytKRTVUWFN6cXJDY3R4eHkyR3Y4SGJoNWMzM0p2SDNmRFhPQkFpd3lvZUVtSit6a213dE5WVExMWjNvNHV3R0lqYTN0Mi9RRmhqbFBtZE1SY0VjeWxuMUV6Si83ZXAwMzhrbXV4dDcrYkJHeGtrYitLZXlaaERFTDJPSjMwKzdSZ0t0U29tMEFqUzZRNlcvbVNyYTY2d21qLzFXK1N3VkNKQmtjNHg2b2VpdG9rTTNjZkZUdFVUcXBQdTA2Q2V3K0VBc2FkdjZJSWNjVG5SWlFUZEhuTVFVMW83bjRPN1NRdE85VW9OcWUxWmI4dVp3UEhzaGJRbUFGeWhpRVpaWVQ3RHU5N0VGWEpQT1A5dnhQdk5rU2syYm1nQWNuTzR6VWJZbkxwbDFhSGpzeTFRMkNj; JSESSIONID=1F722A5AD1B5DE01654252574E750FE7; QN267=310146450aa655dec; QN58=1717379098256%7C1717381776823%7C31"""
}
# 初始化
def init(self):
if not os.path.exists('travel.csv'):
with open('travel.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f)
# 标题
writer.writerow([
'name',
'price',
'province',
'star',
'address',
'detail_url',
'comment_total',
'detail_intro',
'short_intro',
'img_list',
'cover',
'sale_count',
'districts',
'score'
])
if not os.path.exists('comments.csv'):
with open('comments.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f)
# 标题
writer.writerow([
'travel_name',
'content',
'date'
])
# 发送请求
def send_request(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response
else:
return None
# 将数据写入csv文件
def save_to_csv(self, row):
with open('travel.csv', 'a', encoding='utf8', newline='') as f:
writer = csv.writer(f)
writer.writerow(row)
def comments_to_csv(self, row):
with open('comments.csv', 'a', encoding='utf8', newline='') as f:
writer = csv.writer(f)
writer.writerow(row)
# 主程序 resp:爬取的页面信息 province:城市
def spider_main(self, resp, province):
# 城市景点列表
travel_list = resp.json()['data']['sightList']
for index, travel in enumerate(travel_list):
try:
print("正在爬取该页 %s 条数据" % str(index + 1))
time.sleep(2)
# 景点名称
name = travel['sightName']
# 门票价格
try:
price = travel['qunarPrice']
except:
price = 0
# 有点景点可能没有评等级,所以需要判断一下
try:
star = travel['star']
except:
star = '未评'
# 详细地址
address = travel['address']
# 短评
short_intro = travel['intro']
# �