方法一:
pip install curl_cffi==0.7.4
pip install scrapy-fingerprint==0.1.3
seetings.py打开中间件
DOWNLOADER_MIDDLEWARES = {
"scrapy_fingerprint.fingerprintmiddlewares.FingerprintMiddleware": 100
}
yield scrapy.Request(url=url,callback=self.parse) 改为以下
from scrapy_fingerprint.request import FingerprintRequest
class AaSpider(scrapy.Spider):
def start_requests(self):
url = 'https://www.Aa.com/'
yield FingerprintRequest(url=url,callback=self.parse)
方法二:
安装
tls_client
库
pip install tls-client==1.0.1
设置中间件
DOWNLOADER_MIDDLEWARES = {
"reel_rush_daily.middlewares.PassJa3TlsMiddleware": 100
}
middlewares.py增加代码
from scrapy.http import HtmlResponse
from tls_client import Session
class PassJa3TlsMiddleware(object):
def __init__(self):
self.session: Session = Session(
client_identifier="chrome_104"
)
def process_request(self, request, spider):
if '.agoramt.com' in request.url:
print(f'request.url:{request.url}')
proxies = request.meta.get("proxies") or None
headers = request.headers.to_unicode_dict()
if request.method == "GET":
response = self.session.get(
url=request.url,
headers=headers,
proxy=proxies,
timeout_seconds=60,
)
else:
response = self.session.post(
url=request.url,
headers=headers,
proxy=proxies,
timeout_seconds=60,
)
return HtmlResponse(
url=request.url,
status=response.status_code,
body=response.content,
encoding="utf-8",
request=request,
)
方法三(推荐):
安装
curl-cffi
库
pip install curl-cffi==0.2.4
设置中间件
DOWNLOADER_MIDDLEWARES = {
"reel_rush_daily.middlewares.PassJa3TlsMiddleware": 100
}
middlewares.py增加代码
from scrapy.http import HtmlResponse
from curl_cffi import requests
class PassJa3TlsMiddleware(object):
def __init__(self, settings):
self.timeout = settings.get('DOWNLOAD_TIMEOUT')
self.proxies = settings.get('REQUESTS_PROXIES')
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.settings)
return s
def process_request(self, request, spider):
headers = request.headers.to_unicode_dict()
body = request.body
if request.method == "GET":
response = requests.get(
url=request.url,
headers=headers,
proxies=self.proxies,
timeout=self.timeout,
impersonate="chrome101"
)
else:
response = requests.post(
url=request.url,
headers=headers,
data=body,
proxies=self.proxies,
timeout=self.timeout,
impersonate="chrome101"
)
return HtmlResponse(
url=request.url,
status=response.status_code,
body=response.content,
encoding="utf-8",
request=request,
)