import requests
from retrying import retry
from lxml import etree
import time
from queue import Queue
import threading
class QiuBai:
def __init__(self):
self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
self.url_temp = "https://www.ximalaya.com/keji/p{}/"
self.url_first_queue = Queue()
self.url_second_queue = Queue()
self.html_first_queue = Queue()
self.html_second_queue = Queue()
self.content_list_queue = Queue()
def get_url_list(self):
url_list = [self.url_temp.format(i) for i in range(1,3)]
for url in url_list:
self.url_first_queue.put(url)
print(url)
return url_list
@retry(stop_max_attempt_number=4)
def _parse_url(self, url,return_str=False):
r = requests.get(url, headers=self.headers, timeout=5)
assert r.status_code == 200
print(r.status_code)
time.sleep(1)
return etree.HTML(r.content.decode("utf-8"))
def parse_first_url(self):
while 1:
url = self.url_first_queue.get()
print(url)
try:
html = self._parse_url(url)
except:
html = None
print('Error')
self.html_first_queue.put(html)
self.url_first_queue.task_done()
def get_firstUrl_list(self):
while 1:
html = self.html_first_queue.get()
if html is not None:
html_first_list = html.xpath("//a[@class = 'u0jN album-title lg']/@href")
for u in html_first_list:
url_second = "https://www.ximalaya.com" + u
self.url_second_queue.put(url_second)
self.html_first_queue.task_done()
def parse_second_url(self):
while 1:
url = self.url_second_queue.get()
print(url)
try:
html = self._parse_url(url)
except:
html = None
print('Error')
self.html_second_queue.put(html)
print(self.html_second_queue.qsize())
print('@'*100)
self.url_second_queue.task_done()
print('具体内容页面数量',self.url_second_queue.qsize())
def get_concent_list(self):
while 1:
html_1 = self.html_second_queue.get()
print(html_1)
if html_1 is not None:
print(html_1.xpath('//h1/text()')[0])
content_list = html_1.xpath('//h1/text()')[0]
self.content_list_queue.put(content_list)
self.html_second_queue.task_done()
print('数量',self.html_second_queue.qsize())
def save_content_list(self):
while 1:
content_list = self.content_list_queue.get()
with open("C:/Users/JZG/Desktop/threads8-15/output/title.txt","a",encoding="utf-8") as fp:
fp.write(content_list + "\n")
self.content_list_queue.task_done()
print('内容数量',self.content_list_queue.qsize())
def run(self):
lock = threading.RLock()
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
for i in range(2):
t_first_parse = threading.Thread(target=self.parse_first_url)
thread_list.append(t_first_parse)
t_get_firstUrl_list = threading.Thread(target=self.get_firstUrl_list)
thread_list.append(t_get_firstUrl_list)
for i in range(5):
try:
lock.acquire()
t_second_parse = threading.Thread(target=self.parse_second_url)
thread_list.append(t_second_parse)
finally:
lock.release()
for i in range(3):
try:
lock.acquire()
t_get_content_list = threading.Thread(target=self.get_concent_list)
thread_list.append(t_get_content_list)
finally:
lock.release()
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
print('哈哈哈哈哈')
time.sleep(5)
for q in [self.url_first_queue,self.url_second_queue,self.html_first_queue,self.html_second_queue,self.content_list_queue]:
q.join()
if __name__ == '__main__':
print(time.ctime(time.time()))
qiubai = QiuBai()
qiubai.run()
print(time.ctime(time.time()))