Python爬虫使用实例-漫kzhan_python爬虫漫画资源-CSDN下载

共4个文件

py：4个

需积分: 5 57 浏览量 2024-09-05 18:10:13 上传评论收藏 8KB ZIP 举报

在当前的网络环境下，Python爬虫技术已经成为数据抓取的重要手段，尤其在漫画网站资源抓取方面具有广泛的应用。本文档“Python爬虫使用实例-漫kzhan”详细介绍了如何使用Python爬虫技术实现对特定漫画网站内容的自动化抓取，包括单话、多话以及全系列漫画的抓取过程。文档内容将引导读者理解如何实现对漫画网站单话内容的抓取。在这一部分，会介绍爬虫的基础知识点，如如何设置请求头、如何解析网页内容以及如何提取漫画图片链接。这些基础操作是后续更复杂功能实现的前提。在单话抓取的基础上，文档还将继续深入讲解如何扩展到多话内容的批量抓取，这通常涉及到如何遍历网站的分页系统，并且保存每一页的漫画图片。进一步地，文档将展示如何将抓取到的单话图片合成为一张长图，这一功能对于那些想要在电子设备上阅读漫画的用户非常有用。合成长图的过程涉及到图片处理技术，比如图像的拼接和裁剪，确保最终的图片在视觉上连贯且不失真。文档将介绍如何将多个长图合并为一个PDF文件。这一功能可以方便用户将漫画内容导出为PDF格式进行离线阅读或分享。实现该功能需要使用到PDF生成库，如python-poppler-qt5或者reportlab等，文档将提供相应的代码示例和实现方法。整个文档不仅提供了一个完整的Python爬虫项目实例，还包括了对于项目中使用到的关键技术点的解释说明，使得读者不仅能够复现项目，还能够深入理解其背后的原理。这包括但不限于HTTP请求、HTML解析、图像处理以及PDF生成等，是学习Python爬虫和网页自动化处理技术的宝贵资源。此外，文档还可能提供对于可能遇到的问题和错误处理的介绍，比如如何处理网络请求的异常、如何应对反爬虫机制以及如何优化爬虫运行效率等问题，这些都是爬虫实践中的重要环节。整体来说，“Python爬虫使用实例-漫kzhan”不仅是一个针对特定网站的爬虫项目，更是一个深入浅出地介绍Python爬虫技术的实用教程，对于想要学习或提高在Python爬虫技术方面的人员有着较高的参考价值。通过本实例的详细解读，用户可以快速掌握如何运用Python进行网络数据的抓取、处理和输出，进而在自己的项目中实现自动化数据采集和处理。

资源推荐

资源详情

资源评论

收起资源包目录

mkzhan.zip （4个子文件）

mkzhan

longPicture.py 4KB

OptionalmultiChapter.py 15KB

singleChapter.py 1KB

multiChapter.py 13KB

# 下载comic, def get_data(start=0, end=None) 可指定章节, 通过改变参数start和end # 可能要注意一下顺序, 而且有番外 # 倒序从前往后,前面的为0 正序则从后往前, 后面的为0 # 因为chapter_name = i.css('a::text').getall()[-1].strip() # [-1]是取列表的最后一个元素, 若要reversed 可以改为[0] 此处不可 # 这里不能用reversed, 会报错 for i in list(reversed(list))[start:end]: # TypeError: 'SelectorList' object is not callable # 改start值吧, 若倒序第十话 start=9 若正序第十话 start=len(list)-10 # 第一话start=len(list)-1 end=None也即end = len(list) """ import os import shutil import requests import parsel from PIL import Image from pypdf import PdfWriter from collections import deque url1='https://www.mkzhan.com/209405/' # 请求地址 # 模拟伪装 headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} html_data=requests.get(url=url1,headers=headers).text #print(html_data) # css 数据解析 selector = parsel.Selector(html_data) list = selector.css('.chapter__list .chapter__list-box .chapter__item') long_images = [] # 用于存放所有章节长图的列表 # 创建主文件夹 # main_folder = 'XⅪ.Awaken' main_folder = 'output\\非人哉' if not os.path.exists(main_folder): os.makedirs(main_folder) # 获取数据 # start: 指定开始章节的索引（默认为0）。 # end: 指定结束章节的索引（默认为None，表示获取到列表的最后一章）。 # 例如 get_data(start=0, end=10) 会获取前10章。 # get_data(start=5) 会从第5章开始获取到最后一章。 def get_data(start=1048, end=1049): # 第10话, 这个正序的, 底部更新, 一般倒序的 # 假设 list 是从某个地方获取的章节列表 # list = get_chapter_list() # 这里需要你自己实现获取章节列表的逻辑 if end is None: end = len(list) # 如果没有指定结束位置，默认为列表的长度 for i in list[start:end]: # 根据 start 和 end 的值获取章节 chapter_id = i.css('a::attr(data-chapterid)').get() chapter_name = i.css('a::text').getall()[-1].strip() print(chapter_id, chapter_name) # 创建章节文件夹 chapter_folder = os.path.join(main_folder, chapter_name) if not os.path.exists(chapter_folder): os.makedirs(chapter_folder) # 请求参数 data = { 'chapter_id': chapter_id, 'comic_id': 209405, # 此 'format': '1', 'quality': '1', 'type': '1', } # 发送请求 url = 'https://comic.mkzhan.com/chapter/content/v1/' response = requests.get(url=url, params=data, headers=headers) img_name = 1 images = [] for index in response.json()['data']['page']: img_url = index['image'] img_content = requests.get(url=img_url, headers=headers).content # 确保 chapter_name 是一个有效的文件夹名称 chapter_name = chapter_name.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '') # 图片文件路径 img_file_path = os.path.join(chapter_folder, f'{chapter_name}_{img_name}.png') # 检查图是否已存在 if os.path.exists(img_file_path): print(f"图 {img_file_path} 已存在。") else: with open(img_file_path, mode='wb') as f: f.write(img_content) images.append(img_file_path) img_name += 1 # 合并当前章节的图片为长图 if images: total_height = 0 max_width = 0 images_to_merge = [] for img_path in images: img = Image.open(img_path) total_height += img.height max_width = max(max_width, img.width) images_to_merge.append(img) long_img = Image.new('RGB', (max_width, total_height)) current_height = 0 for img in images_to_merge: long_img.paste(img, (0, current_height)) current_height += img.height long_img_path = os.path.join(chapter_folder, f'{chapter_name}.png') # 检查长图是否已存在 if os.path.exists(long_img_path): print(f"长图 {long_img_path} 已存在。") else: long_img.save(long_img_path) long_images.append(long_img_path) long_img.save(long_img_path) long_images.append(long_img_path) # 移动到上一级, 即 image_folder = 'output\\XⅪ.Awaken\\' shutil.move(long_img_path, os.path.join(main_folder, f'{chapter_name}.png')) # 设置图片文件夹路径和输出 PDF 文件路径 def merged_pdf(): # image_folder = 'XⅪ.Awaken\\' # 替换为你的图片文件夹路径 # output_pdf_path = 'XⅪ.Awaken.pdf' # 输出 PDF 文件路径 image_folder = 'output\\非人哉\\' # 替换为你的图片文件夹路径 output_pdf_path = 'output\\非人哉\\非人哉.pdf' # 输出 PDF 文件路径 # 用于存放所有打开的图片及其标题 images = [] titles = [] # 遍历文件夹中的所有图片 for image_file in os.listdir(image_folder): if image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): # 检查文件格式 image_path = os.path.join(image_folder, image_file) img = Image.open(image_path) images.append(img) # 提取图片标题（不带扩展名） title = os.path.splitext(image_file)[0] titles.append(title) # 自定义排序函数 def custom_sort_key(title): if "序章" in title: # 序章放在最前 return (0, title) elif "最终话" in title or "最后话" or "新oc" in title: # 最终话放在最后 return (2, title) else: # 提取话的数字，并放在第一排序位置 number_part = ''.join(filter(str.isdigit, title)) # 提取数字部分 return (1, int(number_part) if number_part.isdigit() else 0, title) # 根据自定义排序规则排序标题和图片 sorted_indices = sorted(range(len(titles)), key=lambda i: custom_sort_key(titles[i])) images = [images[i] for i in sorted_indices] # 创建 PDF Writer 实例 pdf_writer = PdfWriter() # 将每张图像添加到 PDF for img in images: img_pdf_path = os.path.join(image_folder, f"temp_{titles[images.index(img)]}.pdf") img.save(img_pdf_path, "PDF", quality=100) # 添加保存的 PDF 文件到 writer pdf_writer.append(img_pdf_path) # 保存生成的 PDF 文件 with open(output_pdf_path, 'wb') as f: pdf_writer.write(f) # 清理临时文件 for title in titles: os.remove(os.path.join(image_folder, f"temp_{title}.pdf")) print(f'PDF 文件已生成：{output_pdf_path}') get_data() # merged_pdf() """ import os import shutil import requests import parsel from PIL import Image from pypdf import PdfWriter from collections import deque url1='https://www.mkzhan.com/209405/' # 请求地址 # 模拟伪装 headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} html_data=requests.get(url=url1,headers=headers).text #print(html_data) # css 数据解析 selector = parsel.Selector(html_data) list = selector.css('.chapter__list .chapter__list-box .chapter__item')

评论收藏

内容反馈