file_path = "爬取.txt" withopen(file_path, "w", encoding="utf-8") as file: file.write(f"小说标题: {title}\n\n")
# 设置自定义的窗口数,例如3个窗口(线程) max_workers = 3
# 使用ThreadPoolExecutor创建线程池,指定窗口数 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # 使用map方法并行爬取章节内容 chapter_contents = executor.map(get_chapter_content, [base_url + chapter["href"] for chapter in chapter_list])
# 将爬取到的内容写入文件 for content in chapter_contents: file.write(content)
from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options import concurrent.futures import time
file_path = "爬取.txt" withopen(file_path, "w", encoding="utf-8") as file: file.write(f"小说标题: {title}\n\n")
# 设置自定义的窗口数,例如3个窗口(线程) max_workers = 3
# 使用ThreadPoolExecutor创建线程池,指定窗口数 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # 使用map方法并行爬取章节内容 chapter_contents = executor.map(get_chapter_content, [base_url + chapter["href"] for chapter in chapter_list])
# 将爬取到的内容写入文件 for content in chapter_contents: file.write(content)
print(f"\n全部章节已爬取并写入到 {file_path}")
对本次代码分析(未包含后期加入的隐藏窗口库)
本次Python脚本旨在从一个特定包含想要的小说的网站中提取章节内容。以下为代码分析:
导入库:
bs4 和 BeautifulSoup 用于网页抓取。
selenium 用于浏览器自动化。
concurrent.futures 用于并行执行任务。
time 用于引入延迟。
from bs4 import BeautifulSoup from selenium import webdriver import concurrent.futures import time
file_path = "爬取.txt" withopen(file_path, "w", encoding="utf-8") as file: file.write(f"小说标题: {title}\n\n")
并行执行章节内容提取:
将 max_workers 变量设置为并发线程的数量(在本例中为3)。
使用 ThreadPoolExecutor 创建一个具有指定工作线程数的线程池。
使用 map 方法并行执行 get_chapter_content 函数以获取每个章节的内容。
然后将章节内容写入文件。
max_workers = 3 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: chapter_contents = executor.map(get_chapter_content, [base_url + chapter["href"] for chapter in chapter_list])
for content in chapter_contents: file.write(content)