前言 前面我们使用bs4爬取了笔趣阁,并且实现了去除笔趣阁牛皮藓,但是我们实际上爬虫肯定是不会仅仅输出到终端,都是写入到文件里面,那么我们如何将爬取的数据写入到文本呢?
比如上一篇文章笔趣阁的小说:点击跳转
示例demo,单章节爬取 将数据直接保存到txt,不做任何数据格式处理。 file_path = "C:\\Users\\你的电脑用户名\\Desktop\\文件名.txt" with open (file_path, "w" , encoding="utf-8" ) as file: for h1 in h1_elements: file.write("本文标题: " + h1.text + "\n" ) for p in p_elements: cleaned_text = p.text.replace("请收藏本站:https://www.bqgam.com。笔趣阁手机版:https://m.bqgam.com" , "" ).replace("『点此报错』" , "" ).replace("『加入书签』" , "" ) file.write("本文正文: " + cleaned_text + "\n" ) print (f"内容已写入 {file_path} 文件" )
完整代码:
import requestsfrom bs4 import BeautifulSoupfrom urllib.request import Request, urlopensession = requests.session() host = "https://www.bqgam.com/index/11303/" page = "https://www.bqgam.com/index/11303/1.html" def requestUrl (url ): headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36' , } req = Request(url, headers=headers) html = urlopen(req) html = html.read().decode('utf-8' ) return html def getPage (page ): html = requestUrl(page) soup = BeautifulSoup(html, "html.parser" ) h1_elements = soup.find_all('h1' , attrs={'class' : 'wap_none' }) p_elements = soup.find_all('div' , attrs={'class' : 'Readarea ReadAjax_content' }) file_path = "C:\\Users\\xingchen\\Desktop\\novel_content.txt" with open (file_path, "w" , encoding="utf-8" ) as file: for h1 in h1_elements: file.write("本文标题: " + h1.text + "\n" ) for p in p_elements: cleaned_text = p.text.replace("请收藏本站:https://www.bqgam.com。笔趣阁手机版:https://m.bqgam.com" , "" ).replace("『点此报错』" , "" ).replace("『加入书签』" , "" ) file.write("本文正文: " + cleaned_text + "\n" ) print (f"内容已写入 {file_path} 文件" ) getPage(page)
将爬取数据格式处理 将写入的文件自动换行 显示
cleaned_text_with_breaks = cleaned_text.replace("。" , "。\n" ).replace("!" , "!\n" ).replace("?" , "?\n" ) file.write("本文正文: " + cleaned_text_with_breaks + "\n\n" )
完整代码
import requestsfrom bs4 import BeautifulSoupfrom urllib.request import Request, urlopensession = requests.session() host = "https://www.bqgam.com/index/11303/" page = "https://www.bqgam.com/index/11303/1.html" def requestUrl (url ): headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36' , } req = Request(url, headers=headers) html = urlopen(req) html = html.read().decode('utf-8' ) return html def getPage (page ): html = requestUrl(page) soup = BeautifulSoup(html, "html.parser" ) h1_elements = soup.find_all('h1' , attrs={'class' : 'wap_none' }) p_elements = soup.find_all('div' , attrs={'class' : 'Readarea ReadAjax_content' }) file_path = "C:\\Users\\xingchen\\Desktop\\novel_content.txt" with open (file_path, "w" , encoding="utf-8" ) as file: for h1 in h1_elements: file.write(h1.text + "\n\n" ) for p in p_elements: cleaned_text = p.text.replace("请收藏本站:https://www.bqgam.com。笔趣阁手机版:https://m.bqgam.com" , "" ).replace("『点此报错』" , "" ).replace("『加入书签』" , "" ) cleaned_text_with_breaks = cleaned_text.replace("。" , "。\n" ).replace("!" , "!\n" ).replace("?" , "?\n" ) file.write(cleaned_text_with_breaks + "\n\n" ) print (f"内容已写入 {file_path} 文件" ) getPage(page)
进阶-爬取小说所有章节并存储到文本中 前面的代码我们都只能将其进行单章爬虫,我们如果想一口气将整本爬取下来,需要优化一下代码,以下是重构。
踩了很多坑,屎山代码有以下几个版本:
在项目文件夹外新建 小说名+作者名.txt 文件并写入所有章节进去,没有自定义存放路径功能 import osimport requestsfrom bs4 import BeautifulSouphost = "https://www.bqgam.com/index/11303/" def requestUrl (url ): headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36' } html = requests.get(url, headers=headers) return html.text def getSoup (url ): html = requestUrl(url) return BeautifulSoup(html, "html.parser" ) def getPage (page ): soup = getSoup(page) h1 = soup.find("h1" , attrs={'class' : 'wap_none' }) if h1 is None : return "Title not found" title = h1.text divText = soup.find(id ="chaptercontent" ) if divText is None : return "Content not found" divText = divText.getText("\n" ) i = divText.rfind("请" ) body = title + "\n" + divText[:i] return body def getAuthor (soup ): author_meta = soup.find("meta" , {"property" : "og:novel:author" }) if author_meta: return author_meta["content" ] else : return "Unknown Author" def spider (): soup = getSoup(host) fileName = soup.find(attrs={'class' : 'info' }).h1.string author = getAuthor(soup) save_path = os.path.join(os.getcwd(), f"{fileName} _{author} .txt" ) with open (save_path, "a" , encoding='utf-8' ) as file: for a in soup.find(attrs={'class' : 'listmain' }).find_all("a" ): index = a["href" ].rfind("/" ) + 1 file.write(getPage(host + a["href" ][index:])) spider()
在上一个版本动态获取小说名+作者名基础上,增加了自定义存放路径功能 import osimport requestsfrom bs4 import BeautifulSouphost = "https://www.bqgam.com/index/11303/" def requestUrl (url ): headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36' } html = requests.get(url, headers=headers) return html.text def getSoup (url ): html = requestUrl(url) return BeautifulSoup(html, "html.parser" ) def getPage (page ): soup = getSoup(page) h1 = soup.find("h1" , attrs={'class' : 'wap_none' }) if h1 is None : return "Title not found" title = h1.text divText = soup.find(id ="chaptercontent" ) if divText is None : return "Content not found" divText = divText.getText("\n" ) i = divText.rfind("请" ) body = title + "\n" + divText[:i] return body def getAuthor (soup ): author_meta = soup.find("meta" , {"property" : "og:novel:author" }) if author_meta: return author_meta["content" ] else : return "Unknown Author" def spider (custom_save_path=None ): soup = getSoup(host) fileName = soup.find(attrs={'class' : 'info' }).h1.string author = getAuthor(soup) if custom_save_path is None : custom_save_path = os.path.join(os.getcwd(), f"{fileName} _{author} .txt" ) with open (custom_save_path, "a" , encoding='utf-8' ) as file: for a in soup.find(attrs={'class' : 'listmain' }).find_all("a" ): index = a["href" ].rfind("/" ) + 1 file.write(getPage(host + a["href" ][index:])) soup = getSoup(host) fileName = soup.find(attrs={'class' : 'info' }).h1.string author = getAuthor(soup) custom_save_path = os.path.join("D:\\Downloads\\" , "{}_{}.txt" .format (fileName, author)) spider(custom_save_path)
再进阶,指定爬取某一章节内容 详解:比如1-100,则只爬取并写入1-100的内容到文件txt
import osimport requestsfrom bs4 import BeautifulSouphost = "https://www.bqgam.com/index/11303/" def requestUrl (url ): headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36' } html = requests.get(url, headers=headers) return html.text def getSoup (url ): html = requestUrl(url) return BeautifulSoup(html, "html.parser" ) def getPage (page ): soup = getSoup(page) h1 = soup.find("h1" , attrs={'class' : 'wap_none' }) if h1 is None : return "Title not found" title = h1.text divText = soup.find(id ="chaptercontent" ) if divText is None : return "Content not found" divText = divText.getText("\n" ) i = divText.rfind("请" ) body = title + "\n" + divText[:i] return body def getAuthor (soup ): author_meta = soup.find("meta" , {"property" : "og:novel:author" }) if author_meta: return author_meta["content" ] else : return "Unknown Author" def spider (custom_save_path=None , start_chapter=None , end_chapter=None ): soup = getSoup(host) fileName = soup.find(attrs={'class' : 'info' }).h1.string author = getAuthor(soup) if custom_save_path is None : custom_save_path = os.path.join(os.getcwd(), f"{fileName} _{author} .txt" ) with open (custom_save_path, "a" , encoding='utf-8' ) as file: chapters = soup.find(attrs={'class' : 'listmain' }).find_all("a" ) if start_chapter is not None and end_chapter is not None : chapters = chapters[start_chapter-1 :end_chapter+1 ] for a in chapters: index = a["href" ].rfind("/" ) + 1 file.write(getPage(host + a["href" ][index:])) soup = getSoup(host) fileName = soup.find(attrs={'class' : 'info' }).h1.string author = getAuthor(soup) custom_save_path = os.path.join("D:\\Downloads\\" , "{}_{}.txt" .format (fileName, author)) spider(custom_save_path, start_chapter=1 , end_chapter=100 )
关于这个爬虫代码 上面几个示例demo一直使用的这本全球崩坏 作为演示,其他书籍也是可以爬取的,自己更换为同一个站点的其他书籍url即可。
比如这本小说:宇宙职业选手
如果你要爬取其他笔趣阁或者网站的内容,请根据实际标签等更改源代码,以上源代码仅适用于: www.bqgam.com 这个站点。
最后免责声明:本站一切内容仅供学习交流使用,请勿用于商业用途,否则一切后果用户自负!谢谢。