import requests,re
class Spider:
def take_html(self,url):
r = requests.get(url)
r.encoding = r.apparent_encoding
return r.text
def take_info(self,url,**regex):
html = self.take_html(url)
info_dict = {}
for key, value in regex.items():
info_dict[key] = re.findall(value, html)[:20]
return info_dict
if __name__ == '__main__':
url = 'https://www.x23us.com/html/69/69937/'
chapter_regex = '<td class="L"><a href="(.*?)">(.*?)</a></td>'
title_regex = '<title>(.*?)</title>'
content_regex = '<dd id="contents">(.*?)</dd>'
author_regex = '<meta name="description" content="冰与火之凛冬已至最新章节及全集列表免费在线订阅,本小说作者:(.*?),由顶点小说会员整理上传。" />'
info = Spider().take_info(
url,
book_title=title_regex,
book_author=author_regex,
book_chapter=chapter_regex,
book_content=content_regex,
)
print(info)