Spider小说模型

import requests,re

class Spider:

def take_html(self,url):
r = requests.get(url)
r.encoding = r.apparent_encoding
return r.text

def take_info(self,url,**regex):
html = self.take_html(url)

info_dict = {}
for key, value in regex.items():
info_dict[key] = re.findall(value, html)[:20]
return info_dict

if __name__ == '__main__':

url = 'https://www.x23us.com/html/69/69937/'

chapter_regex = '<td class="L"><a href="(.*?)">(.*?)</a></td>'
title_regex = '<title>(.*?)</title>'
content_regex = '<dd id="contents">(.*?)</dd>'
author_regex = '<meta name="description" content="冰与火之凛冬已至最新章节及全集列表免费在线订阅，本小说作者：(.*?)，由顶点小说会员整理上传。" />'

info = Spider().take_info(
url,
book_title=title_regex,
book_author=author_regex,
book_chapter=chapter_regex,
book_content=content_regex,
)
print(info)

相关阅读:
PHP 数组对象按照某个字段进行排序
laravel 多条件查询
PHP QR CODE 类库生成二维码
TypeError：Cannot read property 'type' of undefined
input禁止输入的4种方法
QQ会话调用地址
禁止左右键复制
设置Meta标签清除页面缓存
百度统计,百度自动推送合并
强制使用360浏览器使用webkit内核

原文地址：https://www.cnblogs.com/cwx-0324/p/10193999.html