使用scrapy框架爬起点小说网
第一步:
进入起点小说网站首页,点击全部作品,进入到另一个页面,这个页面有小说所有的类型,小说类型有大分类和小分类,我们首先要
做的就是先把这些小说分类爬下来,在爬这个页面之前,我们要看一下网页最上方的链接,就是https://www.qidian.com/all,这个
代表你从哪个页面开始爬的,而不是直接从起点的首页开始爬,爬完之后把小说类别名称存入到Mongodb数据库中,把小说链接存到
redis数据库中,以下是爬取小说分类的代码:
import scrapy
from scrapy.selector import HtmlXPathSelector #用lxml方法提取
from scrapy.http import Request
import pymongo #导入pymongo这个包
client = pymongo.MongoClient('localhost',27017) #连接到Mongodb数据库
db = client.novel #novel是库名
collection = db.novelclass #novelclass是novel库里面的表名
import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0) #连接redis数据库
class qidianClassSpider(scrapy.Spider):
name = "qidianClass" #这个就是要运行的py文件的名称
allowed_domains = ["qidian.com"] # 允许访问的域
start_urls = [
"https://www.qidian.com/all",
]
# 每爬完一个网页会回调parse方法
def parse(self, response):
hxs = HtmlXPathSelector(response)
hx = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a') #小说类别位置
for secItem in hx:
url = secItem.select("@href").extract() #小说大类别链接
name = secItem.select("text()").extract() #小说大类别名称
url = "https:"+url[0] #"https:"是协议头,有的网站是http,起点是https,url[0]含义是url链接外的引号或其他符号去掉,只要链接
print(url)
print(name[0])
classid = self.insertMongo(name[0],None) #把类别名插入mongodb中,因为我们点击小类别就能跳到小说界面,所以不用把大分类链接导入redis
request = Request(url, callback=lambda response,pid=str(classid):self.parse_subClass(response,pid)) #把pid赋给self.parse_subClass函数
yield request
print('-----------------')
#这里的lambda是匿名函数,用法可以百度,关键是pid=str(classid)这句,代表的是小类别的pid等于大类别的id,pid就是自己的上一级,大分类没有上一级,所以pid=none,小分类的上一级就是大分类,所以小分类的pid就等于大分类的id
以上代码之所以可以进行“递归”的访问相关URL,关键在于parse方法使用了yield Request对象。
即通过yield生成器向每一个url发送request请求,并执行返回函数parse,从而递归获取小说类别信息。
def parse_subClass(self,response,pid):
print('-----------------')
print('pid='+pid)
print('-----------------')
hxs = HtmlXPathSelector(response)
hx = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a') #小类别的位置
for secItem in hx:
urls = secItem.select("@href").extract()
url = "https:"+urls[0]
names = secItem.select("text()").extract()
print(names[0])
print(url)
classid = self.insertMongo(names[0],pid)
self.pushRedis(classid,pid,url)
def insertMongo(self,classname,pid):
classid = collection.insert({'classname': classname, "pid": pid})
return classid
def pushRedis(self,classid,pid,url):
novelurl = '%s,%s,%s' % (classid,pid,url) #插入redis库中,库名是novelurl
r.lpush('novelurl', novelurl)
在mongodb库中运行的部分结果显示:
> db.novelclass.find()
{ "_id" : ObjectId("5a27dfb218eca4393c488f22"), "classname" : "玄幻", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f23"), "classname" : "奇幻", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f24"), "classname" : "武侠", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f25"), "classname" : "仙侠", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f26"), "classname" : "都市", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f27"), "classname" : "现实", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f28"), "classname" : "军事", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f29"), "classname" : "历史", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f2a"), "classname" : "游戏", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f2b"), "classname" : "体育", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f2c"), "classname" : "科幻", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f2d"), "classname" : "灵异", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f2e"), "classname" : "二次元", "pid" : null }
{ "_id" : ObjectId("5a27dfb218eca4393c488f2f"), "classname" : "短篇", "pid" : null }
{ "_id" : ObjectId("5a27dfb418eca4393c488f30"), "classname" : "现代魔法", "pid" : "5a27dfb218eca4393c488f23" }
{ "_id" : ObjectId("5a27dfb418eca4393c488f31"), "classname" : "剑与魔法", "pid" : "5a27dfb218eca4393c488f23" }
{ "_id" : ObjectId("5a27dfb418eca4393c488f32"), "classname" : "史诗奇幻", "pid" : "5a27dfb218eca4393c488f23" }
{ "_id" : ObjectId("5a27dfb418eca4393c488f33"), "classname" : "黑暗幻想", "pid" : "5a27dfb218eca4393c488f23" }
{ "_id" : ObjectId("5a27dfb418eca4393c488f34"), "classname" : "历史神话", "pid" : "5a27dfb218eca4393c488f23" }
{ "_id" : ObjectId("5a27dfb418eca4393c488f35"), "classname" : "另类幻想", "pid" : "5a27dfb218eca4393c488f23" }
可以看出目前显示的有六个小分类,他们都有pid,也就是说他们都有上一级,再看大分类,他们的pid都是空,证明是最高级了,而这六个小分类
的pid"5a27dfb218eca4393c488f23"和大分类“奇幻”的"_id" : ObjectId("5a27dfb218eca4393c488f23")相等,证明他们是属于奇幻这一大分类,而无论大分类还是
小分类的_id都是系统自己生成的
redis数据库中部分结果显示:
69) "5a27dfb518eca4393c488f3d,5a27dfb218eca4393c488f27,https://www.qidian.com/all?chanId=15&subCateId=20106&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
70) "5a27dfb518eca4393c488f3c,5a27dfb218eca4393c488f27,https://www.qidian.com/all?chanId=15&subCateId=20105&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
71) "5a27dfb518eca4393c488f3b,5a27dfb218eca4393c488f27,https://www.qidian.com/all?chanId=15&subCateId=20104&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
72) "5a27dfb518eca4393c488f3a,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=20103&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
73) "5a27dfb518eca4393c488f39,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=20102&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
74) "5a27dfb518eca4393c488f38,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=240&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
75) "5a27dfb518eca4393c488f37,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=70&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
76) "5a27dfb518eca4393c488f36,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=7&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
77) "5a27dfb418eca4393c488f35,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=20093&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
78) "5a27dfb418eca4393c488f34,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=20092&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
79) "5a27dfb418eca4393c488f33,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=202&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
80) "5a27dfb418eca4393c488f32,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=201&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
81) "5a27dfb418eca4393c488f31,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=62&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
82) "5a27dfb418eca4393c488f30,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=38&orderId=&page=1&style=1&pageSize=20&
总共有82个小分类的链接
第二步:
根据插入到redis的小说小类别链接爬取到对应的小说名称及链接,链接都在redis库中,首先要把他们取出来,以下附上代码:
import scrapy
from scrapy.http import Request
from lxml import etree
from time import sleep
import pymongo
client = pymongo.MongoClient('localhost',27017)
db = client.novel
collection = db.novellist #这次爬取的是小说名,所以又连了一个新的表novellist
import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0)
ii=0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovel"
allow_domain = ["qidian.com"] #分一个域
def __init__(self):
start_urls=[]
urlList = r.lrange('novelurl',0,-1) #这个novelurl就是存小说类别链接的地方,这里把链接都取出来 0 ,-1 代表在redis里面取多少页链接,例:取四页(0,3)
ii=0
self.dict = {} #类别url做键,classid和pid++计数器做值
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid":classid,"pid":pid,"num":0} #num代表页数
print(start_urls)
qidianNovelSpider.start_urls = start_urls
# 每爬完一个网页会回调parse方法
def parse(self, response):
# print(response.body.decode('utf-8'))
classInfo = self.dict[response.url]
obiectid = classInfo['classid']
pid = classInfo['pid']
num = classInfo['num']
if num>3:
return None
hxs = HtmlXPathSelector(response)
hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a')
for secItem in hxsObj:
className = secItem.select('text()').extract()
classUrl = secItem.select('@href').extract()
classUrl = 'https:' + classUrl[0]
print(className[0])
print(classUrl)
classid = collection.insert({'classname': className[0], "pid": obiectid}) #插入mongodb中的novellist
r.lpush('novelnameurl',"%s,%s,%s,"%(classid,obiectid,classUrl)) #把小说名链接插入redis中的novelnameurl中
sleep(0.3)
print('-----------递归-------------')
# global ii
# ii += 1
nexturls = hxs.select('//a[@class="lbf-pagination-next "]') #下一页的位置
nexturl = 'https:'+nexturls[0].get('href')
print('nexturl=%s'%nexturl)
classInfo['num'] += 1
self.dict[nexturl] = classInfo
request = Request(nexturl, callback=self.parse) #执行返回函数parse
yield request
print('-----------end------------')
第三步:
爬小说作品相关信息,并更新到mongodb中的novellist表中,注意:第二步和第三步在mongodb数据库中使用的是同一个表novellist,第二步完成后表里面只有小说名称,而在第三步完成后,就把作者名称,作品状态等更新上去了。
# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
import pymongo
from bson.objectid import ObjectId
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel # 库名dianping
collection = db.novellist
import redis # 导入redis数据库
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovelWorksInfo"
allowed_domains = ["qidian.com"] # 允许访问的域
def __init__(self):
# global pid
# 查询reids库novelurl
# qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
start_urls = []
urlList = r.lrange('novelnameurl', 0, -1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
print(start_urls)
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
objectid2 = ObjectId(objectid)
pid = classInfo['pid']
html = response.body.decode('utf-8')
selector = etree.HTML(html)
workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()')
novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()')
novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()')
novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()')
objClass=novelClass[0]
sonClass=novelClass[1]
print("小说名:"+novelName[0])
print("作者名:"+workName[0])
print("状态:" + novelState[0])
print("小说分类:"+objClass)
print("小说分类2:" + sonClass)
db.novellist.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}})
print('--------end--------------')
第四步:
爬取小说章节,根据redis库里novelnameurl爬取
import scrapy
from lxml import etree
import pymongo
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel
collection = db.novelChapterInfo #mongodb里新建的小说章节表
import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0)
ii=0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovelChapterInfo"
allow_domain = ["qidian.com"]
def __init__(self):
start_urls = []
urlList = r.lrange('novelnameurl', 0, -1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
pid = classInfo['pid']
html = response.body.decode('utf-8')
selector = etree.HTML(html)
novelChapters = selector.xpath('//ul[@class="cf"]/li/a')
for item in novelChapters:
novelChapter = item.text
print(item.text)
novelChapterUrl='https:'+item.get('href')
print(novelChapterUrl)
classid = collection.insert({'novelChapter': novelChapter,'pid': objectid})
r.lpush('novelChapterUrl','%s,%s,%s' % (classid,pid,novelChapterUrl)) #redis库里面新的key novelChapterUrl
第五步:
根据redis里 novelChapterUrl爬取小说内容,和二三步类似,第五步,就是把第四步爬下来的小说章节又更新上了内容
# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
import pymongo
from bson.objectid import ObjectId
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel # 库名dianping
collection = db.novelChapterInfo
import redis # 导入redis数据库
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovelChapterContent"
allowed_domains = ["qidian.com"]
def __int__(self):
start_urls = []
urlList = r.lrange('novelChapterUrl', 0 ,-1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
objectid2 = ObjectId(objectid)
pid = classInfo['pid']
num = classInfo['num']
ii = ""
html = response.body.decode('utf-8')
selector = etree.HTML(html)
novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p')
print(novelChaptersContents)
for item in novelChaptersContents:
novelChaptersContent = item.text
print(novelChaptersContent)
ii = novelChaptersContent + ii
# classid = collection.insert({'content': ii, 'pid': pid})
db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent': ii}})
# sleep(0.3)
print('------------------------------------------------------')