• 使用scrapy框架爬起点小说网


    使用scrapy框架爬起点小说网


    第一步:

    进入起点小说网站首页,点击全部作品,进入到另一个页面,这个页面有小说所有的类型,小说类型有大分类和小分类,我们首先要
    做的就是先把这些小说分类爬下来,在爬这个页面之前,我们要看一下网页最上方的链接,就是https://www.qidian.com/all,这个
    代表你从哪个页面开始爬的,而不是直接从起点的首页开始爬,爬完之后把小说类别名称存入到Mongodb数据库中,把小说链接存到
    redis数据库中,以下是爬取小说分类的代码:
    import scrapy
    from scrapy.selector import HtmlXPathSelector #用lxml方法提取
    from scrapy.http import Request
    import pymongo #导入pymongo这个包

    client = pymongo.MongoClient('localhost',27017) #连接到Mongodb数据库
    db = client.novel #novel是库名
    collection = db.novelclass #novelclass是novel库里面的表名

    import redis
    r = redis.Redis(host='127.0.0.1',port=6379,db=0) #连接redis数据库

    class qidianClassSpider(scrapy.Spider):
    name = "qidianClass" #这个就是要运行的py文件的名称
    allowed_domains = ["qidian.com"] # 允许访问的域
    start_urls = [
    "https://www.qidian.com/all",
    ]


    # 每爬完一个网页会回调parse方法
    def parse(self, response):
    hxs = HtmlXPathSelector(response)
    hx = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a') #小说类别位置
    for secItem in hx:
    url = secItem.select("@href").extract() #小说大类别链接
    name = secItem.select("text()").extract() #小说大类别名称
    url = "https:"+url[0] #"https:"是协议头,有的网站是http,起点是https,url[0]含义是url链接外的引号或其他符号去掉,只要链接
    print(url)
    print(name[0])
    classid = self.insertMongo(name[0],None) #把类别名插入mongodb中,因为我们点击小类别就能跳到小说界面,所以不用把大分类链接导入redis
    request = Request(url, callback=lambda response,pid=str(classid):self.parse_subClass(response,pid)) #把pid赋给self.parse_subClass函数
    yield request
    print('-----------------')
    #这里的lambda是匿名函数,用法可以百度,关键是pid=str(classid)这句,代表的是小类别的pid等于大类别的id,pid就是自己的上一级,大分类没有上一级,所以pid=none,小分类的上一级就是大分类,所以小分类的pid就等于大分类的id
    以上代码之所以可以进行“递归”的访问相关URL,关键在于parse方法使用了yield Request对象。
    即通过yield生成器向每一个url发送request请求,并执行返回函数parse,从而递归获取小说类别信息。

    def parse_subClass(self,response,pid):
    print('-----------------')
    print('pid='+pid)
    print('-----------------')
    hxs = HtmlXPathSelector(response)
    hx = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a') #小类别的位置
    for secItem in hx:
    urls = secItem.select("@href").extract()
    url = "https:"+urls[0]
    names = secItem.select("text()").extract()
    print(names[0])
    print(url)
    classid = self.insertMongo(names[0],pid)
    self.pushRedis(classid,pid,url)
    def insertMongo(self,classname,pid):
    classid = collection.insert({'classname': classname, "pid": pid})
    return classid
    def pushRedis(self,classid,pid,url):
    novelurl = '%s,%s,%s' % (classid,pid,url) #插入redis库中,库名是novelurl
    r.lpush('novelurl', novelurl)

    在mongodb库中运行的部分结果显示:
    > db.novelclass.find()
    { "_id" : ObjectId("5a27dfb218eca4393c488f22"), "classname" : "玄幻", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f23"), "classname" : "奇幻", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f24"), "classname" : "武侠", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f25"), "classname" : "仙侠", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f26"), "classname" : "都市", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f27"), "classname" : "现实", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f28"), "classname" : "军事", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f29"), "classname" : "历史", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f2a"), "classname" : "游戏", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f2b"), "classname" : "体育", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f2c"), "classname" : "科幻", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f2d"), "classname" : "灵异", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f2e"), "classname" : "二次元", "pid" : null }
    { "_id" : ObjectId("5a27dfb218eca4393c488f2f"), "classname" : "短篇", "pid" : null }
    { "_id" : ObjectId("5a27dfb418eca4393c488f30"), "classname" : "现代魔法", "pid" : "5a27dfb218eca4393c488f23" }
    { "_id" : ObjectId("5a27dfb418eca4393c488f31"), "classname" : "剑与魔法", "pid" : "5a27dfb218eca4393c488f23" }
    { "_id" : ObjectId("5a27dfb418eca4393c488f32"), "classname" : "史诗奇幻", "pid" : "5a27dfb218eca4393c488f23" }
    { "_id" : ObjectId("5a27dfb418eca4393c488f33"), "classname" : "黑暗幻想", "pid" : "5a27dfb218eca4393c488f23" }
    { "_id" : ObjectId("5a27dfb418eca4393c488f34"), "classname" : "历史神话", "pid" : "5a27dfb218eca4393c488f23" }
    { "_id" : ObjectId("5a27dfb418eca4393c488f35"), "classname" : "另类幻想", "pid" : "5a27dfb218eca4393c488f23" }

    可以看出目前显示的有六个小分类,他们都有pid,也就是说他们都有上一级,再看大分类,他们的pid都是空,证明是最高级了,而这六个小分类
    的pid"5a27dfb218eca4393c488f23"和大分类“奇幻”的"_id" : ObjectId("5a27dfb218eca4393c488f23")相等,证明他们是属于奇幻这一大分类,而无论大分类还是
    小分类的_id都是系统自己生成的

    redis数据库中部分结果显示:

    69) "5a27dfb518eca4393c488f3d,5a27dfb218eca4393c488f27,https://www.qidian.com/all?chanId=15&subCateId=20106&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    70) "5a27dfb518eca4393c488f3c,5a27dfb218eca4393c488f27,https://www.qidian.com/all?chanId=15&subCateId=20105&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    71) "5a27dfb518eca4393c488f3b,5a27dfb218eca4393c488f27,https://www.qidian.com/all?chanId=15&subCateId=20104&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    72) "5a27dfb518eca4393c488f3a,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=20103&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    73) "5a27dfb518eca4393c488f39,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=20102&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    74) "5a27dfb518eca4393c488f38,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=240&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    75) "5a27dfb518eca4393c488f37,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=70&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    76) "5a27dfb518eca4393c488f36,5a27dfb218eca4393c488f2a,https://www.qidian.com/all?chanId=7&subCateId=7&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    77) "5a27dfb418eca4393c488f35,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=20093&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    78) "5a27dfb418eca4393c488f34,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=20092&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    79) "5a27dfb418eca4393c488f33,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=202&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    80) "5a27dfb418eca4393c488f32,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=201&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    81) "5a27dfb418eca4393c488f31,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=62&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    82) "5a27dfb418eca4393c488f30,5a27dfb218eca4393c488f23,https://www.qidian.com/all?chanId=1&subCateId=38&orderId=&page=1&style=1&pageSize=20&

    总共有82个小分类的链接

    第二步:

    根据插入到redis的小说小类别链接爬取到对应的小说名称及链接,链接都在redis库中,首先要把他们取出来,以下附上代码:

    import scrapy
    from scrapy.http import Request

    from lxml import etree
    from time import sleep
    import pymongo
    client = pymongo.MongoClient('localhost',27017)
    db = client.novel
    collection = db.novellist #这次爬取的是小说名,所以又连了一个新的表novellist

    import redis
    r = redis.Redis(host='127.0.0.1',port=6379,db=0)

    ii=0

    class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovel"
    allow_domain = ["qidian.com"] #分一个域

    def __init__(self):
    start_urls=[]
    urlList = r.lrange('novelurl',0,-1) #这个novelurl就是存小说类别链接的地方,这里把链接都取出来 0 ,-1 代表在redis里面取多少页链接,例:取四页(0,3)
    ii=0
    self.dict = {} #类别url做键,classid和pid++计数器做值
    for item in urlList:
    itemStr = str(item, encoding="utf-8")
    arr = itemStr.split(',')
    classid = arr[0]
    pid = arr[1]
    url = arr[2]
    start_urls.append(url)
    self.dict[url] = {"classid":classid,"pid":pid,"num":0} #num代表页数

    print(start_urls)
    qidianNovelSpider.start_urls = start_urls

    # 每爬完一个网页会回调parse方法
    def parse(self, response):
    # print(response.body.decode('utf-8'))
    classInfo = self.dict[response.url]
    obiectid = classInfo['classid']
    pid = classInfo['pid']
    num = classInfo['num']
    if num>3:
    return None
    hxs = HtmlXPathSelector(response)
    hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a')
    for secItem in hxsObj:
    className = secItem.select('text()').extract()
    classUrl = secItem.select('@href').extract()
    classUrl = 'https:' + classUrl[0]
    print(className[0])
    print(classUrl)
    classid = collection.insert({'classname': className[0], "pid": obiectid}) #插入mongodb中的novellist
    r.lpush('novelnameurl',"%s,%s,%s,"%(classid,obiectid,classUrl)) #把小说名链接插入redis中的novelnameurl中

    sleep(0.3)
    print('-----------递归-------------')
    # global ii
    # ii += 1
    nexturls = hxs.select('//a[@class="lbf-pagination-next "]') #下一页的位置
    nexturl = 'https:'+nexturls[0].get('href')
    print('nexturl=%s'%nexturl)

    classInfo['num'] += 1
    self.dict[nexturl] = classInfo
    request = Request(nexturl, callback=self.parse) #执行返回函数parse
    yield request
    print('-----------end------------')

    第三步:

    爬小说作品相关信息,并更新到mongodb中的novellist表中,注意:第二步和第三步在mongodb数据库中使用的是同一个表novellist,第二步完成后表里面只有小说名称,而在第三步完成后,就把作者名称,作品状态等更新上去了。


    # -*- coding: utf-8 -*-
    import scrapy
    from lxml import etree
    import pymongo
    from bson.objectid import ObjectId

    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel # 库名dianping
    collection = db.novellist

    import redis # 导入redis数据库

    r = redis.Redis(host='127.0.0.1', port=6379, db=0)

    ii = 0


    class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelWorksInfo"
    allowed_domains = ["qidian.com"] # 允许访问的域

    def __init__(self):
    # global pid
    # 查询reids库novelurl
    # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
    start_urls = []
    urlList = r.lrange('novelnameurl', 0, -1)
    ii = 0
    self.dict = {}
    for item in urlList:
    itemStr = str(item, encoding="utf-8")
    arr = itemStr.split(',')
    classid = arr[0]
    pid = arr[1]
    url = arr[2]
    start_urls.append(url)
    self.dict[url] = {"classid": classid, "pid": pid, "num": 0}

    print(start_urls)
    self.start_urls = start_urls

    def parse(self, response):
    classInfo = self.dict[response.url]
    objectid = classInfo['classid']
    objectid2 = ObjectId(objectid)
    pid = classInfo['pid']
    html = response.body.decode('utf-8')
    selector = etree.HTML(html)
    workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()')
    novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()')
    novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()')
    novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()')
    objClass=novelClass[0]
    sonClass=novelClass[1]
    print("小说名:"+novelName[0])
    print("作者名:"+workName[0])
    print("状态:" + novelState[0])

    print("小说分类:"+objClass)
    print("小说分类2:" + sonClass)

    db.novellist.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}})


    print('--------end--------------')

    第四步:


    爬取小说章节,根据redis库里novelnameurl爬取

    import scrapy

    from lxml import etree


    import pymongo
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel
    collection = db.novelChapterInfo #mongodb里新建的小说章节表

    import redis
    r = redis.Redis(host='127.0.0.1',port=6379,db=0)

    ii=0

    class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelChapterInfo"
    allow_domain = ["qidian.com"]

    def __init__(self):
    start_urls = []
    urlList = r.lrange('novelnameurl', 0, -1)
    ii = 0
    self.dict = {}
    for item in urlList:
    itemStr = str(item, encoding="utf-8")
    arr = itemStr.split(',')
    classid = arr[0]
    pid = arr[1]
    url = arr[2]
    start_urls.append(url)
    self.dict[url] = {"classid": classid, "pid": pid, "num": 0}

    self.start_urls = start_urls

    def parse(self, response):
    classInfo = self.dict[response.url]
    objectid = classInfo['classid']
    pid = classInfo['pid']
    html = response.body.decode('utf-8')
    selector = etree.HTML(html)
    novelChapters = selector.xpath('//ul[@class="cf"]/li/a')
    for item in novelChapters:
    novelChapter = item.text
    print(item.text)
    novelChapterUrl='https:'+item.get('href')
    print(novelChapterUrl)

    classid = collection.insert({'novelChapter': novelChapter,'pid': objectid})
    r.lpush('novelChapterUrl','%s,%s,%s' % (classid,pid,novelChapterUrl)) #redis库里面新的key novelChapterUrl


    第五步:
    根据redis里 novelChapterUrl爬取小说内容,和二三步类似,第五步,就是把第四步爬下来的小说章节又更新上了内容

    # -*- coding: utf-8 -*-
    import scrapy
    from lxml import etree
    import pymongo
    from bson.objectid import ObjectId

    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel # 库名dianping
    collection = db.novelChapterInfo

    import redis # 导入redis数据库

    r = redis.Redis(host='127.0.0.1', port=6379, db=0)

    ii = 0

    class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelChapterContent"
    allowed_domains = ["qidian.com"]

    def __int__(self):
    start_urls = []
    urlList = r.lrange('novelChapterUrl', 0 ,-1)
    ii = 0
    self.dict = {}
    for item in urlList:
    itemStr = str(item, encoding="utf-8")
    arr = itemStr.split(',')
    classid = arr[0]
    pid = arr[1]
    url = arr[2]
    start_urls.append(url)
    self.dict[url] = {"classid": classid, "pid": pid, "num": 0}

    self.start_urls = start_urls
    def parse(self, response):
    classInfo = self.dict[response.url]
    objectid = classInfo['classid']
    objectid2 = ObjectId(objectid)
    pid = classInfo['pid']
    num = classInfo['num']
    ii = ""
    html = response.body.decode('utf-8')
    selector = etree.HTML(html)
    novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p')
    print(novelChaptersContents)
    for item in novelChaptersContents:
    novelChaptersContent = item.text
    print(novelChaptersContent)
    ii = novelChaptersContent + ii
    # classid = collection.insert({'content': ii, 'pid': pid})
    db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent': ii}})
    # sleep(0.3)
    print('------------------------------------------------------')

  • 相关阅读:
    2016华为实习编程题:停车管理系统
    lintcode:单词切分
    2016网易实习生编程题:数组中两个数的和等于sum
    2016网易实习生编程题:n个骰子的和等于m
    2016百度编程题:钓鱼比赛
    2016百度编程题:裁减网格纸
    2016百度编程题:罪犯转移
    JAVA面试基础
    扔硬币问题
    随机数生成随机数
  • 原文地址:https://www.cnblogs.com/MRASdoubleZ/p/8011254.html
Copyright © 2020-2023  润新知