• 糗事百科python爬虫


    # -*- coding: utf-8 -*-
    #coding=utf-8
    
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    class QSBK:
        def __init__(self):
            self.pageIndex=1
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self.header={'User-Agent':self.user_agent}
            self.store=[]
            self.enable=False
        def getPage(self,pageIndex):
            try:
                url = 'http://www.qiushibaike.com/hot/page/'+str(pageIndex)
                request = urllib2.Request(url,headers=self.header)
                response = urllib2.urlopen(request)
                pageHtml =response.read().decode('utf-8')
                return pageHtml
            except urllib2.URLError,e:
                print '链接网络失败'+e.reason
                return None
        def getPageItem(self,pageIndex):
            page = self.getPage(pageIndex)
            if page==None:
                print "页面获得失败"
                return  None
            pattern = re.compile('<div class="author.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?class="content.*?<span>s*(.*?)s*</span>',re.S)
            items = re.findall(pattern, page)
            pageStories = []
            for item in items:
                pageStories.append([item[0],item[1]])
            return pageStories
        def loadPage(self):
            if self.enable==True:
                if len(self.store)<2:
                    pageStories = self.getPageItem(self.pageIndex)
                    if pageStories!=None:
                        self.store.append(pageStories)
                        self.pageIndex+=1
        def getOneStory(self,pageStories):
            for story in pageStories:
                input= raw_input()
                self.loadPage()
                if input=='Q':
                    self.enable=False
                    return
                print u'%s %s'%(story[0],story[1])
        def start(self):
            print u"正在读取糗事百科的数据,按Q退出"
            self.enable=True
            self.loadPage()
            nowPage=0
            while self.enable:
                if len(self.store)>0:
                    pageStore=self.store[0]
                    nowPage+=1
                    del self.store[0]
                    self.getOneStory(pageStore)
    
    
    
    spider =QSBK()
    spider.start()
  • 相关阅读:
    Linux定时任务实现每秒执行一次
    go 操作 Excel
    带你十天轻松搞定 Go 微服务系列全集+勘误
    debian修改crontab默认编辑器为vim
    LeetCode 537 复数乘法
    LeetCode 219 存在重复元素 II
    centos8 安装docker
    Linux 文件管理之vim命令详解
    linux 网络管理之nmcli命令详解
    Windows通过计划任务定时执行bat文件
  • 原文地址:https://www.cnblogs.com/norm/p/7425193.html
Copyright © 2020-2023  润新知