• Python爬虫实战(一):爬糗事百科段子


    代码:

    # _*_ coding:utf-8 _*_
    import urllib2
    import re
    from datetime import datetime
    
    class QSBK:
        def __init__(self):
            self.pageIndex = 1
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self.headers = {'User-Agent':self.user_agent}
            self.stories = []
            self.enable = False
            
        def getPage(self,pageIndex):
            try:
                url = 'http://www.qiushibaike.com/hot/page'+str(pageIndex)
                request = urllib2.Request(url,headers = self.headers)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                return pageCode
            except urllib2.URLError,e:
                if hasattr(e,'reason'):
                    print u"QSBK connect Error,reason: ",e.reason
                    return None
    
        def getPageItems(self,pageIndex):
            pageCode = self.getPage(pageIndex)
            if not pageCode:
                print "Page Loading Error..."
                return None
            pattern = re.compile('<div.*?author clearfix">.*?<a.*?<img.*?>(.*?)</a>.*?<a.*?<h2>(.*?)</h2>.*?</a>.*?<div.*?'+'content">(.*?)<!--(.*?)-->.*?</div>.*?<div class="stats.*?class="number">(.*?)</i>',re.S)
            items = re.findall(pattern,pageCode)
            pageStories = []
            for item in items:
                haveImg = re.search("img",item[0])
                if not haveImg:
                    replaceBR = re.compile('<br/>')
                    text = re.sub(replaceBR,"
    ",item[2])
                    pageStories.append([item[1].strip(),text.strip(),item[3].strip(),item[4].strip()])
            return pageStories
    
        def loadPage(self):
            if self.enable == True:
                if len(self.stories) < 2:
                    pageStories = self.getPageItems(self.pageIndex)
                    if pageStories:
                        self.stories.append(pageStories)
                        self.pageIndex += 1
    
        def getOneStory(self,pageStories,page):
             for story in pageStories:
                 input = raw_input()
                 self.loadPage()
                 if input == 'Q':
                     self.enable = False
                     return
                 print u"第%d页	发布人:%s	发布时间:%s	赞:%s
    %s" %(page,story[0],datetime.fromtimestamp(int(story[2])),story[3],story[1])
                 
        def start(self):
             print u"正在读取糗事百科,按回车查看新段子,Q退出"
             self.enable = True
             self.loadPage()
             nowPage = 0
             while self.enable:
                 if len(self.stories)>0:
                     pageStories = self.stories[0]
                     nowPage += 1
                     del self.stories[0]
                     self.getOneStory(pageStories,nowPage)
    
    spider = QSBK()
    spider.start()
    

      

  • 相关阅读:
    在vim中设置将tab自动转化为4个空格
    nginx1.4.6+php5.5.11+mysql5.6.17+mecache+opcache
    Centos7安装杀毒软件ClamAV
    网页中meta标记
    js刷新页面方法大全
    微信第三方登陆,无需注册一键登录,获取用户信息,PHP实现方法
    phpcms v9 如何实现用户登录
    web页面自适应手机屏幕宽度
    微信公共平台消息回复类
    自动回复微信消息
  • 原文地址:https://www.cnblogs.com/AndyJee/p/4997101.html
Copyright © 2020-2023  润新知