• Python爬虫实战(一):爬糗事百科段子


    代码:

    # _*_ coding:utf-8 _*_
    import urllib2
    import re
    from datetime import datetime
    
    class QSBK:
        def __init__(self):
            self.pageIndex = 1
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self.headers = {'User-Agent':self.user_agent}
            self.stories = []
            self.enable = False
            
        def getPage(self,pageIndex):
            try:
                url = 'http://www.qiushibaike.com/hot/page'+str(pageIndex)
                request = urllib2.Request(url,headers = self.headers)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                return pageCode
            except urllib2.URLError,e:
                if hasattr(e,'reason'):
                    print u"QSBK connect Error,reason: ",e.reason
                    return None
    
        def getPageItems(self,pageIndex):
            pageCode = self.getPage(pageIndex)
            if not pageCode:
                print "Page Loading Error..."
                return None
            pattern = re.compile('<div.*?author clearfix">.*?<a.*?<img.*?>(.*?)</a>.*?<a.*?<h2>(.*?)</h2>.*?</a>.*?<div.*?'+'content">(.*?)<!--(.*?)-->.*?</div>.*?<div class="stats.*?class="number">(.*?)</i>',re.S)
            items = re.findall(pattern,pageCode)
            pageStories = []
            for item in items:
                haveImg = re.search("img",item[0])
                if not haveImg:
                    replaceBR = re.compile('<br/>')
                    text = re.sub(replaceBR,"
    ",item[2])
                    pageStories.append([item[1].strip(),text.strip(),item[3].strip(),item[4].strip()])
            return pageStories
    
        def loadPage(self):
            if self.enable == True:
                if len(self.stories) < 2:
                    pageStories = self.getPageItems(self.pageIndex)
                    if pageStories:
                        self.stories.append(pageStories)
                        self.pageIndex += 1
    
        def getOneStory(self,pageStories,page):
             for story in pageStories:
                 input = raw_input()
                 self.loadPage()
                 if input == 'Q':
                     self.enable = False
                     return
                 print u"第%d页	发布人:%s	发布时间:%s	赞:%s
    %s" %(page,story[0],datetime.fromtimestamp(int(story[2])),story[3],story[1])
                 
        def start(self):
             print u"正在读取糗事百科,按回车查看新段子,Q退出"
             self.enable = True
             self.loadPage()
             nowPage = 0
             while self.enable:
                 if len(self.stories)>0:
                     pageStories = self.stories[0]
                     nowPage += 1
                     del self.stories[0]
                     self.getOneStory(pageStories,nowPage)
    
    spider = QSBK()
    spider.start()
    

      

  • 相关阅读:
    Webwork【04】Configuration 详解
    Webwork【03】核心类 ServletDispatcher 的初始化
    Webwork【02】前端OGNL试练
    Webwork【01】Webwork与 Struct 的前世今生
    Oracle 数据库日常巡检
    php jquery ajax select 二级联动【get方式】
    PHP+ajax实现二级联动【post+json方式】
    thinkphp中在页面怎么格式输出小数和时间
    DataTables Bootstrap 3 example
    Bootstrap表格动态加载内容和排序功能
  • 原文地址:https://www.cnblogs.com/AndyJee/p/4997101.html
Copyright © 2020-2023  润新知