• 糗事百科正则爬虫


    参考博客:http://cuiqingcai.com/990.html

    # -*- coding:utf-8 -*- 
    import urllib
    import urllib2
    import re
    
    page = 1
    
    url = "https://www.qiushibaike.com/8hr/page/" + str(page)
    headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request)
        content = response.read()
        # 匹配有图的帖子
        #pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<img src="(.*?.jpg)" .*?stats-vote.*?number">(d+)',re.S) # re.S 多行匹配
        # 匹配无图的帖子
        pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(d+)',re.S) # re.S 多行匹配
        items = re.findall(pattern,content)
        for item in items:
            print item[0], item[1].strip(), item[2]
    except urllib2.URLError, e:
        # 确定错误的属性
        if hasattr(e, "code"):
            print e.code
        if hasattr(e, "reason"):
            print e.reason
        

    与用户交互

    # -*- coding:utf-8 -*-
    
    import urllib, urllib2
    import re
    import thread
    import time 
    stories = []
    
    class Qsbk():
        """定义一个丑事百科类"""
        def __init__(self):
            """初始方法"""
            self.url = "https://www.qiushibaike.com/8hr/page/"        
            self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}
    
        def get_page(self, page):
            """传入某一页索引的代码"""
            fullurl = self.url + str(page)
            try:
                request = urllib2.Request(url=fullurl, headers=self.headers)
                response = urllib2.urlopen(request).read()
                self.get_page_items(response)
            except urllib2.URLError, e:
                if hasattr(e, "code"):
                    print e.code
                if hasattr(e, "reason"):
                    print e.reason
           
    
        def get_page_items(self, response):
            """获取段子列表"""
            global stories
            pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(d+)',re.S) # re.S 多行匹配
            items = re.findall(pattern,response)
            for item in items:
                stories.append(item[0].strip()+"
    "+ item[2].strip()+"
    "+ item[1].strip().replace("<br>", "").replace("<br/>", ""))
    
        def load_page(self, page):
            """如果当列表中少于10,则加载新一页"""
            self.get_page(page)
    
        def get_one_story(self):
            """调用此方法,打印一个段子"""
            global stories
            print "--------------------------------------------------------------------------------------"
            print stories.pop(0)
            print "--------------------------------------------------------------------------------------
    "        
        
    def main():
        """控制函数"""
        print "段子加载中..."
        qsbk = Qsbk()
        page = 0       
        qsbk.load_page(page)
        while True:
    
            option = raw_input("按任意键看段,按q退出:")
            if "q" == option:
                break
            else:
                if len(stories) < 10:
                    page += 1
                    qsbk.load_page(page)
                qsbk.get_one_story()
    
    
    if __name__ == "__main__":
        main()
         
  • 相关阅读:
    c# 正则表达式 首字母转大写
    c# WebBrowser获取cookie
    c# 求最小公倍数
    Response.Redirect与Server.Transfer区别-转
    asp 读文件 比较ip
    asp数组的使用
    如何解决#1045
    mysql limit分页查询效率
    Docker 容器管理:rancher
    Docker监控:google/cadvisor
  • 原文地址:https://www.cnblogs.com/cuzz/p/7707596.html
Copyright © 2020-2023  润新知