• 网络爬虫入门——案例一:爬取百度贴吧帖子


    参考资料:

    Python:http://www.runoob.com/python/python-intro.html

    Python爬虫系列教程:http://www.cnblogs.com/xin-xin/p/4297852.html

    正则表达式:http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html

    本贴目标:

    1.对百度贴吧的任意帖子进行抓取

    2.指定是否只抓取楼主发帖内容

    3.将抓取到的内容分析并保存到文件

    4.抓取帖子中出现的美图

    # -*- coding: utf-8 -*-
    """
    Created on Fri Apr 15 11:47:02 2016
    
    @author: wuhan
    """
    import urllib
    import urllib2
    import re
    import time
    import os
    
    
    #reload(sys)
    #sys.setdefaultencoding("utf-8")
    
    class Tool:
        removeImg = re.compile('<img.*?>| {12}')
        removeAddr = re.compile('<a.*?>|</a>')
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        replaceTD = re.compile('<td>')
        replacePara = re.compile('<p.*?>')
        replaceBR = re.compile('<br><br>|<br>')
        removeExtraTag = re.compile('<.*?>')
        
        def replace(self,x):
            x = re.sub(self.removeImg, "", x)
            x = re.sub(self.removeAddr, "", x)
            x = re.sub(self.replaceLine, "
    ", x)
            x = re.sub(self.replaceBR, "
    ", x)
            x = re.sub(self.replacePara, "
      ", x)
            x = re.sub(self.replaceTD, "	", x)
            x = re.sub(self.removeExtraTag, "", x)
            return x.strip()
            
    
    class BDTB:
        def __init__(self, baseUrl, seeLZ, floorTag):
            self.baseURL = baseUrl
            self.seeLZ = '?see_lz=' + str(seeLZ)
            self.tool = Tool()
            self.file = None
            self.floor = 1
            self.defaultTitle = u'百度贴吧'
            self.floorTag = floorTag
            
        def getPage(self, pageNum):
            try:
                url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                return response.read().decode('utf-8')
            except urllib2.URLError, e:
                if hasattr(e, "reason"):
                    print u'百度贴吧链接失败,错误原因 :', e.reason
                    return None
                    
        def getTitle(self, page):
             pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S)
             result = re.search(pattern, page)
             if result:
                 return result.group(1).strip()
             else:
                 return None
                 
        def getPageNum(self, page):
            pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
            result = re.search(pattern, page)
            if result:
                return result.group(1).strip()
            else:
                return None
        
        def getContents(self,page):
            pattern = re.compile('<div id="post_content.*?>(.*?)</div>', re.S)
            items = re.findall(pattern, page)
            contents = []
            for item in items:
                content = "
    " + self.tool.replace(item) + "
    "
                contents.append(content.encode('utf-8'))
            return contents
            
        def setFileTitle(self, title):
            if title is not None:
                self.file = open(title + ".txt" , "w+")
            else:
                self.file = open(self.defaultTitle + ".txt" , "w+")
                
        def writeData(self, contents):
            for item in contents:
                if self.floorTag == '1':
                    floorLine = "
    " + str(self.floor) + u"-----------------------------------------------------------------------------------------------------------------------------------------
    "
                    self.file.write(floorLine)
                self.file.write(item)
                self.floor += 1
        
        def start(self):
            indexPage = self.getPage(1)
            pageNum = self.getPageNum(indexPage)
            title = self.getTitle(indexPage)
            self.setFileTitle(title)
            if pageNum == None:
                print "URL已失效,请重试"
                return
            try:
                print "该贴子共有" + str(pageNum) + ""
                for i in range(1, int(pageNum)+1):
                    print "正在写入第" + str(i) + "页数据"
                    page = self.getPage(i)
                    contents = self.getContents(page)
                    self.writeData(contents)
                    self.getPicture(page, i)
            except IOError, e:
                print "写入异常,原因" + e.message
            finally:
                print "写入任务完成"
                
        def getPicture(self, page, PageNum):
            reg = r'<img class="BDE_Image".*?src="(.+?.jpg)'
            imgre = re.compile(reg)#可以把正则表达式编译成一个正则表达式对象
            imglist = re.findall(imgre,page)#读取html 中包含 imgre(正则表达式)的数据
            t = time.localtime(time.time())
            foldername = str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday"))
            picpath = 'E:\Python\ImageDownload\%s' % (foldername) #下载到的本地目录  
            if not os.path.exists(picpath):   #路径不存在时创建一个
                os.makedirs(picpath)
        
            x = 0
            for imgurl in imglist:
                target = picpath+'\%s_%s.jpg' % (PageNum, x)
                urllib.urlretrieve(imgurl, target)#直接将远程数据下载到本地
                x+=1
            
    print u"请输入帖子代号"
    baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))
    seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0
    ".decode('utf-8').encode('gbk'))
    floorTag = raw_input("是否写入楼层信息,是输入1,否输入0
    ".decode('utf-8').encode('gbk'))
    bdtb = BDTB(baseURL,seeLZ,floorTag)
    bdtb.start()        
  • 相关阅读:
    java并发编程——Excutor
    java并发编程——BlockingQueue
    const int *p和int * const p的区别(常量指针与指向常量的指针)
    C语言 enum作为函数返回值及函数参数
    (void)0;
    浅析IAR环境下Flash loader工作原理 (转)
    xilinx zcu106 vcu demo
    flashloader速度提升
    typora--简洁的markdown编辑器
    vivado 2019.2 工程修改文件夹名称后引起的一系列问题
  • 原文地址:https://www.cnblogs.com/jingyuewutong/p/5569059.html
Copyright © 2020-2023  润新知