• python百度贴吧爬虫


    # -*- coding: utf-8 -*-
    #coding=utf-8
    
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    class BDTB:
        def __init__(self,baseurl,seeLz):
            self.baseUrl=baseurl
            self.seeLz='?see_lz='+str(seeLz)
            self.Tool=Tool()
        def getPage(self,pageNum):
            try:
                url=self.baseUrl+self.seeLz+'&pn='+str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                return response.read()
            except urllib2.URLError, e:
                print "链接网络失败"+e.reason
                return None
        def getTitle(self):
            html=self.getPage(1)
            pattern = re.compile('core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
            result =re.search(pattern,html)
            if result:
                print result.group(1)
            else:
                return None
        def getContent(self,page):
            pattern  =re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            floor=1
            for i in items:
                print floor,u'楼--------------------------------------------
    '
                print self.Tool.replace(i)
                floor+=1
    
    
    
    
    class Tool:
        #去除img标签,1-7位空格,&nbsp;
        removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
        #删除超链接标签
        removeAddr = re.compile('<a.*?>|</a>')
        #把换行的标签换为
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        #将表格制表<td>替换为
        replaceTD= re.compile('<td>')
        #将换行符或双换行符替换为
        replaceBR = re.compile('<br><br>|<br>')
        #将其余标签剔除
        removeExtraTag = re.compile('<.*?>')
        #将多行空行删除
        removeNoneLine = re.compile(' +')
    
    
        def replace(self,x):
    
    
            x = re.sub(self.removeImg,"",x)
    
    
            x = re.sub(self.removeAddr,"",x)
    
    
            x = re.sub(self.replaceLine," ",x)
    
    
            x = re.sub(self.replaceTD," ",x)
    
    
            x = re.sub(self.replaceBR," ",x)
    
    
            x = re.sub(self.removeExtraTag,"",x)
    
    
            x = re.sub(self.removeNoneLine," ",x)
    
    
            #strip()将前后多余内容删除
    
    
            return x.strip()
    
    
    
    
    baseURL = 'http://tieba.baidu.com/p/3138733512'
    bdtb = BDTB(baseURL,2)
    bdtb.getContent(bdtb.getPage(2))
  • 相关阅读:
    ASP.NET学习笔记(1)
    vs2005新建项目中没有ASP.NET WEB应用程序
    IE无法安装Activex控件
    【Android】SDK工具学习
    【英语】Bingo口语笔记(22)
    【Python】实践笔记
    【学习】纪录片笔记
    【英语】Bingo口语笔记(20)
    【英文】20141027 生词
    【英文】Bingo口语笔记(18)
  • 原文地址:https://www.cnblogs.com/norm/p/7426279.html
Copyright © 2020-2023  润新知