参考资料:
Python:http://www.runoob.com/python/python-intro.html
Python爬虫系列教程:http://www.cnblogs.com/xin-xin/p/4297852.html
正则表达式:http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html
本贴目标:
1.对百度贴吧的任意帖子进行抓取
2.指定是否只抓取楼主发帖内容
3.将抓取到的内容分析并保存到文件
4.抓取帖子中出现的美图
# -*- coding: utf-8 -*- """ Created on Fri Apr 15 11:47:02 2016 @author: wuhan """ import urllib import urllib2 import re import time import os #reload(sys) #sys.setdefaultencoding("utf-8") class Tool: removeImg = re.compile('<img.*?>| {12}') removeAddr = re.compile('<a.*?>|</a>') replaceLine = re.compile('<tr>|<div>|</div>|</p>') replaceTD = re.compile('<td>') replacePara = re.compile('<p.*?>') replaceBR = re.compile('<br><br>|<br>') removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, " ", x) x = re.sub(self.replaceBR, " ", x) x = re.sub(self.replacePara, " ", x) x = re.sub(self.replaceTD, " ", x) x = re.sub(self.removeExtraTag, "", x) return x.strip() class BDTB: def __init__(self, baseUrl, seeLZ, floorTag): self.baseURL = baseUrl self.seeLZ = '?see_lz=' + str(seeLZ) self.tool = Tool() self.file = None self.floor = 1 self.defaultTitle = u'百度贴吧' self.floorTag = floorTag def getPage(self, pageNum): try: url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode('utf-8') except urllib2.URLError, e: if hasattr(e, "reason"): print u'百度贴吧链接失败,错误原因 :', e.reason return None def getTitle(self, page): pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S) result = re.search(pattern, page) if result: return result.group(1).strip() else: return None def getPageNum(self, page): pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S) result = re.search(pattern, page) if result: return result.group(1).strip() else: return None def getContents(self,page): pattern = re.compile('<div id="post_content.*?>(.*?)</div>', re.S) items = re.findall(pattern, page) contents = [] for item in items: content = " " + self.tool.replace(item) + " " contents.append(content.encode('utf-8')) return contents def setFileTitle(self, title): if title is not None: self.file = open(title + ".txt" , "w+") else: self.file = open(self.defaultTitle + ".txt" , "w+") def writeData(self, contents): for item in contents: if self.floorTag == '1': floorLine = " " + str(self.floor) + u"----------------------------------------------------------------------------------------------------------------------------------------- " self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNum == None: print "URL已失效,请重试" return try: print "该贴子共有" + str(pageNum) + "页" for i in range(1, int(pageNum)+1): print "正在写入第" + str(i) + "页数据" page = self.getPage(i) contents = self.getContents(page) self.writeData(contents) self.getPicture(page, i) except IOError, e: print "写入异常,原因" + e.message finally: print "写入任务完成" def getPicture(self, page, PageNum): reg = r'<img class="BDE_Image".*?src="(.+?.jpg)' imgre = re.compile(reg)#可以把正则表达式编译成一个正则表达式对象 imglist = re.findall(imgre,page)#读取html 中包含 imgre(正则表达式)的数据 t = time.localtime(time.time()) foldername = str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday")) picpath = 'E:\Python\ImageDownload\%s' % (foldername) #下载到的本地目录 if not os.path.exists(picpath): #路径不存在时创建一个 os.makedirs(picpath) x = 0 for imgurl in imglist: target = picpath+'\%s_%s.jpg' % (PageNum, x) urllib.urlretrieve(imgurl, target)#直接将远程数据下载到本地 x+=1 print u"请输入帖子代号" baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/')) seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0 ".decode('utf-8').encode('gbk')) floorTag = raw_input("是否写入楼层信息,是输入1,否输入0 ".decode('utf-8').encode('gbk')) bdtb = BDTB(baseURL,seeLZ,floorTag) bdtb.start()