原文链接:http://cuiqingcai.com/993.html
划重点:
1.提取帖子内容时,对图片,贴吧自动增加的超链接,制表符,换行符要做删除或替换处理
2.decode是把bytes转换为str, encode是把str转换为bytes 原帖中的代码第100行多了一个encode,导致出错
3.代码中用到了文件相关操作
4.原文中获取标题的正则表达式我觉得不太对,做了修改。原文只是提取了<h1></h1>直接的,但实际上有在<h3></h3>直接的
最终代码如下,在python3.4.3中实现
import urllib.request import urllib.parse import re #处理页面标签类 class Tool: #去除img标签,7位长空格 removeImg = re.compile('<img.*?>| {7}') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为 replaceLine = re.compile('<tr>|<div>|</div>|</p>') #将表格制表<td>替换为 replaceTD = re.compile('<td>') #把段落开头换为 加空两格 replacePara = re.compile('<p.*?>') #将换行符或双换行符替换为 replaceBR = re.compile('<br><br>|<br>') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, " ", x) x = re.sub(self.replaceTD, " ", x) x = re.sub(self.replacePara, " ", x) x = re.sub(self.replaceBR, " ", x) x = re.sub(self.removeExtraTag, "", x) return x.strip() #百度贴吧爬虫类 class BDTB: #初始化,传入基地址,是否只看楼主的参数 def __init__(self, baseUrl, seeLZ,floorTag): self.baseURL = baseUrl self.seeLZ = '?see_lz='+str(seeLZ) self.tool = Tool() self.file = None self.floor = 1 self.defaultTitle = u"百度贴吧" self.floorTag = floorTag #传入页码,获取该页帖子的代码 def getPage(self, pageNum): try: url = self.baseURL+self.seeLZ+'&pn='+str(pageNum) request = urllib.request.Request(url) response = urllib.request.urlopen(request) return response.read().decode('utf-8','ignore') #注意转换成字符串 except urllib.error.URLError as e: if hasattr(e, "reason"): print(u"连接百度贴吧失败,错误原因:", e.reason) return None #获取帖子标题 def getTitle(self): pageCode = self.getPage(1) pattern = re.compile('''<hd class="core_title_txt.*?title="(.*?)" style=".*?</hd>''', re.S) result = re.search(pattern, pageCode) if result: title = result.group(1).strip() #这里注意,获取分组的方法 return title else: return None #提取帖子页数 def getPageNum(self): pageCode = self.getPage(1) pattern = re.compile('''<span class=.*?</span>.*?回复贴,共.*?<span class=.*?>(.*?)</span>''', re.S) result = re.search(pattern, pageCode) if result: pageNum = result.group(1).strip() return pageNum else: return None #获取每一层楼的内容,传入页面内容 def getContent(self, page): pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) items = re.findall(pattern, page) contents = [] for item in items: content = " "+self.tool.replace(item)+" " contents.append(content) return contents def setFileTitle(self, title): if title is not None: self.file = open(title+".txt","w+") else: self.file = open(self.defaultTitle+".txt","w+") def writeData(self, contents): #向文件写入每一楼的信息 for item in contents: if self.floorTag == '1': #楼之间的分隔符 floorLine = " " + str(self.floor) + "楼-------------------------------------" self.file.write(floorLine) self.file.write(item) self.floor+=1 def start(self): pageNum = self.getPageNum() title = self.getTitle() self.setFileTitle(title) if pageNum == None: print(u"URL已失效,请重试") return try: print("该帖子共有" + str(pageNum) + "页") for i in range(1, int(pageNum) + 1): print("正在写入第"+str(i)+"页数据") page = self.getPage(i) contents = self.getContent(page) self.writeData(contents) #出现写入异常 except IOError as e: print("写入异常,原因"+e.message) finally: print("写入任务完成") print(u"请输入帖子代号") baseURL = 'http://tieba.baidu.com/p/' + str(input(u'http://tieba.baidu.com/p/')) seeLZ = input("是否只看楼主发言,是输入1,否输入0 ") floorTag = input("是否写入楼层信息,是输入1,否输入0 ") bdtb = BDTB(baseURL, seeLZ, floorTag) bdtb.start()