爬取百度贴吧帖子信息
#!/usr/bin/env python # -*- coding: utf-8 -*- # author: imcati import requests,re,time class TiebaSpider(object): def __init__(self,tiebaName): self.tiebaName=tiebaName self.base_url='https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}' self.headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'} #构造请求 def get_url_list(self): url_list=[] for i in range(5): url_list.append(self.base_url.format(i*50)) return url_list #获取页面信息 def get_pageInfo(self,url): response=requests.get(url=url,headers=self.headers) return self.parse_pageInfo(response.content.decode('utf-8')) #解析数据 def parse_pageInfo(self,html): pattern=re.compile('<div class="t_con cleafix".*?<a rel="noreferrer" href="(.*?)" title="(.*?)" target=.*?</div>',re.S) return re.findall(pattern,html) #保存抓取信息 def save_info(self,info): for value_info in info: info_str = '帖子信息:' + value_info[1] + '帖子链接:https://tieba.baidu.com' + value_info[0] + ' ' with open('./tieba','ab') as f: f.write(info_str.encode("utf-8")) def run(self): url_list=self.get_url_list() for url in url_list: info=self.get_pageInfo(url) self.save_info(info) time.sleep(1) if __name__=="__main__": tiebaspider=TiebaSpider('python') tiebaspider.run()