1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
from bs4 import BeautifulSoup import logging import sys reload(sys) sys.setdefaultencoding( "utf-8" ) class Item(object): title = None #帖子标题 firstAuthor = None #帖子原作者 firstTime = None #帖子创建时间 reNum = None #帖子回复浏览数量 LastTime = None #帖子最后回复时间 LastAuthor = None #帖子最后回复作者 link = None #帖子链接 # 全局方法获取网页内容 def getResponseContent(url): try: response = urllib2.urlopen(url.encode('utf8'),timeout=20) except: logging.error(u'Python返回URL:{}数据失败'.format(url)) else: logging.info(u'Python返回URL:{}数据成功'.format(url)) return response.read() class getHupuInfo(object): def __init__(self,url): self.url = url self.pageSum = 3 self.urls = self.getUrls(self.pageSum) self.items = self.spider(self.urls) self.pipelines(self.items) def getUrls(self,pageSum): urls = [] urls.append(self.url) for pn in range(1,pageSum): tempurl = self.url + '-'+ str(pn+1) urls.append(tempurl) logging.info(u'获取URLS成功! ') return urls def spider(self,urls): items = [] for url in urls: htmlContent = getResponseContent(url) soup = BeautifulSoup(htmlContent,'lxml') tagtable = soup.find('table',attrs={'id':'pl'}) tagstr = tagtable.find_all('tr') flag = 0 for tag in tagstr: if flag == 0: flag +=1 continue else: flag += 1 item = Item() item.link = '/'+ tag.get('mid') + '.html' item.title = tag.find('td', attrs={'class': 'p_title'}).find('a',href = item.link).get_text() item.firstAuthor = tag.find('td', attrs={'class': 'p_author'}).a.get_text() item.firstTime = tag.find('td', attrs={'class': 'p_author'}).get_text() item.reNum = tag.find('td', attrs={'class': 'p_re'}).get_text() item.LastAuthor = tag.find('td', attrs={'class': 'p_retime'}).a.get_text() item.LastTime = tag.find('td', attrs={'class': 'p_retime'}).get_text() items.append(item) logging.info(u'获取帖子成功') return items def pipelines(self,items): fileName = u'Hupu_bxj.txt' with open(fileName,'w') as fp: for item in items: #fp.write('{} {} {} {} {} {} {} '.format(item.title,item.firstAuthor,item.firstTime,item.reNum,item.LastAuthor,item.LastTime,item.link)) fp.write('{} '.format(item.title).encode('utf8')) logging.info(u'写入文本成功') def getpiclink(self): piclink = [] for item in self.items: piclink.append(self.url[0:20] + item.link) logging.info(u'返回图片帖子链接成功') return piclink class picInfo(object): def __init__(self,links): self.links = links self.imgurls = [] self.spider() self.pipeline() def spider(self): if self.links == None: logging.error('无图片链接') else: for link in self.links: htmlContent = getResponseContent(link) soup = BeautifulSoup(htmlContent,'lxml') tagDiv = soup.find('div',attrs={'id':'tpc'}) img = tagDiv.find('div',attrs={'class':'quote-content'}).find_all('img') if img == None: continue else: for subimg in img: if subimg.get('data-original') == None: imgurl = subimg.get('src') else: imgurl = subimg.get('data-original') self.imgurls.append(imgurl) logging.info(u'获取图片链接成功') def pipeline(self): for i in range(len(self.imgurls)): if self.imgurls[i][-3:] == 'png': imgname = str(i) + '.png' elif self.imgurls[i][-3:] == 'jpg': imgname = str(i) + '.jpg' elif self.imgurls[i][-4:] == 'jpeg': imgname = str(i) + '.jpeg' elif self.imgurls[i][-3:] == 'gif': imgname = str(i) + '.jpeg' else: continue img = getResponseContent(self.imgurls[i]) with open (imgname, 'ab') as fp: fp.write(img) logging.info(u'写入图片成功') if __name__ == '__main__': logging.basicConfig(level= logging.INFO) url = u'https://bbs.hupu.com/bxj' HUPU = getHupuInfo(url) picurls = HUPU.getpiclink() PIC = picInfo(picurls)