1 # _*_ coding:utf-8 _*_ 2 import urllib2 3 from bs4 import BeautifulSoup 4 5 user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0" 6 headers = {'User-Agent':user_agent} 7 url = "https://www.qiushibaike.com" 8 # 爬取article链接的content内容 9 def getContent(article_url, headers): 10 request = urllib2.Request(article_url,data=None,headers=headers) 11 response = urllib2.urlopen(request,timeout=60) 12 html = response.read().decode('utf-8') 13 soup = BeautifulSoup(html,'html.parser') 14 contents= soup.select('.content')[0].strings 15 print u"内容:" 16 for content in contents: 17 print u"%s" % content.strip() 18 print ' ' 19 # ---------------------------- 20 # 获取"https://www.qiushibaike.com/hot/page/1/"页面的作者、好笑、评论,文章地址信息 21 def getData(url, headers,pages=1): 22 for page in range(1,pages+1): 23 page_url = url + "/hot/page/" + str(page) 24 print "正在爬取第 %s 页+++" % page 25 request = urllib2.Request(page_url,data=None,headers=headers) 26 response = urllib2.urlopen(request,timeout=60) 27 html = response.read().decode('utf-8') 28 # print html 29 soup = BeautifulSoup(html,'html.parser') 30 31 authors = soup.select('h2') 32 smile_nums = soup.select('.stats-vote > .number') 33 comment_nums = soup.select('.stats-comments > .qiushi_comments > .number') 34 article_urls = soup.select('.contentHerf') 35 for i in range(25): 36 print "正在爬取第 %s 页的第 %s 条数据---" % (page,i+1) 37 author = authors[i].string.strip() 38 print u"作者: %s" % author 39 funny_num = smile_nums[i].string 40 comment_num = comment_nums[i].string 41 print u"好笑: %s" % funny_num 42 print u"评论: %s"% comment_num 43 article_url = article_urls[i]['href'] 44 45 article_url = url + article_url 46 # print article_url 47 getContent(article_url, headers) 48 # --------------------------------- 49 getData(url,headers,pages=10)