每天一点linux命令:新建文件夹
一,使用python获得acfun的所有番剧的信息,评论,弹幕
1 #! /usr/bin/env python 2 # -*- coding=utf-8 -*- 3 import re 4 import requests 5 import sys 6 import json 7 reload(sys) 8 sys.setdefaultencoding("utf-8") 9 num = 1 10 head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'} #防陷阱 11 def dm(ht): 12 oldURL= 'http://danmu.aixifan.com/V2/' + ht + '?pageSize=500&pageNo=0' 13 #print oldURL 14 for i in range(1,5): 15 newURL = re.sub('pageNo=d+','pageNo=%d'%i,oldURL,re.S) 16 print newURL 17 html = requests.get(newURL,headers = head) 18 type = sys.getfilesystemencoding() 19 aa = json.loads(html.text) 20 #print len(aa[1]) 21 try: 22 for i in range(0,501): 23 print aa[2][i]['m'] 24 except Exception,e: 25 break 26 def PL(ht): 27 url = 'http://www.acfun.tv/comment/bangumi/web/list?bangumiId=' + ht #评论首地址,可获得评论数,评论的 28 print url 29 jscontent = requests.get(url,headers = head).content 30 jsDict = json.loads(jscontent) 31 pag = jsDict['data']['totalPage'] 32 print pag 33 nurl = url + '&pageNo=1' 34 for i in range(1,pag+1): 35 ourl = re.sub('pageNo=d+','pageNo=%d'%i,nurl,re.S) 36 jscontent = requests.get(ourl,headers = head).content 37 jsDict = json.loads(jscontent) 38 39 def geturl(): 40 ourl = 'http://www.acfun.tv/bangumi/bangumi/page?pageSize=42&isWeb=1&pageNo=1&sort=1' 41 for i in range(1,8): 42 nurl = re.sub('pageNo=d+','pageNo=%d'%i,ourl,re.S) 43 print nurl 44 jscontent = requests.get(nurl,headers = head).content 45 jsDict = json.loads(jscontent) 46 for j in range(1,42): 47 info( str(jsDict['data']['list'][j]['id']) ) 48 break 49 break 50 def info(ht): 51 url = "http://www.acfun.tv/v/ab" + ht 52 sc = "http://www.acfun.tv/bangumi/stow/isStowed?bangumiId=" + ht #收藏数 53 pl = "http://www.acfun.tv/bangumi/count/bangumi_view.aspx?bangumiId="+ht #评论数 54 html = requests.get(url) 55 htpl = requests.get(pl) 56 title = re.findall('h3 class="title">(.*?)</h3><span',html.text,re.S)[0] 57 print '名称:' + title 58 up = re.findall('</h3><span class="last">(.*?)</span>',html.text,re.S)[0] 59 print '更新:'+ up 60 pp = re.search('[(.*?)]',htpl.text,re.S).group(1) 61 print '评论总数:' + pp 62 jsconten = requests.get(sc,headers = head).content 63 jsDict = json.loads(jsconten) 64 print '收藏总数:' + str(jsDict['data']['stowCount']) 65 jianjie = re.findall('pan class="desc">(.*?)</span>',html.text,re.S)[0] 66 print '简介:' + jianjie 67 page = re.findall('" data-count="(.*?)" data-index="',html.text,re.S)[0] 68 page = int(page) 69 nurl = url + '_1' 70 for i in range(1,page+1):#有多少话 多少页 71 nurl = re.sub('_d+','_%d'%i,nurl,re.S)#每个话的地址 72 print nurl 73 print '第' + str(i) + '话弹幕:' 74 html = requests.get(nurl) 75 id = re.findall('data-vid="(.*?)" data-sid',html.text,re.S)[0]#获取每个话的弹幕,地址 76 # dm(id) 77 print '第' + str(i) + '话评论:' 78 PL(ht) 79 if __name__ == "__main__": 80 geturl()