1.抓包
访问一个频道,Charles抓包,找到真实连接,一般返回json数据和网页中数据对应为真实连接
请求方式为post,所以要添加请求头和表单数据,由于在charles环境下,所以要添加charles的代理ip和认证文件,然后进行测试,访问成功。
对不同的频道分别经过charles抓包,发现请求的链接都是一样的,只是更换了表单中tab_id属性来对应不同的频道,所以创建一个存储tab_id和频道名的字典,更换时从tab_id字典中取值,来实现不同频道的访问
2.封ip测试
一直访问并没有被封,所以就放开了采吧
3.概览页
一般概览页的数据库中存放文章的链接,但是这个百度APP返回数据中含有所有字段,所以将返回的数据全部存放到数据库中。
4.细览页
读取概览页数据库中的数据,通过正则解析出各个字段,去除无效信息,字段加密存放到数据库中
注:charles代理ip自行设置;url只提供一个样子,自行抓取;数据库自行设置;认证文件自行设置;表单数据自行抓取;数据解析模块需要什么自己就解析什么,这里不提供了;处理上还不是很完美,自行修改;
gailanye.py
1 import requests 2 import re 3 import time 4 import pymysql 5 6 7 class BD(object): 8 def __init__(self): 9 self.url = 'https://mbd.baidu.com/searchbox?-此处省略-7ig' 10 self.form = { 11 'data': '''此处省略。。。 12 13 ''' 14 15 } 16 self.proxy = { 17 'https': 'https://此处省略' 18 } 19 self.channel = { 20 '1': '推荐', 21 '3': '娱乐', 22 '4': '体育', 23 '5': '时尚', 24 '6': '国际', 25 '8': '热点', 26 '12': '汽车', 27 '13': '军事', 28 '14': '科技', 29 '15': '财经', 30 '16': '游戏', 31 '17': '女人', 32 '18': '历史', 33 '28': '搞笑', 34 '35': '情感', 35 '34': '美食', 36 '41': '居家', 37 '42': '政务', 38 '43': '旅游', 39 '44': '辟谣', 40 '51': '健康', 41 '54': '萌宠', 42 '72': '新华社', 43 '75': '虎扑', 44 '81': '澎湃新闻', 45 '85': '人民日报', 46 '106': '36氪', 47 '88': '虎嗅', 48 '309999289': '上海', 49 '309999257': '广州', 50 '309999340': '深圳', 51 '309999332': '天津', 52 '309999179': '杭州', 53 '309999315': '南京', 54 '309999218': '武汉', 55 '109999131': '北京', 56 } 57 58 def modify_tab_id(self, tab_id): 59 # 修改表单中的tab_id 60 self.form['data'] = re.sub('"tab_id": "(d+)"', '"tab_id": "{}"'.format(tab_id), self.form['data']) 61 # self.form['data'] = re.sub('"last_update_time": (d+),', '"last_update_time": {}000,'.format(int(time.time())), self.form['data']) 62 return self.form['data'] 63 64 def get_data(self): 65 # 获得频道和内容 66 list_d = [] 67 for data in self.channel: 68 data_channel = [] 69 print('='*20) 70 print(data) 71 self.form['data'] = self.modify_tab_id(data) 72 response = requests.post(self.url, data=self.form, proxies=self.proxy, verify='*.pem') 73 datas = response.text 74 channel = self.channel[data] 75 data_channel = [channel, datas] 76 print(data_channel) 77 list_d.append(data_channel) 78 return list_d 79 80 def save_data(self, list_d): 81 # 写入数据库 82 host = '127.0.0.1' 83 db = 'bd' 84 user = 'root' 85 psd = '123456' 86 charset = 'utf8' 87 88 con = pymysql.connect(host=host, db=db, user=user, passwd=psd, charset=charset) 89 90 cur = con.cursor() 91 92 for i in list_d: 93 print(i) 94 sql = ( 95 "insert into gly(此处省略)" 96 "values(此处省略)") 97 list_m = [i[0], i[1]] # i[0]为频道名 i[1]为数据 98 try: 99 cur.execute(sql, list_m) 100 print('insert success') 101 except Exception as e: 102 print('insert error', e) 103 con.rollback() 104 else: 105 con.commit() 106 cur.close() 107 con.close() 108 109 110 if __name__ == '__main__': 111 bd = BD() 112 list_d = bd.get_data() 113 bd.save_data(list_d)
xilanye.py
1 import pymysql 2 import json 3 import time 4 import hashlib 5 import requests 6 from lxml import etree 7 import re 8 9 10 # 娱乐频道先删除掉 11 # 体育频道有导航栏目前还无法获取data,先过滤掉 12 13 14 class XLY(object): 15 def __init__(self): 16 self.no_results_channel = [] # 存储没有数据的频道 17 self.proxy = { 18 'https': '....' 19 } 20 self.sum_data = 0 21 22 def get_data(self): 23 host = '127.0.0.1' 24 db = 'bd' 25 user = 'root' 26 pwd = '123456' 27 charset = 'utf8' 28 con = pymysql.connect(host=host, db=db, user=user, passwd=pwd, charset=charset) 29 30 datas = [] 31 cur = con.cursor() 32 sql = 'select * from gly' 33 try: 34 cur.execute(sql) 35 results = cur.fetchall() 36 i = 0 37 for result in results: 38 i += 1 39 data = [] 40 # 读出来是元组类型,转化为列表返回 41 result = list(result) 42 if '{"100":[]}' in result[1]: 43 self.no_results_channel.append(result[0]) 44 print('no results channel:', result[0]) 45 elif 'navigatorItems' in result[1]: 46 print('有导航栏的频道,还没有处理') 47 else: 48 data = [result[0], result[1]] 49 datas.append(data) 50 print('get_data') 51 print('=' * 20, i) 52 # if i == 5: 53 # break 54 except Exception as e: 55 print('error', e) 56 con.rollback() 57 else: 58 con.commit() 59 return datas 60 61 def parse_data(self, datas): 62 items = [] 63 for data in datas: 64 channel = data[0] 65 channel_data = data[1] 66 channel_data = json.loads(channel_data) 67 channel_data = channel_data['data']['100']['itemlist']['items'] 68 69 for text in channel_data: 70 print('='*20) 71 item = {} 72 try: 73 mode = text['data']['mode'] 74 except: 75 mode = '' 76 print('mode not found') 77 # 根据mode判断是否为文章,过滤掉图集广告 78 if mode == 'text': 79 此处省略 87 88 m1 = hashlib.md5() 89 m1.update(item['urlname'].encode("utf8")) 90 item['hkey'] = m1.hexdigest() 91 92 try: 93 item['comments'] = text['data']['comment_num'][:-2] 94 except: 95 item['comments'] = '' 96 print('no comment_num') 97 98 # 解析content 99 content, url_time = self.parse_content(item['urlname']) 100 101 102 print(item) 103 self.save_data(item) 104 if item != {}: 105 items.append(item) 106 return items 107 108 def parse_content(self, url): 109 # 根据每一篇文章获取content, url_time 110 response = requests.get(url, proxies=self.proxy, verify='此处省略.pem') 111 text = response.text 112 element = etree.HTML(text) 113 contents = element.xpath('//p[@class="contentText contentSize contentPadding"]//text()') 114 url_time = element.xpath('//div[@class="infoSet"]//text()') 115 try: 116 if '17-' in url_time: 117 url_time = re.sub('17', '2018', url_time) 118 print(url_time) 119 else: 120 url_time = '2018-' + str(url_time[1]) 121 except: 122 url_time = '' 123 if not contents: 124 contents = '' 125 else: 126 contents = ''.join(contents) 127 return contents, url_time 128 129 def save_data(self, item): 130 host = '127.0.0.1' 131 db = 'bd' 132 user = 'root' 133 pwd = '123456' 134 charset = 'utf8' 135 136 con = pymysql.connect(host=host, db=db, user=user, passwd=pwd, charset=charset) 137 cur = con.cursor() 138 sql = 'insert into xly(此处省略)' 139 'values(此处省略)' 140 list = [此处省略] 142 try: 143 cur.execute(sql, list) 144 print('insert success') 145 self.sum_data += 1 146 print('成功插入数据库第{}条'.format(self.sum_data)) 147 except Exception as e: 148 print('error~~', e) 149 con.rollback() 150 else: 151 con.commit() 152 # cur.execute(sql, list) 153 cur.close() 154 con.close() 155 156 157 if __name__ == '__main__': 158 xly = XLY() 159 datas = xly.get_data() 160 items = xly.parse_data(datas)