分析Ajax请求并抓取今日头条街拍美图
分析网页
以上作为data参数,offset不是页码,页面下滑,新的内容刷新,offset值变大。
找到请求地址:
请求网址高亮部分为固定内容,后半在data中。
如果请求中不加cookie会遇到下列问题:
取出来的数据不完整。
所以要找到Cookie,将其放入请求头中。
requests.get请求内容出错最有是cookie问题,下载图片时不要有header。
1 import json 2 import os 3 import re 4 from urllib.parse import urlencode 5 import requests 6 from bs4 import BeautifulSoup 7 from requests.exceptions import RequestException 8 import config 9 import pymongo 10 from hashlib import md5 11 from multiprocessing import Pool 12 13 client = pymongo.MongoClient(config.MONGO_URL, connect=False) 14 db = client[config.MONGO_DB] 15 16 def get_page_index(offset, keyword): 17 data = { 18 'aid': 24, 19 'app_name': 'web_search', 20 # 'offset': 0, 21 'offset': offset, 22 'format': 'json', 23 # 'keyword': '人像摄影', 24 'keyword': keyword, 25 'autoload': 'true', 26 'count': 20, 27 'en_qc': 1, 28 'cur_tab': 1, 29 'from': 'search_tab', 30 'pd': 'synthesis', 31 # 'timestamp': 1560048654702, 32 } 33 url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data) 34 # print(url) 35 headers = { 36 'Cookie': 'UM_distinctid=16b3761e1ad453-0595b1308b57738-70226752-100200-16b3761e1ae16e;' 37 'tt_webid=6700158723510421000;csrftoken=9529582d3a82b38c3de771c3142adee4;' 38 'tt_webid=6700158723510421000;' 39 'WEATHER_CITY=%E5%8C%97%E4%BA%AC;' 40 'CNZZDATA1259612802=627663064-1560000420-%7C1560062522;' 41 '__tasessionId=3e436xfba1560065889568;' 42 's_v_web_id=fc9f6b68f10a508f45a92c3bc7ca0400', 43 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0', 44 'Host': 'www.toutiao.com', 45 } 46 try: 47 response = requests.get(url, headers=headers) 48 if response.status_code == 200: 49 # print(response.text) 50 return response.text 51 return None 52 except RequestException: 53 print('请求索引页出错') 54 return None 55 56 # 取出具体文章的地址 57 def parse_page_index(html): 58 data = json.loads(html) # 请求得到的是json格式 59 if data and 'data' in data.keys(): # 判断获取是否成功, 键中是否有data 60 try: 61 for item in data.get('data'): # 从响应中得到data数据 62 yield item.get('article_url') # 在页面响应 中找到 文章的地址 63 except TypeError: 64 pass 65 66 # 详情 67 def get_page_detail(url): 68 headers = { 69 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0', 70 'Host': 'www.toutiao.com', 71 } 72 try: 73 response = requests.get(url, headers=headers) 74 if response.status_code == 200: 75 # print(response.text) 76 return response.text 77 return None 78 except RequestException: 79 print('请求详情页出错', url) 80 return None 81 82 # 解析详情页 83 def parse_page_detail(html, url): 84 # print(html) 85 soup = BeautifulSoup(html, 'lxml') 86 87 title = soup.select('title')[0].get_text() 88 # print("title:", title) 89 image_pattern = re.compile('pgc-img">.*?(http.*?)"', re.S) 90 results = re.findall(image_pattern, html) 91 for image in results: 92 print(image) 93 download_image(image) 94 if results: 95 return { 96 'title': title, 97 'url': url, 98 'image': results 99 } 100 101 102 # 存储 103 def save_to_mongo(result): 104 if db[config.MONGO_TABLE].insert(result): 105 print('存储MongoDB成功', result) 106 return True 107 return False 108 109 def download_image(url): # 传入的是图片url 110 print("保存的图片:", url) 111 try: 112 response = requests.get(url) 113 if response.status_code == 200: 114 115 save_image(response.content) # 二进制内容 116 return None 117 except RequestException: 118 print('请求图片出错', url) 119 return None 120 121 def save_image(content): #当前路径 122 file_path = '{0}/picture/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') 123 if not os.path.exists(file_path): 124 with open(file_path, 'wb') as f: 125 f.write(content) 126 f.close() 127 128 def main(offset): 129 html = get_page_index(offset, '人像摄影') 130 print("offset:", offset) 131 for url in parse_page_index(html): 132 if url == None: # url文章url 133 continue 134 # print(url) 135 html_detail = get_page_detail(url) # 得到该文章具体页面内容 136 137 if html_detail: 138 re = parse_page_detail(html_detail, url) # 找出标题与图片 139 if re == None: 140 continue 141 save_to_mongo(re) 142 143 144 if __name__ == '__main__': 145 groups = [x * 20 for x in range(config.GROUP_START, config.GROUP_END + 1)] 146 147 pool = Pool() 148 pool.map(main, groups)
配置文件:
MONGO_URL='localhost' MONGO_DB='toutiao' MONGO_TABLE='toutiao' GROUP_START=1 GROUP_END=20
可以将数据保存在mongodb中,并将图片保存在picture下。
可视化数据库robomongo