• 分析 ajax 请求并抓取 “今日头条的街拍图”


    今日头条抓取页面:

    分析街拍页面的 ajax 请求:

    通过在 XHR 中查看内容,获取 url 链接,params 参数信息,将两者进行拼接后取得完整 url 地址。data 中的 article_url 为各详情页的链接地址。

    代码显示:

     1 # 获取街拍页面;
     2 def one_page_index(offset, keyword, headers):
     3     params = {
     4         'aid': 24,
     5         'app_name': 'web_search',
     6         'offset': offset,
     7         'format': 'json',
     8         'keyword': keyword,
     9         'autoload': 'true',
    10         'count': 20,
    11         'en_qc': 1,
    12         'cur_tab': 1,
    13         'from': 'search_tab',
    14         'pd': 'synthesis',
    15     }
    16     url = "https://www.toutiao.com/api/search/content/?" + urlencode(params)
    17     try:
    18         response = requests.get(url, headers=headers)
    19         if response.status_code == 200:
    20             return response.text
    21         return None
    22     except RequestException:
    23         print('请求索引页出错!')
    24         return None
    View Code
    1 # 解析街拍页面的信息;
    2 def parse_one_page(html):
    3     data = json.loads(html)
    4     if data and 'data' in data.keys():
    5         for item in data.get('data'):
    6             yield item.get('article_url')
    View Code

    详情页的解析:

    需将获取到的数据进行格式调整;

    代码显示:

     1 def parse_detail(html, url):
     2     soup = BeautifulSoup(html, 'lxml')
     3     title = soup.select('title')[0].get_text()
     4     image_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
     5     result = re.search(image_pattern, html)
     6     if result:
     7         # group(1)即为第一个括号里面的内容;
     8         # 将获取的数据格式进行调整;
     9         newResult = result.group(1).replace("\\u002F", '/')
    10         newResult = newResult.replace("\", '')
    11         data = json.loads(newResult)
    12         if data and 'sub_images' in data.keys():
    13             sub_images = data.get('sub_images')
    14             images = [item.get('url') for item in sub_images]
    15             for image in images: download_image(image)
    16             return {
    17                 "title": title,
    18                 "url": url,
    19                 "images": images
    20             }
    View Code

    例外:

    在街拍主页中,有些网页的内容不是图集,而是在单个页面中显示所有的图片,其解析的内容与图集形式的网页内容不同,无法正常解析出内容,会获取 None 。在写入数据库时需进行判断。

    最后将抓取的数据保存到 MongoDB 数据库中,且将图片保存到本地文件中。

     

    完整代码:

    1 # config.py 文件
    2 
    3 MONGO_URL = "localhost"
    4 MONGO_DB = 'toutiao'
    5 MONGO_TABLE = 'toutiao'
    6 
    7 GROUP_START = 1
    8 GROUP_END = 20
    9 KEYWORLD = '街拍'
      1 import json
      2 import os
      3 import re
      4 from hashlib import md5
      5 from urllib.parse import urlencode
      6 import pymongo
      7 from bs4 import BeautifulSoup
      8 from requests.exceptions import RequestException
      9 import requests
     10 from toutiao.config import *
     11 from multiprocessing import Pool
     12 
     13 
     14 client = pymongo.MongoClient(MONGO_URL)
     15 db = client[MONGO_DB]
     16 
     17 # 获取街拍页面;
     18 def one_page_index(offset, keyword, headers):
     19     params = {
     20         'aid': 24,
     21         'app_name': 'web_search',
     22         'offset': offset,
     23         'format': 'json',
     24         'keyword': keyword,
     25         'autoload': 'true',
     26         'count': 20,
     27         'en_qc': 1,
     28         'cur_tab': 1,
     29         'from': 'search_tab',
     30         'pd': 'synthesis',
     31     }
     32     url = "https://www.toutiao.com/api/search/content/?" + urlencode(params)
     33     try:
     34         response = requests.get(url, headers=headers)
     35         if response.status_code == 200:
     36             return response.text
     37         return None
     38     except RequestException:
     39         print('请求索引页出错!')
     40         return None
     41 
     42 
     43 # 获取街拍各详情页的信息;
     44 def get_detail_page(url, headers):
     45     try:
     46         response = requests.get(url, headers=headers)
     47         if response.status_code == 200:
     48             return response.text
     49         return None
     50     except RequestException:
     51         print('请求详情页出错!')
     52         return None
     53 
     54 
     55 # 解析街拍页面的信息;
     56 def parse_one_page(html):
     57     data = json.loads(html)
     58     if data and 'data' in data.keys():
     59         for item in data.get('data'):
     60             yield item.get('article_url')
     61 
     62 
     63 # 街拍各详情页的解析;
     64 # 使用正则解析数据;
     65 def parse_detail(html, url):
     66     soup = BeautifulSoup(html, 'lxml')
     67     title = soup.select('title')[0].get_text()
     68     image_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
     69     result = re.search(image_pattern, html)
     70     if result:
     71         # group(1)即为第一个括号里面的内容;
     72         # 将获取的数据格式进行调整;
     73         newResult = result.group(1).replace("\\u002F", '/')
     74         newResult = newResult.replace("\", '')
     75         data = json.loads(newResult)
     76         if data and 'sub_images' in data.keys():
     77             sub_images = data.get('sub_images')
     78             images = [item.get('url') for item in sub_images]
     79             for image in images: download_image(image)
     80             return {
     81                 "title": title,
     82                 "url": url,
     83                 "images": images
     84             }
     85 
     86 
     87 # 下载图片;
     88 def download_image(url):
     89     print('正在下载', url)
     90     try:
     91         response = requests.get(url)
     92         if response.status_code == 200:
     93             # 返回图片使用content;
     94             save_image(response.content)
     95             return None
     96         return None
     97     except RequestException:
     98         print('请求图片出错!')
     99         return None
    100 
    101 
    102 # 将图片保存到本地;
    103 # 使用md5形式的文件名,内容相同的md5值也相同,防止下载重复的图片;
    104 def save_image(content):
    105     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
    106     if not os.path.exists(file_path):
    107         with open(file_path, 'wb') as f:
    108             f.write(content)
    109             f.close()
    110 
    111 
    112 # 将数据存储到MongoDB中;
    113 # 某些网页的图片全部在单页中显示,代码与图集形式显示的网页不同,解析会匹配不到内容,无法插入数据库;
    114 def save_to_mongodb(result):
    115     if result and db[MONGO_TABLE].insert(result):
    116         print('存储到MongoDB成功', result)
    117         return True
    118     return False
    119 
    120 
    121 def main(offset):
    122     headers = {
    123         "User-Agent": 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    124         "cookie": 'tt_webid=6741657574736889357; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6741657574736889357; csrftoken=9e7ac598957d2ec36c80c6f1e05b9622; s_v_web_id=2ab2b8ff35fc91cacdec489ca9a5570f; __tasessionId=d6osr6e6r1569719745562; UM_distinctid=16d7a9f7f4724e-0d4cc2a69d799-3c375d0d-100200-16d7a9f7f4a4f1'
    125     }
    126     html = one_page_index(offset, KEYWORLD, headers)
    127     for url in parse_one_page(html):
    128         if url:
    129             html = get_detail_page(url, headers)
    130             result = parse_detail(html, url)
    131             save_to_mongodb(result)
    132 
    133 # 运行;
    134 if __name__ == '__main__':
    135     groups = [x * 20 for x in range(GROUP_START, GROUP_END + 1)]
    136     Pool = Pool()
    137     Pool.map(main, groups)
  • 相关阅读:
    进程、线程和协程的图解
    Python多线程的原理与实现
    Python多进程原理与实现
    python面试题-1
    数据库事务隔离级别--读未提交,读已提交,重复读,序列化
    java--浅谈线程(二、线程的方法和状态)
    java--浅谈线程(一简单介绍)
    类加载机制--浅谈
    JSP/Servlet Web 学习笔记 DayThree
    JSP/Servlet Web 学习笔记 DayThree —— 实现一个登陆小界面
  • 原文地址:https://www.cnblogs.com/liqiongming/p/11609032.html
Copyright © 2020-2023  润新知