• 通过分析Ajax请求 抓取今日头条街拍图集


    代码:

    import os
    import re
    import json
    import time
    from hashlib import md5
    from multiprocessing import Pool
    
    import requests
    from requests.exceptions import RequestException
    from pymongo import MongoClient
    
    # 配置信息
    OFFSET_START = 0   # 爬去页面的起始下标
    OFFSET_END = 20    # 爬去页面的结束下标
    KEYWORD = '街拍'   # 搜索的关键字
    
    # mongodb相关配置
    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'   # 数据库名称
    MONGO_TABLE = 'jiepai'  # 集合名称
    
    # 图片保存的文件夹名称
    IMAGE_PATH = 'images'
    
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    
    client = MongoClient(host=MONGO_URL)
    db = client[MONGO_DB]
    jiepai_table = db[MONGO_TABLE]
    
    if not os.path.exists(IMAGE_PATH):
        os.mkdir(IMAGE_PATH)
    
    
    def get_html(url, params=None):
        try:
            response = requests.get(url, params=params, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException as e:
            print("请求%s失败: " % url, e)
            return None
    
    # 获取索引页内容
    def get_index_page(offset, keyword):
        basic_url = 'http://www.toutiao.com/search_content/'
        params = {
            'offset': offset,
            'format': 'json',
            'keyword': keyword,
            'autoload': 'true',
            'count': 20,
            'cur_tab': 3
        }
        return get_html(basic_url, params)
    
    
    def parse_index_page(html):
        '''
        解析索引页内容
        返回: 索引页中包含的所有详情页url
        '''
        if not html:
            return
        data = json.loads(html)
        if 'data' in data:
            for item in data['data']:
                article_url = item['article_url']
                if 'toutiao.com/group' in article_url:
                    yield article_url
    
    
    # 获取详情页
    def get_detail_page(url):
        return get_html(url)
    
    # 解析详情页
    
    
    def parse_detail_page(url, html):
        '''
            解析详情页
            返回对应的标题,url和包含的图片url
        '''
        title_reg = re.compile('<title>(.*?)</title>')
        title = title_reg.search(html).group(1)
        gallery_reg = re.compile('var gallery = (.*?);')
        gallery = gallery_reg.search(html)
        if gallery and 'sub_images' in gallery.group(1):
            images = json.loads(gallery.group(1))['sub_images']
            image_list = [image['url'] for image in images]
            return {
                'title': title,
                'url': url,
                'images': image_list
            }
        return None
    
    
    def save_to_mongodb(content):
        jiepai_table.insert(content)
        print("存储到mongdob成功", content)
    
    
    def download_images(image_list):
        for image_url in image_list:
            try:
                response = requests.get(image_url)
                if response.status_code == 200:
                    save_image(response.content)
            except RequestException as e:
                print("下载图片失败: ", e)
    
    
    def save_image(content):
        '''
            对图片的二进制内容做hash,构造图片路径,以此保证图片不重复
        '''
        file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),
                                             IMAGE_PATH, md5(content).hexdigest(), 'jpg')
        # 去除重复的图片
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
    
    
    def jiepai(offset):
        html = get_index_page(offset, KEYWORD)
        if html is None:
            return
        page_urls = list(parse_index_page(html))
        # print("详情页url列表:" )
        # for page_url in page_urls:
        #     print(page_url)
    
        for page in page_urls:
            print('get detail page:', page)
            html = get_detail_page(page)
            if html is None:
                continue
            content = parse_detail_page(page, html)
            if content:
                save_to_mongodb(content)
                download_images(content['images'])
                time.sleep(1)
        print('-------------------------------------')
    
    
    if __name__ == '__main__':
        offset_list = range(OFFSET_START, OFFSET_END)
        pool = Pool()
        pool.map(jiepai, offset_list)

     备注:

    其实通过url请求返回的json数据中已经包含了图片列表

    import requests
    
    
    basic_url = 'http://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=3'
    url = basic_url.format(0)
    html = requests.get(url).json()
    items = html['data']
    for item in items:
        title = item['media_name']
        image_list = [image_detail['url'] for image_detail in item['image_detail']]
        print(title, image_list)
  • 相关阅读:
    HCIA-IoT 华为认证物联网工程师
    [书目20210522]投资最重要的事
    [书目20210414]海龟交易法则
    [书目20210224]陆蓉 行为金融学讲义
    [书目20210207]肖星的财务思维课
    [转]昂克英文君 一张图告诉你英语该怎么学
    Cloud Native
    Aerospike-介绍
    Groovy使用场景
    javaStream与响应式流
  • 原文地址:https://www.cnblogs.com/hupeng1234/p/7112440.html
Copyright © 2020-2023  润新知