• python爬虫练习--爬取今日头条街拍图片


    #! /usr/bin/env python
    # coding: utf-8
    
    import requests
    import os
    from hashlib import md5
    from multiprocessing.pool import Pool
    
    headers = {
    
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    
    }
    
    
    
    def get_pase(url,offset):
    
        params = {
            'offset':offset,
            'format':'json',
            'keyword':'街拍',
            'autoload':'true',
            'count':'20',
            'cur_tab':'1',
            'from':'search_tab'
        }
    
        response = requests.get(url,headers=headers,params=params)
        try:
            if response.status_code == 200:
                return response.json()
    
        except Exception as e:
            print(e.args)
    
    def get_images(json):
        if json:
            items = json.get('data')
            for item in items:
                title = item.get('title')
                images_list = item.get('image_list')
                if images_list:
                    for image in images_list:
                        yield {
                            'title': title,
                            'images': 'http:' + image.get('url')
                        }
    
    
    def save_image(item):
        if not os.path.exists(item.get('title')):
            os.mkdir(item.get('title'))
        try:
    
            image_url = item.get('images')
            new_image_url = image_url.replace('list','large')
            response = requests.get(new_image_url)
            if response.status_code == 200:
                file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb')as f:
                        f.write(response.content)
                else:
                    print('Already Downloaded', file_path)
        except requests.ConnectionError:
            print('Failed to save image')
    
    
    def main(offset):
        url = 'https://www.toutiao.com/search_content/'
        json = get_pase(url,offset)
        for item in get_images(json):
            print('33[32;1m%s33[0m' % item)
            save_image(item)
    
    GROUP_START = 1
    GROUP_END = 5
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
  • 相关阅读:
    struts2的在aJax中无法传参数到后台使用:解决方法
    jqGrid的属性(2)特指内容属性
    [leetcode]Binary Tree Maximum Path Sum
    判断二叉树是否平衡(Bottomup)
    [转]反向迭代器(rbegin,rend)
    Crack Interview 3.3
    Crack Interview 9.1 合并排序数组
    字符串转整数
    [转]了解如何通过reverse_iterator的base得到iterator
    通过bitmap的方式用8个int实现对256个char是否出现过做记录
  • 原文地址:https://www.cnblogs.com/watchslowly/p/9148051.html
Copyright © 2020-2023  润新知