• 今日头条街拍


    spider.py

    import json
    import os
    from urllib.parse import urlencode
    import pymongo
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import ConnectionError
    import re
    from multiprocessing import Pool
    from hashlib import md5
    from json.decoder import JSONDecodeError
    from config import *
    
    client = pymongo.MongoClient(MONGO_URL, connect=False)
    db = client[MONGO_DB]
    
    
    def get_page_index(offset, keyword):
        data = {
            'autoload': 'true',
            'count': 20,
            'cur_tab': 3,
            'format': 'json',
            'keyword': keyword,
            'offset': offset,
        }
        params = urlencode(data)
        base = 'http://www.toutiao.com/search_content/'
        url = base + '?' + params
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def download_image(url):
        print('Downloading', url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                save_image(response.content)
            return None
        except ConnectionError:
            return None
    
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    
    def parse_page_index(text):
        try:
            data = json.loads(text)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
        except JSONDecodeError:
            pass
    
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def parse_page_detail(html, url):
        soup = BeautifulSoup(html, 'lxml')
        result = soup.select('title')
        title = result[0].get_text() if result else ''
        images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
        result = re.search(images_pattern, html)
        if result:
            data = json.loads(result.group(1).replace('\', ''))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title': title,
                    'url': url,
                    'images': images
                }
    
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
    
    
    def main(offset):
        text = get_page_index(offset, KEYWORD)
        urls = parse_page_index(text)
        for url in urls:
            html = get_page_detail(url)
            result = parse_page_detail(html, url)
            if result: save_to_mongo(result)
    
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
    

    config.py

    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'toutiao'
    
    GROUP_START = 1
    GROUP_END = 20
    KEYWORD='街拍'
    

    来源于微信公众号: 进击的Coder (ID:FightingCoder)

  • 相关阅读:
    请用正则实现String.trim()
    事件委托的原理是什么?有什么作用?
    请简述get请求和post请求的区别
    用原生js实现,点击一个列表时,输出对应的索引
    请用js写一个函数,实现获取浏览器url中查询字符串中的参数并返回一个数组
    请描述一下cookies、sessionStorage、localStorage、session四者的区别?
    清除浮动的几种方式,各自的优缺点?
    如何使用离线存储(localStorage)?
    使用css怎么让谷歌支持小于12px的文字比如10px
    ajax有哪些方法可以实现跨域?他们都有哪些局限性?
  • 原文地址:https://www.cnblogs.com/hankleo/p/11489793.html
Copyright © 2020-2023  润新知