• python AjaxSpider 代码演示



    
    import re # 引入正则表达式
    import json #  引入 json
    import pymongo # 引入mongo数据库
    import requests # 引入HTTP请求协议
    from hashlib import md5 # 引入MD5
    from bs4 import BeautifulSoup #引入BeautifulSoup 信息查询框架
    from multiprocessing import Pool # 引入 多线程池
    from urllib.parse import urlencode #引入网页解析
    from json.decoder import JSONDecodeError #引入json错误异常
    from requests.exceptions import RequestException #引入 HTTP异常
    
    from config import * #导入数据库配置信息
    
    client = pymongo.MongoClient(MONGO_URL,connect=False)
    db = client[MONGO_DB]
    
    # 抓取索引
    def get_page_index(offset,keyword):
        # 构造请求数据信息
        data ={
            'office':offset, # 默认页码
            'format': 'json', # 数据格式
            'keyword': 'keyword', # 关键字
            'autoload': 'true',
            'count': '20',
            'cur_tab': 3,
        }
        url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
        try:
            response = requests.get(url)
            # 判断是否有正常获取到网页信息
            if response.status_code == 200:
                # 如果访问正常泽返回数据,否则为空
                return response.text
            return None
        except RequestException:
            print('请求索引出错')
            return None
    
    def parse_page_index(html):
       try:
            data = json.loads(html)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
       except JSONDecodeError:
           pass
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                    return response.text
            return None
        except RequestException:
            print('请求详情页出错',url)
            print(url)
    
    
    def parse_page_detail(html,url):
        soup = BeautifulSoup(html,'lxml')
        title = soup.select('title')[0].get_text()
        print(title)
        images_pattern = re.compile('var gallery = (.*?)',re.S)
        result = re.search(images_pattern,html)
        if result:
            data = json.loads(result.group(1))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title':title,
                    'url':url,
                    'images':images,
    
                }
    
    def save_to_monogo(result):
        if db[MONGO_TABLE].insert(result):
            print('存储到MonogoDB成功',result)
            return True
        return False
    
    def download_image(url):
        print('正在下载',url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                    # return response.text
                save_image(response.content)
            return None
        except RequestException:
            print('请求图片出错出错',url)
            return None
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(ls.getcwd(),md5(content).hexdigest(),'jpg')
        if not os.path.exists(file_path):
            with open(file_path,'wb') as f:
                f.writable(content)
                f.close()
    
    
    def main(offset):
        # html = get_page_index(0,'街拍')
        html = get_page_index(offset,KEYWORD)
        for url in parse_page_index(html):
            html = get_page_detail(url)
            if html:
               result = parse_page_detail(html,url)
               if result: save_to_monogo(result)
    
               print(result)
    if __name__ == '__main__':
        # main()
        groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
        pool = Pool()
        pool.map(main, groups)
    
    
    
  • 相关阅读:
    FHDe2Net:Full High Definition Demoireing Network
    Single Image Reflection Removal through Cascaded Refinement
    GFN___Gated Fusion Network for Single Image Dehazing
    127. 单词接龙 哈希表 BFS 优化建图 双向搜索
    面试题 02.08. 环路检测 快慢指针
    503. 下一个更大元素 II (暴力、单调栈)
    GINet:Graph Interaction Network for Scene Parsing
    FFDNet: Toward a Fast and Flexible Solution for CNN-Based Image Denoising
    搬家完成!
    Lucas定理
  • 原文地址:https://www.cnblogs.com/wordgao/p/9824675.html
Copyright © 2020-2023  润新知