• Elasticsearch导出数据存入本地文件


    直接上脚本

      

    # -*- coding: utf-8 -*-
    
    from elasticsearch import Elasticsearch
    
    
    # 日志的配置环境
    import platform
    import logging.handlers
    sys_platform = platform.system()
    if sys_platform == "Windows":
        LOG_FILE_check = './app_cic.txt'
    else:
        LOG_FILE_check = '/cic1.log'
    handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=1200 * 1024 * 1024,backupCount=10)  # 实例化handler  200M 最多十个文件
    fmt = '
    ' + '%(message)s'
    formatter = logging.Formatter(fmt)  # 实例化formatter
    handler.setFormatter(formatter)  # 为handler添加formatter
    logger = logging.getLogger('check')  # 获取名为tst的logger
    logger.addHandler(handler)  # 为logger添加handler
    logger.setLevel(logging.DEBUG)
    
    
    # es = Elasticsearch()
    es = Elasticsearch(["20.0.0.11:9200"], sniff_on_start=True, sniff_on_connection_fail=True,sniff_timeout=60)
    
    import time
    
    query_json = {
        "query":{
            "terms":{
                "site":[
                    "百度搜索"
                ]
            }
    
        }
    }
    
    
    
    page_num = 1000  # 每次获取数据
    query = es.search(index='guoyan_index_v1', body=query_json, scroll='5m', size=page_num)
    results = query['hits']['hits']  # es查询出的结果第一页
    total = query['hits']['total']  # es查询出的结果总量
    scroll_id = query['_scroll_id']  # 游标用于输出es查询出的所有结果
    every_num = int(total/page_num)  #
    # print(results)
    print("total",total)
    print("scroll_id",scroll_id)
    print("every_num",every_num)
    
    alist = []
    end_data_list = []
    print("----------",int(total/page_num)+1)
    for i in range(0, every_num):
    # for i in range(100, 1000):
        print("正在读取的位置是:",i)
        results_list = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
        for key in results_list:
            try:
                source = key['_source']["source"]
                other6 = key['_source']["other6"]
                result_str = other6 + " " + source
                end_data_list.append(result_str)
            except:
                pass
    end_data_list = list(set(end_data_list))
    print("去重以后的数据是条数是:",len(end_data_list))
    for end_data in end_data_list:
        logger.info(end_data)
    如果觉得对您有帮助,麻烦您点一下推荐,谢谢!



    好记忆不如烂笔头
  • 相关阅读:
    java jdk 安装后目录下没有 jre
    解决leader-line生成的svg线不能被html2canvas转成图片问题
    css 计数实现目录索引
    前端架构入门
    js 尺寸信息
    阿里云25端口被封,换465端口发送
    appache官网下载 httpClient
    oc基础:类的定义
    指针的地址(地址的地址)
    0 ‘与‘’、 ‘0’
  • 原文地址:https://www.cnblogs.com/xuchunlin/p/14556875.html
Copyright © 2020-2023  润新知