• python 爬虫 上传elasticSearch (包括日期)


    python爬虫

    1 利用pip引入相关库

    from bs4 import BeautifulSoup
    import requests
    
    

    2 构建请求

    以某网站为例,此为post请求,根据目标网站而定

    
          headers = {'Host': 'xxxxx',
                   'Origin':'xxxxx',
                   'Content-Type':'xxxx',
                   'Origin': 'xxxx',
                   'Referer': ',
                   'User-Agent': 'xxxxxxx',
                   'X-Requested-With': 'xxxxx'
                       }
    
         content = {'m': 'xflist',
                       'city': 'wf',
                       'district': ''}
    
          url='xxxxx';
    
         res = requests.post(url, data=content, headers=headers)
    
    

    3 解析网页结果

    
          soup = BeautifulSoup(html, "html.parser")
          list = soup.select('body > div.main > section.mBox.mb8.dtbk > div > ul ')
          for i in list:
          lilist = i.find_all("li")
            for j in lilist:
                create_time = j.select("div.time")[0].getText()
                print(create_time)
                content = j.select("h4")[0].getText()
                print(content)
                json = {"createTime": create_time,"content":content}
                jsonArray.append(json)
    
    

    经python爬虫爬出数据,包含时间格式截取部分如下

    
    json={
                   "floor_area": "57535 ㎡",
                   "building_area": "250000 ㎡",
                   "volume_rate": "3.78",
                   "greening_rate": "30%",
                   "parking_rate": "项目规划车位数量为1889个",           
                   "record": [
                            {
                                "createTime": "2018-11-20 11:03:33",
                                "content": "新瑞都"
                            },
                            {
                                "createTime": "2020-12-31",
                                "content": "3号楼"
                            },
                            {
                                "createTime": "2018-11-03",
                                "content": "3号楼"
                            },
                    }
    

    4引入elasticsearch库,确保elasticsearch为启动状态

    from e import Elasticsearch
    from fang import make_request1
    
    
    es = Elasticsearch([{'host':'127.0.0.1','port':9200}])
    json = make_request1()
    
    es.index(index="house_test",doc_type="fang",body = json)
    
    

    运行后报错信息如下

    显然虽然json 爬取为字符串,但是上传到elasticsearch,被识别为日期格式

    于是elasticsearch创建索引时即规定type.

    5 elasticsearch创建索引,具体实现代码如下

    
    from elasticsearch import Elasticsearch
    es = Elasticsearch('192.168.1.1:9200')
    
    mappings = {
                "mappings": {
                    "fang": {
                        "properties": {
                           
                             },"open_time": {
                                "type": "keyword",
                                "index": "false"
                            },"volume_rate": {
                                "type": "keyword",
                                "index": "false"
                            },"greening_rate": {
                                "type": "keyword",
                                "index": "false"
                            },"parking_rate": {
                                "type": "keyword",
                                "index": "false"
                             },"house_type": {
                                "type": "keyword",
                                "index": "false"
                             },
                            "property_company": {
                                "type": "keyword",
                                "index": "false"
                             },
                            # tags可以存json格式,访问tags.content
                            "projectComment": {
                                "type": "object",
                                "properties": {
                                    "createTime": {"type": "keyword", "index": False},
                                    "content": {"type": "keyword", "index": False},
                                }
                            },
                        }
                    }
                }
            }
    
    
    
    res = es.indices.create(index = 'index_test',body =mappings)
    
    

    将数据连续插入

    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import bulk
    
    es = Elasticsearch('192.168.1.1:9200')
    
    ACTIONS = []
    
    json1 ={
                    "floor_area": "57535 ㎡",
                   "building_area": "250000 ㎡",
                   "volume_rate": "3.78",
                   "greening_rate": "30%",
                   "parking_rate": "项目规划车位数量为1889个",           
                   "record": [
                            {
                                "createTime": "2018-11-20 11:03:33",
                                "content": "新瑞都"
                            },
                            {
                                "createTime": "2020-12-31",
                                "content": "3号楼"
                            },
                            {
                                "createTime": "2018-11-03",
                                "content": "3号楼"
                            },
                    }
    json2 ={
                   "floor_area": "354345 ㎡",
                   "building_area": "234500 ㎡",
                   "volume_rate": "453",
                   "greening_rate": "43%",
                   "parking_rate": "项目规划车位数量为1889个",           
                   "record": [
                            {
                                "createTime": "2018-11-20 11:03:33",
                                "content": "新瑞都"
                            },
                            {
                                "createTime": "2020-12-31",
                                "content": "3号楼"
                            },
                            {
                                "createTime": "2018-11-03",
                                "content": "3号楼"
                            },
                        }
                    }
    
    ACTIONS.append(json1)
    ACTIONS.append(json2)
    
    res,_ =bulk(es, ACTIONS, index="indes_test", raise_on_error=True)
    
    
    

    5 查询索引

    通过postman发送请求

    查询到结果

  • 相关阅读:
    移动端头部声明
    清除浮动绝招
    图片采用base64压缩,可以以字符串的形式传送base64给服务端转存为图片
    js cookie的封装和调用
    js 封装设计cookie
    div可编辑状态设置
    align使图片和文字居中
    布局如何做到自适应?
    Jmeter学习笔记四_压力测试
    Pycharm中配置Git版本管理
  • 原文地址:https://www.cnblogs.com/gloria-liu/p/9995642.html
Copyright © 2020-2023  润新知