• lakefs 集成presto/trinodb


    presto 集成lakefs 主要利用了hive connector

    预备

    需要hive metadata

    • docker-compose
    version: "3"
    services:
      lakefs:
        image: "treeverse/lakefs:${VERSION:-latest}"
        ports:
          - "8000:8000"
        depends_on:
          - "postgres"
        environment:
          - LAKEFS_AUTH_ENCRYPT_SECRET_KEY=${LAKEFS_AUTH_ENCRYPT_SECRET_KEY:-some random secret string}
          - LAKEFS_DATABASE_CONNECTION_STRING=${LAKEFS_DATABASE_CONNECTION_STRING:-postgres://lakefs:lakefs@postgres/postgres?sslmode=disable}
          - LAKEFS_BLOCKSTORE_TYPE=${LAKEFS_BLOCKSTORE_TYPE:-s3}
          - LAKEFS_BLOCKSTORE_LOCAL_PATH=${LAKEFS_BLOCKSTORE_LOCAL_PATH:-/home/lakefs}
          - LAKEFS_GATEWAYS_S3_DOMAIN_NAME=${LAKEFS_GATEWAYS_S3_DOMAIN_NAME:-s3.local.lakefs.io:8000}
          - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-minio}
          - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_SECRET_KEY=${AWS_SECRET_ACCESS_KEY:-minio123}
          - LAKEFS_LOGGING_LEVEL=${LAKEFS_LOGGING_LEVEL:-INFO}
          - LAKEFS_STATS_ENABLED
          - LAKEFS_BLOCKSTORE_S3_ENDPOINT=http://s3:9000
          - LAKEFS_BLOCKSTORE_S3_FORCE_PATH_STYLE=true
          - LAKEFS_COMMITTED_LOCAL_CACHE_DIR=${LAKEFS_COMMITTED_LOCAL_CACHE_DIR:-/home/lakefs/.local_tier}
        entrypoint:
          [
            "/app/wait-for",
            "postgres:5432",
            "--",
            "/app/lakefs",
            "run"
          ]
      postgres:
        image: postgres:11
        container_name: postgres
        environment:
          POSTGRES_USER: lakefs
          POSTGRES_PASSWORD: lakefs
     
      mariadb:
        image: mariadb:10
        container_name: mariadb
        ports:
          - "3306:3306"
        environment:
          MYSQL_ROOT_PASSWORD: admin
          MYSQL_USER: admin
          MYSQL_PASSWORD: admin
          MYSQL_DATABASE: metastore_db
      s3:
        image: minio/minio
        environment:
          - "MINIO_ACCESS_KEY=minio"
          - "MINIO_SECRET_KEY=minio123"
        command: server /data --console-address ":9001"
        ports:
          - "9000:9000"
          - "9001:9001"
      hive-metastore:
        image: dalongrong/hive-metastore:3.2.0
        container_name: hive
        depends_on:
          - mariadb
        ports:
          - 9083:9083
        environment:
          - DB_URI=mariadb:3306
        volumes:
          - ./etc/hive-site.xml:/opt/apache-hive-bin/conf/hive-site.xml
     
      trino:
        image: trinodb/trino:370
        container_name: trino
        environment:
          - AWS_ACCOUNT_ID
          - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID
          - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_SECRET_ACCESS_KEY
          - LAKECTL_METASTORE_GLUE_CREDENTIALS_ACCESS_KEY_ID
          - LAKECTL_METASTORE_GLUE_CREDENTIALS_ACCESS_SECRET_KEY
        volumes:
          # - ./etc/s3.properties:/etc/trino/catalog/s3.properties
          - ./etc/lakefs.properties:/etc/trino/catalog/lakefs.properties
        ports:
          - "8080:8080"
     
      trino-client:
        image: trinodb/trino:358
        profiles: [ "client" ]
        entrypoint:
          [
            "trino",
            "--server",
            "trino:8080",
            "--catalog",
            "s3",
            "--schema",
            "default"
          ]
     
      create-dbt-schema-main:
        image: trinodb/trino:358
        profiles: [ "client" ]
        entrypoint:
          [
            "trino",
            "--server",
            "trino:8080",
            "--catalog",
            "lakefs",
            "--execute",
            "drop schema if exists dbt_main; create schema dbt_main with (location = 's3://example/main/dbt' )"
          ]
     
    • hive metadata 配置
      结合实际修改
     
    <configuration>
        <property>
            <name>hive.metastore.uris</name>
            <value>thrift://hive:9083</value>
            <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
        </property>
        <property>
            <name>hive.metastore.event.db.notification.api.auth</name>
            <value>false</value>
        </property>
        <property>
            <name>javax.jdo.option.ConnectionDriverName</name>
            <value>com.mysql.jdbc.Driver</value>
        </property>
        <property>
            <name>javax.jdo.option.ConnectionURL</name>
            <value>jdbc:mysql://mariadb:3306/metastore_db</value>
        </property>
        <property>
            <name>javax.jdo.option.ConnectionUserName</name>
            <value>admin</value>
        </property>
        <property>
            <name>javax.jdo.option.ConnectionPassword</name>
            <value>admin</value>
        </property>
        <property>
            <name>hive.metastore.schema.verification</name>
            <value>false</value>
        </property>
        <property>
            <name>fs.s3a.path.style.access</name>
            <value>true</value>
        </property>
        <property>
            <name>fs.s3.impl</name>
            <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
        </property>
        <property>
            <name>fs.s3a.impl</name>
            <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
        </property>
        <property>
            <name>fs.s3a.endpoint</name>
            <value>http://lakefs:8000</value>
        </property>
        <property>
            <name>fs.s3a.access.key</name>
            <value>AKIAJXJ3AX5EWAAYO3GQ</value>
        </property>
        <property>
            <name>fs.s3a.secret.key</name>
            <value>ayPIL/XuHlPCNUp1x+0dQfyvXCZFp8g+BewEy7bX</value>
        </property>
    </configuration>
    • trinodb 配置
      lakefs.properties
     
    connector.name=hive-hadoop2
    hive.metastore.uri=thrift://hive-metastore:9083
    hive.non-managed-table-writes-enabled=true
    hive.s3select-pushdown.enabled=true
    hive.s3.aws-access-key=AKIAJXJ3AX5EWAAYO3GQ
    hive.s3.aws-secret-key=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
    hive.s3.endpoint=http://lakefs:8000
    hive.s3.path-style-access=true
    hive.s3.ssl.enabled=false
     
    hive.allow-drop-table=true
    hive.allow-add-column=true
    hive.allow-drop-column=true
    hive.allow-rename-table=true
    hive.allow-rename-column=true

    链接trinodb

    可以使用dbeaver

    • 创建lakefs schema 以及table
     
    CREATE SCHEMA lakefs.main
    WITH (location = 's3a://dalong/main')
     
     
    CREATE TABLE lakefs.main.page_views2 (
      user_id bigint,
      page_url varchar,
      country varchar
    )
    WITH (
      format = 'ORC',
      bucketed_by = ARRAY['user_id'],
      bucket_count = 50
    )
     
    insert into lakefs.main.page_views2(user_id,page_url,country) values (333,'https://www.baidu.com','dalong')
    • 效果

    数据


    lakefs

    说明

    以上是一个简单的集成,table格式使用了orc,实际上我们也可以使用parquet 这样就可以集成dremio了,实际上dremio 集成其他格式的方法挺多,但是目前
    因为dremio并不是完全开源(特性以及完全有商业左右)并不是特别的好,我们可以基于如下扩展
    基于hive metadata 服务扩展存储,然后基于自定义的存储插件解决(性能可能不会太好),还有就是基于jdbc 驱动

    参考资料

    https://docs.lakefs.io/integrations/presto_trino.html
    https://github.com/rongfengliang/lakefs-hive-trino

  • 相关阅读:
    正则表达式
    [创业指南]给海归技术创业兄弟的九个忠告
    about avast
    设计模式Hibernate
    job desc
    把INT转换成2进制等
    微服务调用跟踪
    Redis 分布式锁实现
    jquery中的$.ajax()方法
    订单从ftp写入到b2b
  • 原文地址:https://www.cnblogs.com/rongfengliang/p/15890568.html
Copyright © 2020-2023  润新知