-
最近在调研flink sql连接hive,初次使用踩了许多坑,记录一下。
-
首先idea运行需要Windows上安装Hadoop环境,并配置好环境变量,否则报$HADOOP_HOME找不到的错误。
-
配置完成后进入到Linux服务器上已有的Hadoop环境,将core-site.xml文件和hdfs-site.xml文件放到idea代码的resource目录下。
-
此时可以尝试运行一下MR的wordcount代码,以验证环境是否配置完善。
-
然后进入到Linux服务器上已有的hive环境,将hive-site.xml文件放到idea代码的resource目录下。
-
准备工作完成后就可以进行代码的开发了。
-
需要引入的依赖包如下:
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>flink_hive</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> <flink.version>1.14.5</flink.version> <hive.version>3.1.2</hive.version> <flink.scala.version>2.12</flink.scala.version> <hadoop.version>3.2.2</hadoop.version> <scala.binary.version>2.12</scala.binary.version> </properties> <dependencies> <!--Hadoop依赖包--> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> <!--mapreduce--> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>${hadoop.version}</version> </dependency> <!-- Flink Dependency --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-hive_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-java</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner_2.12</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.12</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table</artifactId> <version>${flink.version}</version> <type>pom</type> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-common</artifactId> <version>${flink.version}</version> </dependency> <!-- Hive Dependency --> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>${hive.version}</version> </dependency> </dependencies> <build> <plugins> <!-- Java Compiler --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. --> <!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.1.1</version> <executions> <!-- Run shade goal on package phase --> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <!--<transformers combine.children="append"> <!– The service transformer is needed to merge META-INF/services files –> <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> <!– ... –> </transformers>--> <!-- 合并多个connetor 的META-INF.services 文件--> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> <resource>reference.conf</resource> </transformer> <!-- The service transformer is needed to merge META-INF/services files --> <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"> <projectName>Apache Flink</projectName> <encoding>UTF-8</encoding> </transformer> </transformers> <!-- 自动排除不使用的类,缩小jar包体积--> <!-- <minimizeJar>true</minimizeJar>--> <artifactSet> <excludes> <exclude>org.apache.flink:force-shading</exclude> <exclude>org.slf4j:*</exclude> <exclude>org.apache.logging.log4j:*</exclude> </excludes> </artifactSet> <filters> <filter> <!-- Do not copy the signatures in the META-INF folder. Otherwise, this might cause SecurityExceptions when using the JAR. --> <artifact>*:*</artifact> <excludes> <exclude>module-info.class</exclude> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>
-
代码部分如下:
import org.apache.flink.table.api.*;
import org.apache.flink.table.catalog.hive.HiveCatalog;
public class HiveJdbcMain {
public static void main(String[] args) throws Exception {
//设置账户为hadoop,有写入hdfs权限
System.setProperty("HADOOP_USER_NAME", "hadoop");
System.setProperty("HADOOP_USER_PASSWORD", "hadoop");
//使用阿里的Planner
EnvironmentSettings settings = EnvironmentSettings.newInstance()/*.inBatchMode()*/.build();
// EnvironmentSettings settings = EnvironmentSettings.newInstance()
// .useBlinkPlanner()
// .inStreamingMode() // 有流和批inBatchMode() 任选
// .build();
// 构建table环境
TableEnvironment tableEnv = TableEnvironment.create(settings);
//设置方言 不同数据库的语句有差别
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
//构造hive catalog 直接调用hiveconstans就可以
// Catalog名称,定义一个唯一的名称表示
String NAME="myhive";
// 默认Hive数据库名称
String DEFAULTDATABASE="default";
//hive-site.xml路径 运行Flink的Linux目录下
// String HIVECONFDIRPATH="/opt/module/hive-3.1.2/conf/";服务器文件位置
String HIVECONFDIRPATH="src/main/resources";//本地文件位置
//hive版本
String VERSION="3.1.2";
HiveCatalog myHive=new HiveCatalog(NAME, DEFAULTDATABASE,HIVECONFDIRPATH, VERSION);
//注册指定名字的catalog
tableEnv.registerCatalog("myhive",myHive);
//使用上面注册的catalog
tableEnv.useCatalog("myhive");
// 执行逻辑,需要提前创建好hive的库表。
String sql="select * from default.ems_data";
Table tableResult1 = tableEnv.sqlQuery(sql);
tableResult1.execute().print();
//获取结果的迭代器,可以循环迭代器获取结果
/*CloseableIterator<Row> collect = tableResult1.execute().collect();
System.out.println(collect.next());*/
//执行executeSql 插入或更新数据库
/*String executeSql="insert into table xxxx select * from default.ems_data";
TableResult tableResult6 = tableEnv.executeSql(executeSql);*/
}
}
- 以上代码和依赖经过多次调试修改,最终可运行成功。