• elasticsearch-hadoop使用


         elasticsearch-hadoop是一个深度集成Hadoop和ElasticSearch的项目,也是ES官方来维护的一个子项目,通过实现Hadoop和ES之间的输入输出,可以在Hadoop里面对ES集群的数据进行读取和写入,充分发挥Map-Reduce并行处理的优势,为Hadoop数据带来实时搜索的可能。 
    项目网址:http://www.elasticsearch.org/overview/hadoop/

    运行环境: 
    CDH4、ElasticSearch0.90.2

    http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Quick-Start/cdh4qs_topic_3_3.html

    https://github.com/medcl/elasticsearch-rtf

    Hive和ES的互操作: 
    #安装,HIVE里面添加ElasticSearch-Hadoop的JAR路径 
    #下载hadoop-es jar包,https://download.elasticsearch.org/hadoop/hadoop-latest.zip

    #Hive加载的JAR路径为本地路径

    [medcl@node-1 ~]$ ls
    elasticsearch-hadoop-1.3.0.M1.jar
    [medcl@node-1 ~]$ pwd
    /home/medcl
    [medcl@node-1 ~]$ hive -hiveconf hive.aux.jars.path=/home/medcl/elasticsearch-hadoop-1.3.0.M1.jar
    Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
    Hive history file=/tmp/medcl/hive_job_log_94db3616-e210-4aab-b07b-6fb159e217ec_1758848920.txt

    #ElasticSearch集群名为"elasticsearch",和Hadoop在一个机器上

    #Hive里面创建一个Table(user),并使用Hadoop-ElasticSearch关联一个索引(/index/user),2个字段,id和name

    CREATE EXTERNAL TABLE user  (id INT, name STRING,site STRING)
    STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    TBLPROPERTIES('es.resource' = 'index/user/',
                  'es.index.auto.create' = 'true')
    在medcl用下操作:
    
    CREATE EXTERNAL TABLE user  (id INT, name STRING)
    STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    TBLPROPERTIES('es.resource' = '/index/user/',
                  'es.index.auto.create' = 'true');
    
    FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask
    hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
        > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
        > TBLPROPERTIES('es.resource' = 'medcl/',
        >               'es.index.auto.create' = 'false');
    FAILED: Error in metadata: MetaException(message:Got exception: org.apache.hadoop.security.AccessControlException Permission denied: user=medcl, access=WRITE, inode="/user":hdfs:supergroup:drwxr-xr-x
     
    
    #擦,看下权限
    [medcl@node-1 ~]$ hadoop fs -lsr /
    lsr: DEPRECATED: Please use 'ls -R' instead.
    drwxrwxrwt   - hdfs supergroup          0 2013-12-16 22:19 /tmp
    drwxr-xr-x   - hdfs supergroup          0 2013-12-16 22:25 /user
    drwxr-xr-x   - medcl supergroup          0 2013-12-17 00:30 /user/medcl
    drwxr-xr-x   - medcl supergroup          0 2013-12-16 22:32 /user/medcl/input
    -rw-r--r--   1 medcl supergroup    2801897 2013-12-16 22:32 /user/medcl/input/file1.txt
    drwxr-xr-x   - medcl supergroup          0 2013-12-17 00:30 /user/medcl/lib
    -rw-r--r--   1 medcl supergroup     160414 2013-12-17 00:30 /user/medcl/lib/elasticsearch-hadoop-1.3.0.M1.jar
    drwxr-xr-x   - hdfs  supergroup          0 2013-12-16 22:20 /var
    drwxr-xr-x   - hdfs  supergroup          0 2013-12-16 22:20 /var/lib
    #原来user目录权限是hdfs,ok,切换hdfs,jar也换个hdfs用户可以访问到的位置,就/tmp吧
    [root@node-1 medcl]# cp elasticsearch-hadoop-1.3.0.M1.jar  /tmp/
    [root@node-1 medcl]# ^C
    [root@node-1 medcl]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
    Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
    Hive history file=/tmp/hdfs/hive_job_log_bdad4d7a-f929-43d7-a56e-e026fdd7e3b4_1219802521.txt
    hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
        > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
        > TBLPROPERTIES('es.resource' = '/index/user/',
        >               'es.index.auto.create' = 'false');
    2013-12-16 17:09:29.560 GMT Thread[main,5,main] java.io.FileNotFoundException: derby.log (Permission denied)
    ----------------------------------------------------------------
    2013-12-16 17:09:29.877 GMT:
     Booting Derby version The Apache Software Foundation - Apache Derby - 10.4.2.0 - (689064): instance a816c00e-0142-fc62-4b5c-000000cec758
    on database directory /var/lib/hive/metastore/metastore_db in READ ONLY mode 
     
    Database Class Loader started - derby.database.classpath=''
    FAILED: Error in metadata: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.metastore.HiveMetaStoreClient
    FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask
     
    #ok,干掉lock
    [root@node-1 ~]# ls /var/lib/hive/metastore/metastore_db
    dbex.lck  db.lck  log  seg0  service.properties  tmp
    [root@node-1 ~]# rm /var/lib/hive/metastore/metastore_db/dbex.lck 
    rm: remove regular file `/var/lib/hive/metastore/metastore_db/dbex.lck'? y
    [root@node-1 ~]# rm /var/lib/hive/metastore/metastore_db/db.lck 
    rm: remove regular file `/var/lib/hive/metastore/metastore_db/db.lck'? y
     
    #另外忘记关另外一个hive实例了,难怪呢。
    [root@node-1 tmp]# ps -aux|grep hive
    Warning: bad syntax, perhaps a bogus '-'? See /usr/share/doc/procps-3.2.8/FAQ
    root     10855  0.0  0.1 148024  2064 pts/0    S+   01:09   0:00 sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
    hdfs     10856  1.8  5.7 858344 109892 pts/0   Sl+  01:09   0:06 /usr/lib/jvm/java-openjdk/bin/java -Xmx256m -Dhadoop.log.dir=/usr/lib/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/usr/lib/hadoop -Dhadoop.id.str= -Dhadoop.root.logger=INFO,console -Djava.library.path=/usr/lib/hadoop/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.util.RunJar /usr/lib/hive/lib/hive-cli-0.10.0-cdh4.5.0.jar org.apache.hadoop.hive.cli.CliDriver -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
     
     
    #权限问题
    [root@node-1 tmp]# ll /var/lib/hive/metastore/metastore_db/
    total 16
    drwxrwxr-x 2 medcl medcl 4096 Dec 17 00:56 log
    drwxrwxr-x 2 medcl medcl 4096 Dec 17 00:56 seg0
    -rw-rw-r-- 1 medcl medcl  860 Dec 17 00:56 service.properties
    drwxrwxr-x 2 medcl medcl 4096 Dec 17 01:01 tmp
    [root@node-1 tmp]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar^C
    [root@node-1 tmp]# chmod 777 /var/lib/hive/metastore/metastore_db/ -R
    [root@node-1 tmp]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
    Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
    Hive history file=/tmp/hdfs/hive_job_log_d5749cb0-fde0-4da2-9094-c85cf4673885_252074310.txt
    hive> show tables;
    OK
    Time taken: 6.934 seconds
    hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
        > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
        > TBLPROPERTIES('es.resource' = '/index/user/',
        >               'es.index.auto.create' = 'true');
    OK
    Time taken: 1.115 seconds
     
    #ok,创建成功了
    hive> show tables;
    OK
    user
    Time taken: 0.15 seconds
    hive> 
     
    #权限问题是Hive默认仓库路径造成的,生疏了
    [root@node-1 tmp]# sudo su hdfs
    bash-4.1$ hadoop fs -lsr /
    lsr: DEPRECATED: Please use 'ls -R' instead.
    drwxrwxrwt   - hdfs supergroup          0 2013-12-16 22:19 /tmp
    drwxr-xr-x   - hdfs supergroup          0 2013-12-17 01:20 /user
    drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive
    drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive/warehouse
    drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive/warehouse/user
     
    #好了,开始往HIVE里面倒数据了,先来几行数据
    [root@node-1 tmp]# cat files1.txt 
    1,medcl
    2,lcdem
    3,tom
    4,jack
     
    #传上去
    [root@node-1 tmp]# sudo su hdfs
    bash-4.1$ hadoop fs -put files1.txt /tmp/
    bash-4.1$ hadoop fs -ls /tmp/
    Found 1 items
    -rw-r--r--   1 hdfs supergroup         29 2013-12-17 01:28 /tmp/files1.txt
     
    #加载到Hive里面
    hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
    #LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user_source; 
    #CREATE EXTERNAL TABLE user_source  (id INT, name STRING);
     
    #不是原始Hive表,还不能直接LOAD
    bash-4.1$ hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
    Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
    Hive history file=/tmp/hdfs/hive_job_log_a9516f87-6e2d-44db-9d38-18eed77d9dec_1583221137.txt
    hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user; 
    FAILED: SemanticException [Error 10101]: A non-native table cannot be used as target for LOAD
    hive> CREATE EXTERNAL TABLE user_source  (id INT, name STRING);
    OK
    Time taken: 1.104 seconds
    hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user_source; 
    Copying data from file:/tmp/files1.txt
    Copying file: file:/tmp/files1.txt
    Loading data to table default.user_source
    Table default.user_source stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 29, raw_data_size: 0]
    OK
    Time taken: 0.911 seconds
    hive> show tables;
    OK
    user
    user_source
    Time taken: 0.226 seconds
     
    #下面这个错误是因为es-hadoop的jar文件没有传到HDFS上面,看来本地和HDFS都要上传,并且路径要一致
    hive> select id,name from  user_source;
    Total MapReduce jobs = 1
    Launching Job 1 out of 1
    Number of reduce tasks is set to 0 since there's no reduce operator
    java.io.FileNotFoundException: File does not exist: /tmp/elasticsearch-hadoop-1.3.0.M1.jar
      at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:824)
      at org.apache.hadoop.filecache.DistributedCache.getFileStatus(DistributedCache.java:185)
      at org.apache.hadoop.filecache.TrackerDistributedCacheManager.determineTimestamps(TrackerDistributedCacheManager.java:821)
      at org.apache.hadoop.filecache.TrackerDistributedCacheManager.determineTimestampsAndCacheVisibilities(TrackerDistributedCacheManager.java:778)
      at org.apache.hadoop.mapred.JobClient.copyAndConfigureFiles(JobClient.java:855)
      at org.apache.hadoop.mapred.JobClient.copyAndConfigureFiles(JobClient.java:746)
      at org.apache.hadoop.mapred.JobClient.access$400(JobClient.java:177)
      at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:963)
      at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:948)
      at java.security.AccessController.doPrivileged(Native Method)
      at javax.security.auth.Subject.doAs(Subject.java:415)
      at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
      at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:948)
      at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:922)
      at org.apache.hadoop.hive.ql.exec.ExecDriver.execute(ExecDriver.java:448)
      at org.apache.hadoop.hive.ql.exec.MapRedTask.execute(MapRedTask.java:138)
      at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:138)
      at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:66)
      at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1383)
      at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1169)
      at org.apache.hadoop.hive.ql.Driver.run(Driver.java:982)
      at org.apache.hadoop.hive.ql.Driver.run(Driver.java:902)
      at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:259)
      at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:216)
      at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:412)
      at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)
      at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:613)
      at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
      at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      at java.lang.reflect.Method.invoke(Method.java:606)
      at org.apache.hadoop.util.RunJar.main(RunJar.java:208)
    Job Submission failed with exception 'java.io.FileNotFoundException(File does not exist: /tmp/elasticsearch-hadoop-1.3.0.M1.jar)'
    FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MapRedTask
     
     
    #ok,再看看
    bash-4.1$ hadoop fs -put elasticsearch-hadoop-1.3.0.M1.jar  /tmp/
    bash-4.1$ hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
    Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
    Hive history file=/tmp/hdfs/hive_job_log_28ea1fbc-dc3b-4e62-9f47-1a88eed30069_1310993479.txt
    hive> select id,name from  user_source;
    Total MapReduce jobs = 1
    Launching Job 1 out of 1
    Number of reduce tasks is set to 0 since there's no reduce operator
    Starting Job = job_201312162220_0004, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312162220_0004
    Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201312162220_0004
    Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
    2013-12-17 01:36:28,086 Stage-1 map = 0%,  reduce = 0%
    2013-12-17 01:36:34,141 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
    2013-12-17 01:36:35,162 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
    2013-12-17 01:36:36,177 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
    2013-12-17 01:36:37,184 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
    2013-12-17 01:36:38,204 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 0.88 sec
    MapReduce Total cumulative CPU time: 880 msec
    Ended Job = job_201312162220_0004
    MapReduce Jobs Launched: 
    Job 0: Map: 1   Cumulative CPU: 0.88 sec   HDFS Read: 247 HDFS Write: 24 SUCCESS
    Total MapReduce CPU Time Spent: 880 msec
    OK
    NULL    NULL
    NULL    NULL
    NULL    NULL
    NULL    NULL
    Time taken: 25.999 seconds
     
    #慢,数据怎么是空的,建成外表了(EXTERNAL),没有设置默认的分隔符,好纠结
    hive> drop table user_source;                                                                        
    OK
    Time taken: 0.649 seconds
    hive> CREATE TABLE user_source  (id INT, name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
    OK
    Time taken: 0.109 seconds
    hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' INTO TABLE user_source;                              
    Copying data from file:/tmp/files1.txt
    Copying file: file:/tmp/files1.txt
    Loading data to table default.user_source
    Table default.user_source stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 29, raw_data_size: 0]
    OK
    Time taken: 0.348 seconds
    hive> select * from  user_source;                                                                   
    OK
    1    medcl
    2    lcdem
    3    tom
    4    jack
    Time taken: 0.155 seconds
     
    #源表现在有了,导入到ES所在的表里面去
     
    hive> INSERT OVERWRITE TABLE user
        > SELECT s.id, s.name FROM user_source s;
    Total MapReduce jobs = 1
    Launching Job 1 out of 1
    Number of reduce tasks is set to 0 since there's no reduce operator
    Starting Job = job_201312162220_0005, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312162220_0005
    Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201312162220_0005
    Hadoop job information for Stage-0: number of mappers: 1; number of reducers: 0
    2013-12-17 01:50:52,141 Stage-0 map = 0%,  reduce = 0%
    2013-12-17 01:51:03,220 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
    2013-12-17 01:51:04,243 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
    2013-12-17 01:51:05,254 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
    2013-12-17 01:51:06,266 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
    2013-12-17 01:51:07,294 Stage-0 map = 100%,  reduce = 100%, Cumulative CPU 1.16 sec
    MapReduce Total cumulative CPU time: 1 seconds 160 msec
    Ended Job = job_201312162220_0005
    4 Rows loaded to user
    MapReduce Jobs Launched: 
    Job 0: Map: 1   Cumulative CPU: 1.16 sec   HDFS Read: 247 HDFS Write: 0 SUCCESS
    Total MapReduce CPU Time Spent: 1 seconds 160 msec
    OK
    Time taken: 21.849 seconds
    hive> select * from user;
    OK
    Failed with exception java.io.IOException:java.lang.IllegalStateException: [GET] on [/index/user/&search_type=scan&scroll=10m&size=50&preference=_shards:4;_only_node:MP7Zl3owTRm8O2V6cWvOSg] failed; server[http://10.0.2.15:9200] returned [{"_index":"index","_type":"user","_id":"&search_type=scan&scroll=10m&size=50&preference=_shards:4;_only_node:MP7Zl3owTRm8O2V6cWvOSg","exists":false}]
    Time taken: 0.387 seconds
     
    #可以看出来hadoop-elasticsearch翻译出来的查询语句好像有问题!不过elasticsearch里面已经有数据了,反正暂时不需要用hive来执行查询,先官方发个issue吧。
     
    #ES查询结果
    bash-4.1$ curl localhost:9200/index/user/_search?q=*&pretty=true
    [1] 13588
    bash-4.1$ {"took":3,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":4,"max_score":1.0,"hits":[{"_index":"index","_type":"user","_id":"3x4bEcriRvS6AHkX2Sb7UA","_score":1.0, "_source" : {"id":2,"name":"lcdem"}},{"_index":"index","_type":"user","_id":"_3rGVWhaTSCixYxRzBUSLQ","_score":1.0, "_source" : {"id":4,"name":"jack"}},{"_index":"index","_type":"user","_id":"T-Q_icjgR8ehsH3IV-twWw","_score":1.0, "_source" : {"id":1,"name":"medcl"}},{"_index":"index","_type":"user","_id":"Vdz0sryBT5u0e9hfoMY8Tg","_score":1.0, "_source" : {"id":3,"name":"tom"}}]}}
    #接下来试试大量数据bulk导入的性能,是不是真的做到data locality。
    
    

    elasticsearch-hadoop下载地址:https://github.com/elastic/elasticsearch-hadoop

  • 相关阅读:
    装箱、拆箱操作发生在
    @Data的注解使用以及在IDEA上安装
    Mysql中 BLOB字段转String的方法
    不属于java语言鲁棒性特点的是
    java object默认的基本方法
    哪个类可用于处理 Unicode?
    类和接口的继承
    抽象类的叙述:
    Hashtable 和 HashMap 的区别是:
    编程之美初赛第一场--焦距
  • 原文地址:https://www.cnblogs.com/momoyan/p/9179561.html
Copyright © 2020-2023  润新知