• hadoop2.8_hive_sqoop_spark_impala_cdh_centos7.3


    参考:https://blog.csdn.net/jssg_tzw/article/details/70314184
    #环境 Impala目前好像还不支持Hive2.0
    centos7.3_x86_64
    hadoop-2.8.0
    hbase-1.2.6
    zookeeper-3.4.10
    mysql-community-server-5.7.22-1.el7 & client & common & libs
    mysql-connector-java-5.1.41
    apache-hive-2.1.1
    sqoop-1.4.7.bin__hadoop-2.6.0
    spark-2.1.1-bin-hadoop2.7
    impala-2.6.0+cdh5.8.3 待配置

    #web控制台
    #hadoop overview控制台
    http://192.168.92.241:50070/
    #hadoop cluster控制台
    http://192.168.92.241:8088/
    #hbase控制台
    http://192.168.92.241:16030/
    #hive创建的库和表
    http://192.168.92.241:50070/explorer.html#/user/hive/warehouse/db_hive_edu.db
    #Spark集群控制台
    http://192.168.92.241:8080/
    #kudu控制台查看impala-shell创建的表
    http://192.168.92.241:8051/

    #一、配置hadoop
    #1.配置主机节点环境
    #修改主机名 节点2和3
    vi /etc/hostname
    hadoop2

    #关闭防火墙
    systemctl stop firewalld.service
    systemctl disable firewalld.service
    systemctl is-enabled firewalld.service
    systemctl status firewalld.service

    cat >> /etc/hosts <<EOF
    192.168.92.241 node1
    192.168.92.242 node2
    192.168.92.243 node3
    EOF

    #给3个机器生成秘钥文件
    ls /root/.ssh
    #3个节点执行**************************
    ssh-keygen -t rsa -P ''
    #namenode执行
    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
    #cat ~/.ssh/id_rsa.pub|ssh hadoop1 'sh -c "cat - >>~/.ssh/authorized_keys"'
    #cat ~/.ssh/id_rsa.pub|ssh hadoop2 'sh -c "cat - >>~/.ssh/authorized_keys"'
    ssh node2 cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
    ssh node3 cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

    #复制到 datanode
    scp ~/.ssh/authorized_keys root@node2:~/.ssh/authorized_keys
    scp ~/.ssh/authorized_keys root@node3:~/.ssh/authorized_keys

    #验证ssh
    ssh node1 date
    ssh node2 date
    ssh node3 date

    #2.3个节点安装java se&hadoop
    #安装java se
    mkdir -p /opt/software
    mkdir -p /opt/java
    cd /opt/software
    #上传jdk-8u171-linux-x64.tar.gz到/opt/software
    tar -zxvf jdk-8u171-linux-x64.tar.gz -C /opt/java/

    scp -r /opt/java/ node2:/opt/
    scp -r /opt/java/ node3:/opt/

    #创建环境变量所需目录
    mkdir -p /opt/hadoop/hadoop-2.8.0
    mkdir -p /opt/hive/apache-hive-2.1.1-bin
    mkdir -p /opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0
    mkdir -p /opt/hbase/hbase-1.2.6
    mkdir -p /opt/zookeeper/zookeeper-3.4.10

    cat >> /etc/profile << EOF
    export JAVA_HOME=/opt/java/jdk1.8.0_171
    export HADOOP_HOME=/opt/hadoop/hadoop-2.8.0
    export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
    export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_HOME}/lib/native
    export HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib"
    export HIVE_HOME=/opt/hive/apache-hive-2.1.1-bin
    export HIVE_CONF_DIR=${HIVE_HOME}/conf
    export SQOOP_HOME=/opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0
    export HBASE_HOME=/opt/hbase/hbase-1.2.6
    export ZK_HOME=/opt/zookeeper/zookeeper-3.4.10
    export CLASS_PATH=.:${JAVA_HOME}/lib:${HIVE_HOME}/lib:$CLASS_PATH
    export PATH=.:${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${SPARK_HOME}/bin:${ZOOKEEPER_HOME}/bin:${HIVE_HOME}/bin:${SQOOP_HOME}/bin:${HBASE_HOME}/bin:${ZK_HOME}/bin:$PATH
    EOF

    #生效java环境变量
    source /etc/profile

    mkdir -p /opt/hadoop
    mkdir /root/hadoop
    mkdir /root/hadoop/tmp
    mkdir /root/hadoop/var
    mkdir /root/hadoop/dfs
    mkdir /root/hadoop/dfs/name
    mkdir /root/hadoop/dfs/data

    tar -zxvf hadoop-2.8.0.tar.gz -C /opt/hadoop

    cd /opt/hadoop/hadoop-2.8.0/etc/hadoop

    3.2.3.1 在<configuration>节点内加入配置:
    vi /opt/hadoop/hadoop-2.8.0/etc/hadoop/core-site.xml

    <property>
    <name>hadoop.tmp.dir</name>
    <value>/root/hadoop/tmp</value>
    <description>Abase for other temporary directories.</description>
    </property>
    <property>
    <name>fs.default.name</name>
    <value>hdfs://node1:9000</value>
    </property>


    3.2.3.2 将export JAVA_HOME=${JAVA_HOME} 修改为:
    vi /opt/hadoop/hadoop-2.8.0/etc/hadoop/hadoop-env.sh

    export JAVA_HOME=/opt/java/jdk1.8.0_171

    3.2.3.3 在<configuration>节点内加入配置:
    #说明:dfs.permissions 配置为false后,可以允许不要检查权限就生成dfs上的文件,方便倒是方便了,但是你需要防止误删除,请将它设置为true,或者直接将该property节点删除,因为默认就是true。
    vi /opt/hadoop/hadoop-2.8.0/etc/hadoop/hdfs-site.xml

    <property>
    <name>dfs.name.dir</name>
    <value>/root/hadoop/dfs/name</value>
    <description>Path on the local filesystem where theNameNode stores the namespace and transactions logs persistently.</description>
    </property>
    <property>
    <name>dfs.data.dir</name>
    <value>/root/hadoop/dfs/data</value>
    <description>Comma separated list of paths on the localfilesystem of a DataNode where it should store its blocks.</description>
    </property>
    <property>
    <name>dfs.replication</name>
    <value>2</value>
    </property>
    <property>
    <name>dfs.permissions</name>
    <value>false</value>
    <description>need not permissions</description>
    </property>

    3.2.3.4 新建并且修改mapred-site.xml

    cp /opt/hadoop/hadoop-2.8.0/etc/hadoop/mapred-site.xml.template /opt/hadoop/hadoop-2.8.0/etc/hadoop/mapred-site.xml

    #在<configuration>节点内加入配置:
    vi /opt/hadoop/hadoop-2.8.0/etc/hadoop/mapred-site.xml

    <property>
    <name>mapred.job.tracker</name>
    <value>node1:49001</value>
    </property>
    <property>
    <name>mapred.local.dir</name>
    <value>/root/hadoop/var</value>
    </property>
    <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
    </property>


    3.2.3.5 修改slaves文件
    #将里面的localhost删除,添加如下内容:
    vi /opt/hadoop/hadoop-2.8.0/etc/hadoop/slaves

    node2
    node3

    3.2.3.6 修改yarn-site.xml文件
    #在<configuration>节点内加入配置(内存根据机器配置越大越好):
    #说明:yarn.nodemanager.vmem-check-enabled这个的意思是忽略虚拟内存的检查,如果你是安装在虚拟机上,这个配置很有用,配上去之后后续操作不容易出问题。如果是实体机上,并且内存够多,可以将这个配置去掉。
    vi /opt/hadoop/hadoop-2.8.0/etc/hadoop/yarn-site.xml

    <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>node1</value>
    </property>
    <property>
    <description>The address of the applications manager interface in the RM.</description>
    <name>yarn.resourcemanager.address</name>
    <value>${yarn.resourcemanager.hostname}:8032</value>
    </property>
    <property>
    <description>The address of the scheduler interface.</description>
    <name>yarn.resourcemanager.scheduler.address</name>
    <value>${yarn.resourcemanager.hostname}:8030</value>
    </property>
    <property>
    <description>The http address of the RM web application.</description>
    <name>yarn.resourcemanager.webapp.address</name>
    <value>${yarn.resourcemanager.hostname}:8088</value>
    </property>
    <property>
    <description>The https adddress of the RM web application.</description>
    <name>yarn.resourcemanager.webapp.https.address</name>
    <value>${yarn.resourcemanager.hostname}:8090</value>
    </property>
    <property>
    <name>yarn.resourcemanager.resource-tracker.address</name>
    <value>${yarn.resourcemanager.hostname}:8031</value>
    </property>
    <property>
    <description>The address of the RM admin interface.</description>
    <name>yarn.resourcemanager.admin.address</name>
    <value>${yarn.resourcemanager.hostname}:8033</value>
    </property>
    <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
    </property>
    <property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>1024</value>
    <discription>每个节点可用内存,单位MB,默认8182MB</discription>
    </property>
    <property>
    <name>yarn.nodemanager.vmem-pmem-ratio</name>
    <value>2.1</value>
    </property>
    <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>1024</value>
    </property>
    <property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>false</value>
    </property>

    #在namenode上执行初始化
    cd /opt/hadoop/hadoop-2.8.0/bin
    /opt/hadoop/hadoop-2.8.0/bin/hadoop namenode -format

    cd /opt/hadoop
    scp -r /opt/hadoop root@node2:/opt
    scp -r /opt/hadoop root@node3:/opt

    #启动hadoop
    /opt/hadoop/hadoop-2.8.0/sbin/start-all.sh

    #输出如下:
    This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
    21/01/18 22:56:53 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    Starting namenodes on [node1]
    node1: starting namenode, logging to /opt/hadoop/hadoop-2.8.0/logs/hadoop-root-namenode-node1.out
    node2: starting datanode, logging to /opt/hadoop/hadoop-2.8.0/logs/hadoop-root-datanode-node2.out
    node3: starting datanode, logging to /opt/hadoop/hadoop-2.8.0/logs/hadoop-root-datanode-node3.out
    Starting secondary namenodes [0.0.0.0]
    The authenticity of host '0.0.0.0 (0.0.0.0)' can't be established.
    ECDSA key fingerprint is 27:e4:1d:35:19:f4:f8:58:57:07:4c:0b:97:42:19:25.
    Are you sure you want to continue connecting (yes/no)? yes
    0.0.0.0: Warning: Permanently added '0.0.0.0' (ECDSA) to the list of known hosts.
    0.0.0.0: starting secondarynamenode, logging to /opt/hadoop/hadoop-2.8.0/logs/hadoop-root-secondarynamenode-node1.out
    21/01/18 22:57:22 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    starting yarn daemons
    starting resourcemanager, logging to /opt/hadoop/hadoop-2.8.0/logs/yarn-root-resourcemanager-node1.out
    node3: starting nodemanager, logging to /opt/hadoop/hadoop-2.8.0/logs/yarn-root-nodemanager-node3.out
    node2: starting nodemanager, logging to /opt/hadoop/hadoop-2.8.0/logs/yarn-root-nodemanager-node2.out

    #3228 -- main class information unavailable
    jps

    47520 Jps
    47259 SecondaryNameNode
    3228 -- main class information unavailable
    47054 NameNode

    #测试hadoop
    #overview页面 未打开 *********************************************
    http://192.168.92.241:50070/
    #cluster页面
    http://192.168.92.241:8088/
    ###########################################
    #二、配置zookkeeper
    cd /opt/software
    mkdir -p /opt/zookeeper
    tar -zxvf zookeeper-3.4.10.tar.gz -C /opt/zookeeper
    cp /opt/zookeeper/zookeeper-3.4.10/conf/zoo_sample.cfg /opt/zookeeper/zookeeper-3.4.10/conf/zoo.cfg

    mkdir /opt/zookeeper/data
    mkdir /opt/zookeeper/dataLog

    cat >> /opt/zookeeper/data/myid << EOF
    1
    EOF

    cat >> /opt/zookeeper/zookeeper-3.4.10/conf/zoo.cfg << EOF
    dataDir=/opt/zookeeper/data
    dataLogDir=/opt/zookeeper/dataLog
    server.1=node1:2888:3888
    server.2=node2:2888:3888
    server.3=node3:2888:3888
    EOF

    scp -r /opt/zookeeper node2:/opt
    scp -r /opt/zookeeper node3:/opt

    #在 node2 node3修改myid文件 内容人别为2 3

    #启动zookeeper 每个节点
    /opt/zookeeper/zookeeper-3.4.10/bin/zkServer.sh start
    #查看zookeeper的状态
    /opt/zookeeper/zookeeper-3.4.10/bin/zkServer.sh status

    #将ZooKeeper设置为开机启动(可选) node1 node2 node3执行
    cat >> /etc/rc.d/init.d/zookeeper << EOF
    #!/bin/bash
    #chkconfig: 2345 10 90
    #description: service zookeeper
    export JAVA_HOME=/opt/java/jdk1.8.0_171
    export ZOO_LOG_DIR=/opt/zookeeper/log
    ZOOKEEPER_HOME=/opt/zookeeper/zookeeper-3.4.10
    case "$1" in
    start) su root ${ZOOKEEPER_HOME}/bin/zkServer.sh start;;
    start-foreground) su root ${ZOOKEEPER_HOME}/bin/zkServer.sh start-foreground;;
    stop) su root ${ZOOKEEPER_HOME}/bin/zkServer.sh stop;;
    status) su root ${ZOOKEEPER_HOME}/bin/zkServer.sh status;;
    restart) su root ${ZOOKEEPER_HOME}/bin/zkServer.sh restart;;
    upgrade)su root ${ZOOKEEPER_HOME}/bin/zkServer.sh upgrade;;
    print-cmd)su root ${ZOOKEEPER_HOME}/bin/zkServer.sh print-cmd;;
    *) echo "requirestart|start-foreground|stop|status|restart|print-cmd";;
    esac
    EOF

    chmod +x /etc/rc.d/init.d/zookeeper
    chkconfig --add zookeeper
    chkconfig --list zookeeper
    #datanode重启后,namenode需要再次重启所有hadoop进程以启动datanode对应进程
    reboot
    #执行测试看是否开机启动
    service zookeeper status
    lsof -i:2181
    netstat -lntup

    ###########################################
    #三、配置hbase
    cd /opt/software
    mkdir -p /opt/hbase
    tar -zxvf hbase-1.2.6-bin.tar.gz -C /opt/hbase

    cd /opt/hbase/hbase-1.2.6/conf

    cat >> /opt/hbase/hbase-1.2.6/conf/hbase-env.sh << EOF
    export JAVA_HOME=/opt/java/jdk1.8.0_171
    export HADOOP_HOME=/opt/hadoop/hadoop-2.8.0
    export HBASE_HOME=/opt/hbase/hbase-1.2.6
    export HBASE_CLASSPATH=/opt/hadoop/hadoop-2.8.0/etc/hadoop
    export HBASE_PID_DIR=/root/hbase/pids
    export HBASE_MANAGES_ZK=false
    EOF

    #节点node2 node3未执行
    mkdir /root/hbase
    mkdir /root/hbase/tmp
    mkdir /root/hbase/pids

    #在<configuration>节点内增加以下配置:
    vi /opt/hbase/hbase-1.2.6/conf/hbase-site.xml
    <property>
    <name>hbase.rootdir</name>
    <value>hdfs://node1:9000/hbase</value>
    <description>The directory shared byregion servers.</description>
    </property>
    <property>
    <name>hbase.zookeeper.property.clientPort</name>
    <value>2181</value>
    <description>Property from ZooKeeper'sconfig zoo.cfg. The port at which the clients will connect.
    </description>
    </property>
    <property>
    <name>zookeeper.session.timeout</name>
    <value>120000</value>
    </property>
    <property>
    <name>hbase.zookeeper.quorum</name>
    <value>node1,node2,node3</value>
    </property>
    <property>
    <name>hbase.tmp.dir</name>
    <value>/root/hbase/tmp</value>
    </property>
    <property>
    <name>hbase.cluster.distributed</name>
    <value>true</value>
    </property>

    cat > /opt/hbase/hbase-1.2.6/conf/regionservers << EOF
    node1
    node2
    node3
    EOF

    #复制hbase到其他节点node2 node3
    scp -r /opt/hbase node2:/opt/
    scp -r /opt/hbase node3:/opt/

    Hbase是基于hadoop,依赖于zookkeeper,上面的配置启用的是我们自己的zookeeper集群
    Hbase可以只在hadoop的某个namenode节点上安装,也可以在所有的hadoop节点上安装,但是启动的时候只需要在一个节点上启动就行了

    /opt/hbase/hbase-1.2.6/bin/start-hbase.sh

    #web访问hbase
    http://192.168.92.241:16030/

    #进入hbase数据库
    /opt/hbase/hbase-1.2.6/bin/hbase shell
    status
    exit

    ###########################################
    #四、配置 hive
    #mysql安装
    cd /opt/software/
    rpm -qa | grep mariadb
    yum remove mariadb-libs -y
    #或者rpm -e mariadb-libs-5.5.44-2.el7.centos.x86_64 --nodeps

    2 到官网去下载最新的rpm包:mysql-5.7.18-1.el7.x86_64.rpm-bundle.tar
    #-解压出rpm-bundle.tar,实际上只需要安装以下几个包
    rpm -ivh mysql-community-common-5.7.22-1.el7.x86_64.rpm
    rpm -ivh mysql-community-libs-5.7.22-1.el7.x86_64.rpm
    rpm -ivh mysql-community-client-5.7.22-1.el7.x86_64.rpm
    rpm -ivh mysql-community-server-5.7.22-1.el7.x86_64.rpm

    使用rpm安装方式安装mysql,安装的路径如下:
    #数据库目录
    /var/lib/mysql/
    #配置文件
    /usr/share/mysql(mysql.server命令及配置文件)
    #相关命令
    /usr/bin(mysqladmin mysqldump等命令)
    #启动脚本
    /etc/rc.d/init.d/(启动脚本文件mysql的目录)
    /etc/my.conf

    /*为了保证数据库目录为与文件的所有者为 mysql 登陆用户,如果你是以 root 身份运行 mysql 服务,需要执行下面的命令初始化
    如果是以 mysql 身份运行,则可以去掉 --user 选项。
    另外 --initialize 选项默认以“安全”模式来初始化,则会为 root 用户生成一个密码并将该密码标记为过期,登陆后你需要设置一个新的密码,
    而使用 --initialize-insecure 命令则不使用安全模式,则不会为 root 用户生成一个密码。
    这里演示使用的 --initialize 初始化的,会生成一个 root 账户密码,密码在log文件里,红色区域的就是自动生成的密码
    */
    mysqld --initialize --user=mysql

    #在最后部分查看生成的mysql的临时密码:A temporary password is generated for root@localhost: %kWTz,Ml?3Zs
    cat /var/log/mysqld.log

    systemctl start mysqld.service

    #对mysql进行停止,启动和重启:
    #启动:
    service mysqld start
    /etc/inint.d/mysqld start
    safe_mysqld &
    #停止:
    service mysqld stop
    /etc/inint.d/mysqld stop
    mysqladmin shutdown
    #重启:
    service mysqld restart
    /etc/inint.d/mysqld restart

    #连接数据库
    mysql -u root -p
    Enter password:
    密码输入:%kWTz,Ml?3Zs

    #修改密码:
    ALTER USER 'root'@'localhost' IDENTIFIED BY 'mysql';
    #更改mysql权限,让root用户以node1主机名方式可以登录mysql(hive初始化)
    grant all privileges on *.* to root@'%' identified by 'mysql' with grant option;
    flush privileges;
    use mysql;
    select host,user from user;

    #新建test用户 仅允许本地(localhost)登录 新建用户test,并且只允许该用户在本地(localhost)登录
    grant select,update,insert,delete on *.* to test@localhost identified by "mysql";
    flush privileges;

    #若要test用户可以远程登录mysql,则还需要如下命令(或grant方式):
    update user set host = '%' where user = 'test';
    #现在我们再查看mysql的管理库:
    select host,user from user;
    #flush缓存 告诉服务器重载授权表 允许远程连接
    flush privileges;

    #当你使用GRANT和REVOKE语句时,表自动重载,而你直接修改授权表时不是。 所以更简便的方法是
    grant select,update,insert,delete on mas.* to test@“%” identified by "mysql";

    #########################
    tar -zxvf apache-hive-2.1.1-bin.tar.gz -C /opt/hive
    cd /opt/hive/apache-hive-2.1.1-bin/conf
    cp hive-default.xml.template hive-site.xml
    #创建hdfs目录
    cat >> /etc/profile << EOF
    export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native
    EOF
    $HADOOP_HOME/bin/hadoop fs -mkdir -p /user/hive/warehouse
    $HADOOP_HOME/bin/hadoop fs -ls /user/hive/warehouse

    #报错:18/04/23 07:02:50 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    解决:
    cat >> /etc/profile << EOF
    export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native
    EOF

    #若报错:Failed to connect to server: node1/192.168.92.241:9000: try once and fail.
    /opt/hadoop/hadoop-2.8.0/sbin/stop-all.sh
    /opt/hadoop/hadoop-2.8.0/bin/hadoop namenode -format
    /opt/hadoop/hadoop-2.8.0/sbin/start-all.sh
    $HADOOP_HOME/bin/hadoop fs -mkdir -p /user/hive/warehouse

    $HADOOP_HOME/bin/hadoop fs -chmod 777 /user/hive/warehouse
    $HADOOP_HOME/bin/hadoop fs -mkdir -p /tmp/hive/
    $HADOOP_HOME/bin/hadoop fs -chmod 777 /tmp/hive
    #检查hdfs目录是否创建成功
    $HADOOP_HOME/bin/hadoop fs -ls /user/hive/
    #检查/tmp/hive是否创建成功
    $HADOOP_HOME/bin/hadoop fs -ls /tmp/

    #修改hive-site.xml中的临时目录
    mkdir /opt/hive/tmp

    vi hive-site.xml

    #1.将hive-site.xml文件中的${system:java.io.tmpdir}替换为hive的临时目录,例如我替换为/opt/hive/tmp,该目录如果不存在则要自己手工创建,并且赋予读写权限
    将${system:user.name}都替换为root

    #2.修改hive-site.xml数据库相关的配置,若更改主机ip,则需要更改此配置和主机hosts文件
    #搜索javax.jdo.option.ConnectionURL,将该name对应的value修改为MySQL的地址,例如我修改后是:
    #加&useSSL=false后无法登陆hive##########################################################################
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://192.168.92.241:3306/hive?createDatabaseIfNotExist=true</value>

    #3.搜索javax.jdo.option.ConnectionDriverName,将该name对应的value修改为MySQL驱动类路径,例如我的修改后是:
    <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    </property>

    #4.搜索javax.jdo.option.ConnectionUserName,将对应的value修改为MySQL数据库登录名:
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>

    #5.搜索javax.jdo.option.ConnectionPassword,将对应的value修改为MySQL数据库的登录密码:
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>mysql</value>

    #6.搜索hive.metastore.schema.verification,将对应的value修改为false:
    <name>hive.metastore.schema.verification</name>
    <value>false</value>

    #将MySQL驱动包上载到Hive的lib目录下,例如我是上载到/opt/hive/apache-hive-2.1.1-bin/lib
    tar -zxvf /opt/software/mysql-connector-java-5.1.41.tar.gz -C /opt/software
    cp /opt/software/mysql-connector-java-5.1.41/mysql-connector-java-5.1.41-bin.jar /opt/hive/apache-hive-2.1.1-bin/lib
    cp hive-env.sh.template hive-env.sh

    cat >> hive-env.sh << EOF
    export HADOOP_HOME=/opt/hadoop/hadoop-2.8.0
    export HIVE_CONF_DIR=/opt/hive/apache-hive-2.1.1-bin/conf
    export HIVE_AUX_JARS_PATH=/opt/hive/apache-hive-2.1.1-bin/lib
    EOF

    cd /opt/hive/apache-hive-2.1.1-bin/bin
    #对数据库进行初始化,执行命令:
    schematool -initSchema -dbType mysql

    ./hive
    #执行查看函数的命令:
    show functions;
    desc function sum;

    #退出hive
    create database db_hive_edu;
    use db_hive_edu;
    create table student(id int,name string) row format delimited fields terminated by ' ';
    #或者quit;
    exit;
    touch /opt/hive/student.txt

    #需要手工编辑 修改为字段之间用tab分割,unix格式
    cat >> /opt/hive/student.txt << EOF
    1 zhangsan
    2 lisi
    3 wangwu
    4 zhaoliu
    5 chenqi
    EOF

    #在hive命令行中执行加载数据的hive命令:
    hive
    load data local inpath '/opt/hive/student.txt' into table db_hive_edu.student;
    #数据为NULL
    use db_hive_edu;
    select * from student;
    #如果没有指定数据库,请把这个语句换成
    select * from db_hive_edu.student;

    #我的hadoop的namenode的IP地址是192.168.92.241,查看hive创建的库和表:
    http://192.168.92.241:50070/explorer.html#/user/hive/warehouse/db_hive_edu.db

    #在MySQL数据库中执行select语句,查看hive创建的表,SQL是:
    SELECT * FROM hive.TBLS;

    ###########################################
    #五、配置sqoop
    #和hive一样,sqoop只需要在hadoop的namenode上安装即可
    cd /opt/software/
    mkdir -p /opt/sqoop
    tar -zxvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz -C /opt/sqoop

    cd /opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0/conf
    cp sqoop-env-template.sh sqoop-env.sh

    cat >> sqoop-env.sh << EOF
    export HADOOP_COMMON_HOME=/opt/hadoop/hadoop-2.8.0
    export HADOOP_MAPRED_HOME=/opt/hadoop/hadoop-2.8.0
    export HIVE_HOME=/opt/hive/apache-hive-2.1.1-bin
    EOF

    scp /opt/software/mysql-connector-java-5.1.41/mysql-connector-java-5.1.41-bin.jar /opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0/lib/

    #sqoop是一个工具,安装完成后,如果操作的命令不涉及hive和hadoop的
    sqoop help

    #警告如下:
    Warning: /opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0/../hcatalog does not exist! HCatalog jobs will fail.
    Please set $HCAT_HOME to the root of your HCatalog installation.
    Warning: /opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0/../accumulo does not exist! Accumulo imports will fail.
    Please set $ACCUMULO_HOME to the root of your Accumulo installation.
    Warning: /opt/sqoop/sqoop-1.4.7.bin__hadoop-2.6.0/../zookeeper does not exist! Accumulo imports will fail.
    Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
    Try 'sqoop help' for usage.

    #说明:因为我们没有基于hadoop安装HBase,所以HBase相关的命令不能用,但是操作hadoop分布式文件系统的命令是可以用的。
    #查看MySQL数据库hive中的表
    sqoop list-tables --username root --password 'mysql' --connect jdbc:mysql://192.168.92.241:3306/hive?characterEncoding=UTF-8

    #导入mysql的表hive.test失败
    sqoop create-hive-table --connect jdbc:mysql://192.168.92.241:3306/hive?characterEncoding=UTF-8 --table TYPES --username root -password 'mysql' --hive-database db_hive_edu

    #报错:
    21/01/19 18:17:34 ERROR hive.HiveConfig: Could not load org.apache.hadoop.hive.conf.HiveConf. Make sure HIVE_CONF_DIR is set correctly.
    21/01/19 18:17:34 ERROR tool.CreateHiveTableTool: Encountered IOException running create table job: java.io.IOException: java.lang.ClassNotFoundException: org.apache.hadoop.hive.conf.HiveConf
    at org.apache.sqoop.hive.HiveConfig.getHiveConf(HiveConfig.java:50)
    at org.apache.sqoop.hive.HiveImport.getHiveArgs(HiveImport.java:392)
    at org.apache.sqoop.hive.HiveImport.executeExternalHiveScript(HiveImport.java:379)
    at org.apache.sqoop.hive.HiveImport.executeScript(HiveImport.java:337)
    at org.apache.sqoop.hive.HiveImport.importTable(HiveImport.java:241)
    at org.apache.sqoop.tool.CreateHiveTableTool.run(CreateHiveTableTool.java:57)
    at org.apache.sqoop.Sqoop.run(Sqoop.java:147)
    at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)
    at org.apache.sqoop.Sqoop.runSqoop(Sqoop.java:183)
    at org.apache.sqoop.Sqoop.runTool(Sqoop.java:234)
    at org.apache.sqoop.Sqoop.runTool(Sqoop.java:243)
    at org.apache.sqoop.Sqoop.main(Sqoop.java:252)
    Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.conf.HiveConf
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:264)
    at org.apache.sqoop.hive.HiveConfig.getHiveConf(HiveConfig.java:44)
    ... 11 more

    ###########################################
    #配置scala&spark
    cd /opt/software/
    mkdir /opt/scala

    tar -zxvf scala-2.12.2.tgz -C /opt/scala

    #所有节点配置spark
    mkdir /opt/spark
    tar -zxvf spark-2.1.1-bin-hadoop2.7.tgz -C /opt/spark

    cd /opt/spark/spark-2.1.1-bin-hadoop2.7/conf
    cp spark-env.sh.template spark-env.sh
    cp slaves.template slaves

    vi spark-env.sh

    export SCALA_HOME=/opt/scala/scala-2.12.2
    export JAVA_HOME=/opt/java/jdk1.8.0_171
    export HADOOP_HOME=/opt/hadoop/hadoop-2.8.0
    export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
    export SPARK_HOME=/opt/spark/spark-2.1.1-bin-hadoop2.7
    export SPARK_MASTER_IP=node1
    export SPARK_EXECUTOR_MEMORY=1G

    #编辑slaves文件,里面的内容为:
    vi slaves

    node2
    node3

    source /etc/profile

    scp -r /opt/scala node2:/opt
    scp -r /opt/scala node3:/opt

    scp -r /opt/spark node2:/opt
    scp -r /opt/spark node3:/opt

    #执行启动脚本:
    /opt/spark/spark-2.1.1-bin-hadoop2.7/sbin/start-all.sh
    #查看scala版本
    /opt/spark/spark-2.1.1-bin-hadoop2.7/scala -version

    #把所有进程加入开机启动
    vi /etc/profile

    /opt/hadoop/hadoop-2.8.0/sbin/start-all.sh
    /opt/zookeeper/zookeeper-3.4.10/bin/zkServer.sh start
    /opt/hbase/hbase-1.2.6/bin/start-hbase.sh
    /opt/spark/spark-2.1.1-bin-hadoop2.7/sbin/start-all.sh

    #访问Spark集群提供的URL
    http://192.168.92.241:8080/

    #运行Spark提供的计算圆周率的示例程序 误差很大
    cd /opt/spark/spark-2.1.1-bin-hadoop2.7
    ./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local examples/jars/spark-examples_2.11-2.1.1.jar

    ##############################################################-
    Apache impala集成apache kudu
    CentOS7上Apache impala2.7集成apache kudu1.3
    #环境准备
    #服务器配置
    ID IP_address Hostname Notes
    1 192.168.92.241 node1 1.Hadoop Master 2.Spark Master 3.Kudu Master 4.Impala Master 5.JDK 6.Scala
    2 192.168.92.242 node2 1.Hadoop Slave 2.Spark Slave 3.Kudu tserver 4.Impalad 5.JDK 6.Scala
    3 192.168.92.243 node3 1.Hadoop Slave 2.Spark Slave 3.Kudu tserver 4.Impalad 5.JDK 6.Scala

    #环境配置
    1.有关CentOS7.0安装配置JDK请参考CentOS7安装配置JDK1.8
    2.有关Hadoop2.7.3 集群配置请参考CentOS7上Hadoop集群的搭建
    3.有关Spark 2.10 集群搭建请参考CentOS7上Spark集群拱建
    4.有关Scala 安装配置请参考CentOS7上安装配置Scala

    # impala 安装包下载
    #下载impala安装包,下载地址:http://archive.cloudera.com/beta/impala-kudu/redhat/7/x86_64/impala-kudu/0/RPMS/x86_64/
    impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    impala-kudu-catalog-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    impala-kudu-debuginfo-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    impala-kudu-server-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    impala-kudu-shell-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    impala-kudu-state-store-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    impala-kudu-udf-devel-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm

    #下载 bigtop-utils,下载地址:http://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/5.9.0/RPMS/noarch/
    bigtop-utils-0.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.30.el7.noarch.rpm
    ##########################-
    #配置本地yum源
    mkdir /media/CentOS
    mount -t auto /dev/cdrom /media/CentOS
    mount
    cd /etc/yum.repos.d/
    cp CentOS-Base.repo CentOS-Base.repo.bak

    #修改CentOS-Base.repo的内容,注释文档中的所有mirrorlist属性,把baseurl属性打开注释,并设置baseurl的属性值为挂载点
    vi /etc/yum.repos.d/CentOS-Base.repo

    baseurl=file:///media/CentOS/

    yum clean all

    yum list

    vi /etc/fstab

    /dev/sr0 /media/CentOS iso9660 ro,relatime 0 0

    ##########################-
    #下载安装依赖包
    rpm -qa | grep cyrus-sasl-plain
    rpm -qa | grep lsb
    rpm -qa | grep ntp

    yum -y install cyrus-sasl-plain lsb ntp

    #安装impala
    #安装 bigtop-utils (主从机都要安装)
    rpm -ivh bigtop-utils-0.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.30.el7.noarch.rpm
    #从节点
    mkdir -p /opt/software
    #复制到从节点
    scp bigtop-utils-0.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.30.el7.noarch.rpm node2:/opt/software
    scp bigtop-utils-0.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.30.el7.noarch.rpm node3:/opt/software
    #从节点
    cd /opt/software
    rpm -ivh bigtop-utils-0.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.30.el7.noarch.rpm

    #安装 impala-kudu-2.7.0 (主从机都要安装)
    #要加上选项#nodeps否则安装会因为缺少依赖包无法通过
    rpm -ivh impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm --nodeps
    scp impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node2:/opt/software
    scp impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node3:/opt/software
    #从节点
    rpm -ivh impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm --nodeps

    #安装 impala-kudu-catalog (主机安装)
    rpm -ivh impala-kudu-catalog-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    #scp impala-kudu-catalog-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node2:/opt/software
    #scp impala-kudu-catalog-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node3:/opt/software
    #从节点 安装多余了
    #rpm -ivh impala-kudu-catalog-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm

    #安装 impala-kudu-state (主机安装)
    rpm -ivh impala-kudu-state-store-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm

    #主机安装 impala-kudu-server (主从机都要安装)
    rpm -ivh impala-kudu-server-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    scp impala-kudu-server-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node2:/opt/software
    scp impala-kudu-server-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node3:/opt/software
    #从节点
    rpm -ivh impala-kudu-server-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm

    #安装impala-kudu-shell (主从机都要安装)
    rpm -ivh impala-kudu-shell-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    scp impala-kudu-shell-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node2:/opt/software
    scp impala-kudu-shell-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node3:/opt/software
    #从节点
    rpm -ivh impala-kudu-shell-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm

    #安装 impala-kudu-udf-devel(主从机都要安装)
    rpm -ivh impala-kudu-udf-devel-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm
    scp impala-kudu-udf-devel-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node2:/opt/software
    scp impala-kudu-udf-devel-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm node3:/opt/software
    #从节点
    rpm -ivh impala-kudu-udf-devel-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64.rpm

    #配置impala (主从机都要配置)
    #配置 vim /etc/default/bigtop-utils里的JDK
    #所有节点
    cat >> /etc/default/bigtop-utils << EOF
    # export JAVA_HOME
    export JAVA_HOME=/opt/java/jdk1.8.0_171
    EOF

    #配置vim /etc/default/impala (主从机都要配置)
    #所有节点
    vim /etc/default/impala

    IMPALA_CATALOG_SERVICE_HOST=node1
    IMPALA_STATE_STORE_HOST=node1

    #配置三台机器时间同步(主从机都要配置)
    #所有节点
    systemctl restart ntpd
    systemctl enable ntpd


    #把hadoop配置文件core-site.xml和hdfs-site.xml拷到/etc/impala/conf.dist/目录下并作如下改动
    #拷贝core-site.xml和hdfs-site.xml (主从机都要配置)
    #所有节点
    cp $HADOOP_HOME/etc/hadoop/core-site.xml /etc/impala/conf.dist/
    cp $HADOOP_HOME/etc/hadoop/hdfs-site.xml /etc/impala/conf.dist/


    6.2.具体配置(在hadoop原有内容上增加以下内容,在configuration中嵌入以下property)
    cd /etc/impala/conf.dist/

    vi core-site.xml

    <property>
    <name>dfs.client.read.shortcircuit</name>
    <value>true</value>
    </property>
    <property>
    <name>dfs.client.read.shortcircuit.skip.checksum</name>
    <value>false</value>
    </property>
    <property>
    <name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
    <value>true</value>
    </property>

    vi hdfs-site.xml

    <!#impala configuration -->
    <property>
    <name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
    <value>true</value>
    </property>
    <property>
    <name>dfs.block.local-path-access.user</name>
    <value>impala</value>
    </property>
    <property>
    <name>dfs.client.file-block-storage-locations.timeout.millis</name>
    <value>60000</value>
    </property>

    #设置scoket path
    在目录/var/run/目录下新建目录hadoop-hdfs (主从机都要配置)。
    注意:该文件夹可能已经存在,应当确认用impala是否有权限进行读写。
    如果已经存在,将用户impala加入该文件所属的组,并修改该文件组的权限即: chown -R 775 hadoop-hdfs/
    #所有节点
    cd /var/run/
    mkdir hadoop-hdfs
    chown -R 775 hadoop-hdfs

    #权限配置(所有机器都应当配置)
    如果想要 impala 和 YARN 合作,需要把 impala 用户加入 hdfs 组,相关的可以了解的是Llama项目.
    impala 在执行 DROP TABLE 操作时,需要把文件移到到 hdfs 的回收站,所以你需要创建一个 hdfs 的目录 /user/impala,并将其设置为impala 用户可写。
    同样的,impala 需要读取 hive 数据仓库下的数据,故需要把 impala 用户加入 hive 组。
    #所有节点
    groupadd hadoop
    usermod -G hdfs,hadoop impala
    groups impala

    impala : impala hdfs hadoop

    #Extension: 创建impala在hdfs上的目录并设置权限
    hadoop fs -mkdir -p /user/impala
    hadoop fs -chown impala /user/impala
    hadoop fs -ls /user/

    #从节点执行报错********************************************************
    [root@node2 run]# hadoop fs -mkdir -p /user/impala
    Exception in thread "main" java.lang.RuntimeException: core-site.xml not found
    at org.apache.hadoop.conf.Configuration.loadResource(Configuration.java:2617)
    at org.apache.hadoop.conf.Configuration.loadResources(Configuration.java:2543)
    at org.apache.hadoop.conf.Configuration.getProps(Configuration.java:2426)
    at org.apache.hadoop.conf.Configuration.set(Configuration.java:1151)
    at org.apache.hadoop.conf.Configuration.set(Configuration.java:1123)
    at org.apache.hadoop.conf.Configuration.setBoolean(Configuration.java:1459)
    at org.apache.hadoop.util.GenericOptionsParser.processGeneralOptions(GenericOptionsParser.java:322)
    at org.apache.hadoop.util.GenericOptionsParser.parseGeneralOptions(GenericOptionsParser.java:488)
    at org.apache.hadoop.util.GenericOptionsParser.<init>(GenericOptionsParser.java:170)
    at org.apache.hadoop.util.GenericOptionsParser.<init>(GenericOptionsParser.java:153)
    at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
    at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)
    at org.apache.hadoop.fs.FsShell.main(FsShell.java:378)

    #启动impala
    #主机:node1
    service impala-state-store restart --kudu_master_hosts=node1:7051
    service impala-catalog restart --kudu_master_hosts=node1:7051
    #主从机执行
    service impala-server restart --kudu_master_hosts=node1:7051

    #登录impala-shell
    impala-shell

    #node1执行(按照mysql语法写impala-shell),修改后
    CREATE TABLE dept (
    id BIGINT,
    name STRING
    ) distribute by range(name) split rows(('it'),('op'),('hr'))
    TBLPROPERTIES(
    'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
    'kudu.table_name' = 'dept',
    'kudu.master_addresses' = 'node1:7051',
    'kudu.key_columns' = 'id,name'
    );

    #报错:*******************************************************************
    ERROR:
    ImpalaRuntimeException: Error creating Kudu table
    CAUSED BY: NonRecoverableException: RPC can not complete before timeout: KuduRpc(method=ListTables, tablet=null, attempt=25, DeadlineTracker(timeout=30000, elapsed=29214))
    CAUSED BY: NoLeaderMasterFoundException: Master config (node1:7051) has no leader.. Exceptions received: org.kududb.client.RecoverableException: [Peer Kudu Master - node1:7051] Connection reset on [id: 0xf5a4934f]
    CAUSED BY: RecoverableException: [Peer Kudu Master - node1:7051] Connection reset on [id: 0xf5a4934f]

    #在kudu控制台查看impala-shell创建的表,有关更多的impala操作kudu的示例,后面会发文详细讲解!
    #未打开
    http://192.168.92.241:8051/

    #若zookeeper和hbase进程未启动
    #主从节点执行
    sh /home/zk_start.sh
    #主节点执行
    sh /home/hbase_start.sh

  • 相关阅读:
    BZOJ 1568: [JSOI2008]Blue Mary开公司
    BZOJ 3165: [Heoi2013]Segment
    BZOJ 2733: [HNOI2012]永无乡
    BZOJ 4631: 踩气球
    BZOJ 4530: [Bjoi2014]大融合
    BZOJ 4919: [Lydsy1706月赛]大根堆
    BZOJ 5442: [Ceoi2018]Global warming
    BZOJ 4027: [HEOI2015]兔子与樱花
    BZOJ 5441: [Ceoi2018]Cloud computing
    php的抓取
  • 原文地址:https://www.cnblogs.com/buffercache/p/14264938.html
Copyright © 2020-2023  润新知