• Spark记录-本地Spark读取Hive数据简单例子


    注意:将mysql的驱动包拷贝到spark/lib下,将hive-site.xml拷贝到项目resources下,远程调试不要使用主机名

    import org.apache.spark._ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.hive.HiveContext import java.io.FileNotFoundException import java.io.IOException object HiveSelect { def main(args: Array[String]) { System.setProperty("hadoop.home.dir", "D:\hadoop") //加载hadoop组件 val conf = new SparkConf().setAppName("HiveApp").setMaster("spark://192.168.66.66:7077") .set("spark.executor.memory", "1g") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .setJars(Seq("D:\workspace\scala\out\scala.jar"))//加载远程spark //.set("hive.metastore.uris", "thrift://192.168.66.66:9083")//远程hive的meterstore地址 // .set("spark.driver.extraClassPath","D:\json\mysql-connector-java-5.1.39.jar") val sparkcontext = new SparkContext(conf); try { val hiveContext = new HiveContext(sparkcontext); hiveContext.sql("use siat"); //使用数据库 hiveContext.sql("DROP TABLE IF EXISTS src") //删除表 hiveContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) " + "ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' ");//创建表 hiveContext.sql("LOAD DATA LOCAL INPATH 'D:\workspace\scala\src.txt' INTO TABLE src "); //导入数据 hiveContext.sql(" SELECT * FROM src").collect().foreach(println);//查询数据 } catch { case e: FileNotFoundException => println("Missing file exception") case ex: IOException => println("IO Exception") case ee: ArithmeticException => println(ee) case eee: Throwable => println("found a unknown exception" + eee) case ef: NumberFormatException => println(ef) case ec: Exception => println(ec) case e: IllegalArgumentException => println("illegal arg. exception"); case e: IllegalStateException => println("illegal state exception"); } finally { sparkcontext.stop() } } }

     附录1:scala-spark api-http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.package 

    org.apache.spark
    
    org.apache.spark.api.java
    
    org.apache.spark.api.java.function
    
    org.apache.spark.broadcast
    
    org.apache.spark.graphx
    
    org.apache.spark.graphx.impl
    
    org.apache.spark.graphx.lib
    
    org.apache.spark.graphx.util
    
    org.apache.spark.input
    
    org.apache.spark.internal
    
    org.apache.spark.internal.io
    
    org.apache.spark.io
    
    org.apache.spark.launcher
    
    org.apache.spark.mapred
    
    org.apache.spark.metrics.source
    
    org.apache.spark.ml
    
    org.apache.spark.ml.attribute
    
    org.apache.spark.ml.classification
    
    org.apache.spark.ml.clustering
    
    org.apache.spark.ml.evaluation
    
    org.apache.spark.ml.feature
    
    org.apache.spark.ml.fpm
    
    org.apache.spark.ml.linalg
    
    org.apache.spark.ml.param
    
    org.apache.spark.ml.recommendation
    
    org.apache.spark.ml.regression
    
    org.apache.spark.ml.source.libsvm
    
    org.apache.spark.ml.stat
    
    org.apache.spark.ml.stat.distribution
    
    org.apache.spark.ml.tree
    
    org.apache.spark.ml.tuning
    
    org.apache.spark.ml.util
    
    org.apache.spark.mllib
    
    org.apache.spark.mllib.classification
    
    org.apache.spark.mllib.clustering
    
    org.apache.spark.mllib.evaluation
    
    org.apache.spark.mllib.feature
    
    org.apache.spark.mllib.fpm
    
    org.apache.spark.mllib.linalg
    
    org.apache.spark.mllib.linalg.distributed
    
    org.apache.spark.mllib.optimization
    
    org.apache.spark.mllib.pmml
    
    org.apache.spark.mllib.random
    
    org.apache.spark.mllib.rdd
    
    org.apache.spark.mllib.recommendation
    
    org.apache.spark.mllib.regression
    
    org.apache.spark.mllib.stat
    
    org.apache.spark.mllib.stat.distribution
    
    org.apache.spark.mllib.stat.test
    
    org.apache.spark.mllib.tree
    
    org.apache.spark.mllib.tree.configuration
    
    org.apache.spark.mllib.tree.impurity
    
    org.apache.spark.mllib.tree.loss
    
    org.apache.spark.mllib.tree.model
    
    org.apache.spark.mllib.util
    
    org.apache.spark.partial
    
    org.apache.spark.rdd
    
    org.apache.spark.scheduler
    
    org.apache.spark.scheduler.cluster
    
    org.apache.spark.security
    
    org.apache.spark.serializer
    
    org.apache.spark.sql
    
    org.apache.spark.sql.api.java
    
    org.apache.spark.sql.catalog
    
    org.apache.spark.sql.expressions
    
    org.apache.spark.sql.expressions.javalang
    
    org.apache.spark.sql.expressions.scalalang
    
    org.apache.spark.sql.hive
    
    org.apache.spark.sql.hive.execution
    
    org.apache.spark.sql.hive.orc
    
    org.apache.spark.sql.jdbc
    
    org.apache.spark.sql.sources
    
    org.apache.spark.sql.streaming
    
    org.apache.spark.sql.types
    
    org.apache.spark.sql.util
    
    org.apache.spark.status.api.v1
    
    org.apache.spark.status.api.v1.streaming
    
    org.apache.spark.storage
    
    org.apache.spark.streaming
    
    org.apache.spark.streaming.api.java
    
    org.apache.spark.streaming.dstream
    
    org.apache.spark.streaming.flume
    
    org.apache.spark.streaming.kafka
    
    org.apache.spark.streaming.kinesis
    
    org.apache.spark.streaming.receiver
    
    org.apache.spark.streaming.scheduler
    
    org.apache.spark.streaming.scheduler.rate
    
    org.apache.spark.streaming.util
    
    org.apache.spark.ui.env
    
    org.apache.spark.ui.exec
    
    org.apache.spark.ui.jobs
    
    org.apache.spark.ui.storage
    
    org.apache.spark.util
    
    org.apache.spark.util.random
    
    org.apache.spark.util.sketch
    

      

  • 相关阅读:
    Ansible facts
    K8S基础概念
    Ansible CMDB
    Docker 核心技术与实现原理
    如何在 Centos7 中使用阿里云的yum源
    一篇文全面了解DevOps:从概念、关键问题、兴起到实现需求
    关于操作stream流
    service not available now, maybe disk full, CL: 0.95 CQ: 0.95 INDEX: 0.95, maybe your broker mach
    AOP的底层实现,动态代理和cglib代理
    模拟浏览器加载静态资源
  • 原文地址:https://www.cnblogs.com/xinfang520/p/7832274.html
Copyright © 2020-2023  润新知