• spark2.2.1 sql001


    package sql
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.SparkContext
    
    object Parquet extends App {
    
      val spark = SparkSession
        .builder()
        .appName("Spark SQL basic example")
        .master("local[*]")
        .config("spark.some.config.option", "some-value")
        .getOrCreate()
    
      // For implicit conversions like converting RDDs to DataFrames
      import spark.implicits._
    //  val usersDF = spark.read.load("E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet")
    //  usersDF.select("name", "favorite_color").write.save("E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/namesAndFavColors.parquet")
      val sqlDF = spark.sql("SELECT * FROM parquet.`E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet`").show
      //val peopleDF = spark.read.format("json").load("E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/people.json")
      //peopleDF.select("name", "age").write.format("parquet").save("namesAndAges.parquet")
    }
    //Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
    //18/01/11 13:08:41 INFO SparkContext: Running Spark version 2.2.1
    //18/01/11 13:08:42 INFO SparkContext: Submitted application: Spark SQL basic example
    //18/01/11 13:08:42 INFO SecurityManager: Changing view acls to: fangping
    //18/01/11 13:08:42 INFO SecurityManager: Changing modify acls to: fangping
    //18/01/11 13:08:42 INFO SecurityManager: Changing view acls groups to: 
    //18/01/11 13:08:42 INFO SecurityManager: Changing modify acls groups to: 
    //18/01/11 13:08:42 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(fangping); groups with view permissions: Set(); users  with modify permissions: Set(fangping); groups with modify permissions: Set()
    //18/01/11 13:08:43 INFO Utils: Successfully started service 'sparkDriver' on port 53757.
    //18/01/11 13:08:43 INFO SparkEnv: Registering MapOutputTracker
    //18/01/11 13:08:43 INFO SparkEnv: Registering BlockManagerMaster
    //18/01/11 13:08:43 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
    //18/01/11 13:08:43 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
    //18/01/11 13:08:43 INFO DiskBlockManager: Created local directory at C:UsersfangpingAppDataLocalTemplockmgr-3ccc835a-5d8d-4ef7-be37-82967f2a72ad
    //18/01/11 13:08:43 INFO MemoryStore: MemoryStore started with capacity 339.6 MB
    //18/01/11 13:08:43 INFO SparkEnv: Registering OutputCommitCoordinator
    //18/01/11 13:08:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
    //18/01/11 13:08:43 INFO Utils: Successfully started service 'SparkUI' on port 4041.
    //18/01/11 13:08:43 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://172.18.3.4:4041
    //18/01/11 13:08:43 INFO Executor: Starting executor ID driver on host localhost
    //18/01/11 13:08:43 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 53778.
    //18/01/11 13:08:43 INFO NettyBlockTransferService: Server created on 172.18.3.4:53778
    //18/01/11 13:08:43 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
    //18/01/11 13:08:43 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 172.18.3.4, 53778, None)
    //18/01/11 13:08:43 INFO BlockManagerMasterEndpoint: Registering block manager 172.18.3.4:53778 with 339.6 MB RAM, BlockManagerId(driver, 172.18.3.4, 53778, None)
    //18/01/11 13:08:43 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 172.18.3.4, 53778, None)
    //18/01/11 13:08:43 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 172.18.3.4, 53778, None)
    //18/01/11 13:08:44 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/E:/back/scalaWs/Spark2Demo/spark-warehouse/').
    //18/01/11 13:08:44 INFO SharedState: Warehouse path is 'file:/E:/back/scalaWs/Spark2Demo/spark-warehouse/'.
    //18/01/11 13:08:44 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
    //18/01/11 13:08:44 INFO SparkSqlParser: Parsing command: SELECT * FROM parquet.`E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet`
    //18/01/11 13:08:53 INFO SparkContext: Starting job: sql at Parquet.scala:18
    //18/01/11 13:08:54 INFO DAGScheduler: Got job 0 (sql at Parquet.scala:18) with 1 output partitions
    //18/01/11 13:08:54 INFO DAGScheduler: Final stage: ResultStage 0 (sql at Parquet.scala:18)
    //18/01/11 13:08:54 INFO DAGScheduler: Parents of final stage: List()
    //18/01/11 13:08:54 INFO DAGScheduler: Missing parents: List()
    //18/01/11 13:08:54 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[1] at sql at Parquet.scala:18), which has no missing parents
    //18/01/11 13:08:54 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 63.2 KB, free 339.5 MB)
    //18/01/11 13:08:54 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 22.1 KB, free 339.5 MB)
    //18/01/11 13:08:54 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 172.18.3.4:53778 (size: 22.1 KB, free: 339.6 MB)
    //18/01/11 13:08:54 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
    //18/01/11 13:08:55 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at sql at Parquet.scala:18) (first 15 tasks are for partitions Vector(0))
    //18/01/11 13:08:55 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
    //18/01/11 13:08:55 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 5010 bytes)
    //18/01/11 13:08:55 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
    //18/01/11 13:08:59 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1753 bytes result sent to driver
    //18/01/11 13:08:59 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 4155 ms on localhost (executor driver) (1/1)
    //18/01/11 13:08:59 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
    //18/01/11 13:09:00 INFO DAGScheduler: ResultStage 0 (sql at Parquet.scala:18) finished in 4.222 s
    //18/01/11 13:09:00 INFO DAGScheduler: Job 0 finished: sql at Parquet.scala:18, took 6.652536 s
    //18/01/11 13:09:06 INFO BlockManagerInfo: Removed broadcast_0_piece0 on 172.18.3.4:53778 in memory (size: 22.1 KB, free: 339.6 MB)
    //18/01/11 13:09:06 INFO FileSourceStrategy: Pruning directories with: 
    //18/01/11 13:09:06 INFO FileSourceStrategy: Post-Scan Filters: 
    //18/01/11 13:09:06 INFO FileSourceStrategy: Output Data Schema: struct<name: string, favorite_color: string, favorite_numbers: array<int> ... 1 more fields>
    //18/01/11 13:09:06 INFO FileSourceScanExec: Pushed Filters: 
    //18/01/11 13:09:08 INFO CodeGenerator: Code generated in 451.653537 ms
    //18/01/11 13:09:08 INFO CodeGenerator: Code generated in 20.894987 ms
    //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 225.1 KB, free 339.4 MB)
    //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 21.3 KB, free 339.4 MB)
    //18/01/11 13:09:08 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 172.18.3.4:53778 (size: 21.3 KB, free: 339.6 MB)
    //18/01/11 13:09:08 INFO SparkContext: Created broadcast 1 from show at Parquet.scala:18
    //18/01/11 13:09:08 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
    //18/01/11 13:09:08 INFO SparkContext: Starting job: show at Parquet.scala:18
    //18/01/11 13:09:08 INFO DAGScheduler: Got job 1 (show at Parquet.scala:18) with 1 output partitions
    //18/01/11 13:09:08 INFO DAGScheduler: Final stage: ResultStage 1 (show at Parquet.scala:18)
    //18/01/11 13:09:08 INFO DAGScheduler: Parents of final stage: List()
    //18/01/11 13:09:08 INFO DAGScheduler: Missing parents: List()
    //18/01/11 13:09:08 INFO DAGScheduler: Submitting ResultStage 1 (MapPartitionsRDD[4] at show at Parquet.scala:18), which has no missing parents
    //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 8.8 KB, free 339.4 MB)
    //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 4.5 KB, free 339.3 MB)
    //18/01/11 13:09:08 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 172.18.3.4:53778 (size: 4.5 KB, free: 339.6 MB)
    //18/01/11 13:09:08 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1006
    //18/01/11 13:09:08 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (MapPartitionsRDD[4] at show at Parquet.scala:18) (first 15 tasks are for partitions Vector(0))
    //18/01/11 13:09:08 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
    //18/01/11 13:09:08 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, localhost, executor driver, partition 0, PROCESS_LOCAL, 5323 bytes)
    //18/01/11 13:09:08 INFO Executor: Running task 0.0 in stage 1.0 (TID 1)
    //18/01/11 13:09:08 INFO FileScanRDD: Reading File path: file:///E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet, range: 0-615, partition values: [empty row]
    //18/01/11 13:09:09 INFO ParquetReadSupport: Going to read the following fields from the Parquet file:
    //
    //Parquet form:
    //message spark_schema {
    //  required binary name (UTF8);
    //  optional binary favorite_color (UTF8);
    //  required group favorite_numbers (LIST) {
    //    repeated int32 array;
    //  }
    //}
    //
    //Catalyst form:
    //StructType(StructField(name,StringType,true), StructField(favorite_color,StringType,true), StructField(favorite_numbers,ArrayType(IntegerType,true),true))
    //       
    //18/01/11 13:09:09 INFO CodeGenerator: Code generated in 13.695856 ms
    //18/01/11 13:09:09 INFO InternalParquetRecordReader: RecordReader initialized will read a total of 2 records.
    //18/01/11 13:09:09 INFO InternalParquetRecordReader: at row 0. reading next block
    //18/01/11 13:09:09 INFO CodecPool: Got brand-new decompressor [.snappy]
    //18/01/11 13:09:09 INFO InternalParquetRecordReader: block read in memory in 347 ms. row count = 2
    //18/01/11 13:09:09 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 1279 bytes result sent to driver
    //18/01/11 13:09:09 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 713 ms on localhost (executor driver) (1/1)
    //18/01/11 13:09:09 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool 
    //18/01/11 13:09:09 INFO DAGScheduler: ResultStage 1 (show at Parquet.scala:18) finished in 0.712 s
    //18/01/11 13:09:09 INFO DAGScheduler: Job 1 finished: show at Parquet.scala:18, took 0.760571 s
    //+------+--------------+----------------+
    //|  name|favorite_color|favorite_numbers|
    //+------+--------------+----------------+
    //|Alyssa|          null|  [3, 9, 15, 20]|
    //|   Ben|           red|              []|
    //+------+--------------+----------------+
    //
    //18/01/11 13:09:09 INFO SparkContext: Invoking stop() from shutdown hook
    //18/01/11 13:09:09 INFO SparkUI: Stopped Spark web UI at http://172.18.3.4:4041
    //18/01/11 13:09:09 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
    //18/01/11 13:09:10 INFO MemoryStore: MemoryStore cleared
    //18/01/11 13:09:10 INFO BlockManager: BlockManager stopped
    //18/01/11 13:09:10 INFO BlockManagerMaster: BlockManagerMaster stopped
    //18/01/11 13:09:10 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
    //18/01/11 13:09:10 INFO SparkContext: Successfully stopped SparkContext
    //18/01/11 13:09:10 INFO ShutdownHookManager: Shutdown hook called
    //18/01/11 13:09:10 INFO ShutdownHookManager: Deleting directory C:UsersfangpingAppDataLocalTempspark-300f3c5b-8383-4972-b9e4-bbd55a0407b7
    //
  • 相关阅读:
    python分布式爬虫-Selenium(针对需要渲染的页面)
    《PHP高性能开发:基础、框架与项目实战》_王甲临
    flutter hello world
    C#+EF+SQLite数据库操作
    【转】Wilcoxon 检验之 rank-sum 与 signed-rank
    [转]多目标进化算法的性能指标总结 (一)
    IDEA查找栏
    [转]IDEA空指针断点
    Java 高斯分布随机数
    【Vegas原创】centos中挖矿病毒kdevtmpfsi的终极解决方法
  • 原文地址:https://www.cnblogs.com/alamps/p/8267868.html
Copyright © 2020-2023  润新知