package sql import org.apache.spark.sql.SparkSession import org.apache.spark.SparkContext object Parquet extends App { val spark = SparkSession .builder() .appName("Spark SQL basic example") .master("local[*]") .config("spark.some.config.option", "some-value") .getOrCreate() // For implicit conversions like converting RDDs to DataFrames import spark.implicits._ // val usersDF = spark.read.load("E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet") // usersDF.select("name", "favorite_color").write.save("E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/namesAndFavColors.parquet") val sqlDF = spark.sql("SELECT * FROM parquet.`E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet`").show //val peopleDF = spark.read.format("json").load("E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/people.json") //peopleDF.select("name", "age").write.format("parquet").save("namesAndAges.parquet") } //Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties //18/01/11 13:08:41 INFO SparkContext: Running Spark version 2.2.1 //18/01/11 13:08:42 INFO SparkContext: Submitted application: Spark SQL basic example //18/01/11 13:08:42 INFO SecurityManager: Changing view acls to: fangping //18/01/11 13:08:42 INFO SecurityManager: Changing modify acls to: fangping //18/01/11 13:08:42 INFO SecurityManager: Changing view acls groups to: //18/01/11 13:08:42 INFO SecurityManager: Changing modify acls groups to: //18/01/11 13:08:42 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(fangping); groups with view permissions: Set(); users with modify permissions: Set(fangping); groups with modify permissions: Set() //18/01/11 13:08:43 INFO Utils: Successfully started service 'sparkDriver' on port 53757. //18/01/11 13:08:43 INFO SparkEnv: Registering MapOutputTracker //18/01/11 13:08:43 INFO SparkEnv: Registering BlockManagerMaster //18/01/11 13:08:43 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information //18/01/11 13:08:43 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up //18/01/11 13:08:43 INFO DiskBlockManager: Created local directory at C:UsersfangpingAppDataLocalTemplockmgr-3ccc835a-5d8d-4ef7-be37-82967f2a72ad //18/01/11 13:08:43 INFO MemoryStore: MemoryStore started with capacity 339.6 MB //18/01/11 13:08:43 INFO SparkEnv: Registering OutputCommitCoordinator //18/01/11 13:08:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041. //18/01/11 13:08:43 INFO Utils: Successfully started service 'SparkUI' on port 4041. //18/01/11 13:08:43 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://172.18.3.4:4041 //18/01/11 13:08:43 INFO Executor: Starting executor ID driver on host localhost //18/01/11 13:08:43 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 53778. //18/01/11 13:08:43 INFO NettyBlockTransferService: Server created on 172.18.3.4:53778 //18/01/11 13:08:43 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy //18/01/11 13:08:43 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 172.18.3.4, 53778, None) //18/01/11 13:08:43 INFO BlockManagerMasterEndpoint: Registering block manager 172.18.3.4:53778 with 339.6 MB RAM, BlockManagerId(driver, 172.18.3.4, 53778, None) //18/01/11 13:08:43 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 172.18.3.4, 53778, None) //18/01/11 13:08:43 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 172.18.3.4, 53778, None) //18/01/11 13:08:44 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/E:/back/scalaWs/Spark2Demo/spark-warehouse/'). //18/01/11 13:08:44 INFO SharedState: Warehouse path is 'file:/E:/back/scalaWs/Spark2Demo/spark-warehouse/'. //18/01/11 13:08:44 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint //18/01/11 13:08:44 INFO SparkSqlParser: Parsing command: SELECT * FROM parquet.`E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet` //18/01/11 13:08:53 INFO SparkContext: Starting job: sql at Parquet.scala:18 //18/01/11 13:08:54 INFO DAGScheduler: Got job 0 (sql at Parquet.scala:18) with 1 output partitions //18/01/11 13:08:54 INFO DAGScheduler: Final stage: ResultStage 0 (sql at Parquet.scala:18) //18/01/11 13:08:54 INFO DAGScheduler: Parents of final stage: List() //18/01/11 13:08:54 INFO DAGScheduler: Missing parents: List() //18/01/11 13:08:54 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[1] at sql at Parquet.scala:18), which has no missing parents //18/01/11 13:08:54 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 63.2 KB, free 339.5 MB) //18/01/11 13:08:54 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 22.1 KB, free 339.5 MB) //18/01/11 13:08:54 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 172.18.3.4:53778 (size: 22.1 KB, free: 339.6 MB) //18/01/11 13:08:54 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006 //18/01/11 13:08:55 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at sql at Parquet.scala:18) (first 15 tasks are for partitions Vector(0)) //18/01/11 13:08:55 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks //18/01/11 13:08:55 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 5010 bytes) //18/01/11 13:08:55 INFO Executor: Running task 0.0 in stage 0.0 (TID 0) //18/01/11 13:08:59 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1753 bytes result sent to driver //18/01/11 13:08:59 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 4155 ms on localhost (executor driver) (1/1) //18/01/11 13:08:59 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool //18/01/11 13:09:00 INFO DAGScheduler: ResultStage 0 (sql at Parquet.scala:18) finished in 4.222 s //18/01/11 13:09:00 INFO DAGScheduler: Job 0 finished: sql at Parquet.scala:18, took 6.652536 s //18/01/11 13:09:06 INFO BlockManagerInfo: Removed broadcast_0_piece0 on 172.18.3.4:53778 in memory (size: 22.1 KB, free: 339.6 MB) //18/01/11 13:09:06 INFO FileSourceStrategy: Pruning directories with: //18/01/11 13:09:06 INFO FileSourceStrategy: Post-Scan Filters: //18/01/11 13:09:06 INFO FileSourceStrategy: Output Data Schema: struct<name: string, favorite_color: string, favorite_numbers: array<int> ... 1 more fields> //18/01/11 13:09:06 INFO FileSourceScanExec: Pushed Filters: //18/01/11 13:09:08 INFO CodeGenerator: Code generated in 451.653537 ms //18/01/11 13:09:08 INFO CodeGenerator: Code generated in 20.894987 ms //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 225.1 KB, free 339.4 MB) //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 21.3 KB, free 339.4 MB) //18/01/11 13:09:08 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 172.18.3.4:53778 (size: 21.3 KB, free: 339.6 MB) //18/01/11 13:09:08 INFO SparkContext: Created broadcast 1 from show at Parquet.scala:18 //18/01/11 13:09:08 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes. //18/01/11 13:09:08 INFO SparkContext: Starting job: show at Parquet.scala:18 //18/01/11 13:09:08 INFO DAGScheduler: Got job 1 (show at Parquet.scala:18) with 1 output partitions //18/01/11 13:09:08 INFO DAGScheduler: Final stage: ResultStage 1 (show at Parquet.scala:18) //18/01/11 13:09:08 INFO DAGScheduler: Parents of final stage: List() //18/01/11 13:09:08 INFO DAGScheduler: Missing parents: List() //18/01/11 13:09:08 INFO DAGScheduler: Submitting ResultStage 1 (MapPartitionsRDD[4] at show at Parquet.scala:18), which has no missing parents //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 8.8 KB, free 339.4 MB) //18/01/11 13:09:08 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 4.5 KB, free 339.3 MB) //18/01/11 13:09:08 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 172.18.3.4:53778 (size: 4.5 KB, free: 339.6 MB) //18/01/11 13:09:08 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1006 //18/01/11 13:09:08 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (MapPartitionsRDD[4] at show at Parquet.scala:18) (first 15 tasks are for partitions Vector(0)) //18/01/11 13:09:08 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks //18/01/11 13:09:08 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, localhost, executor driver, partition 0, PROCESS_LOCAL, 5323 bytes) //18/01/11 13:09:08 INFO Executor: Running task 0.0 in stage 1.0 (TID 1) //18/01/11 13:09:08 INFO FileScanRDD: Reading File path: file:///E:/bigdata/spark-2.2.1-bin-hadoop2.6/examples/src/main/resources/users.parquet, range: 0-615, partition values: [empty row] //18/01/11 13:09:09 INFO ParquetReadSupport: Going to read the following fields from the Parquet file: // //Parquet form: //message spark_schema { // required binary name (UTF8); // optional binary favorite_color (UTF8); // required group favorite_numbers (LIST) { // repeated int32 array; // } //} // //Catalyst form: //StructType(StructField(name,StringType,true), StructField(favorite_color,StringType,true), StructField(favorite_numbers,ArrayType(IntegerType,true),true)) // //18/01/11 13:09:09 INFO CodeGenerator: Code generated in 13.695856 ms //18/01/11 13:09:09 INFO InternalParquetRecordReader: RecordReader initialized will read a total of 2 records. //18/01/11 13:09:09 INFO InternalParquetRecordReader: at row 0. reading next block //18/01/11 13:09:09 INFO CodecPool: Got brand-new decompressor [.snappy] //18/01/11 13:09:09 INFO InternalParquetRecordReader: block read in memory in 347 ms. row count = 2 //18/01/11 13:09:09 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 1279 bytes result sent to driver //18/01/11 13:09:09 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 713 ms on localhost (executor driver) (1/1) //18/01/11 13:09:09 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool //18/01/11 13:09:09 INFO DAGScheduler: ResultStage 1 (show at Parquet.scala:18) finished in 0.712 s //18/01/11 13:09:09 INFO DAGScheduler: Job 1 finished: show at Parquet.scala:18, took 0.760571 s //+------+--------------+----------------+ //| name|favorite_color|favorite_numbers| //+------+--------------+----------------+ //|Alyssa| null| [3, 9, 15, 20]| //| Ben| red| []| //+------+--------------+----------------+ // //18/01/11 13:09:09 INFO SparkContext: Invoking stop() from shutdown hook //18/01/11 13:09:09 INFO SparkUI: Stopped Spark web UI at http://172.18.3.4:4041 //18/01/11 13:09:09 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped! //18/01/11 13:09:10 INFO MemoryStore: MemoryStore cleared //18/01/11 13:09:10 INFO BlockManager: BlockManager stopped //18/01/11 13:09:10 INFO BlockManagerMaster: BlockManagerMaster stopped //18/01/11 13:09:10 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped! //18/01/11 13:09:10 INFO SparkContext: Successfully stopped SparkContext //18/01/11 13:09:10 INFO ShutdownHookManager: Shutdown hook called //18/01/11 13:09:10 INFO ShutdownHookManager: Deleting directory C:UsersfangpingAppDataLocalTempspark-300f3c5b-8383-4972-b9e4-bbd55a0407b7 //