object ParquetFileTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("ParquetFileTest") .getOrCreate() //1: 将json文件数据转化成parquet文件数据 val df = spark.read.json(s"${BASE_PATH}/people.json") df.show() //gzip、lzo、snappy df.write.option("compression", "snappy").parquet(s"${BASE_PATH}/parquet") //2: 读取parquet文件 val parquetDF = spark.read.parquet(s"${BASE_PATH}/parquet") parquetDF.show() //3: parquet schema merge //全局设置spark.sql.parquet.mergeSchema = true df.toDF("age", "first_name").write.parquet(s"${BASE_PATH}/parquet_schema_change") val changedDF = spark.read.parquet(s"${BASE_PATH}/parquet_schema_change") changedDF.show() val schemaMergeDF = spark.read.option("mergeSchema", "true").parquet(s"${BASE_PATH}/parquet", s"${BASE_PATH}/parquet_schema_change") schemaMergeDF.show() spark.stop() } }
object OrcFileTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("OrcFileTest") .getOrCreate() //1: 将json文件数据转化成orc文件数据 val df = spark.read.json(s"${BASE_PATH}/people.json") df.show() df.write.option("compression", "snappy").orc(s"${BASE_PATH}/orc") val orcFileDF = spark.read.orc(s"${BASE_PATH}/orc") orcFileDF.show() spark.stop() } }