• Spark SQL External Data Sources JDBC官方实现读测试


    在最新的master分支上官方提供了Spark JDBC外部数据源的实现,先尝为快。

    通过spark-shell测试

    import org.apache.spark.sql.SQLContext  
    val sqlContext  = new SQLContext(sc)
    import sqlContext._
    
    val TBLS_JDBC_DDL = s"""
    |CREATE TEMPORARY TABLE spark_tbls
    |USING org.apache.spark.sql.jdbc
    |OPTIONS (
    |  url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
    |  dbtable     'TBLS'
    |)""".stripMargin
    
    sqlContext.sql(TBLS_JDBC_DDL)

    指定列查询:

    sql("SELECT * FROM spark_tbls").collect.foreach(println)
    [1,1423100397,1,0,spark,0,1,page_views,MANAGED_TABLE,A,D] [6,1423116106,1,0,spark,0,6,order_created,MANAGED_TABLE,B,E] [7,1423116131,1,0,spark,0,7,test_load1,MANAGED_TABLE,C,F] [8,1423116145,1,0,spark,0,8,order_picked,MANAGED_TABLE,null,null] [9,1423116160,1,0,spark,0,9,order_shipped,MANAGED_TABLE,null,null] [10,1423116168,1,0,spark,0,10,order_received,MANAGED_TABLE,null,null] [11,1423116179,1,0,spark,0,11,order_cancelled,MANAGED_TABLE,null,null] [12,1423116193,1,0,spark,0,12,order_tracking,MANAGED_TABLE,null,null] [13,1423116248,1,0,spark,0,13,order_tracking_join,MANAGED_TABLE,null,null] [14,1423116298,1,0,spark,0,14,click_log,MANAGED_TABLE,null,null] [15,1423116316,1,0,spark,0,15,ad_list,MANAGED_TABLE,null,null][16,1423116324,1,0,spark,0,16,ad_list_string,MANAGED_TABLE,null,null] [17,1423116338,1,0,spark,0,17,cookie_cats,MANAGED_TABLE,null,null]

    查询表中指定列:

    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls").collect.foreach(println)
    [1,page_views,MANAGED_TABLE] [6,order_created,MANAGED_TABLE] [7,test_load1,MANAGED_TABLE] [8,order_picked,MANAGED_TABLE] [9,order_shipped,MANAGED_TABLE] [10,order_received,MANAGED_TABLE] [11,order_cancelled,MANAGED_TABLE] [12,order_tracking,MANAGED_TABLE] [13,order_tracking_join,MANAGED_TABLE] [14,click_log,MANAGED_TABLE] [15,ad_list,MANAGED_TABLE] [16,ad_list_string,MANAGED_TABLE] [17,cookie_cats,MANAGED_TABLE]

    指定查询条件查询:

    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID = 1").collect.foreach(println)
    [1,page_views,MANAGED_TABLE]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID < 7").collect.foreach(println)
    [1,page_views,MANAGED_TABLE]
    [6,order_created,MANAGED_TABLE]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID <= 7").collect.foreach(println)
    [1,page_views,MANAGED_TABLE]
    [6,order_created,MANAGED_TABLE]
    [7,test_load1,MANAGED_TABLE]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID > 7").collect.foreach(println)
    [8,order_picked,MANAGED_TABLE]
    [9,order_shipped,MANAGED_TABLE]
    [10,order_received,MANAGED_TABLE]
    [11,order_cancelled,MANAGED_TABLE]
    [12,order_tracking,MANAGED_TABLE]
    [13,order_tracking_join,MANAGED_TABLE]
    [14,click_log,MANAGED_TABLE]
    [15,ad_list,MANAGED_TABLE]
    [16,ad_list_string,MANAGED_TABLE]
    [17,cookie_cats,MANAGED_TABLE]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID >= 7").collect.foreach(println)
    [7,test_load1,MANAGED_TABLE]
    [8,order_picked,MANAGED_TABLE]
    [9,order_shipped,MANAGED_TABLE]
    [10,order_received,MANAGED_TABLE]
    [11,order_cancelled,MANAGED_TABLE]
    [12,order_tracking,MANAGED_TABLE]
    [13,order_tracking_join,MANAGED_TABLE]
    [14,click_log,MANAGED_TABLE]
    [15,ad_list,MANAGED_TABLE]
    [16,ad_list_string,MANAGED_TABLE]
    [17,cookie_cats,MANAGED_TABLE]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
    [8,order_picked,MANAGED_TABLE,null]
    [9,order_shipped,MANAGED_TABLE,null]
    [10,order_received,MANAGED_TABLE,null]
    [11,order_cancelled,MANAGED_TABLE,null]
    [12,order_tracking,MANAGED_TABLE,null]
    [13,order_tracking_join,MANAGED_TABLE,null]
    [14,click_log,MANAGED_TABLE,null]
    [15,ad_list,MANAGED_TABLE,null]
    [16,ad_list_string,MANAGED_TABLE,null]
    [17,cookie_cats,MANAGED_TABLE,null]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NOT NULL").collect.foreach(println)
    [1,page_views,MANAGED_TABLE,A]
    [6,order_created,MANAGED_TABLE,B]
    [7,test_load1,MANAGED_TABLE,C]
    
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE TBL_ID>=7 AND TBL_ID <=10").collect.foreach(println)
    [7,test_load1,MANAGED_TABLE,C]
    [8,order_picked,MANAGED_TABLE,null]
    [9,order_shipped,MANAGED_TABLE,null]
    [10,order_received,MANAGED_TABLE,null]

    多partition并行执行: 可以通过http://hadoop000:4040/jobs/的tasks数查看

    val TBLS_PARTS_JDBC_DDL = s"""
    |CREATE TEMPORARY TABLE spark_tbls_parts
    |USING org.apache.spark.sql.jdbc
    |OPTIONS (
    |  url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
    |  dbtable     'TBLS',
    |  partitionColumn 'TBL_ID', 
    |  lowerBound '1', 
    |  upperBound '50', 
    |  numPartitions '3'
    |)""".stripMargin
    
    sqlContext.sql(TBLS_PARTS_JDBC_DDL)
    sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls_parts WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
    [8,order_picked,MANAGED_TABLE,null] [9,order_shipped,MANAGED_TABLE,null] [10,order_received,MANAGED_TABLE,null] [11,order_cancelled,MANAGED_TABLE,null] [12,order_tracking,MANAGED_TABLE,null] [13,order_tracking_join,MANAGED_TABLE,null] [14,click_log,MANAGED_TABLE,null] [15,ad_list,MANAGED_TABLE,null] [16,ad_list_string,MANAGED_TABLE,null] [17,cookie_cats,MANAGED_TABLE,null] [21,emp,MANAGED_TABLE,null] [22,dept,MANAGED_TABLE,null]

    多表关联查询:

    val COLUMNS_V2_JDBC_DDL = s"""
    |CREATE TEMPORARY TABLE spark_column_v2
    |USING org.apache.spark.sql.jdbc
    |OPTIONS (
    |  url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
    |  dbtable     'COLUMNS_V2'
    |)""".stripMargin
    
    sqlContext.sql(COLUMNS_V2_JDBC_DDL)
    sql("SELECT CD_ID, COLUMN_NAME FROM spark_column_v2").collect.foreach(println)
    [1,city_id] [1,end_user_id] [1,ip] [1,referer] [1,session_id] [1,track_time] [1,url] [6,event_time] [6,ordernumber] [7,id] [7,name] [8,event_time] [8,ordernumber] [9,event_time] [9,ordernumber] [10,event_time] [10,ordernumber] [11,event_time] [11,ordernumber] [12,order_cancelled_ts] [12,order_created_ts] [12,order_picked_ts] [12,order_received_ts] [12,order_shipped_ts] [12,ordernumber] [13,order_cancelled_ts] [13,order_created_ts] [13,order_picked_ts] [13,order_received_ts] [13,order_shipped_ts] [13,ordernumber] [14,ad_id] [14,cookie_id] [14,ts] [15,ad_id] [15,catalogs] [15,url] [16,ad_id] [16,catalogs] [16,url] [17,catalog] [17,cookie_id] [17,weight] [21,comm] [21,deptno] [21,empno] [21,ename] [21,hiredate] [21,job] [21,mgr] [21,sal] [22,deptno] [22,dname] [22,loc] sql("SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1").collect.foreach(println)
    [1,page_views,MANAGED_TABLE,1,city_id] [1,page_views,MANAGED_TABLE,1,end_user_id] [1,page_views,MANAGED_TABLE,1,ip] [1,page_views,MANAGED_TABLE,1,referer] [1,page_views,MANAGED_TABLE,1,session_id] [1,page_views,MANAGED_TABLE,1,track_time] [1,page_views,MANAGED_TABLE,1,url] sql("SELECT a.TBL_ID, COUNT(b.CD_ID) FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID GROUP BY a.TBL_ID").collect.foreach(println)
    [1,7] [6,2] [7,2] [8,2] [9,2] [10,2] [11,2] [12,6] [13,6] [14,3] [15,3] [16,3] [17,3] [21,8] [22,3]

    通过spark-sql测试

    CREATE TEMPORARY TABLE spark_tbls
    USING org.apache.spark.sql.jdbc
    OPTIONS (
    url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
    dbtable     'TBLS'
    );
    SELECT * FROM spark_tbls;
    
    CREATE TEMPORARY TABLE spark_tbls_parts USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'TBLS', partitionColumn 'TBL_ID', lowerBound '1', upperBound '50', numPartitions '3' ); SELECT * FROM spark_tbls_parts;
    CREATE TEMPORARY TABLE spark_column_v2 USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'COLUMNS_V2' ); select * from spark_column_v2; SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1
  • 相关阅读:
    3.5 操作系统习题
    04_jni开发常见错误_本地方法没有找到
    3.4 目录和spooling
    3.3 作业管理
    03_jni_helloworld_完成
    ASP.NET MVC 4 过滤器(Authorize)
    MVC实现实现文件流打包成压缩包
    MVC实现实现文件流打包成压缩包
    MVC实现实现文件流打包成压缩包
    图解分布式架构的演进过程!
  • 原文地址:https://www.cnblogs.com/luogankun/p/4274959.html
Copyright © 2020-2023  润新知