• Spark工程开发常用函数与方法(Scala语言)


    import org.apache.spark.{SparkContext, SparkConf}
    import org.apache.spark.sql.{SaveMode, DataFrame}
    import scala.collection.mutable.ArrayBuffer
    import main.asiainfo.coc.tools.Configure
    import org.apache.spark.sql.hive.HiveContext
    import java.sql.DriverManager
    import java.sql.Connection

     1 连接前台数据源 查询前台MYSQL中的数据

    val DIM_COC_INDEX_INFO_DDL = s"""
    CREATE TEMPORARY TABLE DIM_COC_INDEX_INFO
    USING org.apache.spark.sql.jdbc
    OPTIONS (
    url '${mySQLUrl}',
    dbtable 'DIM_COC_INDEX_INFO'
    )""".stripMargin
    
    sqlContext.sql(DIM_COC_INDEX_INFO_DDL)
    val DIM_COC_INDEX_INFO = sql("SELECT * FROM DIM_COC_INDEX_INFO").cache()
    

      

    2   在A表中筛选出 B表中获取的TARGET_TABLE_CODE 然后再按照DATA_SRC_CODE排序,查询出源表的集合

    val sources = DIM_COC_INDEX_INFO.filter("TARGET_TABLE_CODE ='"+TARGET_TABLE_CODE+"'")
            .select("DATA_SRC_CODE").groupBy("DATA_SRC_CODE").agg(DIM_COC_INDEX_INFO("DATA_SRC_CODE")).collect
    

    3 将表进行关联

    resultIndexTableDF = resultIndexTableDF.join(SOURCE_TABLE,ALL_USERS.col(ALL_USER_JOIN_COLUMN_NAME) === SOURCE_TABLE.col(SOURCE_TABLE_JOIN_COLUMN_NAME),"left_outer")
    resultIndexTableDF.dtypes.foreach(println)
    

    4 根据条件筛选

    val labels = CI_MDA_SYS_TABLE.join(CI_MDA_SYS_TABLE_COLUMN,CI_MDA_SYS_TABLE("TABLE_ID") === CI_MDA_SYS_TABLE_COLUMN("TABLE_ID"),"inner")
          .join(CI_LABEL_EXT_INFO,CI_MDA_SYS_TABLE_COLUMN("COLUMN_ID") === CI_LABEL_EXT_INFO("COLUMN_ID"),"inner")
          .join(CI_LABEL_INFO,CI_LABEL_EXT_INFO("LABEL_ID") === CI_LABEL_INFO("LABEL_ID"),"inner")
          .join(CI_APPROVE_STATUS,CI_LABEL_INFO("LABEL_ID") === CI_APPROVE_STATUS("RESOURCE_ID"),"inner")
          .filter(CI_APPROVE_STATUS("CURR_APPROVE_STATUS_ID") === CI_APPROVE_STATUS_SUCCESS_CODE
          and (CI_LABEL_INFO("DATA_STATUS_ID") === 1 || CI_LABEL_INFO("DATA_STATUS_ID") === 2)
          and (CI_LABEL_EXT_INFO("COUNT_RULES_CODE") isNotNull  //TODO   trim.length>0
          )
          and CI_MDA_SYS_TABLE("UPDATE_CYCLE") === TABLE_DATA_CYCLE
          ).cache()
    

    5 根据某字段对表进行排序

        val labelTargetTables = labels.groupBy("CI_MDA_SYS_TABLE.TABLE_ID","CI_MDA_SYS_TABLE.TABLE_NAME").agg(labels("CI_MDA_SYS_TABLE.TABLE_ID"),labels("CI_MDA_SYS_TABLE.TABLE_NAME")).collect
    

    6 创建parquet格式的表 可使用schema.生成到指定的schema.

            sqlContext.sql("create table "+labelTargetTableName+" stored as parquet as select * from default."+labelTargetTableNameJson)
    

    7 保存数据格式,可以指定生成的格式

     resultLabelTable.saveAsTable(tableName = labelTargetTableName, source="parquet", mode=SaveMode.Overwrite)
    

    8 根据筛选查询出相应数据,由于cache方法并不属于action操作,接下来的操作需要这一步所执行的数据信息,所以这里使用collect方法,再执行遍历方法

          val r0000Labels = labelInThisTargetTable.filter("COUNT_RULES_CODE = 'R_00000'").select("CI_LABEL_INFO.LABEL_ID","COLUMN_NAME").collect
    for(r0000Label <- r0000Labels){
       ........
    }
    

      

      

      

  • 相关阅读:
    C#下载文件代码更新20070920
    判断IP地址是否合法的sql2000使用存储过程跟函数
    百万级SQL分页存储过程
    C#批量重命名文件代码的实现
    单个分页的存储过程
    阿江网站访问统计系统设计构思分析
    c#查询QQ状态是否在线查询代码
    C# public class Person
    C#禁止应用程序的多重运行
    Outlook最小化到托盘的设置方法
  • 原文地址:https://www.cnblogs.com/yangsy0915/p/4867780.html
Copyright © 2020-2023  润新知