• 算子:sample(false, 0.1)抽样数据


    抽样示例操作:

    scala> import org.apache.spark.sql.hive.HiveContext
    import org.apache.spark.sql.hive.HiveContext
    
    scala> val hiveContext = new HiveContext(sc)
    17/11/07 17:19:36 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
    17/11/07 17:19:37 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
    hiveContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@14cc2fdd
    
    scala> hiveContext.sql("use my_hive_db")
    17/11/07 17:19:40 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
    17/11/07 17:19:40 WARN UserGroupInformation: No groups available for user acount_rc
    res20: org.apache.spark.sql.DataFrame = [result: string]
    
    scala> val sampledPairs = hiveContext.sql("select objectid from myobjectid")
        .map(s=>(s.getAs[String]("objectid"),1))
        .sample(false, 0.1)
    17/11/07 17:19:40 WARN UserGroupInformation: No groups available for user acount_rc
    17/11/07 17:19:40 WARN UserGroupInformation: No groups available for user acount_rc
    sampledPairs: org.apache.spark.rdd.RDD[(String, Int)] = PartitionwiseSampledRDD[1059] at sample at <console>:32
    
    scala> val sampledWordCounts = sampledPairs.countByKey
    sampledWordCounts: scala.collection.Map[String,Long] = Map(193700355 -> 32348, 101549569 -> 81388, 100890370 -> 66425, 184703237 -> 60943, 
    184563457 -> 77401, 100692995 -> 55021, 184756482 -> 88707, 193611009 -> 1588, 185257985 -> 16457, 190035714 -> 14209, 153225089 -> 41515, 
    100811782 -> 115963, 100782849 -> 54729, 184581890 -> 70271, 185388291 -> 76225, 185278978 -> 40917, 80085891 -> 66957, 184957442 -> 59129, 
    153127554 -> 146, 101362179 -> 18600, 193658626 -> 48758, 79805058 -> 17477, 101623810 -> 263451, 184637699 -> 23640, 185363457 -> 24341, 
    153561730 -> 19010, 184722690 -> 2516, 79906177 -> 21106, 193805313 -> 78224, 184739585 -> 34405, 101342210 -> 60860, 193511427 -> 77125, 
    101244675 -> 624, 80425606 -> 12167, 189870594 -> 6944, 101441025 -> 39970, 185549825 -> 322, 101125633...
    scala> sampledWordCounts.foreach(println(_))
    (193700355,32348)
    (101549569,81388)
    (100890370,66425)
    (184703237,60943)
    (184563457,77401)
    (100692995,55021)
    (184756482,88707)
    (193611009,1588)
    (185257985,16457)
    (190035714,14209)
    (153225089,41515)
    (100811782,115963)
    (100782849,54729)
    (184581890,70271)
  • 相关阅读:
    makefile编写---单个子目录编译自动变量模板ok
    任务22:课程介绍 & 任务23:Http请求的处理过程
    任务20:DI初始化的源码解读 & 任务21:依赖注入的使用
    任务19:单元测试
    任务18:控制反转
    任务17:从UML角度来理解依赖
    任务16:介绍-
    任务15:配置框架设计浅析
    任务14:配置的热更新
    任务13:在Core Mvc中使用Options
  • 原文地址:https://www.cnblogs.com/yy3b2007com/p/7800749.html
Copyright © 2020-2023  润新知