• sparkr跑通函数 包含排序


    spark1.4.0的sparkR的思路:用Spark大数据集中抽取小数据(sparkR的DataFrame),然后到R里分析(DataFrame)。

    这两个DataFrame是不同的,前者是分布式的,集群上的DF,R里的那些包都不能用;后者是单机版的DF,包里的函数都能用。

    sparkR的开发计划,个人觉得是将目前包里的函数,迁移到sparkR的DataFrame里,这样就打开一片天地。

    > a<- sql(hiveContext, "SELECT count(*) FROM anjuke_scores where restaurant>=10");
    
    > a<- sql(hiveContext, "SELECT * FROM anjuke_scores limit 5")
    > a
    DataFrame[city:string, housingname:string, ori_traffic_score:int, ori_traffic_score_normal:double, metro_station:double, metro_station_normal:double,...
    > first(a)  #显示Formal Data Frame的第一行
     

    > head(a) ;  #列出a的前6行
    > columns(a)      # 列出全部的列
    [1] "city"                      "housingname"               "ori_traffic_score"         "ori_traffic_score_normal"

    [5] "metro_station"             "metro_station_normal"      "bus_station"               "bus_station_normal"  ...

    > showDF(a)
    > b<-filter(a, a$ori_comfort>8); # 行筛选, ori_comfort_normal:double 
    
    > print(a);    #打印列名及类型  
    DataFrame[city:string, housingname:string, ori_traffic_score:int, ......

    > printSchema(a); # 打印列名的树形框架概要 root |-- city: string (nullable = true) |-- housingname: string (nullable = true) |-- ori_traffic_score: integer (nullable = true) |-- ori_traffic_score_normal: double (nullable = true) |-- metro_station: double (nullable = true)

    > take(a,10)   ;  # 提取Formal class DataFrame的前面num行,成为R中普通的 data frame , take(x, num)

         city                  housingname ori_traffic_score ori_traffic_score_normal metro_station metro_station_normal
    1  x9a                   xddrwxb8                NA                        0            NA                    0
    2  x9a         xe4xf04u03a221~                NA                        0            NA                    0
    3  x9a                xf6xe3wxb8                NA                        0            NA                    0
    4  x9a               x8e=xb0wxb8                NA                        0            NA                    0
    5  x9a x9axe4xf04xcexe4xf0~                NA                        0            NA                    0
    6  x9a                      q4xfdE                NA                        0            NA                    0
    7  x9a                xe4xf04xce                NA                        0            NA                    0
    8  x9a                      )xfdVT                NA                        0            NA                    0
    9  x9a                       q177V                NA                        0            NA                    0
    10 x9a           xe4xf04xceWxb8                NA                        0            NA                    0

    > b<-take(a,10)
    > dim(b)
    [1] 10 41
    > aa <- withColumn(a, "ori_comfort_aa", a$ori_comfort * 5)   #用现有的列生成新的列, 新增一列,ori_comfort_aa,结果还是Formal data frame结构
    > printSchema(aa)
    root
     |-- city: string (nullable = true)
    .........
     |-- comfort_normal: double (nullable = true)
     |-- ori_comfort_aa: double (nullable = true)

    > aa <- mutate(a, newCol1 = a$commerce_normal * 5, newCol2 = a$bank_normal * 2) ;   #与withColumn类似
    > printSchema(aa)
    root
     |-- city: string (nullable = true)
     。。。。。。。。。。。。。。。。。。
     |-- comfort_normal: double (nullable = true)
     |-- newCol1: double (nullable = true)
     |-- newCol2: double (nullable = true)

    a1<-arrange(a,asc(a$level_tow));  # 按列排序, asc升序,desc降序

    a1<-orderBy(a,asc(a$level_tow));  # 按列排序

    count(a) ;  # 统计 Formal Data Frame有多少行数据

    > dtypes(a);  #以list的形式列出Formal Data Frame的全部列名及类型
    [[1]]
    [1] "city"   "string"
    
    [[2]]
    [1] "housingname" "string"  
    
    
    > a<-withColumnRenamed(a,"comfort_normal","AA");  # 更改列名  
    > printSchema(a)
    root
     |-- city: string (nullable = true)
     |-- housingname: string (nullable = true)
    ..........
     |-- AA: double (nullable = true)

    创建sparkR的数据框的函数createDataFrame

    > df<-createDataFrame(sqlContext,a.df);  # a.df是R中的数据框, df是sparkR的数据框,注意:使用sparkR的数据库,需要sqlContext

    > str(a.df)
    'data.frame':    5 obs. of  41 variables:

    > str(df)
    Formal class 'DataFrame' [package "SparkR"] with 2 slots
      ..@ env:<environment: 0x4fce350> 
      ..@ sdf:Class 'jobj' <environment: 0x4fc70b0> 

    > destDF <- select(SFO_DF, "dest", "cancelled");  #选择列

    > showDF(destDF);   #显示sparkR的DF
    +----+---------+
    |dest|cancelled|
    +----+---------+
    | SFO|        0|
    ................

    > registerTempTable(SFO_DF, "flightsTable");  #要对sparkDF使用SQL语句,首先需要将DF注册成一个table
     
    > wa <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable"); #在sqlContext下使用SQL语句

    > showDF(wa);   #查询的结果还是sparkDF
    +----+---------+
    |dest|cancelled|
    +----+---------+
    | SFO|        0|
    ................
    > local_df <- collect(wa);   #将sparkDF转换成R中的DF
    > str(local_df)
    'data.frame':    2818 obs. of  2 variables:
     $ dest     : chr  "SFO" "SFO" "SFO" "SFO" ...
     $ cancelled: int  0 0 0 0 0 0 0 0 0 0 ...

    > wa<-flights_df[1:1000,];   #wa是R中的DF
    > flightsDF<-createDataFrame(sqlContext,wa) ;   #flightsDF是sparkR的DF
    > library(magrittr); #管道函数的包对sparkRDF适用
    > groupBy(flightsDF, flightsDF$date) %>%
    +     summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF;  #注意,语法和dplyr中的有所不同,结果还是sparkRDF

    > str(dailyDelayDF)
    Formal class 'DataFrame' [package "SparkR"] with 2 slots
      ..@ env:<environment: 0x4cd3118> 
      ..@ sdf:Class 'jobj' <environment: 0x4cd6968> 
    > showDF(dailyDelayDF)
    +----------+--------------------+--------------------+
    |      date|      AVG(dep_delay)|      AVG(arr_delay)|
    +----------+--------------------+--------------------+
    |2011-01-01|                 5.2|                 5.8|
    |2011-01-02|  1.8333333333333333|                -2.0|
    ................

    在39机器上跑的

    collect将sparkDF转化成DF
    Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
    collect(x, stringsAsFactors = FALSE),x:A SparkSQL DataFrame

    > dist_df<- sql(hiveContext, "SELECT * FROM anjuke_scores where restaurant<=1");
    > local_df <- dist_df %>% 
          groupBy(dist_df$city) %>% 
          summarize(count = n(dist_df$housingname)) %>% 
          collect
    > local_df
               city count
    1         x9a     5
    2         8xde     7
    3      xf0xde     2
    ..........
    ..........

    take也可将sparkDF转化成DF
    Take the first NUM rows of a DataFrame and return a the results as a data.frame
    take(x, num)


    > local_df <- dist_df %>% 
          groupBy(dist_df$city) %>% 
          summarize(count = n(dist_df$housingname))
    > a<-take(local_df,100)
    [Stage 16:=========================================>            (154 + 1) / 199]                                                                                > View(a)
    > a
               city count
    1         x9a     5
    2         8xde     7
    3      xf0xde     2
    ..........
    ..........

    不通的函数:

    > describe(a)
    Error in x[present, drop = FALSE] : 
      object of type 'S4' is not subsettable
    > jfkDF <- filter(flightsDF, flightsDF$dest == "DFW")
    Error in filter(flightsDF, flightsDF$dest == "DFW") : 
      no method for coercing this S4 class to a vector

     

  • 相关阅读:
    实验四 决策树算法及应用
    实验三 朴素贝叶斯算法及应用
    实验二 K-近邻算法及应用
    实验一 感知器及其应用
    园区网核心、防火墙、边界出口冗余实验配置
    PNETLab模拟器部署及使用配置
    基于RPS开发模式的可视化设计解决方案
    Python学习手册《Learning Python》
    【转】Windows 64bit下Python环境配置与集成IDE PyCharm
    时间复杂度完美例子---数组最大子序列和
  • 原文地址:https://www.cnblogs.com/awishfullyway/p/6645790.html
Copyright © 2020-2023  润新知