• R语言包_dplyr_1


    有5个基础的函数: 
    - filter 
    - select 
    - arrange 
    - mutate 
    - summarise 
    - group_by (plus)

    可以和databases以及data tables中的数据打交道。

    plyr包的特点

    其基础函数有以下特点:

    1. 第一个参数df
    2. 返回df
    3. 没有数据更改in place

    正是因为有这些特点,才可以使用%>%操作符,方便逻辑式编程。

    载入数据

    library(plyr)
    library(dplyr)
    
    # load packages
    suppressMessages(library(dplyr))
    install.packages("hflights")
    library(hflights)
    # explore data
    data(hflights)
    head(hflights)
    # convert to local data frame
    flights <- tbl_df(hflights)
    # printing only shows 10 rows and as many columns as can fit on your screen
    flights
    # you can specify that you want to see more rows
    print(flights, n=20)
    # convert to a normal data frame to see all of the columns
    data.frame(head(flights))
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18

    filter

    keep rows matching criteria

    # base R approach to view all flights on January 1
    flights[flights$Month==1 & flights$DayofMonth==1, ]
    # dplyr approach
    # note: you can use comma or ampersand to represent AND condition
    filter(flights, Month==1, DayofMonth==1)
    # use pipe for OR condition
    filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")
    # you can also use %in% operator
    filter(flights, UniqueCarrier %in% c("AA", "UA"))
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    select

    pick columns by name

    # base R approach to select DepTime, ArrTime, and FlightNum columns
    flights[, c("DepTime", "ArrTime", "FlightNum")]
    # dplyr approach
    select(flights, DepTime, ArrTime, FlightNum)
    # use colon to select multiple contiguous columns, and use `contains` to match columns by name
    # note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name
    select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    “chaining” or “pipelining”

    # nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
    filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)
    # chaining method
    flights %>%
        select(UniqueCarrier, DepDelay) %>%
        filter(DepDelay > 60)
    
    # create two vectors and calculate Euclidian distance between them
    x1 <- 1:5; x2 <- 2:6
    sqrt(sum((x1-x2)^2))
    # chaining method
    (x1-x2)^2 %>% sum() %>% sqrt()
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    arrange

    reorder rows

    # base R approach to select UniqueCarrier and DepDelay columns and sort by DepDelay
    flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]
    # dplyr approach
    flights %>%
        select(UniqueCarrier, DepDelay) %>%
        arrange(DepDelay)
    # use `desc` for descending
    flights %>%
        select(UniqueCarrier, DepDelay) %>%
        arrange(desc(DepDelay))
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    mutate

    add new variable 
    create new variables that are functions of exciting variables 
    which is d 
    ifferent form transform

    # base R approach to create a new variable Speed (in mph)
    flights$Speed <- flights$Distance / flights$AirTime*60
    flights[, c("Distance", "AirTime", "Speed")]
    # dplyr approach (prints the new variable but does not store it)
    flights %>%
        select(Distance, AirTime) %>%
        mutate(Speed = Distance/AirTime*60)
    # store the new variable
    flights <- flights %>% mutate(Speed = Distance/AirTime*60)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    summarise

    reduce variables to values

    # base R approaches to calculate the average arrival delay to each destination
    head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
    head(aggregate(ArrDelay ~ Dest, flights, mean))
    # dplyr approach: create a table grouped by Dest, and then summarise each group by taking the mean of ArrDelay
    flights %>%
        group_by(Dest) %>%
        summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))
    #summarise_each allows you to apply the same summary function to multiple columns at once
    #Note: mutate_each is also available
    # for each carrier, calculate the percentage of flights cancelled or diverted
    flights %>%
        group_by(UniqueCarrier) %>%
        summarise_each(funs(mean), Cancelled, Diverted)
    # for each carrier, calculate the minimum and maximum arrival and departure delays
    flights %>%
        group_by(UniqueCarrier) %>%
        summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay"))
    #Helper function n() counts the number of rows in a group
    #Helper function n_distinct(vector) counts the number of unique items in that vector
    # for each day of the year, count the total number of flights and sort in descending order
    flights %>%
        group_by(Month, DayofMonth) %>%
        summarise(flight_count = n()) %>%
        arrange(desc(flight_count))
    # rewrite more simply with the `tally` function
    flights %>%
        group_by(Month, DayofMonth) %>%
        tally(sort = TRUE)
    # for each destination, count the total number of flights and the number of distinct planes that flew there
    flights %>%
        group_by(Dest) %>%
        summarise(flight_count = n(), plane_count = n_distinct(TailNum))
    # Grouping can sometimes be useful without summarising
    # for each destination, show the number of cancelled and not cancelled flights
    flights %>%
        group_by(Dest) %>%
        select(Cancelled) %>%
        table() %>%
        head()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40

    Window Functions

    • Aggregation function (like mean) takes n inputs and returns 1 value
    • Window function takes n inputs and returns n values 
      Includes ranking and ordering functions (like min_rank), offset functions (lead and lag), and cumulative aggregates (like cummean).
    # for each carrier, calculate which two days of the year they had their longest departure delays
    # note: smallest (not largest) value is ranked as 1, so you have to use `desc` to rank by largest value
    flights %>%
        group_by(UniqueCarrier) %>%
        select(Month, DayofMonth, DepDelay) %>%
        filter(min_rank(desc(DepDelay)) <= 2) %>%
        arrange(UniqueCarrier, desc(DepDelay))
    # rewrite more simply with the `top_n` function
    flights %>%
        group_by(UniqueCarrier) %>%
        select(Month, DayofMonth, DepDelay) %>%
        top_n(2,DepDelay) %>%
        arrange(UniqueCarrier, desc(DepDelay))
    
    # for each month, calculate the number of flights and the change from the previous month
    flights %>%
        group_by(Month) %>%
        summarise(flight_count = n()) %>%
        mutate(change = flight_count - lag(flight_count))
    
    # rewrite more simply with the `tally` function
    flights %>%
        group_by(Month) %>%
        tally() %>%
        mutate(change = n - lag(n))
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25

    Other functions

    # randomly sample a fixed number of rows, without replacement
    flights %>% sample_n(5)
    
    # randomly sample a fraction of rows, with replacement
    flights %>% sample_frac(0.25, replace=TRUE)
    
    # base R approach to view the structure of an object
    str(flights)
    
    # dplyr approach: better formatting, and adapts to your screen width
    glimpse(flights)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    Connecting Databases

    • dplyr can connect to a database as if the data was loaded into a data frame
    • Use the same syntax for local data frames and databases
    • Only generates SELECT statements
    • Currently supports SQLite, PostgreSQL/Redshift, MySQL/MariaDB, BigQuery, MonetDB
    • Example below is based upon an SQLite database containing the hflights data
    • Instructions for creating this database are in the databases vignette
    # connect to an SQLite database containing the hflights data
    my_db <- src_sqlite("my_db.sqlite3")
    
    # connect to the "hflights" table in that database
    flights_tbl <- tbl(my_db, "hflights")
    
    # example query with our data frame
    flights %>%
        select(UniqueCarrier, DepDelay) %>%
        arrange(desc(DepDelay))
    
    # identical query using the database
    flights_tbl %>%
        select(UniqueCarrier, DepDelay) %>%
        arrange(desc(DepDelay))
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    You can write the SQL commands yourself 
    dplyr can tell you the SQL it plans to run and the query execution plan

    # send SQL commands to the database
    tbl(my_db, sql("SELECT * FROM hflights LIMIT 100"))
    
    # ask dplyr for the SQL commands
    flights_tbl %>%
        select(UniqueCarrier, DepDelay) %>%
        arrange(desc(DepDelay)) %>%
        explain()
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    参考资料

    1. justmarkham的教程1
    2. justmarkdown的教程2
     
     
  • 相关阅读:
    【转】B树以及B+树的区别
    epoll 笔记
    HTTP 请求的读取和分析
    获取指定网页的所有链接
    Python爬取表情包
    Python重命名文件夹下的文件(包含子文件夹下的文件)
    BeautifulSoup根据class的属性查找标签的方法
    群聊斗神
    Python简单的下载图片
    Python 记录一个坑
  • 原文地址:https://www.cnblogs.com/awishfullyway/p/6594618.html
Copyright © 2020-2023  润新知