R language — efficient operation data frame (dplyr package) (3)

Time:2022-1-4

This paper is the third in the R language – efficient operation data frame (dplyr package) series. It mainly introduces the common data operations such as data frame connection operation, data frame collection operation and data frame grouping calculation.

Data frame connection join

Join series functions are used to connect two data frames.

#Data frame
df1 <- tibble(id=1:2,v1=c("a1","a2"))
df2 <- tibble(id=2:4,v2=c("b1","b2","b3"))

#Inner connection
inner_join(df1,df2)
## # A tibble: 1 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     2 a2    b1
#Left connection
left_join(df1,df2)
## # A tibble: 2 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     1 a1    <NA> 
## 2     2 a2    b1
#Right connection
right_join(df1,df2)
## # A tibble: 3 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     2 a2    b1   
## 2     3 <NA>  b2   
## 3     4 <NA>  b3
#Full connection
full_join(df1,df2)
## # A tibble: 4 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     1 a1    <NA> 
## 2     2 a2    b1   
## 3     3 <NA>  b2   
## 4     4 <NA>  b3
#Filter left connection
semi_ Join (DF1, df2) # keep the connection part
## # A tibble: 1 x 2
##      id v1   
##   <int> <chr>
## 1     2 a2
anti_ Join (DF1, df2) # delete the connection part
## # A tibble: 1 x 2
##      id v1   
##   <int> <chr>
## 1     1 a1
#List connection
nest_join(df1,df2)
## # A tibble: 2 x 3
##      id v1    df2             
## * <int> <chr> <list>          
## 1     1 a1    <tibble [0 × 1]>
## 2     2 a2    <tibble [1 × 1]>
nest_ Join (DF1, df2)% >% tidyr:: unnest (cols = 3) # turn to inner join
## # A tibble: 1 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     2 a2    b1
#Specify column name connection
#Have the same column
inner_ Join (DF1, df2, by = "Id") # eliminates the prompt message
## # A tibble: 1 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     2 a2    b1
#No identical columns
df3 <- tibble(index=2:4,v2=c("b1","b2","b3"))
inner_join(df1,df3,by=c("id"="index"))
## # A tibble: 1 x 3
##      id v1    v2   
##   <int> <chr> <chr>
## 1     2 a2    b1

Data frame collection operation intersect / Union / setdiff

Calculate the intersection, union and complement of two data frames.

#Data frame
tbl_df1 <- tibble(var1=1:3,var2=5:7)
tbl_df2 <- tibble(var1=2:4,var2=6:8)
#Find intersection
intersect(tbl_df1,tbl_df2)
## # A tibble: 2 x 2
##    var1  var2
##   <int> <int>
## 1     2     6
## 2     3     7
#Union set
union(tbl_df1,tbl_df2)
## # A tibble: 4 x 2
##    var1  var2
##   <int> <int>
## 1     1     5
## 2     2     6
## 3     3     7
## 4     4     8
#Union (do not delete intersection)
union_all(tbl_df1,tbl_df2)
## # A tibble: 6 x 2
##    var1  var2
##   <int> <int>
## 1     1     5
## 2     2     6
## 3     3     7
## 4     2     6
## 5     3     7
## 6     4     8
#Complement (tbl#u the complement of DF1)
setdiff(tbl_df1,tbl_df2)
## # A tibble: 1 x 2
##    var1  var2
##   <int> <int>
## 1     1     5
#Judge whether the quantity data frame is the same true
setequal(tbl_df1,tbl_df1[3:1,])
## [1] TRUE

Data grouping calculation group_ by+summarise

Data frame grouping and calculation by label column is a common scenario in data analysis.group_by+sumariseThe combination of functions can flexibly meet the needs of various grouping calculations.

Single function application single column

A single function is applied to a single column of grouped data for calculation. Can be insummariseSpecify the new calculated column name directly in.

#Data
tbl_df <- tibble(var1=1:4,var2=2:5,var3=3:6,label=c("a","b","a","b"))
#Common way
tbl_df %>% 
  group_by(label) %>% 
  Summarize (mean = mean (VAR1)) # computes a column and specifies the column name
## # A tibble: 2 x 2
##   label  mean
##   <chr> <dbl>
## 1 a         2
## 2 b         3

Single function application multi column

A single function is applied to multiple columns of grouped data for batch calculation. When usedsummarise_at、summarize_all、summarise_ifIf the calculation column needs to be renamed during function, the calculation function needs to adoptlistAnd the mode for calculating column names will beOriginal column name + suffixForm of.

#Common way
tbl_df %>% 
  group_by(label) %>% 
  Summarize (mean1 = mean (VAR1), mean2 = mean (var2)) # computes the column name and specifies the column name
## # A tibble: 2 x 3
##   label mean1 mean2
##   <chr> <dbl> <dbl>
## 1 a         2     3
## 2 b         3     4
#Specify multiple columns (calculated column has no column name)
tbl_df %>% 
  group_by(label) %>% 
  summarise_at(c("var1","var2"),mean)
## # A tibble: 2 x 3
##   label  var1  var2
##   <chr> <dbl> <dbl>
## 1 a         2     3
## 2 b         3     4
#Specify multiple columns (calculated column rename)
tbl_df %>% 
  group_by(label) %>% 
  summarise_at(c("var1","var2"),list(mean=mean))
## # A tibble: 2 x 3
##   label var1_mean var2_mean
##   <chr>     <dbl>     <dbl>
## 1 a             2         3
## 2 b             3         4
#All columns
tbl_df %>% 
  group_by(label) %>% 
  summarise_ All (list (mean = ~ mean (., Na. RM = t))) # calculate column rename + anonymous function
## # A tibble: 2 x 4
##   label var1_mean var2_mean var3_mean
##   <chr>     <dbl>     <dbl>     <dbl>
## 1 a             2         3         4
## 2 b             3         4         5
#Condition column
tbl_df %>% 
  group_by(label) %>% 
  summarise_if(is.numeric,list(mean=mean))
## # A tibble: 2 x 4
##   label var1_mean var2_mean var3_mean
##   <chr>     <dbl>     <dbl>     <dbl>
## 1 a             2         3         4
## 2 b             3         4         5

Apply multiple functions to a single column

Multiple different functions are applied to the same column of grouped data.

#Common way
tbl_df %>% 
  group_by(label) %>% 
  Summarize (mean = mean (VAR1), max = max (VAR1)) # computes the column and specifies the column name
## # A tibble: 2 x 3
##   label  mean   max
##   <chr> <dbl> <int>
## 1 a         2     3
## 2 b         3     4
#Specify column method (calculated column has no column name)
tbl_df %>% 
  group_by(label) %>% 
  summarise_at("var1",list(mean,max))
## # A tibble: 2 x 3
##   label   fn1   fn2
##   <chr> <dbl> <int>
## 1 a         2     3
## 2 b         3     4
#Specify column method (calculate column rename)
tbl_df %>% 
  group_by(label) %>% 
  summarise_at("var1",list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 3
##   label  mean   max
##   <chr> <dbl> <int>
## 1 a         2     3
## 2 b         3     4

Apply multiple functions to multiple columns

Multiple functions are applied to multiple columns of grouped data, that is, each column performs the same multiple function calculations.

#Common way
tbl_df %>% 
  group_by(label) %>% 
  summarise(mean1=mean(var1),max1=max(var1),mean2=mean(var2),max2=max(var2))
## # A tibble: 2 x 5
##   label mean1  max1 mean2  max2
##   <chr> <dbl> <int> <dbl> <int>
## 1 a         2     3     3     4
## 2 b         3     4     4     5
#Specify column method (calculated column has no column name)
tbl_df %>% 
  group_by(label) %>% 
  summarise_at(c("var1","var2"),list(mean,max))
## # A tibble: 2 x 5
##   label var1_fn1 var2_fn1 var1_fn2 var2_fn2
##   <chr>    <dbl>    <dbl>    <int>    <int>
## 1 a            2        3        3        4
## 2 b            3        4        4        5
#Specify column method (calculate column rename)
tbl_df %>% 
  group_by(label) %>% 
  summarise_at(c("var1","var2"),list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 5
##   label var1_mean var2_mean var1_max var2_max
##   <chr>     <dbl>     <dbl>    <int>    <int>
## 1 a             2         3        3        4
## 2 b             3         4        4        5
#All columns
tbl_df %>% 
  group_by(label) %>% 
  summarise_all(list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 7
##   label var1_mean var2_mean var3_mean var1_max var2_max
##   <chr>     <dbl>     <dbl>     <dbl>    <int>    <int>
## 1 a             2         3         4        3        4
## 2 b             3         4         5        4        5
## # … with 1 more variable: var3_max <int>
#Condition column
tbl_df %>% 
  group_by(label) %>% 
  summarise_if(is.numeric,list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 7
##   label var1_mean var2_mean var3_mean var1_max var2_max
##   <chr>     <dbl>     <dbl>     <dbl>    <int>    <int>
## 1 a             2         3         4        3        4
## 2 b             3         4         5        4        5
## # … with 1 more variable: var3_max <int>