This paper is the third in the R language – efficient operation data frame (dplyr package) series. It mainly introduces the common data operations such as data frame connection operation, data frame collection operation and data frame grouping calculation.
Data frame connection join
Join series functions are used to connect two data frames.
#Data frame
df1 <- tibble(id=1:2,v1=c("a1","a2"))
df2 <- tibble(id=2:4,v2=c("b1","b2","b3"))
#Inner connection
inner_join(df1,df2)
## # A tibble: 1 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 2 a2 b1
#Left connection
left_join(df1,df2)
## # A tibble: 2 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 1 a1 <NA>
## 2 2 a2 b1
#Right connection
right_join(df1,df2)
## # A tibble: 3 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 2 a2 b1
## 2 3 <NA> b2
## 3 4 <NA> b3
#Full connection
full_join(df1,df2)
## # A tibble: 4 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 1 a1 <NA>
## 2 2 a2 b1
## 3 3 <NA> b2
## 4 4 <NA> b3
#Filter left connection
semi_ Join (DF1, df2) # keep the connection part
## # A tibble: 1 x 2
## id v1
## <int> <chr>
## 1 2 a2
anti_ Join (DF1, df2) # delete the connection part
## # A tibble: 1 x 2
## id v1
## <int> <chr>
## 1 1 a1
#List connection
nest_join(df1,df2)
## # A tibble: 2 x 3
## id v1 df2
## * <int> <chr> <list>
## 1 1 a1 <tibble [0 × 1]>
## 2 2 a2 <tibble [1 × 1]>
nest_ Join (DF1, df2)% >% tidyr:: unnest (cols = 3) # turn to inner join
## # A tibble: 1 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 2 a2 b1
#Specify column name connection
#Have the same column
inner_ Join (DF1, df2, by = "Id") # eliminates the prompt message
## # A tibble: 1 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 2 a2 b1
#No identical columns
df3 <- tibble(index=2:4,v2=c("b1","b2","b3"))
inner_join(df1,df3,by=c("id"="index"))
## # A tibble: 1 x 3
## id v1 v2
## <int> <chr> <chr>
## 1 2 a2 b1
Data frame collection operation intersect / Union / setdiff
Calculate the intersection, union and complement of two data frames.
#Data frame
tbl_df1 <- tibble(var1=1:3,var2=5:7)
tbl_df2 <- tibble(var1=2:4,var2=6:8)
#Find intersection
intersect(tbl_df1,tbl_df2)
## # A tibble: 2 x 2
## var1 var2
## <int> <int>
## 1 2 6
## 2 3 7
#Union set
union(tbl_df1,tbl_df2)
## # A tibble: 4 x 2
## var1 var2
## <int> <int>
## 1 1 5
## 2 2 6
## 3 3 7
## 4 4 8
#Union (do not delete intersection)
union_all(tbl_df1,tbl_df2)
## # A tibble: 6 x 2
## var1 var2
## <int> <int>
## 1 1 5
## 2 2 6
## 3 3 7
## 4 2 6
## 5 3 7
## 6 4 8
#Complement (tbl#u the complement of DF1)
setdiff(tbl_df1,tbl_df2)
## # A tibble: 1 x 2
## var1 var2
## <int> <int>
## 1 1 5
#Judge whether the quantity data frame is the same true
setequal(tbl_df1,tbl_df1[3:1,])
## [1] TRUE
Data grouping calculation group_ by+summarise
Data frame grouping and calculation by label column is a common scenario in data analysis.group_by+sumarise
The combination of functions can flexibly meet the needs of various grouping calculations.
Single function application single column
A single function is applied to a single column of grouped data for calculation. Can be insummarise
Specify the new calculated column name directly in.
#Data
tbl_df <- tibble(var1=1:4,var2=2:5,var3=3:6,label=c("a","b","a","b"))
#Common way
tbl_df %>%
group_by(label) %>%
Summarize (mean = mean (VAR1)) # computes a column and specifies the column name
## # A tibble: 2 x 2
## label mean
## <chr> <dbl>
## 1 a 2
## 2 b 3
Single function application multi column
A single function is applied to multiple columns of grouped data for batch calculation. When usedsummarise_at、summarize_all、summarise_if
If the calculation column needs to be renamed during function, the calculation function needs to adoptlist
And the mode for calculating column names will beOriginal column name + suffix
Form of.
#Common way
tbl_df %>%
group_by(label) %>%
Summarize (mean1 = mean (VAR1), mean2 = mean (var2)) # computes the column name and specifies the column name
## # A tibble: 2 x 3
## label mean1 mean2
## <chr> <dbl> <dbl>
## 1 a 2 3
## 2 b 3 4
#Specify multiple columns (calculated column has no column name)
tbl_df %>%
group_by(label) %>%
summarise_at(c("var1","var2"),mean)
## # A tibble: 2 x 3
## label var1 var2
## <chr> <dbl> <dbl>
## 1 a 2 3
## 2 b 3 4
#Specify multiple columns (calculated column rename)
tbl_df %>%
group_by(label) %>%
summarise_at(c("var1","var2"),list(mean=mean))
## # A tibble: 2 x 3
## label var1_mean var2_mean
## <chr> <dbl> <dbl>
## 1 a 2 3
## 2 b 3 4
#All columns
tbl_df %>%
group_by(label) %>%
summarise_ All (list (mean = ~ mean (., Na. RM = t))) # calculate column rename + anonymous function
## # A tibble: 2 x 4
## label var1_mean var2_mean var3_mean
## <chr> <dbl> <dbl> <dbl>
## 1 a 2 3 4
## 2 b 3 4 5
#Condition column
tbl_df %>%
group_by(label) %>%
summarise_if(is.numeric,list(mean=mean))
## # A tibble: 2 x 4
## label var1_mean var2_mean var3_mean
## <chr> <dbl> <dbl> <dbl>
## 1 a 2 3 4
## 2 b 3 4 5
Apply multiple functions to a single column
Multiple different functions are applied to the same column of grouped data.
#Common way
tbl_df %>%
group_by(label) %>%
Summarize (mean = mean (VAR1), max = max (VAR1)) # computes the column and specifies the column name
## # A tibble: 2 x 3
## label mean max
## <chr> <dbl> <int>
## 1 a 2 3
## 2 b 3 4
#Specify column method (calculated column has no column name)
tbl_df %>%
group_by(label) %>%
summarise_at("var1",list(mean,max))
## # A tibble: 2 x 3
## label fn1 fn2
## <chr> <dbl> <int>
## 1 a 2 3
## 2 b 3 4
#Specify column method (calculate column rename)
tbl_df %>%
group_by(label) %>%
summarise_at("var1",list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 3
## label mean max
## <chr> <dbl> <int>
## 1 a 2 3
## 2 b 3 4
Apply multiple functions to multiple columns
Multiple functions are applied to multiple columns of grouped data, that is, each column performs the same multiple function calculations.
#Common way
tbl_df %>%
group_by(label) %>%
summarise(mean1=mean(var1),max1=max(var1),mean2=mean(var2),max2=max(var2))
## # A tibble: 2 x 5
## label mean1 max1 mean2 max2
## <chr> <dbl> <int> <dbl> <int>
## 1 a 2 3 3 4
## 2 b 3 4 4 5
#Specify column method (calculated column has no column name)
tbl_df %>%
group_by(label) %>%
summarise_at(c("var1","var2"),list(mean,max))
## # A tibble: 2 x 5
## label var1_fn1 var2_fn1 var1_fn2 var2_fn2
## <chr> <dbl> <dbl> <int> <int>
## 1 a 2 3 3 4
## 2 b 3 4 4 5
#Specify column method (calculate column rename)
tbl_df %>%
group_by(label) %>%
summarise_at(c("var1","var2"),list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 5
## label var1_mean var2_mean var1_max var2_max
## <chr> <dbl> <dbl> <int> <int>
## 1 a 2 3 3 4
## 2 b 3 4 4 5
#All columns
tbl_df %>%
group_by(label) %>%
summarise_all(list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 7
## label var1_mean var2_mean var3_mean var1_max var2_max
## <chr> <dbl> <dbl> <dbl> <int> <int>
## 1 a 2 3 4 3 4
## 2 b 3 4 5 4 5
## # … with 1 more variable: var3_max <int>
#Condition column
tbl_df %>%
group_by(label) %>%
summarise_if(is.numeric,list(mean=~mean(.),max=~max(.)))
## # A tibble: 2 x 7
## label var1_mean var2_mean var3_mean var1_max var2_max
## <chr> <dbl> <dbl> <dbl> <int> <int>
## 1 a 2 3 4 3 4
## 2 b 3 4 5 4 5
## # … with 1 more variable: var3_max <int>