R-tidyverse包-數(shù)據(jù)轉(zhuǎn)換dplyr包進(jìn)階
R語言||干貨鸡岗!tidyverse包-數(shù)據(jù)轉(zhuǎn)換dplyr包進(jìn)階
目錄
[TOC]
簡介
Dplyr(https://dplyr.tidyverse.org/)是一種數(shù)據(jù)操作語法,提供了一組一致的動(dòng)詞沼死,幫助我們解決最常見的數(shù)據(jù)操作锡移,比如行操作(filter呕童、slice、arrange)罩抗、列操作(slelect拉庵、rename灿椅、mutate套蒂、relocate)、折疊操作(summarise)茫蛹、合并table(left_join操刀、right_join、inner_join)婴洼。查看包中的所有函數(shù):
library(dplyr)
ls('package:dplyr')
- mutate:添加新列或者修改已存在列
- select:選擇列變量
- filter:依據(jù)值過濾行
- summarise:將多個(gè)值降為單個(gè)summary
- arrange:行排序
- left_join:合并兩個(gè)數(shù)據(jù)框
mutate函數(shù)介紹
常規(guī)用法
library(tidyverse) # or library(dplyr)
# 創(chuàng)建新列骨坑,一列或多列
starwars %>%
select(name, mass) %>%
mutate(
mass2 = mass * 2,
mass2_squared = mass2 * mass2, .before=name, # .before和.after設(shè)置新列插入的位置
test = 'test'
)
# 創(chuàng)建新列,按行進(jìn)行統(tǒng)計(jì)
starwars %>% select(name, height, mass) %>%rowwise() %>% mutate(m = mean(c(height, mass)))
# 創(chuàng)建新列柬采,根據(jù)數(shù)值排名創(chuàng)建列
starwars %>%
select(name, mass, homeworld) %>%
group_by(homeworld) %>%
mutate(rank = min_rank(desc(mass)))
# 刪除列欢唾,設(shè)置NULL刪除
starwars %>%
select(name, height, mass, homeworld) %>%
mutate(
mass = NULL
)
# 修改列,通過運(yùn)算符修改
starwars %>%
select(name, height, mass, homeworld) %>%
mutate(
height = height * 0.0328084 # convert to feet
)
# 修改多列粉捻,利用across函數(shù)將多列字符串?dāng)?shù)據(jù)轉(zhuǎn)換為因子
starwars %>%
select(name, homeworld, species) %>%
mutate(across(!name, as.factor))
# 保留或刪除列礁遣,默認(rèn)保留所有列
df <- tibble(x = 1, y = 2, a = "a", b = "b")
df %>% mutate(z = x + y, .keep = "all") # the default
df %>% mutate(z = x + y, .keep = "used")
df %>% mutate(z = x + y, .keep = "unused")
df %>% mutate(z = x + y, .keep = "none")
特殊用法
# 增加列列名為變量
col_name <- "new_column"
starwars %>% mutate(!!col_name := mass * height)
# 操作列名為變量
col_names <- c("height", "mass")
starwars %>% mutate(across(all_of(col_names), ~ .x * 2))
# 根據(jù)條件增加列
starwars %>% mutate(
gender = case_when(
sex == "male" ~ "Male",
sex == "female" ~ "Female",
TRUE ~ "Unknown"
)
)
# 根據(jù)條件增加列,針對所有以height開頭的列肩刃,值乘10
starwars %>% mutate(across(starts_with("height"), ~.x*10), .keep='used')
# 根據(jù)條件修改列
starwars %>% mutate(
mass = if_else(mass > 100, "Yes", "No")
)
# 根據(jù)多列條件創(chuàng)建列
starwars %>% mutate(
special_condition = if_else(mass > 100 & height > 200, "Yes", "No")
, .keep='used')
select函數(shù)介紹
常規(guī)用法
library(tidyverse)
# For better printing
iris <- as_tibble(iris)
# 選擇列祟霍,單列或者多列
starwars %>% select(homeworld, height, mass)
# 按索引選擇列
starwars %>% select(1:3)
# 按范圍選擇列
starwars %>% select(name:mass)
# 反選,使用!或者-反選
starwars %>% select(!(name:mass))
iris %>% select(-c(Sepal.Length, Petal.Length))
# 按條件反選
iris %>% select(!ends_with("Width"))
# 按照多個(gè)條件選擇盈包,交集
iris %>% select(starts_with("Petal") & ends_with("Width"))
# 按照多個(gè)條件選擇沸呐,并集
iris %>% select(starts_with("Petal") | ends_with("Width"))
# 按照多個(gè)條件選擇,并集呢燥,其中一個(gè)條件反選
iris %>% select(starts_with("Petal") & !ends_with("Width"))
特殊用法
# 選擇列為變量
col_names <- c("Sepal.Length", "Sepal.Width")
iris %>% select(all_of(col_names))
col_names <- c("Sepal.Length", "Sepal.Width", "test") # test不在iris數(shù)據(jù)中
iris %>% select(one_of(col_names))
filter函數(shù)介紹
常規(guī)用法
# 根據(jù)單個(gè)條件選擇行
filter(starwars, species == "Human")
filter(starwars, mass > 1000)
# 根據(jù)多個(gè)條件選擇行
filter(starwars, hair_color == "none" & eye_color == "black")
filter(starwars, hair_color == "none" | eye_color == "black")
filter(starwars, hair_color == "none", eye_color == "black")
# 取反
filter(starwars, !(hair_color == "none" & eye_color == "black"))
# 根據(jù)運(yùn)算結(jié)果選擇行
starwars %>% filter(mass > mean(mass, na.rm = TRUE))
starwars %>% group_by(gender) %>% filter(mass > mean(mass, na.rm = TRUE))
# 列名為變量
vars <- c("mass", "height")
cond <- c(80, 150)
starwars %>%
filter(
.data[[vars[[1]]]] > cond[[1]],
.data[[vars[[2]]]] > cond[[2]]
)
特殊用法
# 列名為變量
col_name <- "Species"
value_to_filter <- "setosa"
filtered_iris <- iris %>% filter(!!sym(col_name) == value_to_filter)
summarise函數(shù)介紹
常規(guī)用法
# 統(tǒng)計(jì)平均值和總行數(shù)
mtcars %>%
summarise(mean = mean(disp), n = n())
# 統(tǒng)計(jì)分組后平均值和總行數(shù)
mtcars %>%
group_by(cyl) %>%
summarise(mean = mean(disp), n = n())
# 列名是變量
var <- "mass"
summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE))
summarise(starwars, avg = mean(!!sym(var), na.rm = TRUE))
特殊用法
# 分組崭添,合并多行為一行
iris %>%
group_by(Species) %>% summarise(test=paste(Petal.Length, collapse=','))
# 多個(gè)函數(shù)
min_max <- list(
min = ~min(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE)
)
starwars %>% summarise(across(where(is.numeric), min_max))
starwars %>% summarise(
tibble(
across(where(is.numeric), ~min(.x, na.rm = TRUE), .names = "min_{.col}"),
across(where(is.numeric), ~max(.x, na.rm = TRUE), .names = "max_{.col}")
)
)
arrange函數(shù)介紹
常規(guī)用法
# 依據(jù)多行進(jìn)行排序
arrange(mtcars, cyl, disp)
# 降序排序
arrange(mtcars, desc(disp))
# 先升序,再降序
arrange(mtcars, mpg, desc(hp))
# <p style="color: red;">分組排序叛氨,必須設(shè)置.by_group參數(shù)為TRUE</p>
mtcars %>% group_by(cyl) %>% arrange(desc(wt),.by_group = TRUE) %>% print(n=32)
# 列名是變量
var <- 'mpg'
arrange(mtcars, !!sym(var))
arrange(mtcars, .data[[var]])
# 選擇多列進(jìn)行排序
iris %>% arrange(pick(starts_with("Sepal")))
iris %>% arrange(across(starts_with("Sepal"), desc))
left_join函數(shù)介紹
install.packages('nycflights13')
library(nycflights13)
flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)
# 向左合并呼渣,指定按某列合并
flights2 %>% left_join(planes, by = "tailnum")
# 按照交集合并
df1 <- tibble(x = c(1, 2), y = 2:1)
df2 <- tibble(x = c(3, 1), a = 10, b = "a")
df1 %>% inner_join(df2)
# 向右合并
df1 %>% right_join(df2)
# 按照并集合并
df1 %>% full_join(df2)
其它常用函數(shù)介紹
rename重命名列
# 修改列名
iris <- as_tibble(iris)
rename(iris, petal_length = Petal.Length)
# 修改多列列名
lookup <- c(pl = "Petal.Length", sl = "Sepal.Length")
rename(iris, all_of(lookup))
# 修改多列列名,如果列名不存在力试,使用any_of
lookup <- c(lookup, new = "unknown")
try(rename(iris, all_of(lookup)))
rename(iris, any_of(lookup))
# 切換列名大小寫徙邻,替換字符
rename_with(iris, toupper)
rename_with(iris, toupper, starts_with("Petal"))
rename_with(iris, ~ tolower(gsub(".", "_", .x, fixed = TRUE)))
# 使用paste修改列名時(shí),設(shè)置recycle0 = TRUE防止空選
rename_with(
iris,
~ paste0("prefix_", .x, recycle0 = TRUE),
starts_with("nonexistent")
)
# rlang包中的set_names
set_names(head(mtcars), paste0(colnames(mtcars), "_foo"))
set_names(head(mtcars), paste0, "_foo")
relocate調(diào)整列順序
df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
# 默認(rèn)最前
df %>% relocate(f)
# 指定順序
df %>% relocate(a, .after = c)
df %>% relocate(f, .before = b)
df %>% relocate(a, .after = last_col())
# 指定順序并改名
df %>% relocate(ff = f)
# 條件選擇列
df %>% relocate(where(is.character))
df %>% relocate(any_of(c("a", "e", "i", "o", "u")))
slice選擇行
# 選擇第一行
mtcars %>% slice(1)
# 選擇尾部一行或多行
mtcars %>% slice(n())
mtcars %>% slice(5:n())
# 去除指定行
slice(mtcars, -(1:4))
# 選擇頭部或尾部指定行
mtcars %>% slice_head(n = 5)
mtcars %>% slice_tail(n = 5)
# 排序后選擇指定行
mtcars %>% slice_min(mpg, n = 5)
mtcars %>% slice_max(mpg, n = 5)
# slice_min()和slice_max()可能返回多行畸裳,因?yàn)榕判蚝笾迪嗤掷纾O(shè)置with_ties = FALSE,輸出指定行
mtcars %>% slice_min(cyl, n = 1)
mtcars %>% slice_min(cyl, n = 1, with_ties = FALSE)
# 隨機(jī)選擇行
mtcars %>% slice_sample(n = 5)
mtcars %>% slice_sample(n = 5, replace = TRUE)
# 隨機(jī)選擇行,并設(shè)置權(quán)重
mtcars %>% slice_sample(weight_by = wt, n = 5)
across選擇列
# 選擇列帅容,并用函數(shù)處理
iris %>% mutate(across(c(Sepal.Length, Sepal.Width), round))
iris %>%
group_by(Species) %>%
summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE)))
# 選擇列颇象,并用函數(shù)處理,并定義列名并徘。.col表示原來的列名遣钳,.fn表示函數(shù)名
iris %>%
group_by(Species) %>%
summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{.col}.fn{.fn}"))
# 選擇列,只要某一行滿足條件即保留行
iris %>%
filter(if_any(ends_with("Width"), ~ . > 4))
case_when條件語句
# 單列
x <- 1:70
case_when(
x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
.default = as.character(x)
)
# 多列
starwars %>%
mutate(type = case_when(
height > 200 | mass > 200 ~ "large",
species == "Droid" ~ "robot",
.default = "other"
)) %>%
pull(type)
pull選擇列
# 選擇列麦乞,適用于一列蕴茴,生成vector,和select函數(shù)最大的不同
mtcars %>% pull(cyl)
# 選擇列姐直,生成named vector
starwars %>% pull(height, name)