使用nycflights13
數(shù)據(jù)包來進(jìn)行說明翘狱,這個數(shù)據(jù)框包含了2013年從紐約市出發(fā)的所有336776次航班的信息蝶棋,該數(shù)據(jù)來自美國交通統(tǒng)計局耘拇,可以使用?flights
查看說明文檔
1.數(shù)據(jù)準(zhǔn)備
library(tidyverse)
library(nycflights13)
head(flights)
# A tibble: 6 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58
# ... with 1 more variable: time_hour <dttm>
# 查看整個數(shù)據(jù)
View(flights)
其中
- int:表示整數(shù)型變量
- dbl:表示雙精度浮點(diǎn)數(shù)型變量煮嫌,或稱為實(shí)數(shù)
- chr:表示字符型向量脏答,或者稱為字符串
- dttm:表示日期時間(日期+時間)型變量
- lgl:邏輯性變量丢间,僅包含F(xiàn)ALSE和TRUE
- fctr:表示因子探熔,R用其來表示具有固定數(shù)目的值的分類變量
- date:表示日期型變量
2. dplyr基礎(chǔ)函數(shù)
2.1 filter() 按行篩選
filter()可以基于觀測的值篩選出一個觀測的子集
filter(flights,month==1,day==1)
# A tibble: 842 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6 0
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0
# # ... with 832 more rows, and 1 more variable: time_hour <dttm>
# dplyr函數(shù)不會修改輸入,因此需要保存函數(shù)結(jié)果烘挫,需要使用賦值操作符 <-
jan1 <- filter(flights,month==1,day==1)
# R要么輸出結(jié)果诀艰,要么將結(jié)果保存在一個變量中,如果想要同時完成這兩種操作饮六,那么需要用括號()將賦值語句括起來
(dec25 <- filter(flights,month==12,day==25))
# # A tibble: 719 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 12 25 456 500 -4 649 651 -2 US 1895 N156UW EWR CLT
# 2 2013 12 25 524 515 9 805 814 -9 UA 1016 N32404 EWR IAH
# 3 2013 12 25 542 540 2 832 850 -18 AA 2243 N5EBAA JFK MIA
# 4 2013 12 25 546 550 -4 1022 1027 -5 B6 939 N665JB JFK BQN
# 5 2013 12 25 556 600 -4 730 745 -15 AA 301 N3JLAA LGA ORD
# 6 2013 12 25 557 600 -3 743 752 -9 DL 731 N369NB LGA DTW
# 7 2013 12 25 557 600 -3 818 831 -13 DL 904 N397DA LGA ATL
# 8 2013 12 25 559 600 -1 855 856 -1 B6 371 N608JB LGA FLL
# 9 2013 12 25 559 600 -1 849 855 -6 B6 605 N536JB EWR FLL
# 10 2013 12 25 600 600 0 850 846 4 B6 583 N746JB JFK MCO
# # ... with 709 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
2.1.1 比較運(yùn)算符
- 為有效的進(jìn)行篩選其垄,需要知道如何使用比較運(yùn)算符來選擇觀測
>
<
<=
>=
==
!=
不等于 - 計算機(jī)使用的是有限精度運(yùn)算,無法存儲無限位的數(shù)卤橄,因此绿满,我們所看到的的每個數(shù)都是一個近似值
在比較浮點(diǎn)數(shù)是否相等時,不能使用==
窟扑,而應(yīng)該使用near()
# 返回值令人目瞪口呆
1/49*49==1
# [1] FALSE
# 正確的打開方式
near(1/49*49,1)
# [1] TRUE
2.1.2 邏輯運(yùn)算符
&
與喇颁,相交
|
或,相合
!
非
# 如果想要過濾得到11月與12月出發(fā)的所有航班
filter(flights,month == 11 | month == 12)
# # A tibble: 55,403 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 11 1 5 2359 6 352 345 7 B6 745 N568JB JFK PSE
# 2 2013 11 1 35 2250 105 123 2356 87 B6 1816 N353JB JFK SYR
# 3 2013 11 1 455 500 -5 641 651 -10 US 1895 N192UW EWR CLT
# 4 2013 11 1 539 545 -6 856 827 29 UA 1714 N38727 LGA IAH
# 5 2013 11 1 542 545 -3 831 855 -24 AA 2243 N5CLAA JFK MIA
# 6 2013 11 1 549 600 -11 912 923 -11 UA 303 N595UA JFK SFO
# 7 2013 11 1 550 600 -10 705 659 6 US 2167 N748UW LGA DCA
# 8 2013 11 1 554 600 -6 659 701 -2 US 2134 N742PS LGA BOS
# 9 2013 11 1 554 600 -6 826 827 -1 DL 563 N912DE LGA ATL
# 10 2013 11 1 554 600 -6 749 751 -2 DL 731 N315NB LGA DTW
# # ... with 55,393 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights,month == 11 | 12)
# # A tibble: 336,776 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD
# # ... with 336,766 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# # time_hour <dttm>
# 11 | 12返回的是邏輯值
11 | 12
# [1] TRUE
# 查看一下flights$month == 11 | 12的統(tǒng)計情況
table(flights$month == 11 | 12)
# TRUE
# 336776
# 查看一下flights$month == 11 | flights$month == 12的統(tǒng)計情況
table(flights$month == 11 | flights$month == 12)
# FALSE TRUE
# 281373 55403
# x %in% y嚎货,選出x是y中一個值的所有行
# 所以month %in% c(11,12)與month == 11 | month == 12是等價的
filter(flights,month %in% c(11,12))
# # A tibble: 55,403 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 11 1 5 2359 6 352 345 7 B6 745 N568JB JFK PSE
# 2 2013 11 1 35 2250 105 123 2356 87 B6 1816 N353JB JFK SYR
# 3 2013 11 1 455 500 -5 641 651 -10 US 1895 N192UW EWR CLT
# 4 2013 11 1 539 545 -6 856 827 29 UA 1714 N38727 LGA IAH
# 5 2013 11 1 542 545 -3 831 855 -24 AA 2243 N5CLAA JFK MIA
# 6 2013 11 1 549 600 -11 912 923 -11 UA 303 N595UA JFK SFO
# 7 2013 11 1 550 600 -10 705 659 6 US 2167 N748UW LGA DCA
# 8 2013 11 1 554 600 -6 659 701 -2 US 2134 N742PS LGA BOS
# 9 2013 11 1 554 600 -6 826 827 -1 DL 563 N912DE LGA ATL
# 10 2013 11 1 554 600 -6 749 751 -2 DL 731 N315NB LGA DTW
# # ... with 55,393 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 使用德摩根定律可以將復(fù)雜的篩選條件進(jìn)行簡化
# !(x & y) 等價于 !x | !y
# !(x | y) 等價于 !x & !y
# 如果想要找出延誤時間(到達(dá)活出發(fā))不多于兩小時的航班无牵,可以使用以下兩種篩選方式
filter(flights,!(arr_delay > 120 | dep_delay > 120))
filter(flights,arr_delay <= 120,dep_delay <= 120)
2.1.3 缺失值 NA
NA在R中表示未知的值,NA是可傳染的厂抖,如果運(yùn)算中包含了NA茎毁,那么計算的結(jié)果也是NA
NA > 5
# [1] NA
10 == NA
# [1] NA
NA + 10
# [1] NA
NA /2
# [1] NA
is.na()
可以用來判斷是否為NA
y=c(1,NA,2)
is.na(y)
# [1] FALSE TRUE FALSE
# tibble包是一個輕量級的包,它實(shí)現(xiàn)的data.frame的重新塑造忱辅,保留了data.frame中經(jīng)過實(shí)踐證明有效的部分七蜘,吸取了專注于數(shù)據(jù)操作的dplyr包的基本思想。tibble包提供了更優(yōu)于data.frame的性能
(df <- tibble(y=c(1,NA,2)))
# # A tibble: 3 x 1
# y
# <dbl>
# 1 1
# 2 NA
# 3 2
# filter()只能篩選出條件為TRUE的行墙懂,它會排除那些條件為FALSE和NA的行
filter(df,y>=1)
# # A tibble: 2 x 1
# y
# <dbl>
# 1 1
# 2 2
# 如果需要保留NA值橡卤,需要在篩選條件中指明
filter(df,is.na(y) | y>=1)
# A tibble: 3 x 1
# y
# <dbl>
# 1 1
# 2 NA
# 3 2
2.2 arrange() 重排
arrange()與filter() 非常相似
# 如果列名不止一個,會將后面的列在前面列的基礎(chǔ)上進(jìn)行排序
arrange(flights,year,month,day)
# # A tibble: 336,776 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6 0
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
# 使用desc函數(shù)進(jìn)行降序排列
arrange(flights,desc(month))
# # A tibble: 336,776 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2013 12 1 13 2359 14 446 445 1 B6 745 N715JB JFK PSE 195 1617 23 59
# 2 2013 12 1 17 2359 18 443 437 6 B6 839 N593JB JFK BQN 186 1576 23 59
# 3 2013 12 1 453 500 -7 636 651 -15 US 1895 N197UW EWR CLT 86 529 5 0
# 4 2013 12 1 520 515 5 749 808 -19 UA 1487 N69804 EWR IAH 193 1400 5 15
# 5 2013 12 1 536 540 -4 845 850 -5 AA 2243 N634AA JFK MIA 144 1089 5 40
# 6 2013 12 1 540 550 -10 1005 1027 -22 B6 939 N821JB JFK BQN 189 1576 5 50
# 7 2013 12 1 541 545 -4 734 755 -21 EV 3819 N13968 EWR CVG 95 569 5 45
# 8 2013 12 1 546 545 1 826 835 -9 UA 1441 N23708 LGA IAH 204 1416 5 45
# 9 2013 12 1 549 600 -11 648 659 -11 US 2167 N945UW LGA DCA 42 214 6 0
# 10 2013 12 1 550 600 -10 825 854 -29 B6 605 N706JB EWR FLL 140 1065 6 0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
# df <- tibble(y=c(1,NA,2))
# 重新排序時损搬,NA總是排在最后
arrange(df,y)
# # A tibble: 3 x 1
# y
# <dbl>
# 1 1
# 2 2
# 3 NA
# 對y進(jìn)行降序
arrange(df,desc(y))
# # A tibble: 3 x 1
# y
# <dbl>
# 1 2
# 2 1
# 3 NA
2.3 select() 選擇特定的列
# 按名稱選擇列year碧库,month柜与,day
select(flights,year,month,day)
# # A tibble: 336,776 x 3
# year month day
# <int> <int> <int>
# 1 2013 1 1
# 2 2013 1 1
# 3 2013 1 1
# 4 2013 1 1
# 5 2013 1 1
# 6 2013 1 1
# 7 2013 1 1
# 8 2013 1 1
# 9 2013 1 1
# 10 2013 1 1
# # ... with 336,766 more rows
# 選擇在year:day之間的所有列
select(flights,year:day)
# # A tibble: 336,776 x 3
# year month day
# <int> <int> <int>
# 1 2013 1 1
# 2 2013 1 1
# 3 2013 1 1
# 4 2013 1 1
# 5 2013 1 1
# 6 2013 1 1
# 7 2013 1 1
# 8 2013 1 1
# 9 2013 1 1
# 10 2013 1 1
# # ... with 336,766 more rows
# 選擇不在year:day之間的所有列
select(flights,-(year:day))
# # A tibble: 336,776 x 16
# dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour
# <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dttm>
# 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00
# 2 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00
# 3 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40 2013-01-01 05:00:00
# 4 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45 2013-01-01 05:00:00
# 5 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0 2013-01-01 06:00:00
# 6 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58 2013-01-01 05:00:00
# 7 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0 2013-01-01 06:00:00
# 8 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6 0 2013-01-01 06:00:00
# 9 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0 2013-01-01 06:00:00
# 10 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0 2013-01-01 06:00:00
# # ... with 336,766 more rows
select()函數(shù)可以搭配使用一些輔助函數(shù):
- starts_with(“abc”):匹配以abc開頭的名稱
- end_with(abc"):匹配以abc結(jié)尾的名稱
- contains(“xyz”):匹配含xyz的名稱
- matches(“”)
- num_range(“x”,1:3):匹配x1嵌灰,x2弄匕,x3
# 使用select()函數(shù)的變種rename()進(jìn)行重命名
rename(flights,YEAR=year)
# # A tibble: 336,776 x 19
# YEAR month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
# n<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6 0
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
# select()與everything()連用可以將某幾列移到數(shù)據(jù)庫的開頭
select(flights,day,dep_time,everything())
# # A tibble: 336,776 x 19
# day dep_time year month sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 517 2013 1 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15
# 2 1 533 2013 1 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29
# 3 1 542 2013 1 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40
# 4 1 544 2013 1 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45
# 5 1 554 2013 1 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0
# 6 1 554 2013 1 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58
# 7 1 555 2013 1 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0
# 8 1 557 2013 1 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6 0
# 9 1 557 2013 1 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0
# 10 1 558 2013 1 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
2.4 mutate() 添加新變量
除了篩選現(xiàn)有的列,我們還需要添加新列
mutate()總是將新列添加在數(shù)據(jù)框最后
(flights_sml <- select(flights,year:day,ends_with("delay"),distance,air_time))
# # A tibble: 336,776 x 7
# year month day dep_delay arr_delay distance air_time
# <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 1 2 11 1400 227
# 2 2013 1 1 4 20 1416 227
# 3 2013 1 1 2 33 1089 160
# 4 2013 1 1 -1 -18 1576 183
# 5 2013 1 1 -6 -25 762 116
# 6 2013 1 1 -4 12 719 150
# 7 2013 1 1 -5 19 1065 158
# 8 2013 1 1 -3 -14 229 53
# 9 2013 1 1 -3 -8 944 140
# 10 2013 1 1 -2 8 733 138
# # ... with 336,766 more rows
mutate(flights_sml,gain=arr_delay-dep_delay,speed=distance/air_time*60)
# # A tibble: 336,776 x 9
# year month day dep_delay arr_delay distance air_time gain speed
# <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 1 2 11 1400 227 9 370.
# 2 2013 1 1 4 20 1416 227 16 374.
# 3 2013 1 1 2 33 1089 160 31 408.
# 4 2013 1 1 -1 -18 1576 183 -17 517.
# 5 2013 1 1 -6 -25 762 116 -19 394.
# 6 2013 1 1 -4 12 719 150 16 288.
# 7 2013 1 1 -5 19 1065 158 24 404.
# 8 2013 1 1 -3 -14 229 53 -11 259.
# 9 2013 1 1 -3 -8 944 140 -5 405.
# 10 2013 1 1 -2 8 733 138 10 319.
# # ... with 336,766 more rows
# 如果只想保留新變量沽瞭,可以使用transmute()
transmute(flights,gain=arr_delay - dep_delay,hours=air_time/60,gain_per_hour=gain/hours)
# # A tibble: 336,776 x 3
# gain hours gain_per_hour
# <dbl> <dbl> <dbl>
# 1 9 3.78 2.38
# 2 16 3.78 4.23
# 3 31 2.67 11.6
# 4 -17 3.05 -5.57
# 5 -19 1.93 -9.83
# 6 16 2.5 6.4
# 7 24 2.63 9.11
# 8 -11 0.883 -12.5
# 9 -5 2.33 -2.14
# 10 10 2.3 4.35
# # ... with 336,766 more rows
2.4.1 常用創(chuàng)建函數(shù)
創(chuàng)建新變量的多種函數(shù)可以同mutate()一同使用迁匠,需要注意的是,這種函數(shù)必須是向量化的驹溃,也即必須要接收一個向量的輸入城丧,并返回一個向量作為輸出,而且輸入與輸出具有同樣數(shù)目的分量
算術(shù)運(yùn)算符
+,-,*,/,^
模運(yùn)算符
%/%
整數(shù)除法
%%
求余
10%/%3
# [1] 3
10%%3
# [1] 1
對數(shù)函數(shù)
在處理的數(shù)據(jù)橫跨多個數(shù)量級的時候豌鹤,對數(shù)轉(zhuǎn)換是一種特別有用的轉(zhuǎn)換方式
log()
log2()
log10()
偏移函數(shù)
lead()
返回一個序列的領(lǐng)先值
lag()
返回一個序列的滯后值
lead(seq(1:20))
[1] 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 NA
lead(seq(1:20),3)
[1] 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 NA NA NA
lag(seq(1:20))
[1] NA 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
lag(seq(1:20),3)
[1] NA NA NA 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
- 累加及滾動聚合
cumsum
累加和
cumprod
累加積
cummin
累加最小值
cummax
累加最大值
cummean
累加均值
cumsum(seq(1:10))
# [1] 1 3 6 10 15 21 28 36 45 55
cumprod(seq(1:5))
# [1] 1 2 6 24 120
cummean(seq(1:5))
# [1] 1.0 1.5 2.0 2.5 3.0
邏輯比較
<,<=,>,>=,==,!=
排秩
min_rank()
z <- c(1,2,2,NA,3,4)
min_rank(z)
# [1] 1 2 2 NA 4 5
min_rank(desc(z))
# [1] 5 3 3 NA 2 1
2.5 group_by(),summarize() 分組
summarize()必須要與group_by()函數(shù)一起使用
summarise(flights,delay=mean(dep_delay, na.rm = T))
# A tibble: 1 x 1
# delay
# <dbl>
# 1 12.6
by.day <- group_by(flights,year,month,day)
summarise(by.day,delay=mean(dep_delay,na.rm = T))
`summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
# # A tibble: 365 x 4
# # Groups: year, month [12]
# year month day delay
# <int> <int> <int> <dbl>
# 1 2013 1 1 11.5
# 2 2013 1 2 13.9
# 3 2013 1 3 11.0
# 4 2013 1 4 8.95
# 5 2013 1 5 5.73
# 6 2013 1 6 7.15
# 7 2013 1 7 5.42
# 8 2013 1 8 2.55
# 9 2013 1 9 2.28
# 10 2013 1 10 2.84
# # ... with 355 more rows
2.6 %>% 管道符
2.6.1 用法:
x %>% f(y) == f(x,y)
x %>% f(y) %>% g(z) == g(f(x,y),z)
使用管道符亡哄,可以更清晰的看到數(shù)據(jù)的轉(zhuǎn)換過程,使得代碼具有更好的可讀性布疙,可以將%>%讀作“然后”
by.dest <- group_by(flights,dest)
delay <- summarise(by.dest,count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T))
(delay <- filter(delay,count>20,dest != "HNL"))
# # A tibble: 96 x 4
# dest count dist delay
# <chr> <int> <dbl> <dbl>
# 1 ABQ 254 1826 4.38
# 2 ACK 265 199 4.85
# 3 ALB 439 143 14.4
# 4 ATL 17215 757. 11.3
# 5 AUS 2439 1514. 6.02
# 6 AVL 275 584. 8.00
# 7 BDL 443 116 7.05
# 8 BGR 375 378 8.03
# 9 BHM 297 866. 16.9
# 10 BNA 6333 758. 11.8
# # ... with 86 more rows
# 下述兩種方式磺平,可以與以上輸出得到相同的結(jié)果
by.dest <- group_by(flights,dest) %>%
summarise(count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T)) %>%
filter(count>20,dest != "HNL"))
# 將代碼()擴(kuò)出來,可以同時完成賦值及輸出
(# 賦值
by.dest <- flights %>%
# 然后分組
group_by(dest) %>%
#然后統(tǒng)計拐辽,輸出
summarise(count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T)) %>%
filter(count>20,dest != "HNL") )
2.6.2 缺失值
(
flights %>%
group_by(year,month,day) %>%
summarise(mean=mean(dep_delay))
)
# # A tibble: 365 x 4
# # Groups: year, month [12]
# year month day mean
# <int> <int> <int> <dbl>
# 1 2013 1 1 NA
# 2 2013 1 2 NA
# 3 2013 1 3 NA
# 4 2013 1 4 NA
# 5 2013 1 5 NA
# 6 2013 1 6 NA
# 7 2013 1 7 NA
# 8 2013 1 8 NA
# 9 2013 1 9 NA
# 10 2013 1 10 NA
# # ... with 355 more rows
# 聚合函數(shù)在使用的時候:如果輸入值為NA,那么輸出值也是NA
# 通過設(shè)置na.rm = T擦酌,可以在計算前出去缺失值NA
(
flights %>%
group_by(year,month,day) %>%
summarise(mean=mean(dep_delay,na.rm = T))
)
# # A tibble: 365 x 4
# # Groups: year, month [12]
# year month day mean
# <int> <int> <int> <dbl>
# 1 2013 1 1 11.5
# 2 2013 1 2 13.9
# 3 2013 1 3 11.0
# 4 2013 1 4 8.95
# 5 2013 1 5 5.73
# 6 2013 1 6 7.15
# 7 2013 1 7 5.42
# 8 2013 1 8 2.55
# 9 2013 1 9 2.28
# 10 2013 1 10 2.84
# # ... with 355 more rows
# 本例中俱诸,缺失值NA表示取消的航班,所以我們可以通過先去除取消的航班來解決缺失值問題
not_cancelled <- flights %>%
filter(!is.na(dep_delay),!is.na(arr_delay))
not_cancelled %>%
group_by(year,month,day) %>%
summarise(mean=mean(dep_delay))
# # A tibble: 365 x 4
# # Groups: year, month [12]
# year month day mean
# <int> <int> <int> <dbl>
# 1 2013 1 1 11.4
# 2 2013 1 2 13.7
# 3 2013 1 3 10.9
# 4 2013 1 4 8.97
# 5 2013 1 5 5.73
# 6 2013 1 6 7.15
# 7 2013 1 7 5.42
# 8 2013 1 8 2.56
# 9 2013 1 9 2.30
# 10 2013 1 10 2.84
# # ... with 355 more rows
2.6.3 計數(shù)n()
聚合函數(shù)操作中包括一個計數(shù)n()
或者非缺失值的計數(shù)sum(!is.na())
赊舶,可以幫助我們檢查是否基于非常少量的數(shù)據(jù)做出結(jié)論
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(delay=mean(arr_delay))
ggplot(delays,aes(x=delay)) + geom_freqpoly(binwidth=10)
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(delay=mean(arr_delay,na.rm = T),n=n())
ggplot(delays,aes(x=n,y=delay)) + geom_point(alpha=1/10)
- 將ggplot2集成到dplyr工作流
delays %>%
filter(n<600 & n>20) %>%
ggplot(aes(x=n,y=delay)) + geom_point(alpha=1/10)
delays %>%
filter(!(n>600|n<20)) %>%
ggplot(aes(x=n,y=delay)) + geom_point(alpha=1/10)
delays %>%
filter(n<=600,n>=20) %>%
ggplot(aes(x=n,y=delay)) + geom_point(alpha=1/10)
ctrl+shift+P可以將上一次發(fā)送至控制臺的代碼段重新發(fā)送一次睁搭,在本實(shí)驗(yàn)中,可以用于多次修改n值后笼平,查看輸出的圖形
2.7 常用的摘要函數(shù)
只是使用均值园骆,計數(shù)和求和是遠(yuǎn)遠(yuǎn)不夠的
2.7.1 位置度量 median(x)
mean(x)
:均值
median(x)
:中位數(shù),指的是50%的數(shù)會大于x寓调,50%的數(shù)會小于x
not_cancelled %>%
group_by(year,month,day) %>%
summarise(
ave_delay1=mean(arr_delay),
ave_delay2=mean(arr_delay[arr_delay>0])
)
# # A tibble: 365 x 5
# # Groups: year, month [12]
# year month day ave_delay1 ave_delay2
# <int> <int> <int> <dbl> <dbl>
# 1 2013 1 1 12.7 32.5
# 2 2013 1 2 12.7 32.0
# 3 2013 1 3 5.73 27.7
# 4 2013 1 4 -1.93 28.3
# 5 2013 1 5 -1.53 22.6
# 6 2013 1 6 4.24 24.4
# 7 2013 1 7 -4.95 27.8
# 8 2013 1 8 -3.23 20.8
# 9 2013 1 9 -0.264 25.6
# 10 2013 1 10 -5.90 27.3
# # ... with 355 more rows
2.7.2 分散程度度量 sd(x), IQR(x), mad(x)
sd(x)
標(biāo)準(zhǔn)誤差
IQR(x)
四分位距
mad(x)
絕對中位差
not_cancelled %>%
group_by(dest) %>%
summarise(
sd=sd(distance),
IQR=IQR(distance),
mad=mad(distance)
) %>%
arrange(desc(sd))
# # A tibble: 104 x 4
# dest sd IQR mad
# <chr> <dbl> <dbl> <dbl>
# 1 EGE 10.5 21 1.48
# 2 SAN 10.4 21 0
# 3 SFO 10.2 21 0
# 4 HNL 10.0 20 0
# 5 SEA 9.98 20 0
# 6 LAS 9.91 21 0
# 7 PDX 9.87 20 0
# 8 PHX 9.86 20 0
# 9 LAX 9.66 21 0
# 10 IND 9.46 20 0
2.7.3 秩的度量 min(x), quantitle(x,0.25), max(x)
not_cancelled %>%
group_by(year,month,day) %>%
summarise(
first=min(dep_time),
last=max(dep_time)
)
# # A tibble: 365 x 5
# # Groups: year, month [12]
# year month day first last
# <int> <int> <int> <int> <int>
# 1 2013 1 1 517 2356
# 2 2013 1 2 42 2354
# 3 2013 1 3 32 2349
# 4 2013 1 4 25 2358
# 5 2013 1 5 14 2357
# 6 2013 1 6 16 2355
# 7 2013 1 7 49 2359
# 8 2013 1 8 454 2351
# 9 2013 1 9 2 2252
# 10 2013 1 10 3 2320
# # ... with 355 more rows
2.7.4 定位度量 first(x), nth(x), last(x)
not_cancelled %>%
group_by(year,month,day) %>%
summarise(
first=first(dep_time),
nth=nth(dep_time,1),
last=last(dep_time)
)
# # A tibble: 365 x 6
# # Groups: year, month [12]
# year month day first nth last
# <int> <int> <int> <int> <int> <int>
# 1 2013 1 1 517 517 2356
# 2 2013 1 2 42 42 2354
# 3 2013 1 3 32 32 2349
# 4 2013 1 4 25 25 2358
# 5 2013 1 5 14 14 2357
# 6 2013 1 6 16 16 2355
# 7 2013 1 7 49 49 2359
# 8 2013 1 8 454 454 2351
# 9 2013 1 9 2 2 2252
# 10 2013 1 10 3 3 2320
# # ... with 355 more rows
2.7.5 計數(shù) n()
- sum(!is.na(x)) 計算非缺失值的數(shù)目
- n_distinct(x)
- count()
not_cancelled %>%
group_by(dest) %>%
summarise(
carrier.na = sum(!is.na(carrier)),
carriers = n_distinct(carrier),
)
# # A tibble: 104 x 3
# dest carrier.na carriers
# <chr> <int> <int>
# 1 ABQ 254 1
# 2 ACK 264 1
# 3 ALB 418 1
# 4 ANC 8 1
# 5 ATL 16837 7
# 6 AUS 2411 6
# 7 AVL 261 2
# 8 BDL 412 2
# 9 BGR 358 1
# 10 BHM 269 1
# # ... with 94 more rows
not_cancelled %>%
count(dest)
# # A tibble: 104 x 2
# dest n
# <chr> <int>
# 1 ABQ 254
# 2 ACK 264
# 3 ALB 418
# 4 ANC 8
# 5 ATL 16837
# 6 AUS 2411
# 7 AVL 261
# 8 BDL 412
# 9 BGR 358
# 10 BHM 269
# # ... with 94 more rows
not_cancelled %>%
count(tailnum,wt=distance)
# # A tibble: 4,037 x 2
# tailnum n
# <chr> <dbl>
# 1 D942DN 3418
# 2 N0EGMQ 239143
# 3 N10156 109664
# 4 N102UW 25722
# 5 N103US 24619
# 6 N104UW 24616
# 7 N10575 139903
# 8 N105UW 23618
# 9 N107US 21677
# 10 N108UW 32070
邏輯值的計數(shù)及比例
- sum(x>10)
- mean(y==0)
not_cancelled %>%
group_by(year,month,day) %>%
summarise(n_early = sum(dep_time<500),
hour_perc=mean(arr_delay>60))
# # A tibble: 365 x 5
# # Groups: year, month [12]
# year month day n_early hour_perc
# <int> <int> <int> <int> <dbl>
# 1 2013 1 1 0 0.0722
# 2 2013 1 2 3 0.0851
# 3 2013 1 3 4 0.0567
# 4 2013 1 4 3 0.0396
# 5 2013 1 5 3 0.0349
# 6 2013 1 6 2 0.0470
# 7 2013 1 7 2 0.0333
# 8 2013 1 8 1 0.0213
# 9 2013 1 9 3 0.0202
# 10 2013 1 10 3 0.0183
# # ... with 355 more rows
2.8 按多個變量分組
當(dāng)使用多個變量進(jìn)行分組時锌唾,每次的摘要統(tǒng)計會用掉一個分組變量,后面的變量會在前面變量的基礎(chǔ)上循序漸進(jìn)
(daily <- group_by(flights,year,month,day))
# # A tibble: 336,776 x 19
# # Groups: year, month, day [365]
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD
# # ... with 336,766 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# # time_hour <dttm>
# 取消分組
daily %>% ungroup()
# # A tibble: 336,776 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD
# # ... with 336,766 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# # time_hour <dttm>
2.9 分組新變量
group()可以與mutate(),filter(),select(),arrange()等函數(shù)一起使用
flights %>%
group_by(year,month,day) %>%
filter(rank(desc(arr_delay))<=10)
# # A tibble: 3,609 x 19
# # Groups: year, month, day [365]
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 1 1 848 1835 853 1001 1950 851 MQ 3944 N942MQ JFK BWI
# 2 2013 1 1 1815 1325 290 2120 1542 338 EV 4417 N17185 EWR OMA
# 3 2013 1 1 1842 1422 260 1958 1535 263 EV 4633 N18120 EWR BTV
# 4 2013 1 1 1938 1703 155 2109 1823 166 EV 4300 N18557 EWR RIC
# 5 2013 1 1 1942 1705 157 2124 1830 174 MQ 4410 N835MQ JFK DCA
# 6 2013 1 1 2006 1630 216 2230 1848 222 EV 4644 N14972 EWR SAV
# 7 2013 1 1 2115 1700 255 2330 1920 250 9E 3347 N924XJ JFK CVG
# 8 2013 1 1 2205 1720 285 46 2040 246 AA 1999 N5DNAA EWR MIA
# 9 2013 1 1 2312 2000 192 21 2110 191 EV 4312 N13958 EWR DCA
# 10 2013 1 1 2343 1724 379 314 1938 456 EV 4321 N21197 EWR MCI
# # ... with 3,599 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
group_by(dest) %>%
filter(n()>365)
# # A tibble: 332,577 x 19
# # Groups: dest [77]
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN
# 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL
# 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO
# 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD
# # ... with 332,567 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# # time_hour <dttm>
Reference
R數(shù)據(jù)科學(xué)夺英,哈德利·威克姆晌涕,加勒特·格羅勒芒德等 著