《R for Data Science》第十六章 Dates and times 啃書知識點(diǎn)積累
參考鏈接:R for Data Science
Dates and times are hard because they have to reconcile two physical phenomena
(the rotation of the Earth and its orbit around the sun)
目前已經(jīng)把R更新到4.0剔交,安裝
nycflights13
出了些麻煩,最后用以下命令
install.packages("https://cran.r-project.org/src/contrib/nycflights13_1.0.1.tar.gz",
repos=NULL, method="libcurl")
Creating date/times
A date-time is a date plus a time: it uniquely identifies an instant in time.
Tibbles print this as<dttm>
. Elsewhere in R these are calledPOSIXct
library(lubridate)
today()
#> [1] "2020-04-27"
now()
#> [1] "22020-04-27 10:11:40 CST"
# today()中的tzone參數(shù)控制時(shí)區(qū)
?today()
today("GMT")
today("UTC")
- From strings
ymd("2020-04-27")
#> [1] "2020-04-27"
mdy("April 27st, 2020")
#> [1] "2020-04-27"
dmy("27-Apr-2020")
#> [1] "2020-04-27"
mdy("Apr-27-2020")
#> [1] "2020-04-27"
# dttm格式也可以
ymd_hms("2020-04-27 20:11:59")
#> [1] "2020-04-27 20:11:59 UTC"
mdy_hm("04/27/2020 08:01")
#> [1] "2020-04-27 08:01:00 UTC"
# 如果向量中含有無效字符串
ymd(c('20200427','XiChen'))
# [1] "2020-04-27" NA
# Warning message:
# 1 failed to parse.
- These functions also take unquoted numbers
ymd(20200427)
#> [1] "2020-04-27"
ymd(20200427, tz = "UTC")
#> [1] "2020-04-27 UTC"
- From individual components
To create a date/time from this sort of input, use
make_date()
for dates, ormake_datetime()
for date-times
library(tidyverse)
library(nycflights13)
library(lubridate)
flights %>%
select(year, month, day, hour, minute) %>%
mutate(departure = make_datetime(year, month, day, hour, minute))
#> # A tibble: 336,776 x 6
#> year month day hour minute departure
#> <int> <int> <int> <dbl> <dbl> <dttm>
#> 1 2013 1 1 5 15 2013-01-01 05:15:00
#> 2 2013 1 1 5 29 2013-01-01 05:29:00
#> 3 2013 1 1 5 40 2013-01-01 05:40:00
#> 4 2013 1 1 5 45 2013-01-01 05:45:00
#> 5 2013 1 1 6 0 2013-01-01 06:00:00
#> 6 2013 1 1 5 58 2013-01-01 05:58:00
#> # … with 3.368e+05 more rows
# 也可以用make_date不包含time
flights %>%
select(year, month, day, hour, minute) %>%
mutate(date = make_date(year, month))
- 自建函數(shù)配合
make_
解析日期時(shí)間
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
- From other types
-
date
和dttm
互換
as_datetime(today())
#> [1] "2020-01-15 UTC"
as_date(now())
#> [1] "2020-01-15"
- “Unix Epoch” 基于1970-01-01
as_datetime(60 * 60 * 10)
#> [1] "1970-01-01 10:00:00 UTC"
# 中間有兩個閏年
as_date(365 * 10 + 2)
#> [1] "1980-01-01"
Date-time components
datetime <- ymd_hms("2016-07-08 12:34:56")
year(datetime)
#> [1] 2016
month(datetime)
#> [1] 7
mday(datetime)
#> [1] 8
yday(datetime)
#> [1] 190
wday(datetime)
#> [1] 6
- 可以設(shè)置具體參數(shù)優(yōu)化提取
month(datetime, label = TRUE)
#> [1] Jul
#> 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
#> [1] Friday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
# wday的label參數(shù)關(guān)系到映射的文本
p1 <- flights_dt %>%
mutate(wday = wday(dep_time)) %>%
ggplot(aes(x = wday)) +
geom_bar()
p2 <- flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()
p1 + p2
- 一個類似于“幸存者偏差”的案例
# 實(shí)際起飛
p1 <- flights_dt %>%
mutate(minute = minute(dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot() +
geom_line(aes(minute, avg_delay))
# 計(jì)劃起飛
p2 <- flights_dt %>%
mutate(minute = minute(sched_dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot() +
geom_line(aes(minute, avg_delay))
p1 + p2
- Rounding
將時(shí)間歸并到近似單元
floor_date()
round_date()
-
ceiling_date()
(需要指定unit)
floor_date(today(), unit = "year") + months(0:11)
#> [1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
#> [6] "2019-06-01" "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01"
#> [11] "2019-11-01" "2019-12-01"
flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(week, n)) +
geom_line()
- Setting components
# 可以直接逐個設(shè)置
(datetime <- ymd_hms("2016-07-08 12:34:56"))
#> [1] "2016-07-08 12:34:56 UTC"
year(datetime) <- 2020
datetime
#> [1] "2020-07-08 12:34:56 UTC"
month(datetime) <- 01
datetime
#> [1] "2020-01-08 12:34:56 UTC"
hour(datetime) <- hour(datetime) + 1
datetime
#> [1] "2020-01-08 13:34:56 UTC"
# 也可以用update更新
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
#> [1] "2020-02-02 02:34:56 UTC"
# 值過大會滾動累加
ymd("2015-02-01") %>%
update(mday = 30)
#> [1] "2015-03-02"
ymd("2015-02-01") %>%
update(hour = 400)
#> [1] "2015-02-17 16:00:00 UTC"
- Q: How does the distribution of flight times within a day change over the course of the year?
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(aes(y = ..density..), binwidth = 60 * 60)
Time spans
- Durations
Durations always record the time span in seconds.
c_age <- today() - ymd(19941027)
c_age
# Time difference of 9314 days
as.duration(c_age) # 先轉(zhuǎn)換為s
# [1] "804729600s (~25.5 years)"
dseconds(15)
#> [1] "15s"
dminutes(10)
#> [1] "600s (~10 minutes)"
dhours(c(12, 24))
#> [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
#> [1] "0s" "86400s (~1 days)" "172800s (~2 days)"
#> [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
#> [1] "1814400s (~3 weeks)"
dyears(1)
#> [1] "31536000s (~52.14 weeks)"
# 可以做計(jì)算
2 * dyears(1)
#> [1] "63072000s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
#> [1] "38847600s (~1.23 years)"
tomorrow <- today() + ddays(1);tomorrow
# [1] "2020-04-28"
last_year <- today() - dyears(1)
# [1] "2020-04-26"
- Periods
Periods are time spans but don’t have a fixed length in seconds, instead they work with “human” times, like days and months.
# Durations無法根據(jù)時(shí)區(qū)調(diào)整
one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
#> [1] "2016-03-13 14:00:00 EDT"
# Periods可以
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + days(1)
#> [1] "2016-03-13 13:00:00 EDT"
# 另一個例子
# A leap year
ymd("2016-01-01") + dyears(1)
#> [1] "2016-12-31"
ymd("2016-01-01") + years(1)
#> [1] "2017-01-01"
- 多數(shù)時(shí)候periods和durations用法類似
但解析的是“human units”而不是durations中的秒
seconds(15)
#> [1] "15S"
minutes(10)
#> [1] "10M 0S"
hours(c(12, 24))
#> [1] "12H 0M 0S" "24H 0M 0S"
days(7)
#> [1] "7d 0H 0M 0S"
months(1:6)
#> [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
#> [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
#> [1] "21d 0H 0M 0S"
years(1)
#> [1] "1y 0m 0d 0H 0M 0S"
# 也可以做計(jì)算
10 * (months(6) + days(1))
#> [1] "60m 10d 0H 0M 0S"
days(50) + hours(25) + minutes(2)
#> [1] "50d 25H 2M 0S"
- Intervals
涉及的符號:
%--%
years(1) / days(1)
# [1] 365.25
next_year <- today() + years(1)
(today() %--% next_year) / ddays(1)
# [1] 365
(today() %--% next_year) %/% days(1)
# [1] 365
- Summary
If you only care about physical time, use a duration;
if you need to add human times, use a period;
if you need to figure out how long a span is in human units, use an interval.
Time zones
用的少皱卓,就放兩個可能用到的代碼
Sys.timezone()
# [1] "Asia/Taipei"
ymd_hms("2020-04-27 12:00:00", tz = Sys.timezone())
# [1] "2020-04-27 12:00:00 CST"