- .加載包
library(dplyr)
library(data.table)
library(lubridate)
library(jsonlite)
library(tidyr)
library(ggplot2)
library(compare)
使用jsonlite包中的fromJSON函數(shù)來下載數(shù)據(jù)集的JSON格式數(shù)據(jù)煎殷。
spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD")
names(spending)
- .數(shù)據(jù)處理
meta <- spending$meta
hospital_spending <- data.frame(spending$data)
colnames(hospital_spending) <- make.names(meta$view$columns$name)
## 查看數(shù)據(jù)
glimpse(hospital_spending)
# select列篩選
hospital_spending <- select(hospital_spending,-c(sid:meta))
glimpse(hospital_spending)
導(dǎo)入的所有數(shù)據(jù)列都是因子型數(shù)據(jù)。
下面我們將列中數(shù)據(jù)為數(shù)值的列改為數(shù)值型數(shù)據(jù):
#因子型到數(shù)值型:先as.character再as.numeric
cols = 6:11 #需要改變數(shù)據(jù)類型的列
## 這里操作可以把需要改變的列作為一個向量變量
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character)
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.numeric)
最后兩列數(shù)據(jù)分別是數(shù)據(jù)收集的起始日期和結(jié)束日期饵撑。
使用lubridate包來糾正這兩列的數(shù)據(jù)類型:
cols = 12:13;
hospital_spending[,cols] <- lapply(hospital_spending[,cols],ymd_hms)
檢查數(shù)據(jù)列是否是我們想要的數(shù)據(jù)類型:
sapply(hospital_spending,class) #sapply的用法
- .創(chuàng)建data.table類型數(shù)據(jù)
使用data.table函數(shù)創(chuàng)建data.table類型數(shù)據(jù):
class(hospital_spending)
hospital_spending_DT <- data.table(hospital_spending)
class(hospital_spending_DT)
- .選取數(shù)據(jù)集的某些列
對于選取數(shù)據(jù)列锨咙,我們可以使用dplyr包中的select函數(shù)酪刀。
另一方面眼滤,我們只需在data.table中指定對應(yīng)的列名即可。
選取一個變量
from_dplyr <- select(hospital_spending,Hospital_Name)
from_data_table <- hospital_spending_DT[,.(Hospital_Name)]
####from_data_table <- hospital_spending_DT[,Hospital_Name]
##直接這樣也是可以的
對比下dplyr和data.table給出的結(jié)果是否相同:
compare(from_dplyr,from_data_table,allowAll = TRUE)
刪除一個變量
from_dplyr <- select(hospital_spending,-Hospital_Name)
from_data_table <- hospital_spending_DT[,Hospital_Name := NULL]
####另一種寫法
#from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE]
compare(from_dplyr,from_data_table,allowAll = TRUE)
對copy()函數(shù)所復(fù)制的輸入對象得到的引用執(zhí)行任何操作都不會對原始數(shù)據(jù)對象產(chǎn)生任何影響。如下所示:
DT=copy(hospital_spending_DT)
"Hospital_Name"%in% names(DT)
#刪除其中一列
DT <- DT[,!c("Hospital_Name"),with = FALSE]
"Hospital_Name"%in% names(DT) #驗證是不是在里面
刪除多個變量
DT=copy(hospital_spending_DT)
DT=DT[,c("Hospital_Name","State","Measure.Start.Date","Measure.End.Date"):=NULL]
c("Hospital_Name","State","Measure.Start.Date","Measure.End.Date") %in% names(DT)
選取多個變量
from_dplyr = select(hospital_spending, Hospital.Name,State,Measure.Start.Date,Measure.End.Date)
from_data_table = hospital_spending_DT[,.(Hospital.Name,State,Measure.Start.Date,Measure.End.Date)]
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes
刪除多個變量
現(xiàn)在牵现,我們要刪除hospital_spending數(shù)據(jù)框和data.table類型數(shù)據(jù)hospital_spending_DT中的變量Hospital.Name施籍,State,Measure.Start.Date,Measure.End.Date:
from_dplyr = select(hospital_spending, -c(Hospital.Name,State,Measure.Start.Date,Measure.End.Date))
from_data_table = hospital_spending_DT[,!c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"),with=FALSE]
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes
dplyr包中有contains()照弥,starts_with()这揣,ends_with()三個函數(shù)影斑,它們可以跟select()函數(shù)一起結(jié)合使用机打。對于data.table,我們則可以使用正則表達式残邀。下面我們將選取所有列名包含字符“Date”的列芥挣,示例如下:
from_dplyr = select(hospital_spending,contains("Date"))
from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT)))
compare(from_dplyr,from_data_table, allowAll=TRUE)
names(from_dplyr)
重命名列名
setnames(hospital_spending_DT,c("Hospital.Name", "Measure.Start.Date","Measure.End.Date"), c("Hospital","Start_Date","End_Date"))
names(hospital_spending_DT)
"Hospital" "Provider.Number." "State" "Period" "Claim.Type" "Avg.Spending.Per.Episode..Hospital." "Avg.Spending.Per.Episode..State." "Avg.Spending.Per.Episode..Nation." "Percent.of.Spending..Hospital." "Percent.of.Spending..State." "Percent.of.Spending..Nation." "Start_Date" "End_Date"
hospital_spending = rename(hospital_spending,Hospital= Hospital.Name, Start_Date=Measure.Start.Date,End_Date=Measure.End.Date)
compare(hospital_spending,hospital_spending_DT, allowAll=TRUE)
TRUE
dropped attributes
篩選行
對于數(shù)據(jù)集特定行的篩選,我們可以使用dplyr包中的filter函數(shù)空另,它通過可能包含正則表達式的邏輯語句來實現(xiàn)該功能蹋砚。在data.table中都弹,我們只需使用邏輯語句就可以了匙姜。
對單個變量進行篩選
# selecting rows for California
from_dplyr = filter(hospital_spending,State=='CA')
from_data_table = hospital_spending_DT[State=='CA']
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes
對多個變量進行篩選
from_dplyr = filter(hospital_spending,State=='CA' & Claim.Type!="Hospice")
from_data_table = hospital_spending_DT[State=='CA' & Claim.Type!="Hospice"]
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes
from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX"))
from_data_table = hospital_spending_DT[State %in% c('CA','MA',"TX")]
unique(from_dplyr$State)
CA MA TX
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes
數(shù)據(jù)排序
我們使用dplyr包中的arrange()函數(shù)對數(shù)據(jù)行進行排序氮昧,可以實現(xiàn)對一個或多個變量的數(shù)據(jù)行進行排序。如果想實現(xiàn)降序咪辱,需使用如下代碼所示的desc()函數(shù)依啰。以下示例演示了如何對數(shù)據(jù)行進行升序和降序排序:
#升序
from_dplyr = arrange(hospital_spending, State)
from_data_table = setorder(hospital_spending_DT, State)
compare(from_dplyr,from_data_table, allowAll=TRUE)
#降序
from_dplyr = arrange(hospital_spending, desc(State))
from_data_table = setorder(hospital_spending_DT, -State)
compare(from_dplyr,from_data_table, allowAll=TRUE)
對多變量進行排序
以下代碼實現(xiàn)了State變量升序企锌,End_Date變量降序排序:
from_dplyr = arrange(hospital_spending, State,desc(End_Date))
from_data_table = setorder(hospital_spending_DT, State,-End_Date)
compare(from_dplyr,from_data_table, allowAll=TRUE)
添加或更新列
在dplyr包中专筷,使用mutate()函數(shù)來添加新列蒸苇。在data.table包中,我們可以使用:=引用來添加或更新列:
from_dplyr = mutate(hospital_spending, diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.]
compare(from_dplyr,from_data_table, allowAll=TRUE)
from_dplyr = mutate(hospital_spending, diff1=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,c("diff1","diff2") := list(Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)]
compare(from_dplyr,from_data_table, allowAll=TRUE)
數(shù)據(jù)匯總
我們可以使用dplyr包中的summarise()函數(shù)來創(chuàng)建概括性統(tǒng)計量:
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.))
mean 1820.409
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))]
mean 1820.409
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),
maximum=max(Avg.Spending.Per.Episode..Nation.),
minimum=min(Avg.Spending.Per.Episode..Nation.),
median=median(Avg.Spending.Per.Episode..Nation.))
mean maximum minimum median
1820.409 20025 0 109
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),
maximum=max(Avg.Spending.Per.Episode..Nation.),
minimum=min(Avg.Spending.Per.Episode..Nation.),
median=median(Avg.Spending.Per.Episode..Nation.))]
mean maximum minimum median
1820.409 20025 0 109
當(dāng)然庇勃,我們也可以對各分組的數(shù)據(jù)塊分別求概述性統(tǒng)計量槽驶。在dplyr中使用group_by()函數(shù),data.table中指定by參數(shù)即可:
head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)])
mygroup= group_by(hospital_spending,Hospital,State)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]
compare(from_dplyr,from_data_table, allowAll=TRUE)
鏈?zhǔn)讲僮?/p>
在dplyr和data.table包中再层,我們可以使用鏈?zhǔn)讲僮鱽韺崿F(xiàn)代碼的連續(xù)性堡纬。在dplyr中聂受,使用magrittr包中的%>%管道函數(shù)非晨靖洌酷。%>%的功能是用于實現(xiàn)將一個函數(shù)的輸出傳遞給下一個函數(shù)的第一個參數(shù)炮叶。在data.table中,我們可以使用%>%或[來實現(xiàn)鏈?zhǔn)讲僮鳌?/p>
from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]
compare(from_dplyr,from_data_table, allowAll=TRUE)
hospital_spending %>% group_by(State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.)) %>%
arrange(desc(mean)) %>% head(10) %>%
mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>%
ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+
xlab("")+ggtitle('Average Spending Per Episode by State')+
ylab('Average')+ coord_cartesian(ylim = c(3800, 4000))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),
by=.(State)][order(-mean)][1:10] %>%
mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>%
ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+
xlab("")+ggtitle('Average Spending Per Episode by State')+
ylab('Average')+ coord_cartesian(ylim = c(3800, 4000))