dplyr和data.table

.加載包

library(dplyr)
library(data.table)
library(lubridate)
library(jsonlite)
library(tidyr)
library(ggplot2)
library(compare)

使用jsonlite包中的fromJSON函數(shù)來下載數(shù)據(jù)集的JSON格式數(shù)據(jù)煎殷。

spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD")
names(spending)

.數(shù)據(jù)處理

meta <- spending$meta
hospital_spending <- data.frame(spending$data)
colnames(hospital_spending) <- make.names(meta$view$columns$name)

## 查看數(shù)據(jù)
glimpse(hospital_spending)

# select列篩選
hospital_spending <- select(hospital_spending,-c(sid:meta))
glimpse(hospital_spending)

導(dǎo)入的所有數(shù)據(jù)列都是因子型數(shù)據(jù)。
下面我們將列中數(shù)據(jù)為數(shù)值的列改為數(shù)值型數(shù)據(jù)：

#因子型到數(shù)值型：先as.character再as.numeric

cols = 6:11  #需要改變數(shù)據(jù)類型的列
## 這里操作可以把需要改變的列作為一個向量變量
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character)
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.numeric)

最后兩列數(shù)據(jù)分別是數(shù)據(jù)收集的起始日期和結(jié)束日期饵撑。
使用lubridate包來糾正這兩列的數(shù)據(jù)類型：

cols = 12:13;
hospital_spending[,cols] <- lapply(hospital_spending[,cols],ymd_hms)

檢查數(shù)據(jù)列是否是我們想要的數(shù)據(jù)類型：

sapply(hospital_spending,class)    #sapply的用法

.創(chuàng)建data.table類型數(shù)據(jù)

使用data.table函數(shù)創(chuàng)建data.table類型數(shù)據(jù)：

class(hospital_spending)

hospital_spending_DT <- data.table(hospital_spending)
class(hospital_spending_DT)

.選取數(shù)據(jù)集的某些列

對于選取數(shù)據(jù)列锨咙，我們可以使用dplyr包中的select函數(shù)酪刀。
另一方面眼滤，我們只需在data.table中指定對應(yīng)的列名即可。

選取一個變量

from_dplyr <- select(hospital_spending,Hospital_Name)
from_data_table <- hospital_spending_DT[,.(Hospital_Name)]

####from_data_table <- hospital_spending_DT[,Hospital_Name]
##直接這樣也是可以的

對比下dplyr和data.table給出的結(jié)果是否相同：

compare(from_dplyr,from_data_table,allowAll = TRUE)

刪除一個變量

from_dplyr <- select(hospital_spending,-Hospital_Name)
from_data_table <- hospital_spending_DT[,Hospital_Name := NULL]
####另一種寫法
#from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE]

compare(from_dplyr,from_data_table,allowAll = TRUE)

對copy()函數(shù)所復(fù)制的輸入對象得到的引用執(zhí)行任何操作都不會對原始數(shù)據(jù)對象產(chǎn)生任何影響。如下所示：

DT=copy(hospital_spending_DT)
"Hospital_Name"%in% names(DT)

#刪除其中一列
DT <- DT[,!c("Hospital_Name"),with = FALSE]
"Hospital_Name"%in% names(DT)   #驗證是不是在里面

刪除多個變量

DT=copy(hospital_spending_DT)

DT=DT[,c("Hospital_Name","State","Measure.Start.Date","Measure.End.Date"):=NULL] 

c("Hospital_Name","State","Measure.Start.Date","Measure.End.Date") %in% names(DT)

選取多個變量

from_dplyr = select(hospital_spending, Hospital.Name,State,Measure.Start.Date,Measure.End.Date)
from_data_table = hospital_spending_DT[,.(Hospital.Name,State,Measure.Start.Date,Measure.End.Date)]
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes

刪除多個變量

現(xiàn)在牵现，我們要刪除hospital_spending數(shù)據(jù)框和data.table類型數(shù)據(jù)hospital_spending_DT中的變量Hospital.Name施籍，State，Measure.Start.Date，Measure.End.Date：

from_dplyr = select(hospital_spending, -c(Hospital.Name,State,Measure.Start.Date,Measure.End.Date))
from_data_table = hospital_spending_DT[,!c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"),with=FALSE]
compare(from_dplyr,from_data_table, allowAll=TRUE)
TRUE
dropped attributes

dplyr包中有contains()照弥，starts_with()这揣，ends_with()三個函數(shù)影斑，它們可以跟select()函數(shù)一起結(jié)合使用机打。對于data.table，我們則可以使用正則表達式残邀。下面我們將選取所有列名包含字符“Date”的列芥挣，示例如下：

from_dplyr = select(hospital_spending,contains("Date"))
from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT)))
compare(from_dplyr,from_data_table, allowAll=TRUE)
names(from_dplyr)

重命名列名

setnames(hospital_spending_DT,c("Hospital.Name", "Measure.Start.Date","Measure.End.Date"), c("Hospital","Start_Date","End_Date"))

names(hospital_spending_DT)

"Hospital" "Provider.Number." "State" "Period" "Claim.Type" "Avg.Spending.Per.Episode..Hospital." "Avg.Spending.Per.Episode..State." "Avg.Spending.Per.Episode..Nation." "Percent.of.Spending..Hospital." "Percent.of.Spending..State." "Percent.of.Spending..Nation." "Start_Date" "End_Date" 

hospital_spending = rename(hospital_spending,Hospital= Hospital.Name, Start_Date=Measure.Start.Date,End_Date=Measure.End.Date)

compare(hospital_spending,hospital_spending_DT, allowAll=TRUE)

TRUE
  dropped attributes

篩選行

對于數(shù)據(jù)集特定行的篩選，我們可以使用dplyr包中的filter函數(shù)空另，它通過可能包含正則表達式的邏輯語句來實現(xiàn)該功能蹋砚。在data.table中都弹，我們只需使用邏輯語句就可以了匙姜。

對單個變量進行篩選

# selecting rows for California
from_dplyr = filter(hospital_spending,State=='CA') 

from_data_table = hospital_spending_DT[State=='CA']

compare(from_dplyr,from_data_table, allowAll=TRUE)

TRUE
  dropped attributes

對多個變量進行篩選

from_dplyr = filter(hospital_spending,State=='CA' & Claim.Type!="Hospice") 
from_data_table = hospital_spending_DT[State=='CA' & Claim.Type!="Hospice"]

compare(from_dplyr,from_data_table, allowAll=TRUE)

TRUE
  dropped attributes

from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX")) 
from_data_table = hospital_spending_DT[State %in% c('CA','MA',"TX")]

unique(from_dplyr$State)
CA MA TX 

compare(from_dplyr,from_data_table, allowAll=TRUE)

TRUE
dropped attributes

數(shù)據(jù)排序

我們使用dplyr包中的arrange()函數(shù)對數(shù)據(jù)行進行排序氮昧，可以實現(xiàn)對一個或多個變量的數(shù)據(jù)行進行排序。如果想實現(xiàn)降序咪辱，需使用如下代碼所示的desc()函數(shù)依啰。以下示例演示了如何對數(shù)據(jù)行進行升序和降序排序：

#升序 
from_dplyr = arrange(hospital_spending, State)
from_data_table = setorder(hospital_spending_DT, State)

compare(from_dplyr,from_data_table, allowAll=TRUE)

#降序 
from_dplyr = arrange(hospital_spending, desc(State))
from_data_table = setorder(hospital_spending_DT, -State)

compare(from_dplyr,from_data_table, allowAll=TRUE)

對多變量進行排序

以下代碼實現(xiàn)了State變量升序企锌，End_Date變量降序排序：

from_dplyr = arrange(hospital_spending, State,desc(End_Date))
from_data_table = setorder(hospital_spending_DT, State,-End_Date)

compare(from_dplyr,from_data_table, allowAll=TRUE)

添加或更新列

在dplyr包中专筷，使用mutate()函數(shù)來添加新列蒸苇。在data.table包中，我們可以使用:=引用來添加或更新列：

from_dplyr = mutate(hospital_spending, diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.]
compare(from_dplyr,from_data_table, allowAll=TRUE)


from_dplyr = mutate(hospital_spending, diff1=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,c("diff1","diff2") := list(Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)]
compare(from_dplyr,from_data_table, allowAll=TRUE)

數(shù)據(jù)匯總

我們可以使用dplyr包中的summarise()函數(shù)來創(chuàng)建概括性統(tǒng)計量：

summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.))
mean 1820.409

hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))]
mean 1820.409

summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),
                            maximum=max(Avg.Spending.Per.Episode..Nation.),
                            minimum=min(Avg.Spending.Per.Episode..Nation.),
                            median=median(Avg.Spending.Per.Episode..Nation.))
mean     maximum   minimum  median
1820.409  20025       0      109

hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),
                        maximum=max(Avg.Spending.Per.Episode..Nation.),
                        minimum=min(Avg.Spending.Per.Episode..Nation.),
                        median=median(Avg.Spending.Per.Episode..Nation.))]
mean      maximum   minimum  median
1820.409  20025       0      109

當(dāng)然庇勃，我們也可以對各分組的數(shù)據(jù)塊分別求概述性統(tǒng)計量槽驶。在dplyr中使用group_by()函數(shù)，data.table中指定by參數(shù)即可：

head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)])

mygroup= group_by(hospital_spending,Hospital,State)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))

from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]

compare(from_dplyr,from_data_table, allowAll=TRUE)

鏈?zhǔn)讲僮?/p>

在dplyr和data.table包中再层，我們可以使用鏈?zhǔn)讲僮鱽韺崿F(xiàn)代碼的連續(xù)性堡纬。在dplyr中聂受，使用magrittr包中的%>%管道函數(shù)非晨靖洌酷。%>%的功能是用于實現(xiàn)將一個函數(shù)的輸出傳遞給下一個函數(shù)的第一個參數(shù)炮叶。在data.table中，我們可以使用%>%或[來實現(xiàn)鏈?zhǔn)讲僮鳌?/p>

from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.))

from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]

compare(from_dplyr,from_data_table, allowAll=TRUE)

hospital_spending %>% group_by(State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.)) %>% 
arrange(desc(mean)) %>% head(10) %>% 
        mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>% 
          ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+
          xlab("")+ggtitle('Average Spending Per Episode by State')+
          ylab('Average')+ coord_cartesian(ylim = c(3800, 4000))

hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),
                                     by=.(State)][order(-mean)][1:10] %>% 
            mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>% 
           ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+
          xlab("")+ggtitle('Average Spending Per Episode by State')+
          ylab('Average')+ coord_cartesian(ylim = c(3800, 4000))

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末，一起剝皮案震驚了整個濱河市旧困，隨后出現(xiàn)的幾起案子稼锅，更是在濱河造成了極大的恐慌，老刑警劉巖拗盒，帶你破解...
沈念sama閱讀 221,888評論 6贊 515
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件锥债，死亡現(xiàn)場離奇詭異陡蝇，居然都是意外死亡赞弥，警方通過查閱死者的電腦和手機，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 94,677評論 3贊 399
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進店門悼嫉，熙熙樓的掌柜王于貴愁眉苦臉地迎上來拼窥，“玉大人蹋凝，你說我怎么就攤上這事总棵。” “怎么了迄汛？”我有些...
開封第一講書人閱讀 168,386評論 0贊 360
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵骤视，是天一觀的道長。經(jīng)常有香客問我睹逃，道長祷肯，這世上最難降的妖魔是什么？我笑而不...
開封第一講書人閱讀 59,726評論 1贊 297
?港島之戀（遺憾婚禮）
正文為了忘掉前任翼闹，我火速辦了婚禮蒋纬，結(jié)果婚禮上，老公的妹妹穿的比我還像新娘。我一直安慰自己史汗，他們只是感情好停撞，可當(dāng)我...
茶點故事閱讀 68,729評論 6贊 397
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布。她就那樣靜靜地躺著艰猬，像睡著了一般埋市。火紅的嫁衣襯著肌膚如雪。梳的紋絲不亂的頭發(fā)上道宅，一...
開封第一講書人閱讀 52,337評論 1贊 310
城市分裂傳說
那天，我揣著相機與錄音樱报，去河邊找鬼。笑死民珍，一個胖子當(dāng)著我的面吹牛盗飒，可吹牛的內(nèi)容都是我干的。我是一名探鬼主播箩兽，決...
沈念sama閱讀 40,902評論 3贊 421
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼，長吁一口氣：“原來是場噩夢啊……” “哼身坐！你這毒婦竟也來了落包？” 一聲冷哼從身側(cè)響起，我...
開封第一講書人閱讀 39,807評論 0贊 276
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤涯鲁，失蹤者是張志新（化名）和其女友劉穎有序，沒想到半個月后，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體旭寿，經(jīng)...
沈念sama閱讀 46,349評論 1贊 318
?護林員之死
正文獨居荒郊野嶺守林人離奇死亡盅称，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點故事閱讀 38,439評論 3贊 340
?白月光啟示錄
正文我和宋清朗相戀三年，在試婚紗的時候發(fā)現(xiàn)自己被綠了混狠。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片疾层。...
茶點故事閱讀 40,567評論 1贊 352
活死人
序言：一個原本活蹦亂跳的男人離奇死亡，死狀恐怖俯逾，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情皇筛，我是刑警寧澤坠七，帶...
沈念sama閱讀 36,242評論 5贊 350
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站拄踪，受9級特大地震影響拳魁，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜潘懊，卻給世界環(huán)境...
茶點故事閱讀 41,933評論 3贊 334
男人毒藥：我在死后第九天來索命
文/蒙蒙一授舟、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧释树，春花似錦、人聲如沸秸仙。這莊子的主人今日做“春日...
開封第一講書人閱讀 32,420評論 0贊 24
一樁弒父案，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至抢腐，卻和暖如春，著一層夾襖步出監(jiān)牢的瞬間伤靠，已是汗流浹背。一陣腳步聲響...
開封第一講書人閱讀 33,531評論 1贊 272
情欲美人皮
我被黑心中介騙來泰國打工焕梅，沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留卦洽，地道東北人。一個月前我還...
沈念sama閱讀 48,995評論 3贊 377
代替公主和親
正文我出身青樓该窗，卻偏偏與公主長得像蚤霞，于是被迫代替她去往敵國和親。傳聞我的和親對象是個殘疾皇子昧绣，可洞房花燭夜當(dāng)晚...
茶點故事閱讀 45,585評論 2贊 359

dplyr和data.table

推薦閱讀更多精彩內(nèi)容