1. Download Data
if(!file.exists("data")) { ? ?dir.create("data")}
fileUrl<-"https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./data/cameras.csv",method="curl")
list.files("./data")
2. Reading Local File (.csv)
cameraData<-read.table("./data/cameras.csv",sep=",",header=TRUE)
head(cameraData)
3. Reading Excel File (.xlsx)
library(xlsx)
cameraData<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,header=TRUE)
head(cameraData)
## Reading specific rows and columns
colIndex<-2:3
rowIndex<-1:4
cameraDataSubset<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,colIndex=colIndex,rowIndex=rowIndex)
cameraDataSubset
3. Reading XML and HTML
library(XML)
fileUrl<-"http://www.w3schools.com/xml/simple.xml"
doc<-xmlTreeParse(fileUrl,useInternal=TRUE)
rootNode<-xmlRoot(doc)
xmlName(rootNode) ? #查看文件標(biāo)題
names(rootNode) ? #查看所有子主題
rootNode[[1]] ?#查看子主題第一級
rootNode[[1]][[1]]? #查看子主題第一級的第一個(gè)Element
xmlSApply(rootNode,xmlValue) ?#查看所有Element的Value
XPath:
/nodeTop level node
//nodeNode at any level
node[@attr-name]Node with an attribute name
node[@attr-name='bob']Node with attribute name attr-name='bob'
Information from:http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf
xpathSApply(rootNode,"http://name",xmlValue)
xpathSApply(rootNode,"http://price",xmlValue)
fileUrl<-"http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"doc<-htmlTreeParse(fileUrl,useInternal=TRUE)scores<-xpathSApply(doc,"http://li[@class='score']",xmlValue)teams<-xpathSApply(doc,"http://li[@class='team-name']",xmlValue)scores
4. Reading JSON
library(jsonlite)
jsonData<fromJSON("https://api.github.com/users/jtleek/repos")
names(jsonData)
jsonData$name
names(jsonData$owner)
jsonData$owner$login
#Writing data frames to JSON
myjson<-toJSON(iris,pretty=TRUE)
cat(myjson)
#Convert back to JSON
iris2<-fromJSON(myjson)
head(iris2)
5. Data Table
library(data.table)
DF=data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DF,3)
DT=data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))head(DT,3)
# See all data tables in Memory
tables()
# Subsetting rows
DT[2,]
DT[DT$y=="a",] ? #選出y=a的
DT[c(2,3)] ?#選出行12,列123
# Calculating values for variables with expressions
DT[,list(mean(x),sum(z))] ?#返回x的mean驾孔,z的sum兩個(gè)值
# Adding new columns
DT[,w:=z^2]
# 多重操作冷尉,tep意指中間變量
DT[,m:={tmp<-(x+z); log2(tmp+5)}]
# plyr like operations
DT[,a:=x>0] ?#增加一個(gè)變量 true false
DT[,b:=mean(x+w),by=a] ?#by語句
# Special Variable
.N ?An integer, length 1, containing the number of elements of a factor level
set.seed(123);
DT<-data.table(x=sample(letters[1:3],1E5,TRUE))
DT[, .N,by=x]
# Keys (重要)
DT<-data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
setkey(DT,x)
DT['a']?
# Fread指令 Fast reading
big_df<-data.frame(x=rnorm(1E6),y=rnorm(1E6))
file<-tempfile()write.table(big_df,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
system.time(fread(file))