使用R包rvest/XML/RCurl入門R爬蟲

css selector手冊：https://www.runoob.com/cssref/css-selectors.html
xpath selector手冊：https://www.runoob.com/xpath/xpath-tutorial.html
xpath查找節(jié)點：https://www.cnblogs.com/txwen/p/7999485.html
學(xué)習(xí)視頻：https://www.bilibili.com/video/av30320885 https://www.bilibili.com/video/av36907341?from=search&seid=12650656636924845868 https://www.bilibili.com/video/av39807071?p=7
參考文檔：https://zhuanlan.zhihu.com/p/22916652
關(guān)于GET和POST的有趣的解釋：https://zhuanlan.zhihu.com/p/22536382
RCurl解析：https://blog.csdn.net/kMD8d5R/article/details/78933384
html文件及http基本知識：https://www.w3school.com.cn/tags/html_ref_byfunc.asp
post/get格式化工具：http://coolaf.com
如何查看post的參數(shù)：https://jingyan.baidu.com/article/d45ad1487f057669552b8030.html
一個爬蟲初學(xué)者友好的教程：http://www.reibang.com/p/0c0cb9867b44
多個網(wǎng)站的抓取實操：https://blog.csdn.net/hill_night/article/details/45789655?locationNum=12&fps=1
抓取財經(jīng)網(wǎng)股票信息實操：http://blog.sina.com.cn/s/blog_685d10480102wyn9.html
利用postform模擬登錄抓取新浪微博信息：http://www.dataguru.cn/article-873-1.html
一個通過模擬登錄抓取教務(wù)處信息的案例：https://blog.csdn.net/kMD8d5R/article/details/78737442
模擬登錄需要輸入用戶名密碼的網(wǎng)頁：https://www.zhihu.com/question/65799576
模擬登錄可能需要的cookie的解釋：http://www.reibang.com/p/6fc9cea6daa2
通過替換cookie來模擬登錄：https://www.cnblogs.com/huahuayu/p/8207037.html
通過get方式提交request時如果是漢字會自動變成url編碼遏片，有關(guān)這類編碼的介紹：https://www.cnblogs.com/niuyaomin/p/11788732.html
url編碼的解析器：http://web.chacuo.net/charsetbase64
不錯的爬蟲實例：https://ask.hellobi.com/blog/R_shequ/33920
rvest模擬瀏覽行為：https://blog.csdn.net/weixu22/article/details/79237512
https://blog.csdn.net/Joyliness/article/details/78722317
高級爬蟲教程：http://www.reibang.com/p/1fc6a6817160
rvest模擬點擊網(wǎng)頁：https://www.jb51.cc/html/224799.html

library(rvest)
url_eye<-"https://list.tmall.com/search_product.htm?q=%D1%DB%BD%DE%C3%AB&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton"
url_eye
html_session(url_eye)
web_eye<-read_html(url_eye,encoding = "GBK")
pic<-html_nodes(web_eye,xpath = '//div[@class="view grid-nosku "]//div[@class="productImg-wrap"]//img')
pic
pic_dir<-html_attr(pic,"src")
pic_dir<-html_attr(pic,"data-ks-lazyload")
pic_dir<-pic_dir[!is.na(pic_dir)]
pic_dir<-paste("http:",pic_dir,sep = "")
for (i in 1:55) {
  download.file(pic_dir[i],paste(i,".jpg",sep = ""))
}

#爬取GEO中的樣本編號和名稱３透尽！战转！
#注意：1赡译、時刻查看xpath與真實下載的html文件的差異，因為瀏覽器上展示的信息可能與真實下載的文件信息不同！
#2领舰、當(dāng)輸入xpath后獲得的節(jié)點為null時，可以使用‘/div’來驗證html文件是否有效迟螺！
library(XML)
library(RCurl)
library(rvest)
url<-"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056"
myheader<-c("User-Agent"= "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
myheader
temp<-getURL(url,httpheader=myheader,.encoding = "utf-8")
temp
temp<-htmlParse(temp)
temp
nodes<-getNodeSet(temp,path = '//table[@style="position:relative;top:-5px;left:-5px"]/tr')
nodes
value<-sapply(nodes,xmlValue)
value

#爬取大眾點評上的信息冲秽，注意：1、淘寶和天貓需要登錄才能爬染馗浮（或許是別的原因）2锉桑、如果懷疑html文件有問題，可以保存為html后用瀏覽器打開查看窍株！
library(RCurl)
library(XML)
url<-"http://www.dianping.com"
myheader<-c(
  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
temp<-getURL(url,httpheader=myheader,.encoding = "utf-8")
(temp)
write.table(temp,"temp.html")
temp<-htmlParse(temp)
temp<-getNodeSet(temp,'//div[@class="shop-item"]//img')
temp
temp<-lapply(temp,xmlAttrs)#提取節(jié)點所有屬性值
temp<-sapply(temp,function(x){x=x[names(x)=="lazy-src"];return(x)})#提取節(jié)點某一屬性值
temp


library(rvest)
url<-"http://www.dianping.com"
temp<-read_html(url,encoding = "utf-8")
node<-html_nodes(temp,xpath = '//div[@class="shop-item"]//img')%>%html_attr(name = "src")
node

url<-"http://www.baidu.com"
#rvest

web<-read_html(url,encoding = "utf-8")
web
#html_document
temp1<-html_nodes(web,xpath = "http://img")
temp1
#[1] "xml_nodeset"
attrs<-html_attrs(temp1)
#沒有亂碼


#rcurl+xml
library(RCurl)
library(XML)
html_session(url)
web<-getURL(url = url,.encoding = "utf-8")
web
#character
temp<-htmlParse(web)
class(temp)
#"HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument"  "XMLAbstractDocument"
#temp<-htmlTreeParse("./1688.html",encoding = "UTF-8")
#temp
#"XMLDocumentContent"
temp<-getNodeSet(temp,"http://img")
temp
#[1] "XMLNodeSet"
attrs2<-lapply(temp,xmlAttrs)
#有亂碼
iconv(attrs2[[4]],from = "UTF-8")
#可以消除亂碼

#使用爬蟲解析本地html
#注：淘寶天貓京東等暫時無法通過url解析和本地解析的方法爬取信息
#rvest
web<-read_html("./dazhongdianping.html",encoding = "utf-8")
web
nodes<-html_nodes(web,xpath = "http://img")
nodes


#RCurl+XML
web1<-htmlParse("./dazhongdianping.html",encoding = "utf-8")
web1
nodes1<-getNodeSet(web1,"http://img")
nodes1

#注意Ｃ裰帷！＜欣选杉武！根據(jù)getnodesset和html_nodes的說明文件，目前其僅支持xpath 1.0 selector ,同時在出現(xiàn)[]是@屬性="xxxxxx xxxxx"不被認可（中間用空格無法識別）建議換成//div[contains(@class,'')]辙售，注意使用單引號Ｇ岜А！５┎俊祈搜！
url<-"https://sh.lianjia.com/ershoufang/"
web2<-read_html(url,encoding = "utf-8")
web2<-html_nodes(web2,xpath = "http://ul[@class='sellListContent']/li[contains(@class,'clear')]/a")#這樣也可以正常運行较店，但是要用單引號！容燕！
web2
#成功爬取
#若使用//ul[@class="sellListContent"]/li[@class="clear  LOGCLICKDATA"]/a    則爬取失斄撼省！蘸秘！