爬取豆瓣id和IMDB_id
#輸入電影名字拣度、導(dǎo)演笨觅、演員信息囱稽,爬取豆瓣id和IMDB_id
#輸入信息必須經(jīng)過嚴格清洗,不允許出現(xiàn)空格,導(dǎo)演和演員字符長度嚴格小于等于6
#### 計算程序的運行時間
timestart<-Sys.time();
#打印開始時間
print(timestart)
####這塊寫你要運行的程序
#報頭設(shè)置非常重要,爬蟲一定要偽裝刀疙,另外for循環(huán)一定要間隔休息
library(xlsx)
library(readxl)
library(plyr)
library(sqldf)
library(data.table)
library(RCurl)
library(XML)
library(stringr)
#偽裝報頭
myheader<-c(
"User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
########定義編輯距離函數(shù)#############
Fun <- function(x,y){
library(stringr)
m <- str_length(x)
n <- str_length(y)
x <- str_split(x,pattern = "")[[1]];
y <- str_split(y,pattern = "")[[1]];
M <- matrix(0,nrow = m+1,ncol = n+1);
rownames(M) <- c(" ",x)
colnames(M) <- c(" ",y)
for(i in 1:(m+1)) M[i,1] <- i-1;
for(j in 1:(n+1)) M[1,j] <- j-1;
for(i in 2:(m+1)){
for(j in 2:(n+1)){
if(x[i-1]==y[j-1]) cost=0 else cost=1;
M[i,j]=min(M[i-1,j]+1,M[i,j-1]+1,M[i-1,j-1]+cost)
}
}
#返回字符串的相似度
return(round(1-M[m+1,n+1]/(m+n),2));
}
#url <- "https://movie.douban.com/"
#text=c("碟中諜","獅子王","魔戒3","星際穿越","火星救援","碟中諜2","職業(yè)特工隊2","諜影重重2","碟中諜5")
#text="哈利波特與魔法石"
#text="少年派的奇幻漂流"
#text="哈利波特與死亡圣器(下)"
#text="手機"
#text <- t(c("加勒比海盜1:黑珍珠號的詛咒","馮小剛"," 張國立葛優(yōu)范"))
#text <- t(c("哈利波特與死亡圣器(下)" , "大飛", "廖智苗皓鈞"))
#輸入?yún)?shù)
#text <- y[1:100,c(1,2,3)]
#i=2
#抽樣測試
#text <- text[sample(2901,200,replace = F),]
##################測試###########################
#text <- as.data.frame(t(z[1,]),stringsAsFactors = F)
###################匹配分類##############
#A <- "完全匹配"
#B <- "多個匹配但前五結(jié)果唯一"
#C <- "返回一個結(jié)果標題不匹配但詳情頁匹配"
#D <- "前五結(jié)果多個但是匹配上了"
#E <- "完全不匹配"
#F <- "前五結(jié)果多個但是沒有匹配上"
url <- paste("https://movie.douban.com/subject_search?search_text=",text[,1],"&cat=1002",sep = "")
url_douban <- NULL;
url_douban_id <- NULL;
imdb_id <- NULL;
class <- NULL;
#輸入數(shù)據(jù)英文小括號改寫為正則表達式\\(和\\)
text[,1]<- str_replace(str_replace(text[,1],pattern = "\\(",replacement = "\\\\("),pattern = "\\)",replacement = "\\\\)");
#i=1
for(i in 1:length(url)){
wp<-getURL(url[i],.encoding="utf-8",followlocation=T,httpheader=myheader)
doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
#text[i]為電影名字
title<- xpathSApply(doc,"http://div[@class='pl2']//a",xmlValue)#搜索頁所有的結(jié)果標題
title <- str_replace_all(title,pattern = "·",replacement = "")
pipei <- str_detect(title,paste(" ",text[i,1]," {0,2}(\\(.{2,3}\\))?(:.{0,8})?","[\n | /]",sep = ""))
#如果匹配列表個數(shù)等于1
if(length(which(pipei))==1){
subscript <- which(pipei)[1]#匹配上電影的下標
url_douban <- xpathSApply(doc,"http://div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#進入搜索結(jié)果的鏈接
url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id號
wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
if(length(xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0){ imdb_id[i] <- "000"}
else{imdb_id[i] <- xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
class[i] <- "A"
}
else if(length(which(pipei))>1){
if(length(which(pipei[1:5]))==1){
subscript <- which(pipei)[1]#匹配上電影的下標
url_douban <- xpathSApply(doc,"http://div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#進入搜索結(jié)果的鏈接
url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id號
wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
if(length(xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0) imdb_id[i] <- "000"
else {imdb_id[i] < xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
class[i] <- "B"
}
else{
#########需要進行二次匹配###########
url_pipei <-xpathSApply(doc,"http://div[@class='pl2']//a",xmlGetAttr,"href")[which(pipei)];
imdb_id_temp <- NULL;
xishu <- NULL;
for(n in 1:length(url_pipei)){
wp1<-getURL(url_pipei[n],.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")
#導(dǎo)演
daoyan <- xpathSApply(doc1,path = "http://div[@id='info']//span",xmlValue)[1]
# if(is.null(daoyan)){daoyan <- ""}
daoyan <- str_replace_all(daoyan,pattern = "導(dǎo)演:","")
daoyan <- str_replace_all(daoyan,pattern = " ","")
daoyan <- str_replace_all(daoyan,pattern = "/","")
if(length(daoyan)==0){daoyan <- " "}
if(str_length(daoyan)>6)
{daoyan <- substring(daoyan,1,6)}
#主演
zhuyan <- xpathSApply(doc1,path = "http://div[@id='info']//span[@class='actor']",xmlValue)
if(length(zhuyan)==0){zhuyan <- "abcdef"}
zhuyan <- str_replace_all(zhuyan,pattern = "主演:","")
zhuyan <- str_replace_all(zhuyan,pattern = " ","")
zhuyan <- str_replace_all(zhuyan,pattern = "/","")
if(str_length(zhuyan)>6) {zhuyan <- substring(zhuyan,1,6)}
p <- xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)
if(length(p)==0){imdb_id_temp[n] <-"000"}
else{imdb_id_temp[n] <-xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
#加權(quán)系數(shù)計算
xishu[n] <- 0.6*Fun(text[i,2],daoyan)+0.4*Fun(text[i,3],zhuyan)###需要將輸入?yún)?shù)改為三個變量的數(shù)據(jù)框
}
subscript <- which.max(xishu)
if(xishu[subscript]>0.5)
{
url_douban_id[i] <- str_split(url_pipei[subscript],pattern = "/")[[1]][5]
imdb_id[i] <- imdb_id_temp[subscript]
class[i] <- "D"
}
else{
url_douban_id[i] <- 0;
imdb_id[i] <- 0;
class[i] <- "F"
}
}
}
else {
if(length(pipei)==1)
{
url_douban <- xpathSApply(doc,"http://div[@class='pl2']/a",xmlGetAttr,"href")[1]
wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
if(length(xpathSApply(doc1,path = "http://div[@id='info']",xmlValue))!=0){
text_another_name <- str_extract(xpathSApply(doc1,path = "http://div[@id='info']",xmlValue),pattern = "又名:.*IMDb鏈接")
if(is.na(text_another_name)){
text_another_name <- "aaaaaa"
}
}
else{
text_another_name <- "aaaaaa"
}
if(str_detect(text_another_name,text[i,1]))
{
url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id號
if(length(xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))!=0){
imdb_id[i] <- xpathSApply(doc1,path="http://div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
else{imdb_id[i] <- "000"}
class[i] <- "C"
}
else {
url_douban_id[i] <- NA;
imdb_id[i] <- NA;
class[i] <- "E";
}
}
else{
url_douban_id[i] <- NA;
imdb_id[i] <- NA;
class[i] <- "E";
}
}
#每一次循環(huán)休息2秒左右
Sys.sleep(2+runif(1,0,1))
}
#整理成數(shù)據(jù)框
x <- data.frame(text[,1],url_douban_id,imdb_id,class)
#如果匹配列表返回值前五個出現(xiàn)相同的匹配結(jié)果,則返回id=0;考慮將結(jié)果范圍縮小到
#如果列表返回值是1扫倡,但是不匹配名稱谦秧,則獲得鏈接,進入詳情信息頁面
#對搜索列表的電影名稱進行精簡修改撵溃,注意英文名字需要加上分隔符疚鲤,比如哈利波特、珀西杰克遜等
#標題第二個字段好像是沒有進行匹配的缘挑,需要進行修改
#計算程序結(jié)束時間
timeend<-Sys.time()
#打印結(jié)束時間
print(timeend)
runningtime<-timeend-timestart
#輸出時間消耗
print(runningtime)