不嘮閑嗑嗡贺,直接來(lái)干的讶隐,需要從京東搜索關(guān)鍵詞然后將商品列表導(dǎo)出赎懦,看了一下市面上的解決方案一個(gè)解析接口返回的內(nèi)容雀鹃,一個(gè)是通過(guò)selenium抓取頁(yè)面,兩套方案試了一下都不能滿足需求励两,前者頻繁請(qǐng)求會(huì)觸發(fā)風(fēng)控黎茎,后者解析頁(yè)面需要人工干預(yù),想了一下決定嘗試一下chrome插件的方式來(lái)獲取想要的內(nèi)容当悔。
先大概整理一下思路:
設(shè)置關(guān)鍵詞--->觸發(fā)搜索--->滾動(dòng)頁(yè)面--->解析內(nèi)容--->數(shù)據(jù)過(guò)濾--->處理分頁(yè)
根據(jù)頁(yè)面元素確定輸入框和點(diǎn)擊按鈕的標(biāo)識(shí)將搜索功能搞定
//搜索
let search = (keyword) =>{
$("#search-2014").find("#key").focus()
$("#search-2014").find("#key").val(keyword)
$("#search-2014").find(".form").find("button").click()
}
拿到結(jié)果后將頁(yè)面內(nèi)容進(jìn)行解析傅瞻,頁(yè)面內(nèi)容解析有兩段,一個(gè)是列表內(nèi)容提取盲憎,一個(gè)是每條商品信息提取嗅骄,由于京東的商品列表不會(huì)一次全部展示需要不斷的滾動(dòng)頁(yè)面才會(huì)進(jìn)行加載,所以需要做一下處理,
let scrollToBottom = async () =>{
let page_height = document.body.scrollHeight
let scroll_count = 15;
let offset = 400; //page_height / scroll_count
for(let i=0; i< scroll_count; i++){
let timeOut = 2
await sleep(timeOut)
window.scrollBy(0, offset)
console.log(`第${i}次滾動(dòng)`)
}
console.log("滾動(dòng)完成")
parse_content()
}
//解析頁(yè)面內(nèi)容
let parse_content = async () =>{
let keyword = $("#search-2014").find("#key").val()
let items = $("#J_goodsList ul").children()
await sleep(10)
let list = []
$.each(items, function(inx, it){
let data = parse_item(it)
if(data.goods_name.indexOf(keyword) != -1){
//TODO將采集的信息發(fā)給后臺(tái)
}
})
if(list.length > 0){
post_data(list)
}
}
//提取內(nèi)容
let parse_item = (it) =>{
let is_ad = false
let goods_id = $(it).data("sku")
let pic_dom = $(it).find(".p-img").find("img")
let pic_url = $(pic_dom).attr("src") || $(pic_dom).attr("data-lazy-img")
let price = $(it).find(".p-price").text().trim().replace("¥","")
let deal_num = 0
let goods_name = $(it).find(".p-name").text().trim()
goods_name = goods_name.replace("拍拍", "")
goods_name = goods_name.replace("廣告詞", "")
goods_name = goods_name.trim()
let goods_detail_url = "https:"+ $(it).find(".p-name a").attr("href")
let shop_name = $(it).find(".p-shopnum").text().trim()
let shop_link = "https:"+ $(it).find(".p-shopnum a").attr("href")
let location = ""
let keyword = $("#search-2014").find("#key").val()
let data = {
is_ad,
pic_url,
price,
deal_num,
goods_name,
goods_detail_url,
goods_id,
shop_name,
shop_link,
location,
keyword
}
return data
}
由于返回的商品信息中不一定會(huì)存在我們想要的關(guān)鍵字饼疙,所以還需要對(duì)數(shù)據(jù)進(jìn)行一次處理溺森,例如判斷一下標(biāo)題中是否存在關(guān)鍵字等,這里我只是簡(jiǎn)單處理一下
if(data.goods_name.indexOf(keyword) != -1){
//TODO將采集的信息發(fā)給后臺(tái)
}
最后就是處理分頁(yè)問(wèn)題了窑眯,由于京東的分頁(yè)參數(shù)加密比較簡(jiǎn)單就是采用的2*n-1這種模式屏积,n為下一頁(yè)的頁(yè)數(shù)
//解析分頁(yè)
let parse_page = async () =>{
await sleep(10)
let page = parseInt(localStorage.getItem("page") || "0");
if(page){
page = page + 1
}else{
page = 1
}
console.log(`第${page}次執(zhí)行`)
localStorage.setItem("page",page)
//TODO 判斷爬前幾頁(yè)
if(limit && page >limit){
localStorage.removeItem("page")
return
}
let current_page = parseInt($("#J_bottomPage").find(".curr").text().trim())
let total_page = $("#J_bottomPage").find(".p-skip").text().trim()
let regx = /(\d+)/
let result = total_page.match(regx)
total_page = parseInt(result[1])
if (current_page < total_page) {
let nextId = parseInt($("#J_bottomPage").find(".curr").next().text().trim())
let url = location.href
console.log("獲取下一頁(yè)", url)
let page_regx = /\&s=(\d+)/
let page_num = url.match(page_regx)
if(page_num){
page_num = page_num[1]
url = url.replace(page_num, (nextId - 1) * 60 + 1)
let _page_regx = /\&page=(\d+)/
let _page_num = url.match(_page_regx)
if(_page_num){
_page_num = _page_num[1]
url = url.replace(`&page=${_page_num}`, `&page=${(2 * nextId - 1)}`)
}
await sleep(30)
console.log("下一頁(yè)地址",url)
location.href = url
}else{
url = url + `&page=${(2*nextId - 1)}&s=${(nextId - 1) * 60 + 1}&click=0`
await sleep(30)
location.href = url
}
}else{
//爬完了
localStorage.removeItem("page")
}
}
由于只是分析測(cè)試,所以代碼寫的比較粗糙磅甩,大家將就著看吧炊林,后續(xù)有時(shí)間整理成完整的工程代碼