50行代碼抓取一個(gè)網(wǎng)站
很多年前寫的代碼了近她,只是記錄一下
主要使用nodejs + cheerio
cheerio可以采用jquery方式解析dom結(jié)構(gòu)琅捏,網(wǎng)上很多不做轉(zhuǎn)述
如果只是json數(shù)據(jù)的話完箩,可以不用cheerio
實(shí)現(xiàn)數(shù)據(jù)抓取鹿霸,數(shù)據(jù)篩選泵喘,頻率控制泪电,斷點(diǎn)續(xù)傳
npm install?cheerio
npm install?superagent-prefix
npm install?superagent-charset
npm install?line-reader
貼代碼:
var prefix = require('superagent-prefix')('/static');
var request = require('superagent-charset');
var cheerio = require('cheerio');
var fs = require('fs');
var lineReader = require('line-reader'); //日志定義
var log4js = require('log4js');
log4js.configure("./log4js.json"); //注:配置里的日志目錄要先創(chuàng)建,才能加載配置纪铺,不然會(huì)出異常
var logger = log4js.getLogger('log');
var startUrl = 'http://XXXXXXX.com/search/?q=27age&iy431rzm&per_page=20&wfl=1&page=';//起始地址
var fileName = 'result.txt';//文件生成目錄
var num=1;
function doUrl(url){
????request.get(url) .use(prefix) // Prefixes *only* this request
????.charset('utf-8')
????.end(function(err, res){
????????if(err){ console.log(err); }
????????//獲取網(wǎng)頁(yè)中的json數(shù)據(jù)
????????var json = res.text.substring(res.text.indexOf('app.page["pins"] ????????=')+19,res.text.indexOf('app.page["page"]')-2);
????????var obj = JSON.parse(json);
????????var lastId = '';
????????if(obj.length > 0){
????????????for(var i=0;i<obj.length;i++){//數(shù)據(jù)篩選
????????????????if(obj[i] != null && obj[i].source != null && obj[i].source=='xxxx'){
????????????????????lastId = obj[i].pin_id;
????????????????????appendToFile(fileName, obj[i].pin_id + ',' + obj[i].link + ',http://xxxx.com/pins/' + obj[i].pin_id + ',' + obj[i].repin_count + ',' + obj[i].like_count + ',' + obj[i].comment_count + '\r\n');
????????????????????}
????????????????}
????????????????num++;
????????????????var nextUrl = startUrl + num;//下一頁(yè)鏈接
????????????????fs.writeFile('num.txt', num);//記錄已經(jīng)抓取的頁(yè)號(hào)
????????????????logger.info('do next, ulr = ' + nextUrl);//抓取下一頁(yè)數(shù)據(jù)
????????????????setTimeout(function(){doUrl(nextUrl)},600);//控制抓取頻率相速,不然很容易被封
????????????}else {
????????????????logger.info('沒(méi)有更多數(shù)據(jù), url = ' + url);
????????????}
????});
}
function appendToFile(fileName,line){
????fs.appendFile(fileName,line,function(err){if(err) throw err});
}
lineReader.eachLine('num.txt', function(line, last) {//啟動(dòng)時(shí)鲜锚,從上次抓取的最后一頁(yè)開(kāi)始抓取
????num = line;
????doUrl(startUrl + num);//程序啟動(dòng)
????return false;
});