使用node抓取某個頁面的所以新聞鏈接及相關(guān)文章鏈接里面的具體內(nèi)容
var ?eventproxy=require('eventproxy');
var ? request=require('superagent');
var ?superagent=require('superagent-charset')(request);
var ? cheerio=require('cheerio');
var ?url=require('url');
var ?fs=require('fs');
var ?cnodeUrl='http://gd.qq.com/l/gdfabu/gdfbzwjj/more.htm';
var ?sql=require('mssql');
superagent.get(cnodeUrl).charset()
.end(function(err, res) {
if(err) {
returnconsole.error(err);
}
var ? topicUrls= [];
var ? $ =cheerio.load(res.text,{decodeEntities:false});//cheerio like jquery
$('.list01 li a').each(function(idx, element) {
var ? ?$element=$(element);
var ? href=url.resolve(cnodeUrl,$element.attr('href'));
var ? title=$element.text();
topicUrls.push({href:href,title:title});
});
var ?fileStr="";
var ?ep=new eventproxy();
ep.after('topic_html',topicUrls.length,function(topics) {
var ? topics= topics.map(function(topicPair) {
var ?topicUrl= topicPair[0];
var ? topicHtml= topicPair[1];
var ? $=cheerio.load(topicHtml,{decodeEntities:false});
var ? cm=$('#Cnt-Main-Article-QQ').html()||"";
fileStr+="title:"+topicUrl.title+";url:"+topicUrl.href+/*";content:"+cm+*/"\n";
return({
title:topicUrl.title||"無",
url:topicUrl.href,
comment:cm
});
});
//console.log(topics);
//write file
/* fs.writeFile("node4.txt",fileStr,function(err){
if(err) return console.error(err);
});*/
// insert into database
var ?conUrl="mssql://sa:123456@localhost:1433/text";
sql.connect(conUrl,function(err, conn) {
if(err) {
console.log(err);
return;
}
constrequest=newsql.Request();
topics.forEach(function(topic){
var ?sqlL="INSERT INTO press (Title, Content, Origin) VALUES ('"+topic.title+"','"+topic.comment+"','"+topic.url+"')";
request.query(sqlL,function( e, r ) {
if(e) {
console.log(e);
}
// console.log(r);
console.log("success");
});
});
});
});
topicUrls.forEach(function(topicUrl) {
superagent.get(topicUrl.href).charset()
.end(function(err, res) {
ep.emit('topic_html', [topicUrl, res.text]);
});
});
});
eventproxy這前四個模塊自己install桂肌,這個抓取方法不是最后碑定,可以采用async去代替eventproxy,eventproxy是一次性抓取忘瓦,容易? 被別人發(fā)現(xiàn)器联,async可以控制抓取次數(shù)
寫入數(shù)據(jù)庫的自己模擬個宝踪,寫入txt效果如下圖: