有了之前的爬蟲(chóng)知識(shí)蛾娶,相對(duì)比較簡(jiǎn)單。爬取趕集網(wǎng)二手物品信息胎许。
1辜窑、先得到主目錄鏈接
from bs4 import BeautifulSoup
#import requests
import urllib2
#import re
start_url="http://sh.ganji.com/wu/"
def get_channel_list(url):
web_data=urllib2.urlopen(url).read()
#web_data = requests.get(start_url)只能讀取部分?jǐn)?shù)據(jù)
soup = BeautifulSoup(web_data, 'lxml')
contents=soup.select('#wrapper > div.content > div > div > dl > dt > a')
#contents是list不能直接get
for content in contents:
name=content.get('href').split('/')[1]
channel_list='http://sh.ganji.com/{}/'.format(name)
print(channel_list)
get_channel_list(start_url)
2牙勘、再得到不同類(lèi)的全部鏈接和爬取內(nèi)容
from bs4 import BeautifulSoup
#import requests
import urllib2
#import re
import time
import pymongo
client=pymongo.MongoClient('localhost',27017)
ceshi=client['ceshi']
url_list=ceshi['url_list']
info=ceshi['info']
#t_url="http://zhuanzhuan.ganji.com/detail/788638496047104004z.shtml?from=pc&source=ganji&cate=&cateurl="
def get_url_list(channel,page):
if page==1:
url=channel
else:
url='{}o{}/'.format(channel,str(page))
#page格式
web_data=urllib2.urlopen(url).read()
time.sleep(2)
soup = BeautifulSoup(web_data, 'lxml')
if soup.find('td','t'):
links=soup.select('#infolist > div.infocon > table > tbody > tr > td.t > a')
for link in links:
url=link.get('href').split('?')[0]
url_list.insert_one[{'url':url}]
print(url)
else:
pass
def get_info(url):
web_data=urllib2.urlopen(url).read()
soup = BeautifulSoup(web_data, 'lxml')
name=soup.select('div.box_left_top > h1')[0].text
price=soup.select('div.price_li > span > i')[0].text
district=soup.select('div.palce_li > span > i')
info.insert_one[{'name':name,'price':price,'district':district}]
print[{'name':name,'price':price,'district':district}]
3放钦、多進(jìn)程進(jìn)行爬取
from multiprocessing import Pool
from channel_list import channel_list
from get_data_from_url import get_url_list
def get_all_date(channel):
for num in range(1,101):
get_url_list(channel,num)
if __name__ == "__main__":
pool = Pool()
#分到pool里自動(dòng)選擇進(jìn)程數(shù)
pool.map(get_all_date,channel_list.split())
#map的用法,后面一個(gè)一個(gè)帶入進(jìn)前面
4颓屑、計(jì)數(shù)
import time
from get_data_from_url import url_list
while True:
print(url_list.find().count())
time.sleep(3)
學(xué)習(xí)總結(jié):
1揪惦、創(chuàng)建數(shù)據(jù)庫(kù)丹擎,數(shù)據(jù)庫(kù)中插入數(shù)據(jù)蒂培;
2护戳、requests只能讀取大部分代碼垂睬,而urllib2可以讀全驹饺,不知道為什么赏壹;
3蝌借、多進(jìn)程進(jìn)行爬取數(shù)據(jù)菩佑。
4稍坯、擴(kuò)展庫(kù)安裝pip install pymongo。
2017年第1周