今天是4.13號(hào)痒留。
昨天把會(huì)議論文算是完成任務(wù)的寫(xiě)完然后提交了,而實(shí)習(xí)還沒(méi)有找上,所以最近一段時(shí)間應(yīng)該都會(huì)整天在實(shí)驗(yàn)室學(xué)習(xí)python吧,加上最近一個(gè)多星期全部都是大雨哪也去不了(說(shuō)的好像不下雨就會(huì)出去轉(zhuǎn)悠一樣副编。本來(lái)還想問(wèn)一下送宋教授現(xiàn)在有什么項(xiàng)目可以跟過(guò)去做,但又怕把python的學(xué)習(xí)拉下,所以還是最近半個(gè)月先把這個(gè)課程全部學(xué)完吧痹届。另外電腦運(yùn)行pycharm真心帶不動(dòng)呻待,所以也在等家里的那臺(tái)筆記本寄過(guò)來(lái),同時(shí)不得不提的是也在等投稿的論文消息队腐,wish there is a good result蚕捉。
照樣在貼上代碼之前,總結(jié)在實(shí)際中新學(xué)的知識(shí)與所遇到的問(wèn)題柴淘。
(1).快捷鍵ctrl+/可以多行注釋?zhuān)窟x定后tab可以多行縮進(jìn)迫淹,shift+tab則可以向左縮進(jìn)。
(2).注意select('')和split('')得到的結(jié)果都是列表为严,所以都要在后面加下標(biāo)[number]敛熬。
(3).X.stripped_strings 用于去除字符串X中包含的空格或空行。同時(shí)注意要用list()把那一串?dāng)?shù)據(jù)括起來(lái)第股。
(4).對(duì)于多種分類(lèi)情況時(shí)应民,最好用if語(yǔ)句來(lái)進(jìn)行判斷。判斷某特點(diǎn)字符串s1是包含在另一字符串s2中夕吻,可用if 's1' in 's2'
(5).要關(guān)注抓取的數(shù)據(jù)是網(wǎng)頁(yè)自帶的诲锹,還是通過(guò)request返回的json數(shù)據(jù),一般json都是字典數(shù)據(jù)涉馅。對(duì)于瀏覽量等JS數(shù)據(jù)归园,首先在審查元素的network-JS中找到相關(guān)網(wǎng)頁(yè),然后進(jìn)行解析稚矿。
解析過(guò)程包括:將查詢(xún)網(wǎng)頁(yè)的id導(dǎo)出庸诱,然后用format()直接替換到相應(yīng)的JS動(dòng)態(tài)網(wǎng)頁(yè)構(gòu)造成新的網(wǎng)頁(yè);接著跟一般網(wǎng)頁(yè)解析一樣用requests.get()去請(qǐng)求晤揣;最后由于JS網(wǎng)頁(yè)的回應(yīng)內(nèi)容都是字符串偶翅,所以直接用js.text然后再用相應(yīng)的split或其他方法截取自己想要的內(nèi)容。
還一個(gè)問(wèn)題要注意碉渡,對(duì)于請(qǐng)求JS數(shù)據(jù)時(shí)聚谁,記得加上headers包括: 'Referer'和 'User-Agent'
第一段
__author__ = 'guohuaiqi'
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import string
url='http://bj.58.com/sale.shtml'
host='http://bj.58.com'
#得到所有商品類(lèi)目的鏈接并保存下來(lái)
def get_cate_link(url):
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
allurl=soup.select('#ymenu-side > ul > li > ul > li > b > a')
for item in allurl:
cate_link=host+item.get('href')
#print(cate_link)
# get_cate_link(url)
cate_list="""
http://bj.58.com/shouji/
http://bj.58.com/tongxunyw/
http://bj.58.com/danche/
http://bj.58.com/fzixingche/
http://bj.58.com/diandongche/
http://bj.58.com/sanlunche/
http://bj.58.com/peijianzhuangbei/
http://bj.58.com/diannao/
http://bj.58.com/bijiben/
http://bj.58.com/pbdn/
http://bj.58.com/diannaopeijian/
http://bj.58.com/zhoubianshebei/
http://bj.58.com/shuma/
http://bj.58.com/shumaxiangji/
http://bj.58.com/mpsanmpsi/
http://bj.58.com/youxiji/
http://bj.58.com/jiadian/
http://bj.58.com/dianshiji/
http://bj.58.com/ershoukongtiao/
http://bj.58.com/xiyiji/
http://bj.58.com/bingxiang/
http://bj.58.com/binggui/
http://bj.58.com/chuang/
http://bj.58.com/ershoujiaju/
http://bj.58.com/bangongshebei/
http://bj.58.com/diannaohaocai/
http://bj.58.com/bangongjiaju/
http://bj.58.com/ershoushebei/
http://bj.58.com/yingyou/
http://bj.58.com/yingeryongpin/
http://bj.58.com/muyingweiyang/
http://bj.58.com/muyingtongchuang/
http://bj.58.com/yunfuyongpin/
http://bj.58.com/fushi/
http://bj.58.com/nanzhuang/
http://bj.58.com/fsxiemao/
http://bj.58.com/xiangbao/
http://bj.58.com/meirong/
http://bj.58.com/yishu/
http://bj.58.com/shufahuihua/
http://bj.58.com/zhubaoshipin/
http://bj.58.com/yuqi/
http://bj.58.com/tushu/
http://bj.58.com/tushubook/
http://bj.58.com/wenti/
http://bj.58.com/yundongfushi/
http://bj.58.com/jianshenqixie/
http://bj.58.com/huju/
http://bj.58.com/qiulei/
http://bj.58.com/yueqi/
http://bj.58.com/tiaozao/
"""
第二段
__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import sys
client=pymongo.MongoClient('localhost',27017)
tongcheng=client['tongcheng']
urllist=tongcheng['urllist']
content=tongcheng['content']
#爬取所有商品的鏈接保存下來(lái),這里的url來(lái)自cate_list
def get_content_links(cate_url,page):
# http://bj.58.com/danche/pn2/ 這里要構(gòu)造函數(shù),不然傳來(lái)的類(lèi)目鏈接只是進(jìn)來(lái)后的首頁(yè)
page_list='{}pn{}/'.format(cate_url,str(page))
web_data=requests.get(page_list)
soup=BeautifulSoup(web_data.text,'lxml')
time.sleep(1)
if soup.find('td','t'):
allurl=soup.select('td.t a.t')
for url1 in allurl:
content_link=url1.get('href').split('?')[0]
if 'bj.58.com' not in content_link:
pass
else:
urllist.insert_one({'url':content_link})
# print(content_link)
get_item_content(content_link)
else:
pass
# cate_url='http://bj.58.com/youxiji/'
# get_content_links(cate_url,20)
# 爬取每個(gè)頁(yè)面的詳情內(nèi)容,包括標(biāo)題滞诺,時(shí)間形导,價(jià)格,區(qū)域
def get_item_content(content_link):
# 先判斷數(shù)據(jù)是否來(lái)自58习霹,將來(lái)自精品或者轉(zhuǎn)轉(zhuǎn)的數(shù)據(jù)朵耕,統(tǒng)一不要
# for url2 in content_link:
# if 'bj.58.com' not in url2:
# pass
# else:
try:
web_data1=requests.get(content_link)
soup=BeautifulSoup(web_data1.text,'lxml')
page_not_exist = '404' in soup.find('script',type='text/javascript').get('src').split('/')
if page_not_exist:
pass
else:
if '區(qū)域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_tit')[0].get_text():
if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span'):
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')[0].stripped_strings)
else:
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con')[0].stripped_strings)
elif '區(qū)域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_tit')[0].get_text():
if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span'):
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')[0].stripped_strings)
else:
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con')[0].stripped_strings)
else:
district=None
data={
'goods_cate':soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')[0].text.strip(),
'title':soup.select('#content h1')[0].text.strip(),
'date':soup.select('#content li.time')[0].text.replace('.','-'),
'price':soup.select('span.price.c_f50')[0].text.replace('元','').strip() if '面議'not in soup.select('span.price.c_f50')[0].text else None,
'district':district
}
content.insert_one(data)
# print(data)
except requests.ConnectionError as e:
print(e.response)
#
# b=['http://bj.58.com/shuma/23190415633187x.shtml','http://bj.58.com/yishu/25471342844357x.shtml','http://bj.58.com/shouji/25683386143296x.shtml','http://bj.58.com/shuma/23425779899550x.shtml']
# get_item_content(b)
# get_content_links('http://bj.58.com/shouji/',20)
第三段
# _*_ coding: utf-8 _*_
#!/usr/bin/env python
__author__ = 'guohuaiqi'
from multiprocessing import Pool
from get_cate_link import cate_list
from get_all_contents import get_content_links,urllist,content
# 加入斷點(diǎn)續(xù)傳機(jī)制,在出現(xiàn)斷開(kāi)后淋叶,用rest_list替換pool,map()函數(shù)中的cate_links
db_urllist=[item['url'] for item in urllist.find()]
content_urllist=[item['url'] for item in content.fina()]
x=set(db_urllist)
y=set(content_urllist)
rest_list=x-y
def get_all_links(cate_url):
for page in range(1,101):
get_content_links(cate_url,page)
if __name__=='__main__':
pool=Pool()
pool.map(get_all_links,cate_list.split())
第四段
最后再加上一個(gè)count函數(shù)來(lái)對(duì)數(shù)據(jù)庫(kù)中的item計(jì)數(shù)
__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import time
from get_all_contents1 import content
while True:
print(content.find().count())
time.sleep(3)
再要注意的就是阎曹,一定一定在寫(xiě)代碼前在最前面加上:
#!/usr/bin/env python
__ coding: utf-8 __**
在爬取了10745條數(shù)據(jù)后自己手動(dòng)停止了程序,一共花了差不多12分鐘。