Python實(shí)戰(zhàn)計(jì)劃學(xué)習(xí)第二周

在MongoDB中篩選房源

import pymongo
from bs4 import BeautifulSoup
import requests
import time

def get_seed(url='http://bj.xiaozhu.com/', page=1):
    return url if page <= 1 else '{}search-duanzufang-p{}-0/'.format(url, page)

def parse_fangzi(url, data=None):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    title = soup.select('h4 > em') # 標(biāo)題
    address = soup.select('p > span.pr5') # 地址
    price = soup.select('#pricePart > div.day_l > span') # 價(jià)格
    head = soup.select('#floatRightBox > div > div > a > img') # 房主頭像
    name = soup.select('#floatRightBox > div > div > h6 > a') # 房主昵稱
    gender = soup.select('#floatRightBox > div > div.member_pic > div') # 房主性別
    image = soup.select('#curBigImage') # 首張房子圖片
    data = {
        'title': title[0].get_text() if title != None else None
        , 'address': address[0].get_text().strip() if address != None else None
        , 'price': int(price[0].get_text()) if price != None else None
        , 'head': head[0].get('src') if head != None else None
        , 'name': name[0].get_text() if name != None else None
        , 'gender': gender[0].get('class') if gender != None else None
        , 'image': image[0].get('src') if image != None else None
    }
    if data['gender'] != None:
        data['gender'] = '男' if 'member_ico' in data['gender'] else '女'
    return data

def save(sheet):
    seed = 'http://bj.xiaozhu.com/'
    urls = []
    for page in range(1,4) :
        wb_data = requests.get(get_seed(url=seed,page=page))
        soup = BeautifulSoup(wb_data.text, 'lxml')
        for div in soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname'):
            urls.append(div.get('detailurl'))

    for url in urls:
        sheet.insert_one(parse_fangzi(url))
        time.sleep(2)

client = pymongo.MongoClient('localhost',27017)
walden = client['walden']
xiaozhu = walden['xiaozhu']

# save(xiaozhu)
for item in xiaozhu.find({'price':{'$gte':500}}):
    print(item)
屏幕快照 2016-05-30 上午12.37.33.png
  • 學(xué)習(xí)如何操作mongodb數(shù)據(jù)庫(kù)
  • 學(xué)習(xí)Python '三元操作符'

爬取手機(jī)號(hào)

import pymongo
from bs4 import BeautifulSoup
import requests
import time

count = 0
client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
shoujihao = walden['shoujihao']

for page in range(1, 2):
    url = 'http://bj.58.com/shoujihao/pn{}/'.format(page)
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    infocont = soup.select('#infocont > span')
    infocont = int(infocont[0].get_text()) if len(infocont) != 0 else 0
    # 如果找不到計(jì)數(shù)器 則直接退出
    if infocont == 0:
        print('共插入 {} 條, 終止于 {}'.format(count, url))
        break
    phones = soup.select('ul > div.boxlist > ul > li > a.t')
    for phone in phones:
        data = {
            'href': phone.get('href')
            , 'title': phone.find('strong', 'number').get_text()
        }
        shoujihao.insert_one(data)
    count += len(phones)
    print('{} -> {} : {}/{}'.format(url,len(phones),count,infocont))
    time.sleep(2)
屏幕快照 2016-05-30 上午12.43.40.png

設(shè)計(jì)斷點(diǎn)續(xù)傳程序

channel_extract.py

from bs4 import BeautifulSoup
import requests
import re


# 抽取58北京跳蚤市場(chǎng)下的所有類目 除卻手機(jī)號(hào)
def get_channel_urls(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    base = re.search(r'^(https?://[^/]+).*$', url).group(1)
    urls = []
    links = soup.select('ul.ym-submnu > li > b > a')
    for link in links:
        href = link.get('href')
        if href.startswith('/'):
            href = '{}{}'.format(base, href)
        if href not in urls and 'shoujihao' not in href:
            urls.append(href)
    return urls

counts.py

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
_58_channels = walden['_58_channels']
_58_urls = walden['_58_urls']
_58_infos = walden['_58_infos']
while True:
    print('{}/{}/{}'.format(_58_channels.count(), _58_urls.count(), _58_infos.count()))
    time.sleep(5)

main.py

from multiprocessing import Pool
from channel_extract import get_channel_urls
from page_parsing import get_item_info
from page_parsing import get_links_from
import time

import pymongo

client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
_58_channels = walden['_58_channels']
_58_urls = walden['_58_urls']


def get_all_links_from(channel):
    if _58_channels.count({'channel': channel}) > 0:
        return
    count = 0
    for page in range(1, 101):
        links = get_links_from(channel, page, 0)
        time.sleep(0.1)
        if links <= 0:
            break
        count += links
    print('{} -> {}'.format(channel, count))
    _58_channels.insert_one({'channel': channel, 'count': count})


def get_info(url):
    get_item_info(url)
    time.sleep(0.1)


if __name__ == '__main__':
    channels = get_channel_urls('http://bj.58.com/sale.shtml')
    print(len(channels))
    # 首先執(zhí)行解析類目程序
    pool = Pool()
    pool.map(get_all_links_from, channels)
    pool.close()
    pool.join()
    _58_urls.find_and_modify(query={'flag': {'$eq':0}}, update={'$set': {'flag': 0}})
    urls = list(map(lambda url: url['url'], _58_urls.find({'flag': {'$eq': 0}})))
    pool = Pool()
    pool.map(get_info, urls)
    pool.close()
    pool.join()

page_parsing.py

from bs4 import BeautifulSoup
import requests
import time
import pymongo
import sys

client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
_58_urls = walden['_58_urls']
_58_infos = walden['_58_infos']


def get_links_from(channel, page, who_sells=0):
    # http://bj.58.com/iphonesj/0/pn2/
    list_url = '{}{}/pn{}'.format(channel, who_sells, page)
    wb_data = requests.get(list_url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    # #infocont > span > b
    infocont = soup.select('#infocont > span > b')
    infocont = int(infocont[0].get_text()) if len(infocont) > 0 else 0
    print('{} -> {}'.format(list_url, infocont))
    if infocont <= 0:
        return 0
    links = soup.select('table > tr > td > a.t')
    for link in links:
        if not link.has_attr('onclick') and not link.has_attr('data-addtype'):
            item_link = link.get('href').split('?')[0]
            data = {
                'url': item_link
                , 'flag': 0
            }
            if _58_urls.find({'url': {'$eq': data['url']}}).count() == 0:
                _58_urls.insert_one(data)
    return len(links)


def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    res = list(filter(lambda a: a.src != None and '404' in a.src, soup.find_all('script')))
    if len(res) > 0:
        return
    title = soup.select('div.col_sub.mainTitle > h1')
    price = soup.select('span.price.c_f50')
    date = soup.select('li.time')
    area = soup.select('li > div.su_con > span.c_25d')
    # 去除匹配到的 借錢買
    area = list(filter(lambda a: '借錢買' not in a.get_text(), area))
    try:
        data = {
            'url': url
            , 'title': title[0].get_text().strip() if len(title) > 0 else None
            , 'price': price[0].get_text().strip().strip('元').strip() if len(price) > 0 else None
            , 'date': date[0].get_text().strip() if len(date) > 0 else None
            , 'area': ''.join(area[0].stripped_strings) if len(area) > 0 else None
        }
    except:
        print('{} -> 異常'.format(url))
        print(sys.exc_info())
    else:
        print('{} -> {}'.format(url, data))
        record = _58_infos.find_one({'url': {'$eq': url}})
        if record == None:
            _58_infos.insert_one(data)
        else:
            _58_infos.update({'_id': record['_id']}, data)
        _58_urls.find_and_modify(query={'url': url}, update={'$inc': {'flag': 1}})

# get_links_from('http://bj.58.com/iphonesj/', 1)
# get_item_info('http://bj.58.com/zixingche/26131404258880x.shtml')
屏幕快照 2016-05-30 上午1.09.55.png
  • 斷點(diǎn)續(xù)傳主要思路:在mongodb中渠道解析時(shí)存儲(chǔ)解析到的url并給定判斷位flag=0,在解析完詳情頁(yè)的時(shí)候?qū)?duì)應(yīng)url的flag自增,每次運(yùn)行取待爬取url的時(shí)候僅取flag=0的
  • 另外發(fā)現(xiàn)啟動(dòng)多進(jìn)程的時(shí)候 mongodb會(huì)報(bào)警(根據(jù)內(nèi)容應(yīng)該是同時(shí)連接了多個(gè)mongoclient):
UserWarning: MongoClient opened before fork. Create MongoClient with connect=False, or create client after forking. See PyMongo's documentation for details: http://api.mongodb.org/python/current/faq.html#using-pymongo-with-multiprocessing>
  "MongoClient opened before fork. Create MongoClient "

爬取10萬(wàn)商品數(shù)據(jù)

channel_extract.py :渠道抓取

from bs4 import BeautifulSoup
import requests
import re


# 收取趕集網(wǎng)北京下面的所有二級(jí)類目
def get_channel_urls(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    base = re.search(r'^(https?://[^/]+).*$', url).group(1)
    urls = []
    links = soup.select('dl.fenlei dt a')
    for link in links:
        href = link.get('href')
        if href.startswith('/'):
            href = '{}{}'.format(base, href)
            # 去除轉(zhuǎn)轉(zhuǎn)
        if href not in urls :
            urls.append(href)
    return urls

# #wrapper > div.content > div:nth-child(1) > div:nth-child(1) > dl > dt > a:nth-child(1)
# channels = get_channel_urls('http://bj.ganji.com/wu/')
# print('{}\n{}'.format(len(channels),'\n'.join(channels)))

counts.py :監(jiān)控小程序

import time
import pymongo

client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
_ganji_channels = walden['_ganji_channels']
_ganji_urls = walden['_ganji_urls']
_ganji_infos = walden['_ganji_infos']

while True:
    print('{}/{}/{}'.format(_ganji_channels.count(), _ganji_urls.count(), _ganji_infos.count()))
    time.sleep(5)

main.py : 主程序

from multiprocessing import Pool
from channel_extract import get_channel_urls
from page_parsing import get_item_info
from page_parsing import get_links_from

import pymongo

client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
_ganji_channels = walden['_ganji_channels']
_ganji_urls = walden['_ganji_urls']


def get_all_links_from(channel):
    if _ganji_channels.count({'channel': channel}) > 0:
        return
    count = 0
    for page in range(1, 201):
        links = get_links_from(channel, page)
        if links <= 0:
            continue
        count += links
    print('{} -> {}'.format(channel, count))
    _ganji_channels.insert_one({'channel': channel, 'count': count})


def get_info(url):
    get_item_info(url)


# _ganji_urls.drop()
# _ganji_urls.find_and_modify(query={'flag': {'$eq': 0}}, update={'$set': {'flag': 0}})
if __name__ == '__main__':
    _ganji_channels.drop()
    channels = get_channel_urls('http://bj.ganji.com/wu/')
    print(len(channels))
    # 首先執(zhí)行解析類目程序
    pool = Pool()
    pool.map(get_all_links_from, channels)
    pool.close()
    pool.join()
    urls = list(map(lambda url: url['url'], _ganji_urls.find({'flag': {'$eq': 0}})))
    pool = Pool()
    pool.map(get_info, urls)
    pool.close()
    pool.join()

page_parsing.py : 解析模塊玄组,主要用來(lái)解析商品列表和商品詳情

from bs4 import BeautifulSoup
import requests
import pymongo
import sys
import random

client = pymongo.MongoClient('localhost', 27017)
walden = client['walden']
_ganji_urls = walden['_ganji_urls']
_ganji_infos = walden['_ganji_infos']

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
    'Connection': 'keep-alive'
}
# 代理網(wǎng)站 http://cn-proxy.com/
proxy_list = [
    '101.96.11.47:80'
    , '101.96.11.43:80'
    , '101.96.11.42:80'
    , '101.96.11.44:80'
    , '112.5.220.199:80'
    , '111.13.109.56:8080'
]
# 隨機(jī)獲取代理ip
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}


def get_links_from(channel, page, who_sells=1):
    # http://bj.ganji.com/jiaju/a1o1/
    list_url = '{}a{}o{}/'.format(channel, who_sells, page)
    wb_data = requests.get(list_url, headers=headers, proxies=proxies)
    if wb_data.status_code != 200:  # 頁(yè)面不存在則返回 -1
        return -1
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if len(soup.select('ul.pageLink')) == 0:  # 頁(yè)面下方無(wú)頁(yè)面跳轉(zhuǎn)模塊
        print('{} -> {} 結(jié)束'.format(list_url, soup.select('ul.pageLink')))
        return 0
    links = soup.select('dl.list-bigpic > dt > a')
    for link in links:
        data = {
            'url': link.get('href')
            , 'flag': 0
        }
        # 跳過(guò)轉(zhuǎn)轉(zhuǎn)
        if 'zhuanzhuan' not in data['url'] and _ganji_urls.find({'url': {'$eq': data['url']}}).count() == 0:
            _ganji_urls.insert_one(data)
    return len(links)


def get_item_info(url):
    wb_data = requests.get(url, headers=headers, proxies=proxies)
    # wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if wb_data.status_code != 200:  # 如果沒(méi)有該頁(yè)面則跳過(guò)
        return
    title = soup.select('h1.title-name')
    date = soup.select('i.pr-5')
    types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')
    price = soup.select('ul.det-infor > li > i.f22')
    areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')
    newer = soup.select('ul.second-det-infor > li:nth-of-type(1)')
    try:
        data = {
            'url': url
            , 'title': title[0].get_text().strip()
            , 'date': date[0].get_text().strip().strip('發(fā)布').strip() if len(date) > 0 else None
            , 'type': [type.text.strip() for type in types]
            , 'price': price[0].get_text().strip() if len(price) > 0 else None
            , 'area': [area.text.strip() for area in areas if area.text.strip() != "-"]
            , 'newer': '{}{}'.format(newer[0].find('label').get_text(),
                                     newer[0].contents[1].strip()) if len(newer) > 0 else None
        }
    except:
        print(
            '異常 : {} -> title={},date={},types={},price={},areas={},newer={}'.format(url, title, date, types, price,
                                                                                     areas, newer))
        print(sys.exc_info())
    else:
        print('{} -> {}'.format(url, data))
        record = _ganji_infos.find_one({'url': {'$eq': url}})
        if record == None:
            _ganji_infos.insert_one(data)
        else:
            _ganji_infos.update({'_id': record['_id']}, data)
        _ganji_urls.find_and_modify(query={'url': url}, update={'$inc': {'flag': 1}})

# print(get_links_from('http://bj.ganji.com/ershoubijibendiannao/', 1))
# get_item_info('http://bj.ganji.com/bangong/2136884202x.htm')

屏幕快照 2016-05-30 上午1.17.13.png
  • 一共抓取了20個(gè)類目5w多條商品記錄
  • 發(fā)現(xiàn)有些時(shí)候自己本地可以訪問(wèn)的頁(yè)面代理卻訪問(wèn)不了
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末滔驾,一起剝皮案震驚了整個(gè)濱河市,隨后出現(xiàn)的幾起案子俄讹,更是在濱河造成了極大的恐慌哆致,老刑警劉巖,帶你破解...
    沈念sama閱讀 217,826評(píng)論 6 506
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件患膛,死亡現(xiàn)場(chǎng)離奇詭異摊阀,居然都是意外死亡,警方通過(guò)查閱死者的電腦和手機(jī)踪蹬,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 92,968評(píng)論 3 395
  • 文/潘曉璐 我一進(jìn)店門(mén)胞此,熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái),“玉大人跃捣,你說(shuō)我怎么就攤上這事漱牵。” “怎么了疚漆?”我有些...
    開(kāi)封第一講書(shū)人閱讀 164,234評(píng)論 0 354
  • 文/不壞的土叔 我叫張陵酣胀,是天一觀的道長(zhǎng)。 經(jīng)常有香客問(wèn)我娶聘,道長(zhǎng)灵临,這世上最難降的妖魔是什么? 我笑而不...
    開(kāi)封第一講書(shū)人閱讀 58,562評(píng)論 1 293
  • 正文 為了忘掉前任趴荸,我火速辦了婚禮儒溉,結(jié)果婚禮上,老公的妹妹穿的比我還像新娘发钝。我一直安慰自己简烤,他們只是感情好,可當(dāng)我...
    茶點(diǎn)故事閱讀 67,611評(píng)論 6 392
  • 文/花漫 我一把揭開(kāi)白布沾谜。 她就那樣靜靜地躺著嗜暴,像睡著了一般。 火紅的嫁衣襯著肌膚如雪孵淘。 梳的紋絲不亂的頭發(fā)上蒲障,一...
    開(kāi)封第一講書(shū)人閱讀 51,482評(píng)論 1 302
  • 那天,我揣著相機(jī)與錄音,去河邊找鬼揉阎。 笑死庄撮,一個(gè)胖子當(dāng)著我的面吹牛,可吹牛的內(nèi)容都是我干的毙籽。 我是一名探鬼主播洞斯,決...
    沈念sama閱讀 40,271評(píng)論 3 418
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼坑赡!你這毒婦竟也來(lái)了烙如?” 一聲冷哼從身側(cè)響起,我...
    開(kāi)封第一講書(shū)人閱讀 39,166評(píng)論 0 276
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤毅否,失蹤者是張志新(化名)和其女友劉穎亚铁,沒(méi)想到半個(gè)月后,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體螟加,經(jīng)...
    沈念sama閱讀 45,608評(píng)論 1 314
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡刀闷,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 37,814評(píng)論 3 336
  • 正文 我和宋清朗相戀三年,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了仰迁。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片甸昏。...
    茶點(diǎn)故事閱讀 39,926評(píng)論 1 348
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡,死狀恐怖徐许,靈堂內(nèi)的尸體忽然破棺而出施蜜,到底是詐尸還是另有隱情,我是刑警寧澤雌隅,帶...
    沈念sama閱讀 35,644評(píng)論 5 346
  • 正文 年R本政府宣布翻默,位于F島的核電站,受9級(jí)特大地震影響恰起,放射性物質(zhì)發(fā)生泄漏修械。R本人自食惡果不足惜,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 41,249評(píng)論 3 329
  • 文/蒙蒙 一检盼、第九天 我趴在偏房一處隱蔽的房頂上張望肯污。 院中可真熱鬧,春花似錦吨枉、人聲如沸蹦渣。這莊子的主人今日做“春日...
    開(kāi)封第一講書(shū)人閱讀 31,866評(píng)論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)柬唯。三九已至,卻和暖如春圃庭,著一層夾襖步出監(jiān)牢的瞬間锄奢,已是汗流浹背失晴。 一陣腳步聲響...
    開(kāi)封第一講書(shū)人閱讀 32,991評(píng)論 1 269
  • 我被黑心中介騙來(lái)泰國(guó)打工, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留拘央,地道東北人涂屁。 一個(gè)月前我還...
    沈念sama閱讀 48,063評(píng)論 3 370
  • 正文 我出身青樓,卻偏偏與公主長(zhǎng)得像堪滨,于是被迫代替她去往敵國(guó)和親胯陋。 傳聞我的和親對(duì)象是個(gè)殘疾皇子蕊温,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 44,871評(píng)論 2 354

推薦閱讀更多精彩內(nèi)容