我的成果
運行結(jié)果
我的代碼
from bs4 import BeautifulSoup
import requests, time, random, json
# 換代理古戴,參考現(xiàn)成的:https://mugglecoding.gitbooks.io/qa/content/ru_he_huo_qu_dai_li_ip.html
resp = requests.get("http://tor1024.com/static/proxy_pool.txt")
ips_txt = resp.text.strip().split("\n")
ips = []
for i in ips_txt:
try:
k = json.loads(i)
ips.append(k)
except Exception as e:
print(e)
# 58搜索頁面
urls = ['http://bj.58.com/pbdn/0/pn{}/'.format(str(i)) for i in range(1,4)]
headers = {'headers':'Mozilla/5.0 (Windows NT 6.1; WOW64) \ '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
# 從轉(zhuǎn)轉(zhuǎn)頁面爬取二手商品信息
def zhuanzhuan_data(url):
# 列表用來整理商品標簽(商品vs標簽:1對多)
total_tags = []
try:
web_data = requests.get(url, headers=headers, proxies=random.choice(ips),timeout=6)
soup = BeautifulSoup(web_data.text, 'lxml')
category = soup.select('div[class="breadCrumb f12"] \> \ '
'span:nth-of-type(4) > a')[0].get_text().lstrip()
subject = soup.select('h1[class="info_titile"]')[0].get_text()
page_view = soup.select('span[class="look_time"]')[0].get_text()
price = soup.select('span[class="price_now"] > i')[0].get_text()
region = soup.select('div[class="palce_li"] > span > i')[0].get_text()
tags = soup.select('div[class="biaoqian_li"] > span')
# 標簽需要單獨整理一下
for tag in tags:
tag = tag.get_text()
total_tags.append(tag)
# 匯總信息到一個詞典
data = {
'category':category,
'title':subject,
'view':page_view,
'price':price,
'region':region,
'tags':total_tags
}
# 打印詞典劲赠,把詞典傳入“寫入txt”的函數(shù)
print(data)
create_txt(data)
except Exception as e:
print(e)
def create_txt(data):
f = open(r'c:/users/administrator/desktop/zz.txt', 'a')
f.write(str(data) + '\n' + '-'*70 + '\n')
f.close()
# 從58搜索頁面獲取二手商品鏈接
def get_zz(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
zz_urls = soup.select('tr.zzinfo > td.img > a')
for zz_url in zz_urls:
print('這是第',zz_urls.index(zz_url) + 1,'條轉(zhuǎn)轉(zhuǎn)')
zz_url = zz_url.get('href')
zhuanzhuan_data(zz_url)
# 【#####起點#####】
for url in urls:
print('#####第',urls.index(url) + 1,'頁#####')
print('-'*60)
get_zz(url)
print('-'*60)
我的感想:
- 這個作業(yè)做完了,但感覺有點勉強峡竣,而且滿是疑惑靠抑,沒有清爽的感覺,可能是“困難”帶來的挫敗感還沒消失吧
- 這個作業(yè)花了很久:
其實實現(xiàn)爬蟲功能的大部分代碼我中午就寫了得差不多了(大約花了40分鐘)澎胡,但是午休起來之后我怎么都爬不出完整的3頁150條數(shù)據(jù),因為老是報錯:
- 報錯
從request.get()什么參數(shù)都沒加娩鹉,到試headers攻谁,試proxies……花了很久,其中包含了自我懷疑的放空時間弯予。<a>換代理的代碼以后可以隨便抄了呃</a>
總體來說應(yīng)該是花了<a>四個半小時</a>左右的時間吧(太恐怖了)戚宦。
- 寫入到 txt 的時候,用 'w+' 就只能寫1條數(shù)據(jù)锈嫩,直到用了 'a' 受楼,問題解決了。