程序的主要功能就是獲取一個(gè)天氣網(wǎng)站的數(shù)據(jù)然后存儲(chǔ)在一個(gè)cvs文件
環(huán)境為2.7 需要用到的庫
import requests
import csv
import random
from bs4 import BeautifulSoup
requests 網(wǎng)絡(luò)請求需要自行安裝
csv python自帶的操作文件的庫
random 隨機(jī)數(shù) 模擬真實(shí)請求的timeout
Beaautifulsoup 代替正則表達(dá)式的神器 幫助我們更好獲取html中需要的內(nèi)容
主要業(yè)務(wù)分為三步:
1.獲取當(dāng)前網(wǎng)頁內(nèi)容
2.解析網(wǎng)頁獲取目標(biāo)數(shù)據(jù)內(nèi)容
3.寫入csv文件中
代碼如下
#獲取網(wǎng)頁內(nèi)容
def get_content(url, data = None):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
timeout = random.choice(range(80, 180))
rep = requests.get(url, headers=header, timeout=timeout)
rep.encoding = 'utf-8'
return rep.text
#解析網(wǎng)頁 返回目標(biāo)數(shù)據(jù)
def get_data(html_text):
final = []
bs = BeautifulSoup(html_text, "html.parser") # 創(chuàng)建BeautifulSoup對象
body = bs.body # 獲取body部分
data = body.find('div', {'id': '7d'}) # 找到id為7d的div
ul = data.find('ul') # 獲取ul部分
li = ul.find_all('li') # 獲取所有的li
for day in li: # 對每個(gè)li標(biāo)簽中的內(nèi)容進(jìn)行遍歷
temp = []
date = day.find('h1').string # 找到日期
temp.append(date) # 添加到temp中
inf = day.find_all('p') # 找到li中的所有p標(biāo)簽
temp.append(inf[0].string,) # 第一個(gè)p標(biāo)簽中的內(nèi)容(天氣狀況)加到temp中
if inf[1].find('span') is None:
temperature_highest = None # 天氣預(yù)報(bào)可能沒有當(dāng)天的最高氣溫(到了傍晚梳庆,就是這樣)兔魂,需要加個(gè)判斷語句,來輸出最低氣溫
else:
temperature_highest = inf[1].find('span').string # 找到最高溫
temperature_highest = temperature_highest.replace('', '') # 到了晚上網(wǎng)站會(huì)變烘嘱,最高溫度后面也有個(gè)℃
temperature_lowest = inf[1].find('i').string # 找到最低溫
temperature_lowest = temperature_lowest.replace('', '') # 最低溫度后面有個(gè)℃,去掉這個(gè)符號
temp.append(temperature_highest) # 將最高溫添加到temp中
temp.append(temperature_lowest) #將最低溫添加到temp中
final.append(temp) #將temp加到final中
return final
#寫入文件
def write_data(data, name):
file_name = name
with open(file_name, 'w+r') as f:
myCsv = UnicodeWriter(f)
myCsv.writerow(f)
myCsv.writerows([[u'日期', u'天氣', u'最高溫度', u'最低溫度']])
myCsv.writerows(data)
涉及到unicode字符內(nèi)容導(dǎo)致數(shù)據(jù)寫不進(jìn)去,我查閱資料發(fā)現(xiàn)官方給出了一個(gè)解決辦法-----重寫csv類
import csv, codecs, cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
最后的代碼
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import random
from bs4 import BeautifulSoup
import csv, codecs, cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
def write_data(data, name):
file_name = name
with open(file_name, 'w+r') as f:
myCsv = UnicodeWriter(f)
myCsv.writerow(f)
myCsv.writerows([[u'日期', u'天氣', u'最高溫度', u'最低溫度']])
myCsv.writerows(data)
def get_content(url, data = None):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
timeout = random.choice(range(80, 180))
rep = requests.get(url, headers=header, timeout=timeout)
rep.encoding = 'utf-8'
return rep.text
def get_data(html_text):
final = []
bs = BeautifulSoup(html_text, "html.parser") # 創(chuàng)建BeautifulSoup對象
body = bs.body # 獲取body部分
data = body.find('div', {'id': '7d'}) # 找到id為7d的div
ul = data.find('ul') # 獲取ul部分
li = ul.find_all('li') # 獲取所有的li
for day in li: # 對每個(gè)li標(biāo)簽中的內(nèi)容進(jìn)行遍歷
temp = []
date = day.find('h1').string # 找到日期
temp.append(date) # 添加到temp中
inf = day.find_all('p') # 找到li中的所有p標(biāo)簽
temp.append(inf[0].string,) # 第一個(gè)p標(biāo)簽中的內(nèi)容(天氣狀況)加到temp中
if inf[1].find('span') is None:
temperature_highest = None # 天氣預(yù)報(bào)可能沒有當(dāng)天的最高氣溫(到了傍晚饥脑,就是這樣)阿蝶,需要加個(gè)判斷語句,來輸出最低氣溫
else:
temperature_highest = inf[1].find('span').string # 找到最高溫
temperature_highest = temperature_highest.replace('', '') # 到了晚上網(wǎng)站會(huì)變,最高溫度后面也有個(gè)℃
temperature_lowest = inf[1].find('i').string # 找到最低溫
temperature_lowest = temperature_lowest.replace('', '') # 最低溫度后面有個(gè)℃柬甥,去掉這個(gè)符號
temp.append(temperature_highest) # 將最高溫添加到temp中
temp.append(temperature_lowest) #將最低溫添加到temp中
final.append(temp) #將temp加到final中
return final
if __name__ == '__main__':
url ='http://www.weather.com.cn/weather/101190401.shtml'
html = get_content(url)
result = get_data(html)
print(result)
write_data(result, 'weather.csv')
然后
困惑:
程序中對℃
字符不知道怎么處理饮六,會(huì)報(bào)錯(cuò)UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)
希望大牛指點(diǎn)迷津,感激不盡