看了@chaosmind的Python作業(yè) -- 天氣預(yù)報(bào)爬蟲驹愚,知道直接爬取為什么抓不到數(shù)據(jù)的原因焚碌。順便說(shuō)一下http://d1.weather.com.cn/calendar_new/2017/101280701_201706.html?_=1495720234075
后面的_參數(shù)表示的是一個(gè)13位的Unix時(shí)間戳威始。這個(gè)數(shù)是根據(jù)你發(fā)起請(qǐng)求的時(shí)間來(lái)的。Python下可以通過(guò)time模塊的time方法得到。附上一個(gè)Unix時(shí)間戳在線轉(zhuǎn)換站長(zhǎng)工具伏蚊。
一、作業(yè)要求
爬取中國(guó)天氣網(wǎng) 你所在城市過(guò)去一年的歷史數(shù)據(jù)http://www.weather.com.cn/forecast/
二格粪、網(wǎng)址構(gòu)成
http://d1.weather.com.cn/calendar_new/2017/101280701_201706.html?_=1495720234075
代碼可以表示為
'http://d1.weather.com.cn/calendar_new/{year}/{city_id}_{date}.html?_={time}'.format(year=year, city_id=city_id, date=date, time=time)
三躏吊、解題思路
- 獲取今天的日期,得到年帐萎、月比伏,如201705
- 構(gòu)造一個(gè)url列表,日期為201605 - 201705
- 遍歷url列表獲取天氣信息
四疆导、數(shù)據(jù)獲取
從返回結(jié)果看赁项,天氣預(yù)報(bào)的信息存儲(chǔ)在一個(gè)json字符串中,賦值給var fc40。
五悠菜、參考代碼
import time
import datetime
from datetime import date
import requests
import json
import csv
base_url ='http://d1.weather.com.cn/calendar_new/{year}/{city_id}_{date}.html?_='
headers = {
'Referer': 'http://www.weather.com.cn/weather40d/101300903.shtml',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
}
# 請(qǐng)求url舰攒,等到response
def get_html(url):
return requests.get(url, headers=headers)
# datas = json.loads(get_html(base_url).content[11:])
# for i in datas:
# print(i)
from datetime import timedelta
month = date.today().month
year = date.today().year
print(str(month).zfill(2))
print(str(year))
# 得到今天的日期:年、月
def get_today():
today = {}
year = date.today().year
month = date.today().month
today['year'] = year
today['month'] = month
return today
# 得到一年前的日期:年悔醋、月
def get_one_year_ago():
one_year_ago = {}
today = get_today()
one_year_ago['year'] = today.get('year') - 1
one_year_ago['month'] = today.get('month')
return one_year_ago
# 根據(jù)年摩窃、月的信息生成url列表
def generate_url_list(start_date, end_date, city_id = 101300903):
weather_url_list = []
dates = []
for year in range(start_date.get('year'), end_date.get('year') + 1):
if year == end_date.get('year'):
for month in range(1, start_date.get('month') + 1):
date = {
'year': str(year),
'month': str(month).zfill(2)
}
dates.append(date)
else:
for month in range(start_date.get('month'), 12 + 1):
date = {
'year': str(year),
'month': str(month).zfill(2)
}
dates.append(date)
for date in dates:
weather_url_list.append(
base_url.format(
year=date.get('year'), date=date.get('year') + date.get('month'), city_id = city_id)
)
return weather_url_list
# 清洗數(shù)據(jù),將清洗后的數(shù)據(jù)保存成csv文件
def parser_weather_data(resp):
weather_infos = json.loads(resp[11:])
for info in weather_infos:
with open('weather_beiliu.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(
[info.get('date'), info.get('hgl'), info.get('hmax'), info.get('hmin'), info.get('nlyf') + info.get('nl'), info.get('wk')])
def main():
with open('weather_beiliu.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['日期', '降水概率', '最高溫', '最低溫', '農(nóng)歷', '星期幾'])
# 遍歷url芬骄,獲取天氣預(yù)報(bào)信息
for url in generate_url_list(get_one_year_ago(), get_today()):
request_url = url + str(round(time.time()*1000))
# 提醒信息
print('獲取頁(yè)面:{}的數(shù)據(jù)'.format(request_url))
parser_weather_data(get_html(request_url).content)
time.sleep(1)
if __name__ == '__main__':
main()