這次爬取的國家社科基金項目數(shù)據(jù)用到的包和上次的【Python實戰(zhàn)】1997-2019年教育統(tǒng)計數(shù)據(jù)爬取并導出excel
差不多例衍,都用到了pandas.read_html
昔期,而且比之前的爬取要簡單已卸。
爬取網(wǎng)站為:fz.people.com.cn/skygb/sk/index.php/index/index/
具體代碼:
import pandas as pd
from urllib import request
import time,random,re,os
import urllib.request
from lxml import etree
from pandas.core.frame import DataFrame
import datetime
# 隨機獲取headers
def getheaders():
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36"
]
UserAgent = random.choice(user_agent_list)
header = {'User-Agent':UserAgent}
return header
# 獲取頁面html
def get_page(url):
headers = getheaders()
# 嘗試,有時候連接不成功硼一,就多試幾次累澡,避免中途中斷
attempts = 0
success = False
while attempts < 10 and not success:
try:
req = urllib.request.Request(url = url, headers = headers)
html = urllib.request.urlopen(req).read().decode('utf_8')
success = True
except:
attempts += 1
if attempts == 10:
break
# time.sleep(random.random()*0.1)
return html
# 獲得每個頁面的df
def get_df(url):
html = get_page(url)
df = pd.read_html(html)[2]
return df
# 獲取所有頁面數(shù)量
def get_page_num(yeeurl):
html = get_page(url)
selector = etree.HTML(html)
pagenums = selector.xpath('/html/body/div[3]/div/a[12]/@data-ci-pagination-page')[0]
# print(pagenums)
return int(pagenums)
if __name__ == '__main__':
columnnames = ['項目批準號', '項目類別', '學科分類', '項目名稱', '立項時間', '項目負責人', '專業(yè)職務', '工作單位', '單位類別', '所在省區(qū)市', '所屬系統(tǒng)', '成果名稱', '成果形式', '成果等級', '結項時間', '結項證書號', '出版社', '出版時間', '作者', '獲獎情況']
df = DataFrame(columns = columnnames)
for i in range(1991, 2021):
print('****第{}年****'.format(i))
url = 'http://fz.people.com.cn/skygb/sk/index.php/index/seach/?pznum=&xmtype=0&xktype=0&xmname=&lxtime=' + str(i) + '&xmleader=&zyzw=0&gzdw=&dwtype=0&szdq=0&ssxt=0&cgname=&cgxs=0&cglevel=0&jxdata=0&jxnum=&cbs=&cbdate=0&zz=&hj='
if i == 1995:
pagenums = 9
else:
pagenums = get_page_num(url)
print('**共{}頁**'.format(pagenums))
for n in range(1, pagenums + 1):
new_url = 'http://fz.people.com.cn/skygb/sk/index.php/index/seach/' + str(n) + '?pznum=&xmtype=0&xktype=0&xmname=&lxtime=' + str(i) + '&xmleader=&zyzw=0&gzdw=&dwtype=0&szdq=0&ssxt=0&cgname=&cgxs=0&cglevel=0&jxdata=0&jxnum=&cbs=&cbdate=0&zz=&hj='
print(new_url)
newdf = get_df(new_url)
df = df.append(newdf, ignore_index=True)
print('---第{}頁已獲取---'.format(n))
df.to_excel('sheke_fund_1028.xlsx', index = 0)
最后用時差不多30分鐘,獲得的excel數(shù)據(jù)如下:
GZ號:amazingdata (數(shù)據(jù)格子鋪)
后臺回復:國家社科般贼,可下載excel數(shù)據(jù)