1 獲取的數(shù)據(jù)展示
2 實現(xiàn)原理
網(wǎng)上查詢了一圈榄檬,要么是付費接口,要么是殘缺文章衔统,一氣之下自己開發(fā)實現(xiàn)鹿榜,現(xiàn)將核心原理和代碼發(fā)出來,方便需要的朋友锦爵。
原理
通過python爬取遍歷爬取百度獎牌榜舱殿,xpath解析網(wǎng)頁數(shù)據(jù),并將獲取的數(shù)據(jù)清洗入庫到mysql险掀。后面就可以隨心統(tǒng)計展示了沪袭。
3核心代碼實現(xiàn)
3.1獲取金牌榜與獲取國家ID數(shù)據(jù)列表
數(shù)據(jù)示例:
{'get_time': '2024-08-09 09:28:04', 'countryAbbr': 'ZAM', 'countryName': '贊比亞', 'delegationId': '151', 'gold': 0, 'silver': 0, 'total': 1, 'rank': 80}
數(shù)據(jù)說明:其中delegationId 為國家id號,gold 金牌數(shù)樟氢,silver 銀牌冈绊,total 獎牌總數(shù)侠鳄,rank 獎牌排名。
代碼實現(xiàn)
def get_current_medalList():
# 爬取的百度接口URL
url = 'https://tiyu.baidu.com/al/major/home?page=home&match=2024%E5%B9%B4%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&tab=%E5%A5%96%E7%89%8C%E6%A6%9C'
# 調(diào)用GET接口
response = requests.get(url)
# 調(diào)用POST接口
# response = requests.post(url, data=data)
html = etree.HTML(response.text)
script_json = html.xpath('//script[@type="application/json"]')
# 金牌數(shù)
format_medalList = []
for per in script_json[0:1]:
json_str = json.loads(per.text)
medalList = json_str['data']['data']['tabsList'][1]['data']['medalList'][0]
# 獲取時間
current_time = json_str['data']['common']['requestStart']
dt = datetime.datetime.fromtimestamp(current_time / 1000)
new_dt = dt.strftime('%Y-%m-%d %H:%M:%S')
# 當前獲取獎牌數(shù)
for list in medalList:
format_list = {
'get_time': new_dt,
'countryAbbr': list['countryAbbr'],
'countryName': list['countryName'],
'delegationId': list['delegationId'],
'gold': list['gold'],
'silver': list['silver'],
'total': list['total'],
'rank': list['rank']
}
print(format_list)
format_medalList.append(format_list)
# medalList-seasonList
# "本屆獎牌榜",
# "2020東京",
# "2016里約熱內(nèi)盧",
# "2012倫敦",
# "2008北京",
# "2004雅典",
# "2000悉尼",
# "1996亞特蘭大",
# "1992巴塞羅那"
return format_medalList
3.2 獲取單個國家(例如:中國 id 為 29)獎牌數(shù)據(jù)明細列表
數(shù)據(jù)示例:
{'id': '1a9a2bfdb9a015662622c4e674ca3d65', 'get_time': '08月08日', 'country': '中國', 'medal': '第28金', 'medalType': 'gold', 'playerName': '羅詩芳', 'smallMatch': '女子59公斤級', 'time': '23:05', 'bigMatch': '舉重'}
數(shù)據(jù)說明: get_time 獲獎日期死宣,time 獲獎時間伟恶,country 國家名字,playerName 運動員名稱
代碼實現(xiàn)
def get_history_medalList(delegationId):
# 接口URL
url = 'https://tiyu.baidu.com/al/major/delegation?page=delegation&match=2024%E5%B9%B4%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&tab=%E8%8E%B7%E5%A5%96%E5%90%8D%E5%8D%95&id=' + delegationId
# 調(diào)用GET接口
response = requests.get(url)
# 調(diào)用POST接口
# response = requests.post(url, data=data)
html = etree.HTML(response.text)
script_json = html.xpath('//script[@type="application/json"]')
output_data = []
for per in script_json[0:1]:
json_str = json.loads(per.text)
# 0 賽程 1 獲獎名單 2 運動員
tabData = json_str['data']['data']['tabsList'][1]['data'][0]['tabData']
# print(tabData)
# 當前獲取獎牌數(shù)
for perdata in tabData:
date_str = perdata['date']
for data in perdata['dateList']:
format_data = {
"id": hashlib.md5(str(date_str + data["country"] + data["playerName"]).encode('utf-8')).hexdigest(),
"get_time": date_str,
"country": data["country"],
"medal": data["medal"],
"medalType": data["medalType"],
"playerName": data["playerName"],
"smallMatch": data["smallMatch"],
"time": data["time"],
"bigMatch": data["bigMatch"]
}
print(format_data)
output_data.append(format_data)
return output_data
放回的html數(shù)據(jù)相對比較復(fù)雜毅该,結(jié)構(gòu)嵌套層數(shù)較多知押。程序代碼中已經(jīng)實現(xiàn)了解析過程,可忽略解析過程鹃骂,直接摘取果實享用台盯。
3.3 獎牌數(shù)據(jù)明細寫入關(guān)系型數(shù)據(jù)庫,mysql
基于明細數(shù)據(jù)創(chuàng)建mysql表單
明細數(shù)據(jù)示例:{'id': '1a9a2bfdb9a015662622c4e674ca3d65', 'get_time': '08月08日', 'country': '中國', 'medal': '第28金', 'medalType': 'gold', 'playerName': '羅詩芳', 'smallMatch': '女子59公斤級', 'time': '23:05', 'bigMatch': '舉重'}
mysql DDL參考:
-- colin.paris_2024 definition
CREATE TABLE `paris_2024` (
`id` varchar(100) NOT NULL,
`get_time` varchar(100) DEFAULT NULL,
`country` varchar(100) DEFAULT NULL,
`medal` varchar(100) DEFAULT NULL,
`medalType` varchar(100) DEFAULT NULL,
`playerName` varchar(100) DEFAULT NULL,
`smallMatch` varchar(100) DEFAULT NULL,
`time` varchar(100) DEFAULT NULL,
`bigMatch` varchar(100) DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
代碼實現(xiàn)
def import_mysql(data_list):
host = 'your ip addr'
port = 3306
dbName = 'databasesName'
user = 'user'
password = 'yourPassword'
db = pymysql.connect(host=host, port=port, user=user, passwd=password, db=dbName, charset='utf8')
# 創(chuàng)建一個游標對象畏线,通過游標對象來進行數(shù)據(jù)的增刪改查静盅。
cursor = db.cursor()
num = 0
for data in data_list:
id = data['id']
get_time = data['get_time']
country = data['country']
medal = data['medal']
medalType = data['medalType']
playerName = data['playerName']
smallMatch = data['smallMatch']
time_data = data['time']
bigMatch = data['bigMatch']
# 構(gòu)造insert into 語句,使用到了format 占位符
sql = "replace INTO colin.paris_2024(id, get_time, country, medal, medalType, playerName, smallMatch, `time`, bigMatch)VALUES('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}')".format(id, get_time, country, medal, medalType, playerName, smallMatch, time_data, bigMatch)
# print("當前時間戳===", sql)
cursor.execute(sql)
num=num+1
#提交事務(wù)
db.commit()
cursor.close()
db.close()
print("輸入條數(shù)",num)
4 各個代碼塊調(diào)用main
需要python包
import json
import requests
from lxml import etree
import datetime
import hashlib
import pymysql
main 函數(shù)
import json
import requests
from lxml import etree
import datetime
import hashlib
import pymysql
def 3.1 核心函數(shù)(copy到此)
def 3.2 核心函數(shù)(copy到此)
def 3.3 核心函數(shù)(copy到此)
if __name__ == '__main__':
# 獲取當天數(shù)據(jù)寝殴,包含delegationId
format_medalList = get_current_medalList()
all_data=[]
# 遍歷獲取歷史數(shù)據(jù)
for medalList in format_medalList:
delegationId = medalList['delegationId']
# print(delegationId)
# 遍歷國家 獲獎信息
output_data = get_history_medalList(delegationId)
all_data.extend(output_data)
# 寫入數(shù)據(jù)庫
import_mysql(all_data)
print("寫入完成")
以上是整個巴黎奧運會獎牌數(shù)據(jù)獲取實現(xiàn)原理和明細代碼蒿叠。
感謝給個關(guān)注,點個贊蚣常!