即將進入一個知識付費行業(yè)吱七,所以想爬取下知乎Live課程的相關(guān)數(shù)據(jù),做一個簡單的數(shù)據(jù)分析鹤竭,主要分析目標(biāo)都列出來了踊餐。于是自己寫了一個爬蟲,但遺憾的是PC端API開放的數(shù)據(jù)不夠全臀稚,而且部分課程購買可用優(yōu)惠券購買吝岭,所以課程的收入也不是很準(zhǔn)確。
最終獲取的數(shù)據(jù)存儲到MySQL中吧寺,分別記錄了每個課程的id窜管、名稱、主講人稚机、評分幕帆、評價數(shù)、參與人數(shù)赖条、單價失乾、收入等相關(guān)數(shù)據(jù)。
以下為源代碼纬乍,寫的不好碱茁,多多見諒
# coding:utf-8
# 分析目標(biāo)
# 1叉袍、課程總收入=單價*學(xué)習(xí)人數(shù)
# 2织堂、每天產(chǎn)生的課程數(shù)锌云,形成一個趨勢圖(日期*課程數(shù))
# 3缺亮、參與人數(shù)最多課程排序
# 4、收入最多課程排序
# 5清笨、獲得所有課程的tag只搁,以及對應(yīng)tag的課程數(shù)量南蓬,按課程數(shù)量對tag進行排序
# 6、每個課程的相關(guān)數(shù)據(jù):標(biāo)題语盈、主講人舱馅、評分缰泡、評價數(shù)刀荒、語音條數(shù)、問答數(shù)棘钞、文件數(shù)缠借、參與數(shù)、單價
# 7宜猜、主講人和他的所有課程泼返,以及課程數(shù)量
# 8、主講人獲得的總收入姨拥,每個課程的收入
# 涉及到的URL:
# 知乎Live列表:https://api.zhihu.com/lives/homefeed?includes=live
# 知乎Live詳情:https://api.zhihu.com/lives/868793703320408064
import requests
import json
import MySQLdb
# 定義get_course_url绅喉,獲取課程詳情頁的url
def get_course_url():
url = 'https://api.zhihu.com/lives/homefeed?includes=live&limit=88'
headers = {
'referer': 'https://www.zhihu.com/lives',
'host': 'api.zhihu.com',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
response = requests.get(url, headers=headers)
res = response.content
resDict = json.loads(res) # 把json數(shù)據(jù)轉(zhuǎn)化為dict數(shù)據(jù)類型
courseDict = resDict['data'] # 獲取數(shù)據(jù)里的課程數(shù)據(jù),存儲在key為data的數(shù)據(jù)里
courseUrlList = []
for i in range(len(courseDict)):
courseId = courseDict[i]['live']['id'] # 獲取課程id叫乌,id數(shù)據(jù)放在key為live里的數(shù)據(jù)里
firstUrl = 'https://api.zhihu.com/lives/' # 知乎live詳情頁的url前綴
url = firstUrl + str(courseId) # 組成一個完整的知乎live詳情頁url
courseUrlList.append(url)
print '獲取詳情頁URL成功'
return courseUrlList # 返回一個由詳情頁url組成的列表
# 定義獲取課程信息的方法柴罐,并且傳入一個urlList,這是需要爬取的url集合
def get_course_info(urlList):
courseList = []
for url in urlList:
headers = {
'referer': 'https://www.zhihu.com/lives',
'host': 'api.zhihu.com',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
response = requests.get(url, headers=headers)
res = response.content
resDict = json.loads(res)
courseDict = {
'id': resDict['id'],
'name': resDict['subject'],
'speakName': resDict['speaker']['member']['name'],
'rank': resDict['feedback_score'],
'commentNum': resDict['review']['count'],
'answerNum': resDict['reply_message_count'],
'seatNum': resDict['seats']['taken'],
'fee': resDict['fee']['amount'],
'income': float(resDict['seats']['taken']) * (float(resDict['fee']['amount']) / 100)
}
courseList.append(courseDict)
print '獲取課程列表成功'
return courseList
# 定義inser_data方法憨奸,存儲獲取到的數(shù)據(jù)
def insert_data(list):
db = MySQLdb.connect(host='localhost', user='root', passwd='mark2227_', db='zhihu', charset='utf8')
cursor = db.cursor()
print '連接數(shù)據(jù)庫成功'
try:
createSql = '''create table liveCourse(
id bigint not null,
name varchar(25) not null,
speakName varchar(25) not null,
rank float,
commentNum int,
answerNum int,
seatNum int,
fee int,
income float
)
'''
cursor.execute(createSql)
print '成功創(chuàng)建表'
db.commit()
for course in list:
param = []
param.append(int(course['id']))
param.append(course['name'])
param.append(course['speakName'])
param.append(float(course['rank']))
param.append(int(course['commentNum']))
param.append(int(course['answerNum']))
param.append(int(course['seatNum']))
param.append(int(course['fee']))
param.append(float(course['income']))
insertSql = 'insert into liveCourse(id,name,speakName,rank,commentNum,answerNum,seatNum,fee,income) Values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cursor.execute(insertSql, param)
db.commit()
print '成功插入一條數(shù)據(jù)'
except:
print '插入數(shù)據(jù)失敗'
db.rollback()
db.close()
if __name__ == '__main__':
urlList = get_course_url()
courseList = get_course_info(urlList)
insert_data(courseList)