image.png
????這里主要是對(duì)年度票房信息進(jìn)行操作,url構(gòu)造、數(shù)據(jù)解析方面都是比較簡(jiǎn)單的了,這里就只是簡(jiǎn)單說(shuō)一下
爬蟲(chóng)
1. 請(qǐng)求網(wǎng)站
request請(qǐng)求網(wǎng)站栗菜,返回源碼信息
def get_Html(url):
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
return r.text
2. 獲取電影數(shù)據(jù)保存至字典
????因?yàn)閿?shù)據(jù)不多,我們就對(duì)頁(yè)面可視的所有數(shù)據(jù)進(jìn)行抓取蹄梢,這里用到了lxml里面的etree解析網(wǎng)頁(yè)疙筹,用xpath獲取對(duì)應(yīng)的數(shù)據(jù)項(xiàng)然后保存,代碼比較簡(jiǎn)單禁炒,過(guò)程數(shù)據(jù)項(xiàng)英文翻譯過(guò)來(lái)就懂了而咆,就不做太多注釋了
def get_Info(text):
info = {}
info['movie_name'] = []
info['movie_type'] = []
info['movie_type'] = []
info['total'] = []
info['price_average'] = []
info['session_average'] = []
info['origin'] = []
info['time'] = []
tree = etree.HTML(text)
movies = tree.xpath('//table[@id="tbContent"]//tr')[1:]
for movie in movies:
movie_name = movie.xpath('./td[1]/a/p/text()')[0]
if movie.xpath('./td[2]/text()'):
movie_type = movie.xpath('./td[2]/text()')[0]
total = movie.xpath('./td[3]/text()')[0]
price_average = movie.xpath('./td[4]/text()')[0]
session_average = movie.xpath('./td[5]/text()')[0]
if movie.xpath('./td[6]/text()'):
origin = movie.xpath('./td[6]/text()')[0]
if movie.xpath('./td[7]/text()'):
time = movie.xpath('./td[7]/text()')[0]
else:
time = ""
# print(movie_name+' movie_type:'+movie_type+' total:'+total+' person_average:'+price_average+' session_average:'+session_average+' origin:'+origin+' time:'+time)
info['movie_name'].append(movie_name)
info['movie_type'].append(movie_type)
info['total'].append(total)
info['price_average'].append(price_average)
info['session_average'].append(session_average)
info['origin'].append(origin)
info['time'].append(time)
return info
3. url構(gòu)造,獲取2008-2019所有榜上的電影信息
urls = ["http://www.cbooo.cn/year?year={}".format(year) for year in range(2008, 2020)]
4. 保存至csv
????用到pandas庫(kù)幕袱,先將字典轉(zhuǎn)成DataFrame暴备,然后直接寫(xiě)入csv即可,可參考我之前的可視化相關(guān)的內(nèi)容.(這里為了顯示中文可以在編碼方面稍做處理)
def write2csv(dict, year):
if year == '2008':
df = pd.DataFrame(data=dict, index=None)
df.to_csv('box_office.csv', index=False, encoding='gbk', mode='a')
else:
df = pd.DataFrame(data=dict, index=None)
df.to_csv('box_office.csv', index=False, header=False, encoding='gbk', mode='a')
5. csv文件
image.png
可視化
1. 各類型電影總票房數(shù)(柱狀圖)
def draw_bar(filename):
data = pd.read_csv(filename, encoding='gbk')
total = data.groupby(data['movie_type'])['total'].sum()
total.plot(kind='bar')
plt.legend()
# 添加網(wǎng)格
plt.grid(linestyle='--', alpha=0.5)
plt.xlabel("電影類別")
plt.ylabel("總票房數(shù)量")
plt.title("各類型電影總票房數(shù)")
plt.show()
image.png
3. 總票房和平均票價(jià)的關(guān)系(散點(diǎn)圖)
def draw_scatter(filename):
data = pd.read_csv(filename, encoding='gbk')
plt.title('總票房和平均票價(jià)的關(guān)系')
plt.xlabel('平均票價(jià)')
plt.ylabel('總票房(萬(wàn))')
plt.scatter(data.price_average, data.total, color='b', linestyle='--', label='上海')
plt.show()
image.png
3. 劇情類型電影前五票房曲線(折線圖)
def draw_plot(filename):
data = pd.read_csv(filename, encoding='gbk')
total = data.query('movie_type == "劇情"').head(5).groupby('movie_name')['total'].sum()
total.plot()
plt.legend()
# 添加網(wǎng)格
plt.grid(linestyle='--', alpha=0.5)
plt.xlabel("電影")
plt.ylabel("總票房數(shù)量")
plt.title("劇情類型電影前五票房曲線")
plt.show()
image.png
4. 電影票房前五的類型分布(餅圖)
def draw_pie(filename):
data = pd.read_csv(filename, encoding='gbk')
total = data.groupby(data['movie_type'], ).size().sort_values(ascending=False).head(5)
print(total)
print(total.index)
plt.title("電影票房前五的類型分布")
plt.pie(total, autopct='%.2f%%', labels=total.index)
plt.axis('equal')
plt.legend()
plt.show()
image.png
5. 中文處理
plt.rcParams['font.sans-serif'] = ['Simhei']
- 更多爬蟲(chóng)代碼詳情查看Github