本次實戰(zhàn)目的是為了更好地學習MySQL和mongoBD 操作耻陕,需要從貓眼電影上爬取電影信息拙徽,加上簡單的python GUI界面操作,效果圖如下:
picture.png
廢話不多說诗宣,首先導入本次項目需要導入的模塊,以及主程序入口代碼如下:
import requests
from bs4 import BeautifulSoup
import re
import pymysql
from pymongo import MongoClient
import time
import wx
#每次爬取之前都將新建表膘怕,將以前的表刪除
def create_table_mysql():
# 打開數(shù)據(jù)庫連接
db = pymysql.connect(host="localhost", user='root', password='Lizzie94', port=3306, db='Movies_mao')
cursor = db.cursor()
cursor.execute("drop table if EXISTS movie ")
sql = """create table movie(
id int UNSIGNED not null auto_increment,
name char(50) not null,
actor varchar(400) not null,
time DATE not null,
score FLOAT,
PRIMARY KEY (id))ENGINE = InnoDB Default charset=UTF8MB4;
"""
cursor.execute(sql)
db.close()
"""下載網(wǎng)頁"""
def crawurl(url):
try:
r = requests.get(url)
if r.status_code == 200:
return r.text
else:
print('request failed, status is {}'.format(r.status_code))
return None
except Exception as e:
print(e)
return None
"""解析網(wǎng)頁"""
def parse(html):
soup = BeautifulSoup(html, 'html.parser')
detail = soup.find_all('div', "board-item-content")
for item in detail:
movie_name = item.find("p", class_="name").a.text
movie_star = re.sub(r'\s+', '', item.find("p", class_="star").text).replace("主演:", "")
movie_release_time = re.sub(r'\((.*)\)', '', item.find("p", class_="releasetime").text.replace("上映時間:", ""))
movie_score = ''.join([item.find('i', class_="integer").text, item.find('i', class_="fraction").text])
yield {'name': movie_name,
'actor': movie_star,
'time': movie_release_time,
'score': movie_score
}
"""寫入Mysql數(shù)據(jù)庫"""
def load_to_mysql(data):
values = ",".join(["%s"] * len(data))
keys = ",".join(data.keys())
movie_item = tuple(data.values())
# 打開數(shù)據(jù)庫連接
db = pymysql.connect(host="localhost", user='root', password='Lizzie94',port=3306,db= 'Movies_mao')
cursor = db.cursor()
# 插入數(shù)據(jù)
sql_insert = "insert into movie({keys}) values ({values})".format(keys=keys,values=values)
try:
if cursor.execute(sql_insert, movie_item):
print('insert successfully')
db.commit()
except Exception as e:
print("failed", e.args)
db.rollback()
db.close()
"""寫入MangoDB"""
def load_to_mangoDB(data):
client = MongoClient('localhost') # 連接客戶端
db = client.Mao_Movie # 創(chuàng)建數(shù)據(jù)庫“Mao_Movie" 若數(shù)據(jù)庫不存在,存在則連接數(shù)據(jù)庫
post = db.Movie # 創(chuàng)建集合Movie召庞,若集合不存在岛心,存在則連接集合
# post.remove(None)
try:
if post.insert(data):
print('insert MongoDB successfully')
except Exception as e:
print('insert MongoDB failed', e.args)
"""點擊 crawl_button 按鈕觸發(fā)主調用函數(shù)開始爬取電影"""
def main(event):
base_url = 'https://maoyan.com/board/4?offset='
page = 11
# 爬數(shù)據(jù)之前先建表
create_table_mysql()
for i in range(1, page):
url = base_url + str((i-1) * 10)
html = crawurl(url)
for item in parse(html):
print(item)
load_to_mysql(item)
load_to_mangoDB(item)
wx.MessageBox("crawl movies successfully", "Message", wx.OK | wx.ICON_INFORMATION)
以下代碼是整個項目的主入口程序,需要用到wx建立一個GUI界面篮灼,按鈕crawl_button 綁定main()事件忘古,點擊并開始觸發(fā)開始爬取電影事件。
if __name__ == '__main__':
t1 = time.time()
# 界面代碼
app = wx.App()
frame = wx.Frame(None, title="Spide movie", pos=(1000, 200), size=(500, 400))
panel = wx.Panel(frame)
lb_box = wx.BoxSizer(wx.HORIZONTAL)
# 靜態(tài)文本诅诱,放入水平容器中髓堪, 1:1 比例
lb_srch_cont = wx.StaticText(panel, -1, '搜索內容:')
lb_srch_type = wx.StaticText(panel, -1, '搜索類別:')
lb_box.Add(lb_srch_cont, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
lb_box.Add(lb_srch_type, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
text_box = wx.BoxSizer(wx.HORIZONTAL)
# 輸入搜索內容框
content_text = wx.TextCtrl(panel, -1)
list_type = ['演員名字', '電影名字']
# 下拉列表框
type_combox = wx.ComboBox(panel, -1, choices=list_type)
# 搜素按鈕button
srch_button = wx.Button(panel, label="搜索")
# 綁定搜索事件
srch_button.Bind(wx.EVT_BUTTON, hit_me)
text_box.Add(content_text, proportion=3, flag=wx.EXPAND | wx.ALL, border=3)
text_box.Add(type_combox, proportion=2, flag=wx.EXPAND | wx.ALL, border=3)
text_box.Add(srch_button, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
# 搜索之后的顯示文本框
srch_content = wx.TextCtrl(panel, style=wx.TE_MULTILINE|wx.HSCROLL)
# 點擊開始爬取按鈕
crawl_button = wx.Button(panel, label="開始爬取貓眼電影TOP100")
# 綁定爬取事件,事件函數(shù)有且只有一個參數(shù),叫event
crawl_button.Bind(wx.EVT_BUTTON, main)
v_box = wx.BoxSizer(wx.VERTICAL)
v_box.Add(lb_box, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
v_box.Add(text_box, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
v_box.Add(srch_content, proportion=10, flag=wx.EXPAND | wx.ALL, border=3)
v_box.Add(crawl_button, proportion=1, flag=wx.ALIGN_CENTER_HORIZONTAL | wx.ALL, border=3)
panel.SetSizer(v_box)
frame.Show()
app.MainLoop()
print('Total time:', t1-time.time())
點擊搜索button, 獲取輸入文本框和下拉菜單值,進行數(shù)據(jù)庫查詢操作
# 點擊搜索button事件
def hit_me(event):
content = content_text.GetValue()
type = type_combox.GetValue()
if not (content and type): # 若輸入框或下拉框為空則顯示錯誤信息
wx.MessageBox("please input some values", "Message", wx.OK | wx.ICON_INFORMATION)
return
if type == "演員名字":
field = 'actor'
else:
field = 'name'
value = '%'+ content + '%'
# Mysql 數(shù)據(jù)庫查詢操作
# 打開數(shù)據(jù)庫連接
db = pymysql.connect(host="localhost", user='root', password='Lizzie94', port=3306, db='Movies_mao')
cursor = db.cursor()
try:
cursor.execute("Select * from movie where {field} like '{value}' ".format(field=field, value=value))
results = cursor.fetchall()
all_row = ''
for each in results:
row = ' '.join(str(i) for i in each)
all_row += row + '\n'
srch_content.SetValue(all_row)
except Exception as e:
wx.MessageBox("selection from database error" , "Message", wx.OK | wx.ICON_INFORMATION)
print(e)
db.close()
最后的效果圖如下:
picture2.png
picture3.png
picture4.png
未完待續(xù)干旁,下次需要用異步方法來爬蟲驶沼,本人基礎不是很扎實,只能用同步方法來爬取數(shù)據(jù)争群。
最后附上 wxpyhon的中文學習資料: https://www.ctolib.com/docs/sfile/wxpy-in-action/12.html
英文學習地址:http://zetcode.com/wxpython/
https://www.wxpython.org/