最近有人問我Python爬蟲相關(guān)的東西,所以打算把我之前寫的爬豆瓣TOP250的影片信息并存入數(shù)據(jù)庫的內(nèi)容寫出來。
爬取豆瓣TOP250的影片信息
# coding:utf-8
import uuid
import requests
import unicodedata
from lxml import html
import db_douban
def list_douban_top250():
print('正在獲取豆瓣TOP250影片信息并存入數(shù)據(jù)庫...')
movies = []
index = 1
page_count = 10
for i in range(page_count):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i * 25)
url_content = requests.get(url).content
# 內(nèi)容節(jié)點(diǎn)
doc = html.fromstring(url_content)
for y in doc.xpath('//div[@class="info"]'):
# 影片名稱
name = y.xpath('div[@class="hd"]/a/span[@class="title"]/text()')[0]
# 影片詳情
move_content = y.xpath('div[@class="bd"]/p[1]/text()')
# 導(dǎo)演演員信息
actor = move_content[0].replace(" ", "").replace("\n", "")
# 上映日期
date = move_content[1].replace(" ", "").replace("\n", "").split("/")[0]
# 制片國家
country = move_content[1].replace(" ", "").replace("\n", "").split("/")[1]
# 影片類型
gener = move_content[1].replace(" ", "").replace("\n", "").split("/")[2]
# 評分
rate = y.xpath('div[@class="bd"]/div[@class="star"]/span[2]/text()')[0]
# 評論人數(shù)
com_count = y.xpath('div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
# UUID
move_id = uuid.uuid1().hex
# 執(zhí)行l(wèi)og
print('TOP%s--%s--評分%s--人數(shù)%s' % (str(index), name, rate, com_count.replace('人評價', '')))
# 生成影片對象
movie = (unicodedata.normalize('NFKD', move_id).encode('utf-8', 'ignore'),
unicodedata.normalize('NFKD', name).encode('utf-8', 'ignore'),
unicodedata.normalize('NFKD', actor).encode('utf-8', 'ignore'),
unicodedata.normalize('NFKD', date).encode('utf-8', 'ignore'),
unicodedata.normalize('NFKD', country).encode('utf-8', 'ignore'),
unicodedata.normalize('NFKD', rate).encode('utf-8', 'ignore'),
unicodedata.normalize('NFKD', com_count.replace('人評價', '')).encode('utf-8', 'ignore'), index)
# 加入數(shù)組
movies.append(movie)
index += 1
# 插入數(shù)據(jù)庫
db_douban.insert_movies(movies)
print('任務(wù)執(zhí)行完成揉阎!')
list_douban_top250()
存入數(shù)據(jù)庫
import pymysql
# 插入多條
def insert_movies(movies):
db = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='movie', charset='utf8')
cursor = db.cursor()
sql = "INSERT INTO original_douban(id,name,actor,release_date,country,rate,comment_count,rank)\
VALUES (%s,%s,%s,%s,%s,%s ,%s,%s)"
try:
cursor.executemany(sql, movies)
db.commit()
except pymysql.Error:
db.rollback()
finally:
cursor.close()
db.close()
# 插入一條
def insert_movie(movie):
db = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='movie', charset='utf8')
cursor = db.cursor()
sql = "INSERT INTO original_douban(id,name,actor,release_date,country,rate,comment_count,rank)\
VALUES (%s,%s,%s,%s,%s,%s\
,%s,%s)"
try:
cursor.execute(sql, movie)
db.commit()
except pymysql.Error:
db.rollback()
finally:
cursor.close()
db.close()
結(jié)果
log.png
movie_db.png