獲取圖片并存入文件夾中
import urllib.request
response = urllib.request.urlopen('http://placekitten.com/1920/1280')
cat_img = response.read()
with open('cat_1920_1280.jpg','wb')as f:
f.write(cat_img)
利用有道翻譯
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
import json
content = input("請輸入要翻譯的內(nèi)容:")
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
data = {}
data['type']='AUTO'
data['i']=content
data['doctype']='json'
data['xmlVersion']='1.8'
data['keyfrom']='fanyi.web'
data['ue']='UTF-8'
data['action']='FY_BY_CLICKBUTTON'
data['typoResult']='true'
data = urllib.parse.urlencode(data).encode('utf_8')
req = urllib.request.Request(url,data,head)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
target = json.loads(html)
print("翻譯結(jié)果:%s"%(target['translateResult'][0][0]['tgt']))
Python爬蟲將煎蛋網(wǎng)上的圖片全部下載到本地
# -*- coding:utf-8 -*-
import urllib.request
import os
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page')+23
b = html.find(']',a)#從a位置開始找到位置坐標(biāo)
return html[a:b]#頁碼
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_address = []
a = html.find('img src=')
while a!=-1:
b = html.find('.jpg',a,a+255)#從a開始槐瑞,到限定結(jié)束范圍a+255
if b != -1:
img_address.append('http:'+html[a+9:b+4])
else:
b = a+9
a = html.find('img src=',b)
return img_address
def save_imgs(folder,img_address):
for each in img_address:
filename = each.split('/')[-1] #取最后一個即圖片名
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder = 'ooxx',pages=10):
os.mkdir(folder)
os.chdir(folder)
url = "http://jandan.net/ooxx"
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
page_url = url+'/page-'+str(page_num)+'#comments'
img_address = find_imgs(page_url)
save_imgs(folder,img_address)
if __name__ =='__main__':
download_mm()
Python爬蟲將貼吧上的圖片全部下載到本地
# -*- coding:utf-8 -*- 毙替、
import urllib.request
import re
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_img(html):
p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
imglist = re.findall(p, str(html))
for each in imglist:
filename = each.split("/")[-1]
urllib.request.urlretrieve(each, filename, None)
if __name__ == '__main__':
url = 'http://tieba.baidu.com/p/3563409202'
get_img(url_open(url))
爬豆瓣電影TOP250伪嫁,參考
import pymysql
import requests
from bs4 import BeautifulSoup
#%d用作數(shù)字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="
def get_movies(start):
url = baseUrl % start
lists = []
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析頁面內(nèi)容
items = soup.find("ol", "grid_view").find_all("li")# 獲取所有的電影內(nèi)容
for i in items:
movie = {} # 臨時存取電影的數(shù)據(jù)
movie["rank"] = i.find("em").text # 電影排行榜
movie["link"] = i.find("div","pic").find("a").get("href") # 電影詳情頁鏈接
movie["poster"] = i.find("div","pic").find("a").find('img').get("src") # 電影海報地址
movie["name"] = i.find("span", "title").text # 電影名字
movie["score"] = i.find("span", "rating_num").text # 電影評分
movie["other"] = i.find("span", "other").text.replace('/','').replace(' ','/') # 電影別名
movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些電影沒有點評,沒有就設(shè)為空
movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 電影評論人數(shù)
movie["detail"] = i.find("div", "bd").find("p", "").text # 電影詳情
lists.append(movie) # 保存到返回數(shù)組中
return lists
if __name__ == "__main__":
# 連接數(shù)據(jù)庫,需指定charset否則可能會報錯
db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在則刪除
# 創(chuàng)建表sql語句
createTab = """CREATE TABLE movies(
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(20) NOT NULL,
rank VARCHAR(4) NOT NULL,
link VARCHAR(50) NOT NULL,
poster VARCHAR(100) NOT NULL,
score VARCHAR(4) NOT NULL,
other VARCHAR(100) NOT NULL,
quote VARCHAR(50),
detail VARCHAR(300) NOT NULL,
comment_num VARCHAR(100) NOT NULL
)"""
cursor.execute(createTab)
for start in range(0,250,25):
lists = get_movies(start)# 獲取提取到數(shù)據(jù)
for i in lists:
# 插入數(shù)據(jù)到數(shù)據(jù)庫sql語句,%s用作字符串占位
sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["comment_num"]))
db.commit()
print(i["name"]+" is success")
except:
db.rollback()
db.close()
將豆瓣爬下來的電影詳情按年份、國家或地區(qū)姐扮、類型等分好并寫入MySQL數(shù)據(jù)庫
import pymysql
import requests
from bs4 import BeautifulSoup
import re
#%d用作數(shù)字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="
def get_movies(start):
url = baseUrl % start
lists = []
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析頁面內(nèi)容
items = soup.find("ol", "grid_view").find_all("li")# 獲取所有的電影內(nèi)容
for i in items:
movie = {} # 臨時存取電影的數(shù)據(jù)
movie["rank"] = i.find("em").text # 電影排行榜
movie["link"] = i.find("div","pic").find("a").get("href") # 電影詳情頁鏈接
movie["poster"] = i.find("div","pic").find("a").find('img').get("src") # 電影海報地址
movie["name"] = i.find("span", "title").text # 電影名字
movie["score"] = i.find("span", "rating_num").text # 電影評分
movie["other"] = i.find("span", "other").text.replace('/','').replace(' ','/') # 電影別名
movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些電影沒有點評,沒有就設(shè)為空
movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 電影評論人數(shù)
movie["detail"] = i.find("div", "bd").find("p", "").text # 電影詳情
lists.append(movie) # 保存到返回數(shù)組中
return lists
if __name__ == "__main__":
# 連接數(shù)據(jù)庫衣吠,需指定charset否則可能會報錯
db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在則刪除
# 創(chuàng)建表sql語句
createTab = """CREATE TABLE movies(
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(20) NOT NULL,
rank VARCHAR(4) NOT NULL,
link VARCHAR(50) NOT NULL,
poster VARCHAR(100) NOT NULL,
score VARCHAR(4) NOT NULL,
other VARCHAR(100) NOT NULL,
quote VARCHAR(50),
detail VARCHAR(300) NOT NULL,
time VARCHAR(300) NOT NULL,
country VARCHAR(300) NOT NULL,
type VARCHAR(300) NOT NULL,
drictor_artist VARCHAR(300) NOT NULL,
comment_num VARCHAR(100) NOT NULL
)"""
cursor.execute(createTab)
for start in range(0,250,25):
lists = get_movies(start)# 獲取提取到數(shù)據(jù)
data=[]
for i in lists:
action = i["detail"]
remove=re.compile(r' |\n|</br>|\.*')
bd=re.sub(remove,"",action)
bd=re.sub('<br>'," ",bd)#去掉<br>
bd=re.sub('/'," ",bd)#替換/
words=bd.split(" ")
for s in words:
if len(s)!=0 and s!=' ':#去掉空白內(nèi)容
data.append(s)
i["time"] = data[-3][-5:]
i["country"] = data[-2]
i["type"] = data[-1]
i["drictor_artist"] = data[0]
# 插入數(shù)據(jù)到數(shù)據(jù)庫sql語句茶敏,%s用作字符串占位
sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`time`,`country`,`type`,`drictor_artist`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["time"], i["country"], i["type"], i["drictor_artist"], i["comment_num"]))
db.commit()
print(i["name"]+" is success")
except:
db.rollback()
db.close()
可以將TOP250電影的年份畫出來
豆瓣電影TOP250年代分布