很簡單地一個(gè)爬取程序路星,適合初學(xué)者
源碼如下:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
html = urlopen("http://www.reibang.com")
bsobj = BeautifulSoup(html,"html.parser")
# print(bsobj.findAll("h4",{"class":"title"}))#打印獲取的對象
SqlConnect = pymysql.connect(host = 'localhost',user = 'root',password = '123456',db = 'liusenTestURL',charset = 'utf8mb4')
cur = SqlConnect.cursor()#獲取一個(gè)游標(biāo)
#寫入數(shù)據(jù)庫函數(shù)
def writeDataBase(title,content,textURL):
cur.execute("INSERT INTO jianshuTEXT (title,content,URL) VALUES (%s,%s,%s)", (title, content,textURL))
cur.connection.commit()
#獲取內(nèi)容函數(shù)
def gainContent(contentHtml):
contenthtml = urlopen(contentHtml)
contentBsObj = BeautifulSoup(contenthtml,"html.parser")
textTitle = contentBsObj.find('title').get_text()
print('title : '+textTitle)
print('----------------------')
textContent = contentBsObj.find("div",{"class":"show-content"}).get_text()
# print(textContent)
writeDataBase(textTitle,textContent,contentHtml)
try:
for title in bsobj.find("ul", {"class": "article-list thumbnails"}).findAll("h4", {"class": "title"}):
# print(title.find("a"))
if 'href' in title.find("a").attrs:
contenthtml = 'http://www.reibang.com' + title.find("a").attrs['href']
print(contenthtml)
gainContent(contenthtml)
finally:
cur.close()
SqlConnect.close()
歡迎一起交流學(xué)習(xí)
有時(shí)候網(wǎng)頁編碼不是utf-8,這就不太好弄了.假如現(xiàn)在第三方請求庫用的是requests,那么請求下來的數(shù)據(jù)要做一個(gè)轉(zhuǎn)化過程,針對gb2312網(wǎng)頁編碼,現(xiàn)在要做如下處理,否則會(huì)中文亂碼
detailURL = "http://xxx.xxx.xxxxxx.com/"
html = requests.session().get(detailURL, headers=headers)
jieshouText = html.text.encode('ISO-8859-1',"ignore").decode(requests.utils.get_encodings_from_content(html.text)[0],"ignore")
參考:python的requests類抓取中文頁面出現(xiàn)亂碼
http://www.zhetenga.com/view/python的requests類抓取中文頁面出現(xiàn)亂碼-0abbaa140.html
解釋很詳細(xì)