#from urllib import request
from urllib.request import urlopen
#from urllib.request import Request
from urllib import parse
from bs4 import BeautifulSoup
import re
import pymysql.cursors
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")
soup = BeautifulSoup(resp,"html.parser")
listUrls = soup.find_all("a",href=re.compile("^/wiki/")) #采集數(shù)據(jù)的關鍵边臼,找出/wiki/的鏈接
# print(listUrls)
for urlin listUrls:
if not re.search("\.(jpg|JPG)$",url["href"]):
print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])
#連接數(shù)據(jù)庫
? ? connection = pymysql.connect(
host ='localhost',
? ? user ='root',
? ? password ='123456',
? ? db ='wikiurl',
? ? charset ='utf8mb4')#utf-8編碼的擴展集
? ? try:#獲取會話指針
? ? ? with connection.cursor()as cursor:
#創(chuàng)建sql語句
? ? ? ? sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"
? ? ? ? ? ? #執(zhí)行sql語句
? ? ? ? cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))
#提交
? ? ? ? connection.commit()
finally:
connection.close()
創(chuàng)建表的時候注意urlhref的設置255字符長度可能不夠,設置1000.
主鍵和自增的設置。
修改后。不會把.JPG的條目存入數(shù)據(jù)庫中。
#from urllib import request
from urllib.requestimport urlopen
#from urllib.request import Request
from urllib import parse
from bs4 import BeautifulSoup
import re
import pymysql.cursors
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")
soup = BeautifulSoup(resp,"html.parser")
listUrls = soup.find_all("a",href=re.compile("^/wiki/"))
# print(listUrls)
for urlin listUrls:
if not re.search("\.(jpg|JPG)$",url["href"]):
print(url.get_text(),"-------","https://en.wikipedia.org" + url["href"])
#連接數(shù)據(jù)庫
? ? connection = pymysql.connect(
host ='localhost',
? ? user ='root',
? ? password ='123456',
? ? db ='wikiurl',
? ? charset ='utf8mb4')#utf-8編碼的擴展集
? ? try:#獲取會話指針
? ? ? with connection.cursor()as cursor:
#創(chuàng)建sql語句
# for url in listUrls:
? ? ? ? if not re.search("\.(jpg|JPG)$", url["href"]): ? #修改的地方,不會把.JPG的條目存入數(shù)據(jù)庫中。
sql ="insert into `urls`(`urlname`,`urlhref`)values(%s,%s)"
? ? ? ? ? ? #執(zhí)行sql語句
? ? ? ? cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org" + url["href"]))
#提交
? ? ? ? connection.commit()
finally:
connection.close()