import pymysql
import requests as re
from bs4 import BeautifulSoup
try:
? ? for i in range(1,389):#389
? ? ? ? url="http://by.cuc.edu.cn/zcyw/"+str(i)
? ? ? ? r=re.get(url)
? ? ? ? #print(r.text)
? ? ? ? soup = BeautifulSoup(r.text,'html.parser')
? ? ? ? title=soup.find_all('h3',attrs={'class','tit'})
? ? ? ? print(i)
? ? ? ? for t in title:
? ? ? ? ? ? newsurl=t.find_all('a')
? ? ? ? ? ? urllen=str(newsurl[0]).find('target')
? ? ? ? ? ? news=str(newsurl[0])[9:urllen-2]
? ? ? ? ? ? getUrl(news)
? ? ? ? ? ? print(str(newsurl[0])[9:urllen-2])
? ? ? ? ? ? print(t.get_text())
except:
? ? print("error")
def getUrl(url):
? ? #url="http://www.cuc.edu.cn/zcyw/11584.html"
? ? try:
? ? ? ? r=re.get(url)
? ? ? ? soup = BeautifulSoup(r.text,'html.parser')
? ? ? ? title=soup.find_all('h1')
? ? ? ? newsfrom=soup.find_all('sapn')
? ? ? ? newsdate=soup.find_all('sapn')
? ? ? ? viewcount=soup.find_all('span',attrs={'id':'hits'})
? ? ? ? newscontent=soup.find_all('article',attrs={'class','con-area'})
? ? ? ? ntitle=title[0].get_text()
? ? ? ? fromlen=newsfrom[0].get_text().find('20')
? ? ? ? fromtest=newsfrom[0].get_text().find('-')
? ? ? ? nfrom=newsfrom[0].get_text()[27:fromlen].strip()
? ? ? ? ndate=newsdate[0].get_text()[fromlen:fromlen+10]
? ? ? ? ncount=viewcount[0].get_text()
? ? ? ? ncontent=newscontent[0].get_text()
? ? ? ? saverec(url,ntitle,nfrom,ndate,ncount,ncontent)? ?
? ? except:
? ? ? ? print("error")
def saverec(url,ntitle,nfrom,ndate,ncount,ncontent):? ?
? ? # pymysql.connect(數(shù)據(jù)庫url,用戶名,密碼,數(shù)據(jù)庫名 )
? ? db = pymysql.connect("localhost", "root", "2017", "engword", charset = 'utf8')
? ? cursor = db.cursor()
? ? try:
? ? ? ? cursor.execute("INSERT INTO cucnews(newsurl,title,newsfrom,newsdate,contents,newscount) VALUES(%s,%s,%s,%s,%s,%s)",(url, ntitle,nfrom,ndate,ncontent,ncount))
? ? ? ? db.commit()
? ? except:
? ? ? ? print(db.error())
? ? ? ? db.rollback()
? ? db.close()