from urllib import request,parse
import ssl,json,pymysql
class GaoKaoPaiSpider(object):
def __init__(self):
數(shù)據(jù)庫相關(guān)
self.client = pymysql.Connect(
host='127.0.0.1',user='root',
password='ljh1314',database='gaokaopai',
port=3306,charset='utf8'
)
# 創(chuàng)建游標(biāo)
self.cursor = self.client.cursor()
# 添加排行榜排名默認(rèn)字段
self.rank = 1
def send_request(self,form=None,headers=None):
"""發(fā)起請(qǐng)求撞蜂,獲取數(shù)據(jù)"""
form_data = parse.urlencode(form).encode('utf-8')
if not headers:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
url = 'http://www.gaokaopai.com/rank-index.html'
req = request.Request(url=url,headers=headers,data=form_data)
ssl_context = ssl._create_unverified_context()
response = request.urlopen(req,context=ssl_context)
html = response.read().decode('utf-8')
# has_next:判斷是否需要繼續(xù)請(qǐng)求下一頁
has_next = self.parse_response(html)
#獲取下一頁
if has_next:
form['start'] = form['start']+25
self.send_request(form=form)
else:
print('數(shù)據(jù)獲取完畢')
def parse_response(self,html):
#!DOCTYPE HTML 如果!DOCTYPE HTML存在html中
# 說明返回的不是json字符串,而是html,在此案例中
# 表明數(shù)據(jù)已經(jīng)獲取完畢
if '!DOCTYPE HTML' in html:
return False
# json.loads:參數(shù)必須是一個(gè)json字符串
json_data = json.loads(html)
jobs = json_data['data']['ranks']
for job in jobs:
job_info = {}
job_info['top'] = self.rank
job_info['title'] =job['uni_name']
job_info['zongfen'] = self.get_default_num(data=job['xiao_total'],isFloat=True)
job_info['class'] = job['uni_type']
job_info['dizhi'] = job['city_code']
job_info['pici'] = '本科一批'
self.rank += 1
print(job_info)
self.save_data_to_mysql(job_info)
# 如果jobs列表數(shù)據(jù)大于0,說明邢疙,可能存在下一頁,
# 繼續(xù)發(fā)起請(qǐng)求,所以返回True
if len(jobs) > 0:
return True
elif len(jobs) == 0:
return False
def get_default_num(self,data=None,defalut=0,isFloat=False):
"""
:param data: 傳遞的參數(shù)
:param defalut: 設(shè)置默認(rèn)值
:param isFloat: 是否是浮點(diǎn)型
:return:
"""
if data:
if isFloat:
return float(data)
else:
return int(data)
else:
if isFloat:
return float(defalut)
else:
return defalut
def save_data_to_mysql(self,job_info):
"""插入數(shù)據(jù)"""
insert_sql = """
INSERT INTO dsa(%s)
VALUES (%s)
"""%(
','.join(job_info.keys()),
','.join(['%s']*len(job_info)),
)
try:
self.cursor.execute(insert_sql,list(job_info.values()))
self.client.commit()
print('插入成功')
except Exception as err:
self.client.rollback()
print(err)
if name == 'main':
spider = GaoKaoPaiSpider()
"""
otype: 2
city: 0
cate: 0
batch_type: 0
start: 25
amount: 25
"""
form = {
'otype': 2,
'city': 0,
'cate': 0,
'batch_type':0,
'start':25,
'amount':25
}
spider.send_request(form=form)