一、數(shù)據(jù)存儲(chǔ)
#1)存入json
import json
def write_to_json(data):
# 把數(shù)據(jù)整合json支持的類(lèi)型
json_list = []
for houses in data:
for house in houses:
json_list.append(house)
with open('lianjian.json','w') as fp:
fp.write(json.dumps(json_list诫欠,index=2,ensure_ascii=False))
# .dumps()方法:將json對(duì)象轉(zhuǎn)化為字符串
# 參數(shù)indent:表示縮進(jìn)個(gè)數(shù)
# 參數(shù)ensure_ascii:是否將中文字符轉(zhuǎn)化為Unicode字符
import csv
#2) 存入csv
def write_to_csv(data):
# 在寫(xiě)csv的時(shí)候涵卵,需要把data整合成一個(gè)二維列表
# 定義一個(gè)大的列表,用于保存所有的房屋信息
csv_items = []
for houses in data:
for house in houses:
# house是字典荒叼,按照鍵值的形式存儲(chǔ)了每個(gè)房屋的信息轿偎,取出值寫(xiě)入列表
item = []
for k,v in house.items():
item.append(v)
csv_items.append(item)
# 寫(xiě)入csv
with open('lianjjia.csv','w') as fp:
# 用fp來(lái)創(chuàng)建一個(gè)csv的寫(xiě)對(duì)象
w = csv.writer(fp)
w.writerow(['title','house','position','totalPrice','unitPrice','img'])
w.writerows(csv_items)
# .writerows()方法:同時(shí)寫(xiě)入多行,參數(shù)是個(gè)二維列表· 4
import pymysql
#3) 存入數(shù)據(jù)庫(kù)
def write_to_mysql(data):
# 創(chuàng)建一個(gè)mysql數(shù)據(jù)庫(kù)的鏈接
db = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='123456',db = 'lianjia',charset='utf8')
# 創(chuàng)建一個(gè)游標(biāo)被廓,用于解析sql語(yǔ)句
cursor = db.cursor()
# 創(chuàng)建sql語(yǔ)句
for houses in data:
for house in houses:
sql = 'INSERT INTO ershoufang VALUES (NULL,"%s","%s","%s","%s","%s","%s",)'%(house['title'],house['house'],house['position'],house['totalPrice'],house['nuitPrince'],house['img'])
# 解析并提交sql語(yǔ)句
cursor.execute(sql)
db.commit()
# 關(guān)閉游標(biāo)和數(shù)據(jù)庫(kù)連接
cursor.close()
db.close()
二坏晦、ip代理
from urllib import request
import requests
url = 'https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=baidu&wd=ip&oq=ip&rsv_pq=b254df33000238fe&rsv_t=caee8Radj5kHT5OB1roVV9axqOakQtWZzVH9BYWRWLXkJtyQBHfRRlcDylg&rqlang=cn&rsv_enter=0&rsv_sug3=1&rsv_sug1=1&rsv_sug7=100&prefixsug=ip&rsp=0&rsv_sug4=1224'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
##########################################################3
# 創(chuàng)建一個(gè)請(qǐng)求對(duì)象
# request_obj = request.Request(url=url,headers=headers)
# 配置代理
# handler = request.ProxyHandler({"https":"113.200.56.13:8010"})
# 創(chuàng)建一個(gè)opener攜帶代理handler(h)
# opener = request.build_opener(handler)
# res = opener.open(request_obj)
####################################################################
res = requests.get(url=url,headers=headers,proxies = {"https":"113.200.56.13:8010"})
with open('./ip.html','wb') as fp:
fp.write(res.content)
fp.close()
三、模擬登陸
- 模擬古詩(shī)文網(wǎng)登陸
</br>
使用Session狀態(tài)保持
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 登錄頁(yè)的url
page_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
# 登錄需要提交的內(nèi)容:用戶(hù)名、密碼昆婿、靜態(tài)表單间护、兩個(gè)token、驗(yàn)證碼
# 經(jīng)過(guò)分析挖诸,兩個(gè)token和驗(yàn)證碼是在訪(fǎng)問(wèn)登錄頁(yè)的時(shí)候動(dòng)態(tài)生成
# log_html = requests.get(page_url)
s =requests.Session() # 狀態(tài)保持
log_html = s.get(page_url,headers=headers)
# 訪(fǎng)問(wèn)登錄頁(yè)的時(shí)候也要用到session茅糜,因?yàn)閠oken值保存在后臺(tái),Session可以記錄后臺(tái)token值
# 獲取兩個(gè)token
login_tree = etree.HTML(log_html.text)
token_a = login_tree.xpath('//*[@id="__VIEWSTATE"]/@value')
token_b = login_tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')
# 獲取驗(yàn)證碼圖片
code_url = 'https://so.gushiwen.org'+login_tree.xpath('//*[@id="imgCode"]/@src')[0]
print(code_url)
# 下載驗(yàn)證碼
code_info = s.get(code_url)
with open('./code.png','wb') as fp:
fp.write(code_info.content)
fp.close()
# 驗(yàn)證碼可以有以下幾種處理機(jī)制疟呐,用第三方的ai平臺(tái)接口來(lái)識(shí)別彬犯、自己訓(xùn)練AI皿渗、人工識(shí)別
code = input('請(qǐng)輸入你看到的驗(yàn)證碼')
# 登錄信息提交的信息url
log_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
# 請(qǐng)求體
data = {
'__VIEWSTATE': token_a,
'__VIEWSTATEGENERATOR': token_b,
'from': 'http://so.gushiwen.org/user/collect.aspx',
'email': 'fanjianbo666@163.com',
'pwd': '12345678',
'code': code,
'denglu': '登錄'
}
result = s.post(url=log_url,data=data)
print(result.text)
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 登錄頁(yè)的url
login_page = 'http://bbs.chinaunix.net/member.php?mod=logging&action=login&logsubmit=yes'
# 訪(fǎng)問(wèn)登錄頁(yè)面
s = requests.Session()
page_html = s.get(login_page,headers=headers)
soup = BeautifulSoup(page_html.text,'lxml')
action = soup.select('form.cl')[0].get('action')
formhash = soup.select('[name="formhash"]')[0].get('value')
# formhash琳彩、action是動(dòng)態(tài)生成
# 登錄接口需要?jiǎng)討B(tài)獲取
login_url = 'http://bbs.chinaunix.net/'+action
data = {
'formhash':formhash,
'referer':'http://bbs.chinanuix.net/./',
'username':'Mrfan666',
'password':'f12345678',
'loginsubmit':'true',
'returen_type':''
}
r = s.post(url=login_url,headers=headers,data=data)
print(r.text)