1. requests語法
"""__author__= 雍新有"""
"""
解析網(wǎng)站乍炉,獲取源碼
"""
import requests
# 1.語法
# requests.request(method, url, proxy, headers)
# requests.get(url) 相當(dāng)于 requests.request('get', url)
# 傳入url參數(shù)狮辽,獲取源碼
url = 'http://httpbin.org/get'
response = requests.get(url)
# response = requests.request('get', url)
# print(response.text)
# 獲取boss直聘的源碼
# 傳入url參數(shù)夹孔,header參數(shù)
# 在header中組裝User-Agent伪嫁,默認(rèn)User-Agent為python-requests/版本
url = 'https://www.zhipin.com/c101270100/?query=python&page=%s&ka=page-%s'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
response = requests.get(url, headers=headers)
# print(response.text)
# 傳入url參數(shù)坡锡,header參數(shù)循未,proxies參數(shù)
# url = 'https://www.zhipin.com/c101270100/?query=python&page=%s&ka=page-%s'
# url = 'http://httpbin.org/get'
# proxies = {
# 'http': 'http://121.233.206.189:9999'
# }
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
# }
# response = requests.get(url, headers=headers, proxies=proxies)
# print(response.text)
url = 'http://www.baidu.com'
response = requests.get(url)
# content獲取到的是bytes類型的值
print(response.content)
# text獲取到的是str類型的值
print(response.text)
# status_code獲取到的是響應(yīng)狀態(tài)碼
print(response.status_code)
# 獲取cookies
print(response.cookies)
# 傳入url參數(shù)驹饺,請(qǐng)求參數(shù)data
url = 'http://www.baidu.com/s'
params = {
'wd': 'python'
}
response = requests.get(url, params)
print(response.text)
# 其他請(qǐng)求方式,與get的區(qū)別就是params和data
requests.post(url, data=None, json=None, headers=None, proxies=None)
requests.put(url, data=None, json=None, headers=None, proxies=None)
requests.patch(url, data=None, json=None, headers=None, proxies=None)
requests.delete(url, data=None, json=None, headers=None, proxies=None)
1.1 requests貓眼排行并存數(shù)據(jù)庫
"""__author__= 雍新有"""
from db_helper import get_conn, execute_sql, close_conn
"""
爬取貓眼排行榜前100的電影信息
"""
import requests
import re
def parse_html(html):
new_data = []
patterns = re.compile('<dd>.*?<i class="board-index board-index-.*?">(.*?)</i>.*?<a .*? title="(.*?)" .*?>.*?<img .*?>.*?<img data-src="(.*?)" .*?>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>.*?<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>.*?</dd>', re.S)
result = patterns.findall(html)
# Python strip()方法用于移除字符串頭尾指定的字符(默認(rèn)為空格或換行符)或字符序列祠乃。
# 注意:該方法只能刪除開頭或是結(jié)尾的字符,不能刪除中間部分的字符仑乌。
for item in result:
data = {
'index': item[0].strip(),
'img': item[1].strip(),
'title': item[2].strip(),
'actors': item[3].strip(),
'time': item[4].strip(),
'score': item[5] + item[6],
}
new_data.append(data)
return new_data
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
def save_result(result):
# 存儲(chǔ)
conn = get_conn()
for item in result:
sql = f'insert into movies ' \
f'(paiming, name, src, actor, time, score) ' \
f'values ("{item["index"]}", "{item["img"]}", "{item["title"]}", ' \
f'"{item["actors"]}", "{item["time"]}", {item["score"]});'
print(sql)
execute_sql(sql, conn)
close_conn(conn)
def main():
url = 'https://maoyan.com/board/4?offset={}'
for i in range(10):
real_url = url.format(i*10)
# 解析url地址源碼
html = get_html(real_url)
# print(html)
result = parse_html(html)
# 存儲(chǔ)數(shù)據(jù)(mysql百拓、mongodb、redis....)
save_result(result)
# print(result)
if __name__ == '__main__':
main()
2. bs4語法
"""__author__= 雍新有"""
from bs4 import BeautifulSoup
html = """
<html><head><title>學(xué)習(xí)爬蟲好開心</title></head>
<body>
<p class="title" name="dromouse"><b>( ̄TT ̄)筆芯</b></p>
<p class="story">喵了個(gè)貓
<a class="sister" id="link1">汪汪汪晰甚,汪星人</a> and
<a class="sister" id="link2">喵喵喵衙传,喵星人</a>
最后變成一鍋湯</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
# 標(biāo)簽解析器、標(biāo)準(zhǔn)選擇器厕九、CSS選擇器
# 1. 標(biāo)簽選擇器
# soup對(duì)象蓖捶、標(biāo)簽 獲取到的是標(biāo)簽信息(包括標(biāo)簽與標(biāo)簽內(nèi)的內(nèi)容)
print(soup.title)
print(soup.title.string)
print(soup.p)
print(soup.p.string)
# soup對(duì)象.標(biāo)簽 返回的是第一個(gè)標(biāo)簽的內(nèi)容
print(soup.a)
print(soup.a.string)
# soup對(duì)象,標(biāo)簽[屬性值]扁远、soup對(duì)象.標(biāo)簽[屬性值]
print(soup.a['href'])
print(soup.a.attrs['href'])
# 獲取兄弟節(jié)點(diǎn)
# 獲取當(dāng)前節(jié)點(diǎn)以下的第一個(gè)兄弟節(jié)點(diǎn) next_sibling
# 獲取當(dāng)前節(jié)點(diǎn)以下的所有的兄弟節(jié)點(diǎn) next_siblings
siblings = soup.p.next_siblings
for item in siblings:
print(item)
# 獲取當(dāng)前節(jié)點(diǎn)以下的第一個(gè)兄弟節(jié)點(diǎn) previous_sibling
siblings = soup.a.previous_sibling
print(siblings)
# 獲取當(dāng)前節(jié)點(diǎn)以下的所有的兄弟節(jié)點(diǎn) previous_siblings
siblings = soup.a.previous_siblings
for item in siblings:
print(item)
# 獲取父節(jié)點(diǎn)俊鱼,祖父節(jié)點(diǎn)
# print('=========')
# print(soup.a.parents)
# for item in soup.a.parents:
# print(item)
# 2.
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
print('/**************************')
soup = BeautifulSoup(html, 'lxml')
# 語法: soup.find(name, attr, text)
# name標(biāo)簽: 通過標(biāo)簽查詢內(nèi)容
print(soup.find('ul'))
print(soup.find_all('ul'))
# attrs屬性,通過屬性查詢內(nèi)容
print(soup.find_all(attrs={'id': 'list-2'}))
print(soup.find_all(attrs={'class': 'list'}))
print(soup.find_all(id='list-2'))
print(soup.find_all(class_='list'))
# text屬性畅买,通過內(nèi)容查詢標(biāo)簽
print('===========')
print(soup.find_all(text='Foo'))
# 3. CSS選擇器
print('======')
print(soup.select('ul'))
3. mysql連接
"""__author__= 雍新有"""
import pymysql
def get_conn():
# 鏈接數(shù)據(jù)庫
conn = pymysql.Connection(host='47.100.164.252', user='root',
password='123456', database='1902spider',
port=3306, charset='utf8')
return conn
def execute_sql(sql, conn):
# 獲取游標(biāo)
cursor = conn.cursor()
cursor.execute(sql)
# 提交到數(shù)據(jù)庫并闲,不然不會(huì)出數(shù)據(jù)
conn.commit()
def close_conn(conn):
# 關(guān)閉鏈接
conn.close()
4. 今日頭條爬取
"""__author__= 雍新有"""
import requests
import re
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
# 解析源碼可采用正則、bs4
response = requests.get(url, headers=headers)
return response.text
def parse_html(html):
patterns = re.compile('url_list.*?(http.*?p[0-9]\.pstatp\.com.*?origin.*?pgc-image.*?)}', re.S)
result = patterns.findall(html)
result = set(result)
return result
def main():
url = 'https://www.toutiao.com/a6714556525513277964/'
html = get_html(url)
# print(html)
result = parse_html(html)
for item in result:
item1 = item.replace('\\\\\\u002F', '/')
item2 = item1.replace('\\"', '')
print(item2)
if __name__ == '__main__':
main()