這是個(gè)python簡(jiǎn)易爬蟲图仓,主要使用了requests和re模塊,適合入門馍刮。
出處:https://github.com/jingsupo/python-spider/blob/master/day03/04neihanba.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests, re, time
class Neihanspider(object):
def __init__(self):
self.base_url = 'http://www.neihan8.com/article/list_5_'
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
# 第一層解析的正則表達(dá)式 正則里面的符號(hào)不能改碍脏,必須照原樣復(fù)制過(guò)來(lái)
self.first_pattern = re.compile(r'<div class="f18 mb20">.*?</div>', re.S)
# 第二層解析的正則表達(dá)式 去除所有標(biāo)簽 字符實(shí)體 空白 全角空格
self.second_pattern = re.compile(r'<.*?>|&.*?;|\s| ')
# 發(fā)送請(qǐng)求
def send_request(self, url):
time.sleep(2)
try:
response = requests.get(url, headers=self.headers)
return response.content
except Exception as e:
print e
# 寫入文件
def write_file(self, data, page):
with open('04neihanba.txt', 'a') as f:
filename = '第' + str(page) + '頁(yè)的段子\n'
print filename
f.write('-' * 10 + '\n')
f.write(filename)
f.write('-' * 10 + '\n')
for first_data in data:
# 第二層解析
content = self.second_pattern.sub('', first_data)
f.write(content)
# 在每個(gè)段子結(jié)束的時(shí)候加個(gè)換行
f.write('\n\n')
# 調(diào)度方法
def start_work(self):
for page in range(1, 5):
# 拼接url
url = self.base_url + str(page) + '.html'
# 發(fā)送請(qǐng)求
data = self.send_request(url)
# 轉(zhuǎn)碼
data = data.decode('gbk').encode('utf-8')
# 第一層解析
data_list = self.first_pattern.findall(data)
# 將數(shù)據(jù)寫入文件
self.write_file(data_list, page)
if __name__ == '__main__':
spider = Neihanspider()
spider.start_work()