簡易爬蟲代碼實現——基于python2.7
# -*- coding:utf-8 -*-
import urllib2, urllib, time
class Tiebaspider(object):
????def __init__(self, tieba_name, start_page, end_page):
????????self.base_url = 'https://tieba.baidu.com/f?'
????????self.name = tieba_name
????????self.start = start_page
????????self.end = end_page
????????self.headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) ????????????AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", ????????????'Connection': 'keep-alive' }
????# 發(fā)送請求
????def send_request(self, url):
????????time.sleep(2)
????????try:
????????????request = urllib2.Request(url, headers=self.headers)
????????????response = urllib2.urlopen(request)
????????????if response.code == 200:
????????????????return response.read()
????????except Exception as e:
????????????print e
????# 下載文件
????def write_data(self, data, page):
????????filename = 'tieba/' + str(page) + '頁.html'
????????print '%s正在下載...' % filename
????????with open(filename, 'w') as f:
????????????f.write(data)
????# 調度方法
????def start_work(self):
????????for page in range(self.start, self.end + 1):
????????????pn = (page - 1) * 50
????????????params = { 'kw': self.name, 'pn': pn }
????????????# 字典轉碼后與base_url進行拼接
????????????params_str = urllib.urlencode(params)
????????????url = self.base_url + params_str
????????????data = self.send_request(url)
????????????self.write_data(data, page)
if __name__ == '__main__':
????tieba_name = raw_input('請輸入貼吧名字:')
????start_page = int(raw_input('開始頁:'))
????end_page = int(raw_input('結束頁:'))
????spider = Tiebaspider(tieba_name, start_page, end_page)
????spider.start_work()
哈哈哈??