python版本:3.5
douban_self_info.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request,FormRequest
import urllib.request
class DbSpider(scrapy.Spider):
name = "db"
allowed_domains = ["douban.com"]
# start_urls = ['http://douban.com/']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}
def start_requests(self):
return [Request("https://accounts.douban.com/login",
callback=self.parse,
meta={
"cookiejar":1
}
)]
def parse(self, response):
captcha = response.xpath("http://img[@id='captcha_image']/@src").extract()
if len(captcha) > 0:
print("此時(shí)有驗(yàn)證碼")
localpath = "E:/pictest/captchar.jpg"
urllib.request.urlretrieve(captcha[0],filename=localpath)
print("請(qǐng)查看本地驗(yàn)證碼圖片并輸入驗(yàn)證碼")
captcha_value = input()
data = {
"form_email": "xxxxx",
"form_password": "xxxxx",
"captcha-solution":str(captcha_value),
"redir": "https://www.douban.com/people/82984134/" # 登錄后要返回的頁面
}
else:
print("此時(shí)沒有驗(yàn)證碼")
data = {
"form_email":"xxxxx",
"form_password":"xxxxx",
"redir":"https://www.douban.com/people/82984134/"#登錄后要返回的頁面
}
print("登錄中奈惑。丢习。盐数。坟奥。。。")
return [FormRequest.from_response(response,
meta={"cookiejar":response.meta["cookiejar"]},
headers=self.headers,
formdata=data,
callback=self.next,
)]
def next(self,response):
print("此時(shí)已經(jīng)登錄完成并爬取了個(gè)人中心的數(shù)據(jù)")
title = response.xpath("/html/head/title/text()").extract()
# db_id = response.xpath("http://div[@class='infobox']/div[@class='p1']/text()").extract()
print(title[0])
# print(db_id[0])