聲明:
1、本博客所涉及爬蟲及其爬取的數(shù)據(jù)僅用于爬蟲技術(shù)研究,請(qǐng)勿用于商業(yè)用途梯轻。
2褪迟、本博文中涉及的網(wǎng)址均已隱去關(guān)鍵字段冗恨。
基于python3.5版本
源代碼
spidertest.py
# -*- coding: utf-8 -*-
import scrapy
from testscrapy01.items import QIBEBT_Zhuanli_Item
from scrapy.selector import Selector
import re
class QibebtZhuanliSpider(scrapy.Spider):
name = "xxxx_zhuanli_01"
allowed_domains = ["qibebt.cas.cn"]
start_urls = ['http://www.xxxx.cas.cn/kycg/zl/index.html']
lst = []
for i in range(1,15):
start_urls.append("http://www.xxxx.cas.cn/kycg/zl/index_" + str(i) +".html")
# start_urls = ['http://www.xxxx.cas.cn/kycg/zl/index.html'] + lst
def parse(self, response):
print("目標(biāo)網(wǎng)頁(yè)為:" + response.url)
hxs = Selector(response)
qitem = QIBEBT_Zhuanli_Item()
# 把所有行抓取出來(lái)
zls = hxs.xpath("http://tr[@bgcolor='#f2f7f1']")
# print(len(zls))
for zl in zls:
#專利名稱
name = zl.xpath("td[@height='26']/a[@target='_blank']/text()").extract()[0]
# name = zl.xpath("http://a/text()").extract()
#專利類別
type = zl.xpath("td[@align='center']/text()").extract()[0]
#申請(qǐng)?zhí)? number = zl.xpath("td[@align='center']/text()").extract()[1]
#申請(qǐng)日期
apply_date = zl.xpath("td[@align='center']/text()").extract()[2]
sq_date = "無(wú)"
try:
#授權(quán)日期
sq_date = zl.xpath("td[@align='center']/text()").extract()[3]
except:
sq_date = "無(wú)"
#發(fā)明人
#有一個(gè)"\xa0"字符串,進(jìn)行特殊處理
inventor = repr(zl.xpath("td[@align='left']/text()").extract()[0])
# print(isinstance(inventor,str))
inventor = re.sub(r"'","",inventor)
# inventor = re.sub("\\xa0", ",", inventor)
main_inventor = inventor.split('\\xa0')[0]
other_inventors = inventor.split('\\xa0')[1]
# print("專利名稱 = ", name)
# print("專利類別 = ", type)
# print("申請(qǐng)?zhí)?= ", number)
# print("申請(qǐng)日期 = ", apply_date)
# print("授權(quán)日期 = ", sq_date)
# print("第一發(fā)明人 = ", main_inventor)
# print("其他發(fā)明人 = ", other_inventors)
# print("")
yield qitem