1.新建項目
scrapy startproject cnblog
2.pycharm 打開項目
image.png
3.新建spider
image.png
新建main.py
from scrapy import cmdline
cmdline.execute("scrapy crawl cnblog".split())
爬蟲代碼
import scrapy
from cnblog.items import CnblogItem
class Cnblog_Spider(scrapy.Spider):
name = "cnblog"
allowed_domains = ["cnblog.com"]
start_urls = [
'https://www.cnblogs.com/',
]
def parse(self, response):
item = CnblogItem()
item['title'] = response.xpath('//a[@class="titlelnk"]/text()').extract()
item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract()
yield item
item代碼
import scrapy
class CnblogItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
setting
BOT_NAME = 'cnblog'
SPIDER_MODULES = ['cnblog.spiders']
NEWSPIDER_MODULE = 'cnblog.spiders'
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
#user-agent新添加
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
#新修改
ITEM_PIPELINES = {
'cnblog.pipelines.FilePipeline': 300, #實現(xiàn)保存到txt文件
'cnblog.pipelines.mysqlPipeline': 300, # 實現(xiàn)保存到mysql
}
4.存儲成text
class FilePipeline(object):
def process_item(self, item, spider):
data = ''
with open('cnblog.txt', 'w', encoding='utf-8') as f:
titles = item['title']
links = item['link']
for i, j in zip(titles, links):
data += i + ':'+j+'\n'
f.write(data)
f.close()
return item
image.png