目的是爬取Weight Loss里面的文章慌洪。(自己寫(xiě)報(bào)告用勾徽,不宜作為學(xué)習(xí))
內(nèi)容包括{標(biāo)題泰涂,副標(biāo)題叔收,作者啄刹,頭像乙墙,日期颠锉,圖片拼卵,正文}
不多bb直接上代碼嗷嗷蓖议,scrapy框架的代碼應(yīng)該都是這樣吧
mySpider.py
import scrapy
from demo.itemsimport WeightItem
from bs4import UnicodeDammit
from bs4import BeautifulSoup
from urllib.requestimport urlopen
class MySpider(scrapy.Spider):
name="mySpider"
? ? source_url ='https://www.womenshealthmag.com/weight-loss/'
? ? def start_requests(self):
url = MySpider.source_url
yield scrapy.Request(url=url, callback=self.parse)
print("發(fā)送請(qǐng)求")
def parse(self, response):
i=0
? ? ? ? try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
print("收到源代碼")
links = selector.xpath("http://div[position()>2][starts-with(@class,'simple-item grid-simple-item ')]/a[@class='simple-item-image item-image']")
print("主頁(yè)xpath")
for linkin links:
newslink = link.xpath("./@href").extract_first()
yield scrapy.Request(url=MySpider.source_url + newslink, callback=self.parse1)
except Exception as err:
print(err)
def parse1(self,response):
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
text = selector.xpath("http://p[@class='body-text']/text()").extract()
text ="\n".join(text)
#text = selector.xpath("http://p[@class='body-text']/text()")[0]
#text = text.xpath("string(.)")
? ? ? ? ? ? ? ? pic = selector.xpath("/html/body/div[2]/div[4]/div[1]/div[1]/div/img/@data-src").extract_first()
header = selector.xpath("http://header[@class='content-header standard-header']/div[@class='content-header-inner']")
title = header.xpath(".//h1/text()").extract_first()
subtitle = header.xpath(".//p/text()").extract_first()
profilephoto = header.xpath(".//img/@data-src").extract_first()
author = header.xpath(".//span[@class='byline-name']/text()").extract_first()
date = header.xpath(".//time[@class='content-info-date']/text()").extract_first()
item = WeightItem()
item["title"] = title.strip()if titleelse ""
? ? ? ? ? ? ? ? item["subtitle"] = subtitle.strip()if subtitleelse ""
? ? ? ? ? ? ? ? item["author"] = author.strip()if authorelse ""
? ? ? ? ? ? ? ? item["date"] = date.strip()if dateelse ""
? ? ? ? ? ? ? ? item["profilephoto"] = profilephoto.strip()if profilephotoelse ""
? ? ? ? ? ? ? ? item["text"] = text.strip()if textelse ""
? ? ? ? ? ? ? ? item["pic"] = pic.strip()if picelse ""
? ? ? ? ? ? ? ? yield item
item.py
import scrapy
class WeightItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
? ? title = scrapy.Field()
subtitle=scrapy.Field()
author = scrapy.Field()
date = scrapy.Field()
profilephoto=scrapy.Field()
text = scrapy.Field()
pic=scrapy.Field()
piplines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class WeightPipeline(object):
def open_spider(self, spider):
print("開(kāi)始")
try:
self.con = pymysql.connect(host="127.0.0.1",
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? port=3306,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? user="root",
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? passwd="密碼",
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? charset="utf8"
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? )
self.cursor =self.con.cursor(pymysql.cursors.DictCursor)
print("已連接到mysql")
try:
self.cursor.execute("create database mydb")
print("創(chuàng)建新的數(shù)據(jù)庫(kù)")
except:
pass
? ? ? ? ? ? self.con.select_db("mydb")
try:
self.cursor.execute("drop table Woman'sHealth")
print("刪除原來(lái)的表")
except:
pass
? ? ? ? ? ? try:
sql ="""
create table WomansHealth(
Id varchar(8) primary key,
Title varchar(512) ,
Subtitle varchar(256),
Profilephoto varchar(256),
Author varchar(64),
Date varchar(16),
Text text,
Pic varchar(256))
"""
? ? ? ? ? ? ? ? self.cursor.execute(sql)
print("創(chuàng)建新的表")
except:
self.cursor.execute("delete from WomansHealth")
self.opened =True
? ? ? ? ? ? self.count =0
? ? ? ? except Exception as err:
print(err)
self.opened =False
? ? def close_spider(self, spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened =False
? ? ? ? print("closed")
print(self.count)# 無(wú)法顯示
? ? def process_item(self, item, spider):
try:
print("----------------------")
print("標(biāo)題:"+item["title"])
print("副標(biāo)題:"+item["subtitle"])
print("作者:"+item["author"])
print("日期:"+item["date"])
print("頭像鏈接:"+item["profilephoto"])
print("正文:"+item["text"])
print("圖片鏈接:"+item["pic"])
print("---------------------")
if self.opened:
self.count +=1
? ? ? ? ? ? ? ? print(self.count)
ID =str(self.count)
while len(ID) <8:
ID ="0"+ID
self.cursor.execute(
"insert into WomansHealth(Id,Title,Subtitle,Profilephoto,Author,Date,Text,Pic) values (%s,%s,%s,%s,%s,%s,%s,%s)",
? ? ? ? ? ? ? ? ? ? (ID, item["title"], item["subtitle"],item["profilephoto"], item["author"], item["date"], item["text"], item["pic"]))
except Exception as err:
print(err)
return item
run.py
from scrapyimport cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
setting.py 里面加一句
ITEM_PIPELINES = {
'demo.pipelines.WeightPipeline':300, }
然后就存到數(shù)據(jù)庫(kù)了