# -*- coding: utf-8 -*-
import scrapy
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['yeves.cn']
start_urls = ['https://yeves.cn/']
base_domain = 'https://yeves.cn{}' # 基础域名
def parse(self, response):
articles = response.xpath('//\*\[@id="article"\]//div') # 获取首页的标题和链接
for article in articles:
title = article.xpath('./div/article/div/header/h2/a/text()').extract\_first()
href = article.xpath('./div/article/div/header/h2/a/@href').extract\_first()
if title is not None and href is not None:
href = self.base\_domain.format(href)
yield scrapy.Request(href,callback=self.parse\_detail,meta={"title":title}) #通过标题链接获取详情 把标题带过去
def parse\_detail(self,respone):
print(respone.url)
print(respone.meta.get('title'))
detail = {}
detail\['title'\] = respone.meta.get('title')
created\_at = respone.xpath('/html/body/section/div/div/header/div/span\[1\]/time/text()').extract\_first() # 拿到详情数据
category = respone.xpath('/html/body/section/div/div/header/div/span\[2\]/a/text()').extract\_first()
content = respone.xpath('/html/body/section/div/div/article//text()').extract\_first()
detail\['created\_at'\] = created\_at
detail\['category'\] = category
print(detail)
yield detail
手机扫一扫
移动阅读更方便
你可能感兴趣的文章