scrapy Request方法
阅读原文时间:2023年07月08日阅读:2

# -*- coding: utf-8 -*-
import scrapy

class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['yeves.cn']
start_urls = ['https://yeves.cn/']
base_domain = 'https://yeves.cn{}' # 基础域名
def parse(self, response):

    articles = response.xpath('//\*\[@id="article"\]//div') # 获取首页的标题和链接

    for article in articles:  
        title = article.xpath('./div/article/div/header/h2/a/text()').extract\_first()  
        href = article.xpath('./div/article/div/header/h2/a/@href').extract\_first()  
        if title is not None and href is not None:  
            href = self.base\_domain.format(href)  
            yield scrapy.Request(href,callback=self.parse\_detail,meta={"title":title})  #通过标题链接获取详情 把标题带过去

def parse\_detail(self,respone):  
    print(respone.url)  
    print(respone.meta.get('title'))  
    detail = {}  
    detail\['title'\] = respone.meta.get('title')

    created\_at = respone.xpath('/html/body/section/div/div/header/div/span\[1\]/time/text()').extract\_first() # 拿到详情数据  
    category = respone.xpath('/html/body/section/div/div/header/div/span\[2\]/a/text()').extract\_first()  
    content = respone.xpath('/html/body/section/div/div/article//text()').extract\_first()

    detail\['created\_at'\] = created\_at  
    detail\['category'\] = category  
    print(detail)  
    yield detail