scrapy全栈抓xpc练习
阅读原文时间:2023年07月11日阅读:2

# spider文件

-*- coding: utf-8 -*-

import scrapy
import re
from scrapy import Request
import json
import string
import random
from xpc.items import PostItem, CommentItem, CopyItem # 多个item

def strip(s):
# s存在就去空,不存在就返回空
if s:
return s.strip()
return ""

使用scrapy.Request和scrapy.FormRequest发送请求的时候,默认会把cookies保存下来

模拟登录的时候不用scrapy框架,直接使用request模块

cookies = dict(
Authorization='4F635191B0602B5D3B06024483B0602AAF8B06023C2F6259656D'
)

上面的cookies是网站返回的,需要先登陆的一下把这个cookies找到

生成26个字母+数字

def gen_sessionid():
return "".join(random.choices(string.ascii_lowercase + string.digits, k=26))

class XinpianchangSpider(scrapy.Spider):
name = 'XinPianChang'
allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle']
# 假设从21页开始访问,这里就需要带上cookies,这时候最开始设置的cookies就不能用了,网站会返回4个cookies。需要从写start_requests函数
# start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-21']
page_count = 0

# 重写父类中的 start\_requests方法,该方法默认对start\_urls中的url发get请求  
# def start\_requests(self):  
#     for url in self.start\_urls:  
#         # data = {  
#         #     "kw": "cat"  
#         # }  
#         # post请求发送,使用FormRequest  
#         # yield scrapy.FormRequest(url=url, callback=self.parse, formdata=data)  
#  
#         c = cookies.copy()  
#         c.update(PHPSESSID=gen\_sessionid(),  
#                  SERVER\_ID='b52601c8-285bdd26',  
#                  channel\_page='apU%3D')  
#         yield Request(url, cookies=c, dont\_filter=True)

def parse(self, response):  
    # from scrapy.shell import inspect\_response  
    # inspect\_response(response, self)  
    self.page\_count += 1  
    if self.page\_count >= 100:  
        cookies.update(PHPSESSID=gen\_sessionid())  
        self.page\_count = 0

    url\_list = response.xpath('//ul\[@class="video-list"\]/li/@data-articleid').extract()  
    for pid in url\_list:  
        detail\_url = 'https://www.xinpianchang.com/a{}?from=ArticleList'.format(pid)  
        # print(detail\_url)  
        request = response.follow(detail\_url, callback=self.parse\_post)  
        request.meta\['pid'\] = pid  
        yield request  # 进入作品的详情页请求

    pages = response.xpath('//div\[@class="page"\]/a/@href').extract()  
    for page\_url in pages:  
        # print("列表页翻页url", page\_url)  # page\_url是一个相对路径,不完整的  
        yield response.follow(page\_url, self.parse, cookies=cookies)

def parse\_post(self, response):  
    pid = response.meta\['pid'\]  
    post = PostItem()  
    post\['pid'\] = pid  
    post\['title'\] = response.xpath('//div\[@class="title-wrap"\]/h3/text()').get()  
    # video\_url = 'https://openapi-vtom.vmovier.com/v3/video/5E34203E92450?expand=resource&usage=xpc\_web'  
    # response.text拿到网页返回的源码  
    vid = re.findall('vid: "(.\*?)",', response.text)\[0\]  
    # print(vid)  
    video\_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc\_web'.format(vid)  
    cates = response.xpath('//span\[@class="cate v-center"\]/a/text()').extract()  
    post\['category'\] = ''.join(\[cate.strip() for cate in cates\])  
    post\['create\_time'\] = response.xpath('//span\[contains(@class,"update-time")\]/i/text()').get()  
    post\['play\_count'\] = response.xpath('//i\[contains(@class,"play-counts")\]/text()').get()  
    desc\_lst = response.xpath('//p\[contains(@class,"desc")\]//text()').extract()  
    post\['desc'\] = ' '.join(\[i.strip() for i in desc\_lst\])

    # 请求这个video\_url, 多了一步这个注意一下  
    request = Request(video\_url, callback=self.parse\_video)  
    # 把之前获取到的post通过meta传到下一个函数中. 这个post是请求传参  
    request.meta\['post'\] = post  
    yield request

    # 获取评论链接‘https://app.xinpianchang.com/comments?resource\_id=10664352&type=article&page=1&per\_page=24’  
    comment\_url = "https://app.xinpianchang.com/comments?resource\_id={}&type=article&page=1&per\_page=24".format(  
        pid)  
    request = Request(comment\_url, callback=self.parse\_comment)  
    # 把之前获取到的post通过meta传到下一个函数中  
    request.meta\['pid'\] = pid  
    yield request

    # 获取作者页链接  
    creator\_list = response.xpath('//div\[@class="filmplay-creator right-section"\]/ul\[@class="creator-list"\]/li')  
    composer\_url = 'https://www.xinpianchang.com/u{}?from=articleList'  
    # cid = response.xpath('//div\[@class="filmplay-creator right-section"\]/ul\[@class="creator-list"\]/li/a/@data-userid')  
    for creator in creator\_list:  
        cid = creator.xpath('./a/@data-userid').get()  
        composer\_url = 'https://www.xinpianchang.com/u{}?from=articleList'.format(cid)  
        request = response.follow(composer\_url, self.parse\_composer)  
        request.meta\['cid'\] = cid  
        # 避免在cookies更新之后,不断的添加到请求头里面,避免请求头里带有一串cookies  
        request.meta\['dont\_merge\_cookies'\] = True  
        yield request

        # 作者和视频的对应关系  
        cr = CopyItem()  
        cr\['pid'\] = pid  
        cr\['cid'\] = cid  
        cr\['pcid'\] = pid + cid  
        cr\['role'\] = creator.xpath('./div\[@class="creator-info"\]/span/text()').get()  
        # print("cr", cr)  
        yield cr

def parse\_video(self, response):  # 这个response是json格式  
    post = response.meta\['post'\]  
    # 先把返回的json转化一下, 注意一下  
    result = json.loads(response.text)  
    post\['video\_url'\] = result\['data'\]\['resource'\]\['default'\]\['url'\]  
    # 直接返回给管道了  
    yield post

def parse\_comment(self, response):  
    result = json.loads(response.text)  
    for c in result\['data'\]\['list'\]:  
        comment = CommentItem()  
        comment\['uname'\] = c\['userInfo'\]\['username'\]  
        comment\['user\_id'\] = c\['userInfo'\]\['id'\]  
        # comment\['user\_page'\] = c\['userInfo'\]\['web\_url'\]  
        comment\['content'\] = c\['content'\]  
        comment\['content\_id'\] = c\['id'\]  
        print(comment)  
        yield comment

    # 如果有下一页  
    if result\['data'\]\['next\_page\_url'\]:  
        next\_page = 'https://app.xinpianchang.com' + result\['data'\]\['next\_page\_url'\]  
        # print("next\_page", next\_page)  
        yield response.follow(next\_page, self.parse\_comment)

def parse\_composer(self, response):  
    pass

# settings文件

-*- coding: utf-8 -*-

Scrapy settings for xpc project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

https://docs.scrapy.org/en/latest/topics/settings.html

https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'xpc'

SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders'

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'xpc (+http://www.yourdomain.com)'

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

如果使用自定义cookie就把COOKIES_ENABLED设置为True

如果使用settings的cookie就把COOKIES_ENABLED设置为False

COOKIES_ENABLED = True
COOKIES_DEBUG = True # 可以打印出来详细的cookies信息

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)',
}

Enable or disable spider middlewares

See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

'xpc.middlewares.XpcSpiderMiddleware': 543,

}

Enable or disable downloader middlewares

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

'xpc.middlewares.XpcDownloaderMiddleware': 543,

}

Enable or disable extensions

See https://docs.scrapy.org/en/latest/topics/extensions.html

EXTENSIONS = {

'scrapy.extensions.telnet.TelnetConsole': None,

}

Configure item pipelines

See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
'xpc.pipelines.XpcPipeline': 300, # 优先级高
# 'xpc.pipelines.MysqlPipeline': 301,
# 'xpc.pipelines.RedisPipeline': 302,
}

Enable and configure the AutoThrottle extension (disabled by default)

See https://docs.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = False # True缓存访问过的网页,不会真实的发请求

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = 'httpcache'

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

日志类型: INFO DEBUG ERROR

LOG_LEVEL = 'DEBUG'

# item文件

-*- coding: utf-8 -*-

import scrapy

class PostItem(scrapy.Item):
# 保存视频信息
# 自定义字段,有多个表的时候需要写个table_name
table_name = 'posts'

# 下面的是数据字段  
pid = scrapy.Field()  
title = scrapy.Field()  
category = scrapy.Field()  
create\_time = scrapy.Field()  
play\_count = scrapy.Field()  
desc = scrapy.Field()  
video\_url = scrapy.Field()

class CommentItem(scrapy.Item):
# 保存评论信息
table_name = 'comments'
content_id = scrapy.Field()
pid = scrapy.Field()
cid = scrapy.Field()
uname = scrapy.Field()
user_id = scrapy.Field()
content = scrapy.Field()
user_page = scrapy.Field()

class CopyItem(scrapy.Item):
table_name = 'copyrights'
pcid = scrapy.Field() # 表的主键
pid = scrapy.Field()
cid = scrapy.Field()
role = scrapy.Field()

# pipeline文件

-*- coding: utf-8 -*-

import csv
from xpc.items import PostItem, CommentItem, CopyItem
import pymysql
from redis import Redis
import os

class XpcPipeline(object):
def __init__(self):
# 当前文件的上一级
store_file = os.path.dirname(__file__) + '/xpc.csv'
# 打开文件
self.file = open(store_file, 'w', newline="")
# csv 写法
self.writer = csv.writer(self.file)

def open\_spider(self, spider):  
    print("pipeline 开始爬虫......")  

  # 执行多个不同的item时
def process_item(self, item, spider):
if isinstance(item, PostItem):
print("这是发布信息:", item)
elif isinstance(item, CommentItem):
print("这是评论信息:", item)
elif isinstance(item, CopyItem):
print("这是版权信息:", item)
return item # 返回给下一个要执行的管道类

def close\_spider(self, spider):  
    print("pipeline 结束爬虫......")

连接数据库

class MysqlPipeline(object):
conn = None
cursor = None

def open\_spider(self, spider):  
    self.conn = pymysql.Connect(  
        host='127.0.0.1',  
        port=3306,  
        user='root',  
        password='',  
        db='test\_db',  
        charset='utf8'  
    )  
    print("数据库连接成功")

def process\_item(self, item, spider):  
    self.cursor = self.conn.cursor()  
    try:  
        self.cursor.execute('insert into test\_db values("%s", "%s")' % (item\['author'\], item\['content'\]))  
        self.conn.commit()  
    except Exception as e:  
        print("数据库插入异常:", e)  
        print("数据库执行回滚")  
        self.conn.rollback()  
    return item

def close\_spider(self, spider):  
    print("断开数据库连接")  
    self.cursor.close()  
    self.conn.close()

连接数据库

class RedisPipeline(object):
conn = None
cursor = None

def open\_spider(self, spider):  
    self.conn = Redis(  
        host='127.0.0.1',  
        port=6379  
    )  
    print("数据库连接成功")

def process\_item(self, item, spider):  
    dic = {  
        "author": item\["author"\],  
        "content": item\["content"\]  
    }  
    self.conn.lpush("队列名字", dic)

def close\_spider(self, spider):  
    print("断开数据库连接")  
    self.cursor.close()  
    self.conn.close()

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器