就是一个具有很强通用性且集成了很多功能的项目模板(可以被应用在各种需求中)
scrapy集成好的功能:
extract():列表是有多个列表元素
extract_first():列表元素只有单个
scrapy的持久化存储
基于终端指令:
基于管道:pipelines.py
编码流程:
1.数据解析
2.在item的类中定义相关的属性
3.将解析的数据存储封装到item类型的对象中.item['p']
4.将item对象提交给管道
5.在管道类中的process_item方法负责接收item对象,然后对item进行任意形式的持久化存储
6.在配置文件中开启管道
-scrapy crawl first
细节补充:
import scrapy
from huyaPro.items import HuyaproItem
class HuyaSpider(scrapy.Spider):
name = 'huya'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.huya.com/g/wzry']
# 基于终端指令进行的持久化存储
# def parse(self, response):
# li_list = response.xpath('//*[@id="js-live-list"]/li')
# all_data = []
# for li in li_list:
# title = li.xpath('./a[2]/text()').extract_first()
# author = li.xpath('./span/span[1]/i/text()').extract_first()
# hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
# dic = {
# 'title': title,
# 'author': author,
# 'hot': hot
# }
# all_data.append(dic)
# return all_data
# 基于管道进行的持久化存储
def parse(self, response):
li_list = response.xpath('//*[@id="js-live-list"]/li')
for li in li_list:
title = li.xpath('./a[2]/text()').extract_first()
author = li.xpath('./span/span[1]/i/text()').extract_first()
hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
item = HuyaproItem()
item['title'] = title
item['author'] = author
item['hot'] = hot
yield item
import scrapy
class HuyaproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field() # 数据解析中的属性
author = scrapy.Field()
hot = scrapy.Field()
import pymysql
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class HuyaproPipeline:
fp = None
def open_spider(self, spider): # 只会执行一次
print('open')
self.fp = open('huyazhibo.txt', 'w', encoding='utf-8')
def process_item(self, item, spider): # item就是接受爬虫类提交过来的item对象
self.fp.write(item['title']+':'+item['author']+':'+item['hot']+'\n')
print(item['title']+'写入成功')
return item # item的操作表示将item传递给下一个即将被执行的管道类
def close_item(self, spider): # 只会执行一次
self.fp.close()
print('close')
class mysqlPopeLine:
conn = None
cursor = None
def open_spider(self, spider): # 只会执行一次
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Spider', charset='utf8')
print(self.conn)
def process_item(self, item, spider):
sql = 'insert into huya values("%s","%s","%s")'%(item['title'],item['author'],item['hot'])
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
def close_spider(self, spider): # 只会执行一次
self.cursor.close()
self.conn.close()
BOT_NAME = 'huyaPro'
SPIDER_MODULES = ['huyaPro.spiders']
NEWSPIDER_MODULE = 'huyaPro.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
LOG_LEVEL = 'ERROR'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'huyaPro.pipelines.HuyaproPipeline': 300, # 数字越小优先级越高
'huyaPro.pipelines.mysqlPopeLine': 301,
}
基于Spider父类进行全站数据的爬取
scrapy五大核心组件
引擎(Scrapy)
调度器(Scheduler)
下载器(Downloader)
爬虫(Spiders)
项目管道(Pipeline)
scrapy的请求传参
提升scrapy爬取数据的效率
在配置文件中进行相关的配置即可
增加并发:
降低日志级别:
禁止cookie:
禁止重试:
减少下载超时:
scrapy的中间件
爬虫中间件
下载中间件(***):处于引擎和下载器之间
爬取网易新闻的新闻标题和内容
分析
selenium在scrapy中的使用流程
import scrapy
class MiddleSpider(scrapy.Spider):
name = 'middle'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://ip.chinaz.com/']
def parse(self, response):
page_text = response.text
with open('iip.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
from scrapy import signals
import random
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
]
from itemadapter import is_item, ItemAdapter
class MiddleproDownloaderMiddleware:
# 拦截正常请求
def process_request(self, request, spider):
# 进行UA伪装
print('!!!!!!!!!!!!!!!!!!!!!')
request.headers['User-Agent'] = random.choice(user_agent_list)
print(request.headers['User-Agent'])
# 代理ip
request.meta['proxy'] = 'http://123.55.114.25:9999'
print(request.meta['proxy'])
return None
# 拦截所有的请求
def process_response(self, request, response, spider):
print('??????????????????')
return response
# 拦截发生异常的请求
def process_exception(self, request, exception, spider):
# print(request)
return request # 将修正后的正常的请求对象进行重新发送
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
import scrapy
from moviePro.items import MovieproItem
class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.4567kan.com/index.php/vod/show/class/喜剧/id/6/page/1.html']
url = 'http://www.4567kan.com/index.php/vod/show/class/喜剧/id/6/page/%d.html'
# 专门用于解析电影名称
page = 1 # 该page给之后递归调用停止用
def parse(self, response):
print(f'正在爬取第{self.page}页的数据')
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
item = MovieproItem()
name = li.xpath('./div/a/@title').extract_first()
item['name'] = name
detail_url = 'http://www.4567kan.com' + li.xpath('./div/a/@href').extract_first()
# 可以对详情页的url手动发起请求
# 请求参数:让Request将一个数据值(字典)传递给回调函数
yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})
if self.page < 5:
self.page += 1
new_url = format(self.url%self.page)
yield scrapy.Request(new_url, callback=self.parse) # 递归调用
def parse_detail(self, response):
item = response.meta['item']
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['desc'] = desc
yield item
import scrapy
class MovieproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
desc = scrapy.Field()
import scrapy
from wangyiPro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
model_urls = []
bro = webdriver.Chrome(executable_path=r'D:\老男孩python22期代码及笔记\day95\chromedriver.exe')
def parse(self, response):
# 解析出5个板块对应的url
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
model_index = [3,4,6,7,8]
for index in model_index:
li = li_list[index]
# 5个板块对应的url
model_url = li.xpath('./a/@href').extract_first()
self.model_urls.append(model_url)
# 对每一个板块的url进行手动请求的发送
yield scrapy.Request(model_url, callback=self.parse_model)
def parse_model(self, response): # 用作于解析每一个板块对应页面数据中的新闻标题和新闻详情页的url
div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div')
for div in div_list:
title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
item = WangyiproItem()
item['title'] = title
detail_url = div.xpath('./a/@herf').extract_first()
yield scrapy.Request(detail_url, callback=self.parse_new_detail, meta={'item': item})
def parse_new_detail(self, response):
item = response.meta['item']
content = response.xpath('//*[@id="content"]/div[2]//text()').extract()
content = ''.join(content)
item['content'] = content
yield item
# 改方法只会在整个程序结束时执行一次
def closed(self, reason):
self.bro.quit()
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content = scrapy.Field()
from scrapy import signals
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
from time import sleep
class WangyiproDownloaderMiddleware:
#参数:
#reuqest:拦截到请求对应的响应对象
#response:拦截到所有的响应对象(1+5+n)
#spider:爬虫类实例化的对象,可以实现爬虫类和中间件类的数据交互
def process_response(self, request, response, spider):
#拦截到5个板块对应的响应对象,将其替换成5个符合需求的新的响应对象进行返回
#1.找出5个板块对应的5个不符合需求的响应对象
if request.url in spider.model_urls:
#就是满足需求的五个板块对应的响应对象
#url:响应对象对应的请求对象的url
#body:响应数据,可以由selenium中的page_source返回
bro = spider.bro
bro.get(request.url)
sleep(3)
page_text = bro.page_source # 包含了动态加载的新闻数据
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
else:
return response
import pymysql
class WangyiproPipeline:
conn = None
cur = None
def open_spider(self, spider):
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='', database='Spider', charset='utf8')
def process_item(self, item, spider):
print(item)
sql = 'insert into wangyi values("%s","%s")'%(item['title'], item['content'])
self.cur = self.conn.cursor()
try:
self.cur.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()
图片懒加载
ImagePileline: 专门用作与二进制数据下载和持久化存储的管道类
CrawlSpider
分布式
概念:需要搭建一个分布式的集群,然后在机群的每一台电脑中执行同一组程序,让其对某一个网站的数据进行联合分布爬取
原生的scrapy框架是不可以实现分布式?
如何实现分布式?
scrapy-redis组件的作用是什么?
分布式的实现流程:
1.pip install scrapy-redis
2.创建工程
3.cd 工程目录中
4.创建爬虫文件(a.创建基于Spider的爬虫文件b.创建CrawlSpider的爬虫文件)
5.修改爬虫类
6.settings配置文件的配置
UA伪装
Robots
管道的指定
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400
}
指定调度器:
增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = True
指定redis数据库
REDIS_HOST = 'redis服务的ip地址'
REDIS_PORT = 6379
redis的配置文件进行配置redis.windows.conf:
启动redis的服务端和客户端:
启动程序:
scrapy runspider xxx.py
向调度器的队列中仍入一个起始的url:
增量式
总结反爬机制:
import scrapy
from imgPro.items import ImgproItem
class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://sc.chinaz.com/tupian/meinvtupian.html']
def parse(self, response):
div_list = response.xpath('//*[@id="container"]/div')
for div in div_list:
img_src = 'https:' + div.xpath('./div/a/img/@src2').extract_first()
item = ImgproItem()
item['img_src'] = img_src
yield item
import scrapy
class ImgproItem(scrapy.Item):
img_src = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class ImgproPipeline(ImagesPipeline):
# 是用来对媒体资源进行请求的(数据下载),参数item就是接受到爬虫类提交的item对象
def get_media_requests(self, item, info):
yield scrapy.Request(item['img_src'])
# 指明数据存储的路径,需要再settings.py中进行设置
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
def item_completed(self, results, item, info):
return item
# 图片存储文件夹的名称+路径
IMAGES_STORE = './imgLibs'
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem, Detail_item
class SuncrawlSpider(CrawlSpider):
name = 'sunCrawl'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest']
# 实例化一个连接提取器对象:只能取连接
# 作用:根据指定规则(allow='正则表达式')进行指定连接的提取
link = LinkExtractor(allow=r'id=1&page=\d+') # 获取页码连接
# 获取详情页连接,注意:如果中间有点要注意转义
link_detail = LinkExtractor(allow=r'/political/politics/index?id=\d+')
rules = (
# 将Link作用到了Rule构造方法的参数1中
# 作用:(可以数据解析和请求发送)将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
Rule(link, callback='parse_item', follow=False),
# follow=True:将连接提取器 继续作用到 连接提取器取到的 连接 所对应的 页面中
Rule(link_detail, callback='parse_detail', follow=False),
)
def parse_item(self, response):
# xpath表达式中是不可以出现tbody的*****
li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
for li in li_list:
title = li.xpath('./li/span[3]/a/text()').extract_first()
num = li.xpath('./li/span[1]/text()').extract_first()
item = SuncrawlproItem()
item['title'] = title
item['num'] = num
yield item
def parse_detail(self, response):
num = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
num = num.split(':')[-1]
content = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()
item = Detail_item()
item['num'] = num
item['content'] = content
yield item
import scrapy
class SuncrawlproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
num = scrapy.Field()
class Detail_item(scrapy.Item):
content = scrapy.Field()
num = scrapy.Field()
class SuncrawlproPipeline:
def process_item(self, item, spider):
if item.__class__.__name__ == 'Detail_item':
content = item['content']
num = item['num']
print(item, '!!!!!!!!!!')
else:
title = item['title']
num = item['num']
print(item, '?????????')
return item
手机扫一扫
移动阅读更方便
你可能感兴趣的文章