一、创建工程
scarpy startproject xxx
二、编写iteam文件
# -*- coding: utf-8 -*-
import scrapy
class TestScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
info = scrapy.Field()
二、编写setting文件
# -*- coding: utf-8 -*-
BOT_NAME = 'test_scrapy'
SPIDER_MODULES = ['test_scrapy.spiders']
NEWSPIDER_MODULE = 'test_scrapy.spiders'
#USER_AGENT = 'test_scrapy (+http://www.yourdomain.com)'
#CONCURRENT_REQUESTS = 32
#DOWNLOAD_DELAY = 3
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
#COOKIES_ENABLED = False
#TELNETCONSOLE_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
#SPIDER_MIDDLEWARES = {
#}
#DOWNLOADER_MIDDLEWARES = {
#}
#EXTENSIONS = {
#}
ITEM_PIPELINES = {
'test_scrapy.pipelines.TestScrapyPipeline': 300,
}
#AUTOTHROTTLE_ENABLED = True
#AUTOTHROTTLE_START_DELAY = 5
#AUTOTHROTTLE_MAX_DELAY = 60
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
#AUTOTHROTTLE_DEBUG = False
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
三、进入spider文件(cmd创建自定义爬虫文件)
scrapy genspider demo 'www.douyu.com'
编写代码
import scrapy
from test_scrapy.items import TestScrapyItem
class ItcastSpider(scrapy.Spider):
# 爬虫名
name = "itcast"
# 允许爬虫作用的范围
allowd_domains = ['http://www.xxxx.cn/']
# 爬虫起始的url
start_urls = ["http://www.xxxx.cn/channel/teacher.shtml#ajavaee"]
def parse(self, response):
# with open("teacher.html", "w") as f:
# f.write(response.body)
# 通过自带的xpath匹配出所有老师的根节点列表集合
teacher\_list = response.xpath('//div\[@class="li\_txt"\]')
# 所有老师信息的列表集合
# 遍历根节点集合
for each in teacher\_list:
# 保存数据
item = TestScrapyItem()
# name,extract()将匹配出来的结果转换为unicode字符串
# 不加extract()结果为xpath匹配对象
name = each.xpath('./h3/text()').extract()
# title
title = each.xpath('./h4/text()').extract()
# info
info = each.xpath('./p/text()').extract()
item\['name'\] = name\[0\]
item\['title'\] = title\[0\]
item\['info'\] = info\[0\]
yield item
四、运行
scrapy crawl xxxx
OVER!
手机扫一扫
移动阅读更方便
你可能感兴趣的文章