什么是json?
{key:value}
取值:对象名.key[...,...]
取值:数组[索引值]作用
常用方法
动态网站数据抓取 - Ajax
特点 :滚动鼠标滑轮时加载
抓包:查询参数在 WebForms -> QueryString
案例 :豆瓣电影排行榜数据抓取
抓取目标:
代码实现
import requests
import csv
import json
url = "https://movie.douban.com/j/chart/top_list?"
headers = {"User-Agent": "Mozilla/5.0"}
num = input("请输入要爬取的数量:")
params = {
"type": "11",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": num
}
res = requests.get(url, params=params, headers=headers)
html = res.text
print(html)
html = json.loads(html)
for film in html:
name = film['title']
score = film['rating'][0]
#{"rating":["9.6","50"]}
with open("douban.csv", "a", newline='') as f:
writer = csv.writer(f)
L = [name, score]
writer.writerow(L)
定义:无界面浏览器(无头浏览器)
特点:
安装:
Windows
C:\Python36\Scripts
Ubuntu
添加环境变量:
vi .bashrc 添加
export PHANTOM_JS=/home/.../phantomjs-2.1.1-...
export PATH=$PHANTOM_JS/bin:$PATH
到ChromeDriver - WebDriver for Chrome下载适配自己Chrome浏览器的版本。
将解压出的chromedriver添加到环境变量,或者放到miniconda/bin下(本人使用conda作文环境管理器)
from selenium import webdriver
import time
def test1():
# 创建浏览器对象
driver = webdriver.Chrome()
# 发请求get()
driver.get("http://www.baidu.com/")
# 获取网页截屏
driver.save_screenshot("百度.png")
# print("图片保存成功!")
# 关闭
driver.quit()
def test2():
# 创建浏览器对象
driver = webdriver.Chrome()
# 打开页面
driver.get("http://www.baidu.com/")
# 发送文字到搜索框
kw = driver.find_element_by_id("kw")
kw.send_keys("美女")
# 点击"百度一下"
su = driver.find_element_by_id("su")
su.click()
time.sleep(1)
# 获取截屏
# driver.save_screenshot("百度.png")
# 关闭浏览器
# driver.quit()
driver.get(url)
driver.page_source
:获取响应的html源码
driver.page_source.find("字符串")
作用:从html源码中搜索指定字符串
单元素查找
driver.find_element_by_id("").text
driver.find_element_by_class_name("")
driver.find_element_by_xpath('xpath表达式')
多元素查找
driver.find_elements_by_....
对象名.send_keys("内容")
对象名.click()
from selenium import webdriver
import time
# 创建浏览器对象,发送请求
driver = webdriver.Chrome()
driver.get("https://www.douban.com/")
time.sleep(1)
# 获取截图(验证码)
# driver.save_screenshot("验证码.png")
# 找用户名、密码、验证、登录豆瓣按钮
mmdl = driver.find_element_by_class_name("account-tab-account")
mmdl.click()
uname = driver.find_element_by_name("username")
uname.send_keys("haoen110@163.com")
pwd = driver.find_element_by_name("password")
pwd.send_keys("Howie1996925")
button = driver.find_element_by_class_name("btn btn-account")
# 关闭浏览器
导模块
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome()
driver.get("http://www.baidu.com/")
kw = driver.find_element_by_id("kw")
kw.send_keys("Python")
time.sleep(0.8)
kw = driver.find_element_by_id("kw")
kw.send_keys(Keys.COMMAND, 'a')
time.sleep(0.8)
kw = driver.find_element_by_id("kw")
kw.send_keys(Keys.COMMAND, 'x')
time.sleep(0.8)
kw = driver.find_element_by_id("kw")
kw.send_keys(Keys.COMMAND, 'v')
time.sleep(0.8)
kw = driver.find_element_by_id("kw")
kw.clear()
time.sleep(0.8)
kw = driver.find_element_by_id("kw")
kw.send_keys("Spider")
su = driver.find_element_by_id("su")
su.click()
抓取目标:主播名称、观众人数
'''11_斗鱼直播抓取案例.py'''
from selenium import webdriver
from lxml import etree
import time
opt = webdriver.ChromeOptions()
opt.set_headless()
driver = webdriver.Chrome(options=opt)
driver.get("https://www.douyu.com/directory/all")
i = 1
while True:
# 解析(driver.page_source)
# 获取主播名称 和 观众人数
parseHtml = etree.HTML(driver.page_source)
names = parseHtml.xpath('//div[@id="live-list-content"]//span[@class="dy-name ellipsis fl"]')
numbers = parseHtml.xpath('//div[@id="live-list-content"]//span[@class="dy-num fr"]')
for name,number in zip(names,numbers):
print("\t主播名称:%s \t观众人数:%s" %
(name.text.strip(),number.text.strip()))
#for name,number in [("主播1","20万"),("主播2","15万")]
print("第%d页爬取成功" % i)
i += 1
# 判断是否需要点击下一页
# 能点 :点击,继续循环
if driver.page_source.find("shark-pager-disable-next") == -1:
driver.find_element_by_class_name("shark-pager-next").click()
time.sleep(1)
else:
break
# 不能点 :break
print("一共爬取了%d页" % i)
目标
手机扫一扫
移动阅读更方便
你可能感兴趣的文章