python爬取快手小姐姐视频
阅读原文时间:2022年04月30日阅读:2

一、导入需要的三方库

import re #正则表表达式文字匹配
import requests #指定url,获取网页数据
import json #转化json格式

import os #创建文件

二、确定目标地址

快手的目标地址需要在https://www.kuaishou.com网站截取请求,需要在控制台network截取,找到真正的地址https://www.kuaishou.com/graphql src="https://article.cdnof.com/2204/253daec1-cf06-42ef-935a-66ab62b2dc4e.jpg" alt="" />

url = "https://www.kuaishou.com/graphql" #目标网站url

三、确定数据位置

通过在network中搜索网页中显示的特定数据来确定data数据包,可以发现数据在data中。

四、伪装请求头、复制data数据

在控制台找到headers,复制里面的信息并手动转换成键值对的形式

在负载中找到以下信息,并封装到data里用于post请求

data = { #封装data数据
'operationName': "visionSearchPhoto",
'query': "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n likeCount\n viewCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n …photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n __typename\n}\n\nquery visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n …feedContent\n __typename\n }\n searchSessionId\n pcursor\n aladdinBanner {\n imgUrl\n link\n __typename\n }\n __typename\n }\n}\n",
'variables': {'keyword': keyword,'pcursor': str(page),'page':"search",'searchSessionId':"MTRfNTU2NDg2NDBfMTY1MTE0Njk1MTk5OV_mhaLmkYdfNjA0NQ"}
}

data = json.dumps(data)     #将字典数据data转换成需要的json

headers = {     #伪装请求头,模拟浏览器访问  
    'accept': "\*/\*",  
    'Accept-Encoding' : "gzip,deflate,br",  
    'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",  
    'Connection': "keep-alive",  
    'Content-Length': "1279",  
    'content-type': "application/json",  
    'Cookie': "clientid=3; did=web\_3a9d9e97c96c8d33eddd663ef9362703; client\_key=65890b29; kpf=PC\_WEB; kpn=KUAISHOU\_VISION; userId=55648640; kuaishou.server.web\_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABVSIs0T2ccYTa7qgMDgpAdRiErWNJyU1r87iKEYuIWEo\_oJSNGq4aTBB5sq0bA7iLeCMOoX0grrEbBkpPmehOVWEs\_tC-cDytf6dxSLnrE9-tRQaVcHziopazhh5rroA2XZmHEjHAe6z9-AHD0ZTxV9nJPHeI0-k0wfn9DHvDcj8ZUgQexbGwXDeH2wBV\_WSuzutsd3d5oLNmN-90a33TXhoS6uws2LN-siMyPVYdMaXTUH7FIiBd27-7kiEEP6BHgx67BTXhdNPtfA88SZZL4Q\_uQ09AuCgFMAE; kuaishou.server.web\_ph=b0d04539ad2d77e13d866a70f9e56371c4a6",  
    'Host' : "www.kuaishou.com",  
    'Origin': "https://www.kuaishou.com",  
    'Referer': "https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87",  
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Microsoft Edge";v="100"',  
    'sec-ch-ua-mobile' : "?0",  
    'sec-ch-ua-platform': '" "Windows" "',  
    'Sec-Fetch-Dest': "empty",  
    'Sec-Fetch-Mode':"cors",  
    'Sec-Fetch-Site': "same-origin",  
    "User-Agent": "Mozilla / 5.0(Linux;Android6.0;Nexus5 Build / MRA58N) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 99.0.4844.51Mobile Safari / 537.36"  
}

五、发出请求

向网站发起请求,需要用到第三方requests库,进行访问。

response = requests.post(url=url,headers=headers,data=data) #向目标网站发出请求

六、数据解析

通过re库和xpath进行数据解析,解析到需要的数据封装到列表中。

json_data = response.json() #转换json数据
#print(json_data)
feed_list = json_data['data']['visionSearchPhoto']['feeds'] #在标签中找出每一项的列表
#print(feed_list)

for feeds in feed\_list:     #循环每一项  
    title = feeds\['photo'\]\['caption'\]   #在网页中找出标题  
    new\_title = re.sub(r'\[\\/:\*?"<>\\n\]','\_',title)   #去除标题中的特殊字符  
    photoUrl = feeds\['photo'\]\['photoUrl'\]       #找到视频的url地址  
    mp4\_data = requests.get(photoUrl).content   #转换视频二进制形式

七、保存数据

创建文件夹,并将数据保存到文件夹中。需要用到os库。

def mkdir(path): #创建文件夹
# 引入模块
import os #引入第三方库

# 去除首位空格  
path = path.strip()  
# 去除尾部 \\ 符号  
path = path.rstrip("\\\\")  
# 判断路径是否存在  
# 存在     True  
# 不存在   False  
isExists = os.path.exists(path)  
# 判断结果  
if not isExists:  
    # 如果不存在则创建目录  
    # 创建目录操作函数  
    os.makedirs(path)  
    print(path + ' 创建成功')  
    return True  
else:  
    # 如果目录存在则不创建,并提示目录已存在  
    print(path + ' 目录已存在')  
    return False

with open(keyword+'/'+ new_title +'.mp4',mode="wb") as f: #设定保存数据路径
f.write(mp4_data) #保存数据
print("保存成功!",new_title)

# -*- coding = utf-8 -*-

@Time : 2022/4/28 10:48

@Author :王敬博

@File : 快手小姐姐.py

@Software: PyCharm

import re #正则表表达式文字匹配
import requests #指定url,获取网页数据
import json #转化json格式

def mkdir(path): #创建文件夹
# 引入模块
import os #引入第三方库

# 去除首位空格  
path = path.strip()  
# 去除尾部 \\ 符号  
path = path.rstrip("\\\\")  
# 判断路径是否存在  
# 存在     True  
# 不存在   False  
isExists = os.path.exists(path)  
# 判断结果  
if not isExists:  
    # 如果不存在则创建目录  
    # 创建目录操作函数  
    os.makedirs(path)  
    print(path + ' 创建成功')  
    return True  
else:  
    # 如果目录存在则不创建,并提示目录已存在  
    print(path + ' 目录已存在')  
    return False

url = "https://www.kuaishou.com/graphql" #目标网站url

keyword = input("请输入想要搜索的关键词") #搜索关键字
mkpath = "d:\\pywork\\demo02\\"+keyword #创建的文件路径
mkdir(mkpath) #调用函数,创建文件夹

for page in range(1,5): #爬取4页
print("==================第%d页================="%page) #打印爬取的页数
data = { #封装data数据
'operationName': "visionSearchPhoto",
'query': "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n likeCount\n viewCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n …photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n __typename\n}\n\nquery visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n …feedContent\n __typename\n }\n searchSessionId\n pcursor\n aladdinBanner {\n imgUrl\n link\n __typename\n }\n __typename\n }\n}\n",
'variables': {'keyword': keyword,'pcursor': str(page),'page':"search",'searchSessionId':"MTRfNTU2NDg2NDBfMTY1MTE0Njk1MTk5OV_mhaLmkYdfNjA0NQ"}
}

data = json.dumps(data)     #将字典数据data转换成需要的json

headers = {     #伪装请求头,模拟浏览器访问  
    'accept': "\*/\*",  
    'Accept-Encoding' : "gzip,deflate,br",  
    'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",  
    'Connection': "keep-alive",  
    'Content-Length': "1279",  
    'content-type': "application/json",  
    'Cookie': "clientid=3; did=web\_3a9d9e97c96c8d33eddd663ef9362703; client\_key=65890b29; kpf=PC\_WEB; kpn=KUAISHOU\_VISION; userId=55648640; kuaishou.server.web\_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABVSIs0T2ccYTa7qgMDgpAdRiErWNJyU1r87iKEYuIWEo\_oJSNGq4aTBB5sq0bA7iLeCMOoX0grrEbBkpPmehOVWEs\_tC-cDytf6dxSLnrE9-tRQaVcHziopazhh5rroA2XZmHEjHAe6z9-AHD0ZTxV9nJPHeI0-k0wfn9DHvDcj8ZUgQexbGwXDeH2wBV\_WSuzutsd3d5oLNmN-90a33TXhoS6uws2LN-siMyPVYdMaXTUH7FIiBd27-7kiEEP6BHgx67BTXhdNPtfA88SZZL4Q\_uQ09AuCgFMAE; kuaishou.server.web\_ph=b0d04539ad2d77e13d866a70f9e56371c4a6",  
    'Host' : "www.kuaishou.com",  
    'Origin': "https://www.kuaishou.com",  
    'Referer': "https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87",  
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Microsoft Edge";v="100"',  
    'sec-ch-ua-mobile' : "?0",  
    'sec-ch-ua-platform': '" "Windows" "',  
    'Sec-Fetch-Dest': "empty",  
    'Sec-Fetch-Mode':"cors",  
    'Sec-Fetch-Site': "same-origin",  
    "User-Agent": "Mozilla / 5.0(Linux;Android6.0;Nexus5 Build / MRA58N) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 99.0.4844.51Mobile Safari / 537.36"  
}

response = requests.post(url=url,headers=headers,data=data)     #向目标网站发出请求  
json\_data = response.json()    #转换json数据  
#print(json\_data)  
feed\_list = json\_data\['data'\]\['visionSearchPhoto'\]\['feeds'\]     #在标签中找出每一项的列表  
#print(feed\_list)

for feeds in feed\_list:     #循环每一项  
    title = feeds\['photo'\]\['caption'\]   #在网页中找出标题  
    new\_title = re.sub(r'\[\\/:\*?"<>\\n\]','\_',title)   #去除标题中的特殊字符  
    photoUrl = feeds\['photo'\]\['photoUrl'\]       #找到视频的url地址  
    mp4\_data = requests.get(photoUrl).content   #转换视频二进制形式  
    with open(keyword+'/'+ new\_title +'.mp4',mode="wb") as f:   #设定保存数据路径  
        f.write(mp4\_data)   #保存数据  
        print("保存成功!",new\_title)

这次爬取的流程以及源代码是从bili的课程中学到的,大概用了3个小时左右,个人感觉第一次爬取动态网页还是找到数据是比较难的。通过这次实践我终于体会到了程序员的快乐(谁不喜欢看小姐姐呢),嘿嘿嘿!

如果有问题可以私聊我,如果对你有帮助的话,记得个赞哦!