python3拉勾网爬虫之(您操作太频繁,请稍后访问)
阅读原文时间:2023年07月08日阅读:4

你是否经历过这个:

那就对了~
因为需要post和相关的cookie来请求~
所以,一个简单的代码爬拉钩~~~

1 import requests
2 import time
3 import json
4
5
6 def main():
7 url_start = "https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput="
8 url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=成都&needAddtionalResult=false"
9 headers = {
10 'Accept': 'application/json, text/javascript, */*; q=0.01',
11 'Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
12 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
13 }
14 for x in range(1, 5):
15 data = {
16 'first': 'true',
17 'pn': str(x),
18 'kd': '运维'
19 }
20 s = requests.Session() # 创建一个session对象
21 s.get(url_start, headers=headers, timeout=3) # 用session对象发出get请求,请求首页获取cookies
22 cookie = s.cookies # 为此次获取的cookies
23 response = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3) # 获取此次文本
24 time.sleep(5)
25 response.encoding = response.apparent_encoding
26 text = json.loads(response.text)
27 info = text["content"]["positionResult"]["result"]
28 for i in info:
29 print(i["companyFullName"])
30 companyFullName = i["companyFullName"]
31 print(i["positionName"])
32 positionName = i["positionName"]
33 print(i["salary"])
34 salary = i["salary"]
35 print(i["companySize"])
36 companySize = i["companySize"]
37 print(i["skillLables"])
38 skillLables = i["skillLables"]
39 print(i["createTime"])
40 createTime = i["createTime"]
41 print(i["district"])
42 district = i["district"]
43 print(i["stationname"])
44 stationname = i["stationname"]
45
46 if __name__ == '__main__':
47 main()

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器