from urllib import request,parse
from time import sleep
import re
def request_by(url,page):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'}
if page==1:
page_url = url + ".html"
else:
page_url = url +"_"+ str(page) + ".html"
print("正在访问:",page_url)
req = request.Request(url=page_url,headers=headers)
return req
def get_html_from(req):
res = request.urlopen(req)
# 每请求一次要休眠一段时间
sleep(1)
return res.read().decode("utf-8")
def anylasis_data(html):
pat = re.compile(r'<div class="box picblock.*?<img src2="(.*?)"',re.S)
imgs = pat.findall(html)
return imgs
def download_imgs(imgs):
for img in imgs:
# http://pic1.sc.chinaz.com/Files/pic/pic9/201904/zzpic17564_s.jpg
# 生成图片的名字
img_name = img.split("/")[-1]
print("正在下载图片:",img)
request.urlretrieve(url=img,filename="./meinv/"+img_name)
sleep(1)
if __name__ == '__main__':
page_url = "http://sc.chinaz.com/tupian/meinvxiezhen"
for i in range(1,2):
req = request\_by(url=page\_url,page=i)
res = get\_html\_from(req)
imgs = anylasis\_data(res)
download\_imgs(imgs)
手机扫一扫
移动阅读更方便
你可能感兴趣的文章