Python爬虫小结
阅读原文时间:2023年07月10日阅读:3

有些数据是没有专门的数据集的,为了找到神经网络训练的数据,自然而然的想到了用爬虫的方法开始采集数据。一开始采用了网上的一个动态爬虫的代码,发现爬取的图片大多是重复的,有效图片很少。

动态爬虫:

from lxml import etree
import requests
import re
import urllib
import json
import time
import os

local_path = '/home/path/'
if not os.path.exists(local_path):
os.makedirs(local_path)
keyword = input('请输入想要搜索图片的关键字:')
first_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1530850407660_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1530850407660%5E00_1651X792&word={}'.format(keyword)
want_download = input('请输入想要下载图片的张数:')

global page_num
page_num = 1
global download_num
download_num = 0

#这个函数用来获取图片格式
def get_format(pic_url):
#url的末尾存着图片的格式,用split提取
#有些url末尾并不是常见图片格式,此时用jpg补全
t = pic_url.split('.')
if t[-1].lower() != 'bmp' and t[-1].lower() != 'gif' and t[-1].lower() != 'jpg' and t[-1].lower() != 'png':
pic_format = 'jpg'
else:
pic_format = t[-1]
return pic_format

#这个函数用来获取下一页的url
def get_next_page(page_url):
global page_num
html = requests.get(page_url).text
with open('html_info.txt', 'w', encoding='utf-8') as h:
h.write(html)
selector = etree.HTML(html)
try:
msg = selector.xpath('//a[@class="n"]/@href')
print(msg[0])
next_page = 'http://image.baidu.com/' + msg[0]
print('现在是第%d页' % (page_num + 1))
except Exception as e:
print('已经没有下一页了')
print(e)
next_page = None
page_num = page_num + 1
return next_page

#这个函数用来下载并保存图片
def download_img(pic_urls):
count = 1
global download_num
for i in pic_urls:
time.sleep(1)
try:
pic_format = get_format(i)
pic = requests.get(i, timeout=15)
#按照格式和名称保存图片
with open(local_path + 'page%d_%d.%s' % (page_num, count, pic_format), 'wb') as f:
f.write(pic.content)
#print('成功下载第%s张图片: %s' % (str(count), str(pic.url)))
count = count + 1
download_num = download_num + 1
except Exception as e:
#print('下载第%s张图片时失败: %s' % (str(count), str(pic.url)))
print(e)
count = count + 1
continue
finally:
if int(want_download) == download_num:
return 0

#这个函数用来提取url中图片的url
def get_pic_urls(web_url):
html = requests.get(web_url).text
#通过正则表达式寻找图片的地址,
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
#返回图片地址,是一个list
return pic_urls

if __name__ == "__main__":
while True:
pic_urls = get_pic_urls(first_url)
t = download_img(pic_urls)
if t==0:
break
next_url = get_next_page(first_url)
if next_url == None:
print('已经没有更多图片')
break
pic_urls = get_pic_urls(next_url)
t = download_img(pic_urls)
if t== 0:
break
first_url = next_url
#print('已经成功下载%d张图片' %download_num)

为了筛选出重复的图片又采用了哈希算法进行去重

# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf8')

"""
用dhash判断是否相同照片
基于渐变比较的hash
hash可以省略(本文省略)
By Guanpx
"""
import os
from PIL import Image
from os import listdir

def picPostfix(): # 相册后缀的集合
postFix = set()
postFix.update(['bmp', 'jpg', 'png', 'tiff', 'gif', 'pcx', 'tga', 'exif',
'fpx', 'svg', 'psd', 'cdr', 'pcd', 'dxf', 'ufo', 'eps', 'JPG', 'raw', 'jpeg'])
return postFix

def getDiff(width, high, image): # 将要裁剪成w*h的image照片
diff = []
im = image.resize((width, high))
imgray = im.convert('L') # 转换为灰度图片 便于处理
pixels = list(imgray.getdata()) # 得到像素数据 灰度0-255

 for row in range(high): # 逐一与它左边的像素点进行比较  
     rowStart = row \* width  # 起始位置行号  
     for index in range(width - 1):  
         leftIndex = rowStart + index  
         rightIndex = leftIndex + 1  # 左右位置号  
         diff.append(pixels\[leftIndex\] > pixels\[rightIndex\])

 return diff  #  \*得到差异值序列 这里可以转换为hash码\*

def getHamming(diff=[], diff2=[]): # 暴力计算两点间汉明距离
hamming_distance = 0
for i in range(len(diff)):
if diff[i] != diff2[i]:
hamming_distance += 1

 return hamming\_distance

if __name__ == '__main__':

 width = 32  
 high = 32  # 压缩后的大小  
 dirName = "/home/yourpath"  # 相册路径  
 allDiff = \[\]  
 postFix = picPostfix()  #  图片后缀的集合

 dirList = os.listdir(dirName)  
 cnt = 0  
 for i in dirList:  
     cnt += 1  
     # print('文件处理的数量是', cnt)  # 可以不打印 表示处理的文件计数  
     if str(i).split('.')\[-1\] in postFix:  # 判断后缀是不是照片格式  
         try:  
             im = Image.open(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))  
         except OSError as err:  
             os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))  
             print('OS error : {}'.format(err))  
             # continue

         except IndexError as err:  
             os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))  
             print('OS error : {}'.format(err))  
             print('Index Error: {}'.format(err))  
             # continue

         except IOError as err:  
             os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8"))) # 删除图片  
             # print('OS error : {}'.format(err))  
             print('IOError : {}'.format(err))  
             # continue

         # except:  
         #     print ('Other error')  
         else:  
             diff = getDiff(width, high, im)  
             allDiff.append((str(i), diff))

 for i in range(len(allDiff)):  
     for j in range(i + 1, len(allDiff)):  
         if i != j:  
             ans = getHamming(allDiff\[i\]\[1\], allDiff\[j\]\[1\])  
             if ans <= 5:  # 判别的汉明距离,自己根据实际情况设置  
                 print(allDiff\[i\]\[0\], "and", allDiff\[j\]\[0\], "maybe same photo...")  
                 result = dirName + "/" + allDiff\[j\]\[0\]  
                 if os.path.exists(result):  
                     os.remove(result)

用哈希算法筛选后又发现筛除的太多了,阈值不好控制。又尝试采用了静态爬虫的方法,发现结果还不错,重复的也不多,也就省了筛除的步骤。

静态爬虫:

# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import time
# 导入需要的库
import requests
# import os
import json
import time

# 爬取百度图片,解析页面的函数
def getManyPages(keyword, pages):
'''
参数keyword:要下载的影像关键词
参数pages:需要下载的页面数
'''
params = []

 for i in range(30, 30 \* pages + 30, 30):  
     params.append({  
         'tn': 'resultjson\_com',  
         'ipn': 'rj',  
         'ct': 201326592,  
         'is': '',  
         'fp': 'result',  
         'queryWord': keyword,  
         'cl': 2,  
         'lm': -1,  
         'ie': 'utf-8',  
         'oe': 'utf-8',  
         'adpicid': '',  
         'st': -1,  
         'z': '',  
         'ic': 0,  
         'word': keyword,  
         's': '',  
         'se': '',  
         'tab': '',  
         'width': '',  
         'height': '',  
         'face': 0,  
         'istype': 2,  
         'qc': '',  
         'nc': 1,  
         'fr': '',  
         'pn': i,  
         'rn': 30,  
         'gsm': '1e',  
         '': ''  
     })  
 url = 'https://image.baidu.com/search/acjson'  
 urls = \[\]  
 for i in params:  
     try:  
         urls.append(requests.get(url, params=i).json().get('data'))  
     # except json.decoder.JSONDecodeError:  
     #     print("解析出错")

     except OSError as err:  
         print('OS error : {}'.format(err))

     except IndexError as err:  
         print('Index Error: {}'.format(err))

     except IOError as err:  
         print('IOError : {}'.format(err))  
     except:  
         print('Other error')  
 return urls

# 下载图片并保存
def getImg(dataList, localPath):
'''
参数datallist:下载图片的地址集
参数localPath:保存下载图片的路径
'''
if not os.path.exists(localPath): # 判断是否存在保存路径,如果不存在就创建
os.mkdir(localPath)
x = 0
for list in dataList:
for i in list:
if i.get('thumbURL') != None:
# print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'))
open(localPath + '/' + '%d.jpg' % x, 'wb').write(ir.content) # 这里是新加的斜杠
x += 1
else:
print('图片链接不存在')

# 根据关键词来下载图片
if __name__ == '__main__':
import os
father_path = "/home/yourpath/"
t0 = time.time()
for init in os.listdir(father_path):
print('init is{}'.format(str(init)))
for name in os.listdir(init):
print('name is{}'.format(str(name)))
t1 = time.time()
if not os.listdir(os.path.join(father_path, init, name)):
dataList = getManyPages(name, 30)
getImg(dataList, os.path.join(father_path, init, name))
t2 = time.time()
print('cost time is', t2 - t1)
t3 = time.time()
print('total time is', t3 - t0)
# t1 = time.time()
# dataList = getManyPages('keyword', page
_number) # 参数1:关键字,参数2:要下载的页数
# getImg(dataList, './file_path/') # 参数2:指定保存的路径
# t2 = time.time()
# print('cost time is', t2 - t1)
#
# parent_name = "/home/path" # 相册路径
# dirList = os.listdir(parent_name) # 所有文件夹的列表
# for one_file in dirList: # 其中的一个文件夹
# # son_list = os.listdir(one_file)
# son_list = os.path.join(parent_name, one_file)
# son_file = os.listdir(son_list)
# t1 = time.time()