urllib2爬取图片成功之后不能打开
阅读原文时间:2023年07月08日阅读:6

经过8个小时的摸索,终于决定写下此随笔!

初学爬虫,准备爬取百度美女吧的图片,爬取图片之后发现打不开,上代码:

import urllib
import urllib2
from lxml import etree

def loadPage(url):
"""
作用:根据url发送请求,获取响应文件
url:需要爬取的url地址
"""

print('正在下载' )  
ua\_headers = {  
    "User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"  
}  
request = urllib2.Request(url, headers= ua\_headers)  
html = urllib2.urlopen(request).read()  
# print html  
content = etree.HTML(html)  
link\_list = content.xpath('//div\[@class="t\_con cleafix"\]/div\[2\]/div\[1\]/div\[1\]/a/@href')  
for link in link\_list:  
    fulurl = 'http://tieba.baidu.com' + link  
    loadImage(fulurl)

def loadImage(url):
print '正在下载图片'
ua_headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"
}
request = urllib2.Request(url, headers=ua_headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//img[@class="BDE_Image"]/@src')
for link in link_list:
print(link)
writeImage(link)
def writeImage(url):
"""
作用:将HTML内容写入到本地
html:服务器响应文件内容
"""
ua_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/35.0.1916.114 Safari/537.36',
'Cookie': 'AspxAutoDetectCookieSupport=1'
}
request = urllib2.Request(url,headers = ua_headers)
response =urllib2.urlopen(request)
image = response.read()
filename = url[-10:]
print('正在保存' + filename)
# print image
with open(filename, "wb") as f:
f.write(image)
print(filename + '已保存')

def tiebaSpider(url, beginPage, endPage):
"""

    作用:贴吧爬虫调度器。负责组合处理每个页面的url  
    url:贴吧url的前部分  
    beginPage:起始页  
    endPage:结束页  
"""

for page in range(beginPage, endPage + 1):  
    pn = (page - 1) \* 50  
    fulurl = url + "&pn=" + str(pn)  
    loadPage(fulurl)  
    print('谢谢使用!')

if __name__ == '__main__':
kw = raw_input('请输入需要爬取的贴吧名:')
beginPage = int(raw_input('请输入起始页:'))
endPage = int(raw_input('请输入结束页:'))

url = 'http://tieba.baidu.com/f?'  
key = urllib.urlencode({"kw": kw})  
fulurl = url + key  
tiebaSpider(fulurl,beginPage,endPage)

后来发现是writeImage()的参数跟函数体中调用的参数不一致导致的,

def writeImage(link):
"""
作用:将HTML内容写入到本地
html:服务器响应文件内容
"""
ua_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/35.0.1916.114 Safari/537.36',
'Cookie': 'AspxAutoDetectCookieSupport=1'
}
request = urllib2.Request(url,headers = ua_headers)
response =urllib2.urlopen(request)
image = response.read()
filename = url[-10:]
print('正在保存' + filename)
# print image
with open(filename, "wb") as f:
f.write(image)
print(filename + '已保存')

将参数改成跟函数体内一致后,爬取的图片总算可以正常查看了!下面看看成果吧: