经过8个小时的摸索,终于决定写下此随笔!
初学爬虫,准备爬取百度美女吧的图片,爬取图片之后发现打不开,上代码:
import urllib
import urllib2
from lxml import etree
def loadPage(url):
"""
作用:根据url发送请求,获取响应文件
url:需要爬取的url地址
"""
print('正在下载' )
ua\_headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"
}
request = urllib2.Request(url, headers= ua\_headers)
html = urllib2.urlopen(request).read()
# print html
content = etree.HTML(html)
link\_list = content.xpath('//div\[@class="t\_con cleafix"\]/div\[2\]/div\[1\]/div\[1\]/a/@href')
for link in link\_list:
fulurl = 'http://tieba.baidu.com' + link
loadImage(fulurl)
def loadImage(url):
print '正在下载图片'
ua_headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"
}
request = urllib2.Request(url, headers=ua_headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//img[@class="BDE_Image"]/@src')
for link in link_list:
print(link)
writeImage(link)
def writeImage(url):
"""
作用:将HTML内容写入到本地
html:服务器响应文件内容
"""
ua_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/35.0.1916.114 Safari/537.36',
'Cookie': 'AspxAutoDetectCookieSupport=1'
}
request = urllib2.Request(url,headers = ua_headers)
response =urllib2.urlopen(request)
image = response.read()
filename = url[-10:]
print('正在保存' + filename)
# print image
with open(filename, "wb") as f:
f.write(image)
print(filename + '已保存')
def tiebaSpider(url, beginPage, endPage):
"""
作用:贴吧爬虫调度器。负责组合处理每个页面的url
url:贴吧url的前部分
beginPage:起始页
endPage:结束页
"""
for page in range(beginPage, endPage + 1):
pn = (page - 1) \* 50
fulurl = url + "&pn=" + str(pn)
loadPage(fulurl)
print('谢谢使用!')
if __name__ == '__main__':
kw = raw_input('请输入需要爬取的贴吧名:')
beginPage = int(raw_input('请输入起始页:'))
endPage = int(raw_input('请输入结束页:'))
url = 'http://tieba.baidu.com/f?'
key = urllib.urlencode({"kw": kw})
fulurl = url + key
tiebaSpider(fulurl,beginPage,endPage)
后来发现是writeImage()的参数跟函数体中调用的参数不一致导致的,
def writeImage(link):
"""
作用:将HTML内容写入到本地
html:服务器响应文件内容
"""
ua_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/35.0.1916.114 Safari/537.36',
'Cookie': 'AspxAutoDetectCookieSupport=1'
}
request = urllib2.Request(url,headers = ua_headers)
response =urllib2.urlopen(request)
image = response.read()
filename = url[-10:]
print('正在保存' + filename)
# print image
with open(filename, "wb") as f:
f.write(image)
print(filename + '已保存')
将参数改成跟函数体内一致后,爬取的图片总算可以正常查看了!下面看看成果吧:
手机扫一扫
移动阅读更方便
你可能感兴趣的文章