原文地址: http://www.30daydo.com/article/56
由于平时爱好摄影。所以喜欢看看色影无忌论坛的获奖摄影作品,所以写了个小script用来抓取上面的获奖图片,亲自測试能够使用。
自己主动抓全部的获奖图片
完整代码:
#-*-coding=utf-8-*-
__author__ = 'rocchen'
from bs4 import BeautifulSoup
import urllib2,sys,StringIO,gzip,time,random,re,urllib,os
reload(sys)
sys.setdefaultencoding('utf-8')
class Xitek():
def __init__(self):
self.url="http://photo.xitek.com/"
user_agent="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
self.headers={"User-Agent":user_agent}
self.last_page=self.__get_last_page()
def \_\_get\_last\_page(self):
html=self.\_\_getContentAuto(self.url)
bs=BeautifulSoup(html,"html.parser")
page=bs.find\_all('a',class\_="blast")
last\_page=page\[0\]\['href'\].split('/')\[-1\]
return int(last\_page)
def \_\_getContentAuto(self,url):
req=urllib2.Request(url,headers=self.headers)
resp=urllib2.urlopen(req)
#time.sleep(2\*random.random())
content=resp.read()
info=resp.info().get("Content-Encoding")
if info==None:
return content
else:
t=StringIO.StringIO(content)
gziper=gzip.GzipFile(fileobj=t)
html = gziper.read()
return html
#def \_\_getFileName(self,stream):
def \_\_download(self,url):
p=re.compile(r'href="(/photoid/\\d+)"')
#html=self.\_\_getContentNoZip(url)
html=self.\_\_getContentAuto(url)
content = p.findall(html)
for i in content:
print i
photoid=self.\_\_getContentAuto(self.url+i)
bs=BeautifulSoup(photoid,"html.parser")
final\_link=bs.find('img',class\_="mimg")\['src'\]
print final\_link
#pic\_stream=self.\_\_getContentAuto(final\_link)
title=bs.title.string.strip()
filename = re.sub('\[\\/:\*?"<>|\]', '-', title)
filename=filename+'.jpg'
urllib.urlretrieve(final\_link,filename)
#f=open(filename,'w')
#f.write(pic\_stream)
#f.close()
#print html
#bs=BeautifulSoup(html,"html.parser")
#content=bs.find\_all(p)
#for i in content:
# print i
'''
print bs.title
element\_link=bs.find\_all('div',class\_="element")
print len(element\_link)
k=1
for href in element\_link:
#print type(href)
#print href.tag
'''
'''
if href.children\[0\]:
print href.children\[0\]
'''
'''
t=0
for i in href.children:
#if i.a:
if t==0:
#print k
if i\['href'\]
print link
if p.findall(link):
full\_path=self.url\[0:len(self.url)-1\]+link
sub\_html=self.\_\_getContent(full\_path)
bs=BeautifulSoup(sub\_html,"html.parser")
final\_link=bs.find('img',class\_="mimg")\['src'\]
#time.sleep(2\*random.random())
print final\_link
#k=k+1
#print type(i)
#print i.tag
#if hasattr(i,"href"):
#print i\['href'\]
#print i.tag
t=t+1
#print "\*"
'''
'''
if href:
if href.children:
print href.children\[0\]
'''
#print "one element link"
def getPhoto(self):
start=0
#use style/0
photo\_url="http://photo.xitek.com/style/0/p/"
for i in range(start,self.last\_page+1):
url=photo\_url+str(i)
print url
#time.sleep(1)
self.\_\_download(url)
'''
url="http://photo.xitek.com/style/0/p/10"
self.\_\_download(url)
'''
#url="http://photo.xitek.com/style/0/p/0"
#html=self.\_\_getContent(url)
#url="http://photo.xitek.com/"
#html=self.\_\_getContentNoZip(url)
#print html
#'''
def main():
sub_folder = os.path.join(os.getcwd(), "content")
if not os.path.exists(sub_folder):
os.mkdir(sub_folder)
os.chdir(sub_folder)
obj=Xitek()
obj.getPhoto()
if __name__=="__main__":
main()
具体解说请移步:
手机扫一扫
移动阅读更方便
你可能感兴趣的文章