python爬虫(一)抓取 色影无忌图片
阅读原文时间:2023年07月12日阅读:1

原文地址: http://www.30daydo.com/article/56

由于平时爱好摄影。所以喜欢看看色影无忌论坛的获奖摄影作品,所以写了个小script用来抓取上面的获奖图片,亲自測试能够使用。

自己主动抓全部的获奖图片

完整代码:

#-*-coding=utf-8-*-
__author__ = 'rocchen'
from bs4 import BeautifulSoup
import urllib2,sys,StringIO,gzip,time,random,re,urllib,os
reload(sys)
sys.setdefaultencoding('utf-8')
class Xitek():
def __init__(self):
self.url="http://photo.xitek.com/"
user_agent="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
self.headers={"User-Agent":user_agent}
self.last_page=self.__get_last_page()

def \_\_get\_last\_page(self):  
    html=self.\_\_getContentAuto(self.url)  
    bs=BeautifulSoup(html,"html.parser")  
    page=bs.find\_all('a',class\_="blast")  
    last\_page=page\[0\]\['href'\].split('/')\[-1\]  
    return int(last\_page)

def \_\_getContentAuto(self,url):  
    req=urllib2.Request(url,headers=self.headers)  
    resp=urllib2.urlopen(req)  
    #time.sleep(2\*random.random())  
    content=resp.read()  
    info=resp.info().get("Content-Encoding")  
    if info==None:  
        return content  
    else:  
        t=StringIO.StringIO(content)  
        gziper=gzip.GzipFile(fileobj=t)  
        html = gziper.read()  
        return html

#def \_\_getFileName(self,stream):

def \_\_download(self,url):  
    p=re.compile(r'href="(/photoid/\\d+)"')  
    #html=self.\_\_getContentNoZip(url)

    html=self.\_\_getContentAuto(url)

    content = p.findall(html)  
    for i in content:  
        print i

        photoid=self.\_\_getContentAuto(self.url+i)  
        bs=BeautifulSoup(photoid,"html.parser")  
        final\_link=bs.find('img',class\_="mimg")\['src'\]  
        print final\_link  
        #pic\_stream=self.\_\_getContentAuto(final\_link)  
        title=bs.title.string.strip()  
        filename = re.sub('\[\\/:\*?"<>|\]', '-', title)  
        filename=filename+'.jpg'  
        urllib.urlretrieve(final\_link,filename)  
        #f=open(filename,'w')  
        #f.write(pic\_stream)  
        #f.close()  
    #print html  
    #bs=BeautifulSoup(html,"html.parser")  
    #content=bs.find\_all(p)  
    #for i in content:  
    #    print i  
    '''  
    print bs.title  
    element\_link=bs.find\_all('div',class\_="element")  
    print len(element\_link)  
    k=1  
    for href in element\_link:

        #print type(href)  
        #print href.tag  
    '''  
    '''  
        if href.children\[0\]:  
            print href.children\[0\]  
    '''  
    '''  
        t=0

        for i in href.children:  
            #if i.a:  
            if t==0:  
                #print k  
                if i\['href'\]  
                print link

                    if p.findall(link):  
                        full\_path=self.url\[0:len(self.url)-1\]+link  
                        sub\_html=self.\_\_getContent(full\_path)  
                        bs=BeautifulSoup(sub\_html,"html.parser")  
                        final\_link=bs.find('img',class\_="mimg")\['src'\]  
                        #time.sleep(2\*random.random())  
                        print final\_link  
                #k=k+1  
            #print type(i)  
            #print i.tag  
            #if hasattr(i,"href"):  
                #print i\['href'\]  
            #print i.tag  
            t=t+1  
            #print "\*"

    '''

    '''  
        if href:  
            if href.children:  
                print href.children\[0\]  
    '''  
        #print "one element link"

def getPhoto(self):

    start=0  
    #use style/0  
    photo\_url="http://photo.xitek.com/style/0/p/"  
    for i in range(start,self.last\_page+1):  
        url=photo\_url+str(i)  
        print url  
        #time.sleep(1)  
        self.\_\_download(url)

    '''  
    url="http://photo.xitek.com/style/0/p/10"  
    self.\_\_download(url)  
    '''  
    #url="http://photo.xitek.com/style/0/p/0"  
    #html=self.\_\_getContent(url)  
    #url="http://photo.xitek.com/"  
    #html=self.\_\_getContentNoZip(url)  
    #print html  
    #'''  

def main():
sub_folder = os.path.join(os.getcwd(), "content")
if not os.path.exists(sub_folder):
os.mkdir(sub_folder)
os.chdir(sub_folder)
obj=Xitek()
obj.getPhoto()

if __name__=="__main__":
main()

具体解说请移步: 

http://www.30daydo.com/article/56

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章