l线程池抓取lianjia
阅读原文时间:2023年07月08日阅读:5

1. 线程池 的应用

from multiprocessing.dummy import Pool

import requests
from lxml import etree
url="https://sz.lianjia.com/ershoufang/co32/"

url="https://sz.lianjia.com/ershoufang/pg2co32/"

from multiprocessing.dummy import Pool

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',

}

import pymongo

res = requests.get(url=url,headers=headers)

class Ljia():

def \_\_init\_\_(self):

    self.start\_url ="https://sz.lianjia.com/ershoufang/co32/"

    self.headers ={  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',

        }  
    self.client = pymongo.MongoClient()  
    self.collention = self.client.Ljia.lianjia

def get\_content(self,url):

    # 提取数据  
    html = requests.get(url,headers=self.headers)

    tree = etree.HTML(html.text)

    li\_list=tree.xpath('//ul\[@class="sellListContent"\]/li')

    for li in li\_list:  
        item={}  
        item\['title'\] =li.xpath(".//div\[@class='title'\]/a/text()")\[0\]  
        item\['detail\_url'\] = li.xpath(".//div\[@class='houseInfo'\]/a/@href")\[0\]  
        item\['houseInfo'\] =li.xpath(".//div\[@class='houseInfo'\]//text()")  
        item\['houseInfo'\] =\[i.strip() for i in item\['houseInfo'\] if i.strip()\]  
        item\['houseInfo'\] = ''.join(item\['houseInfo'\])  
        item\['totalPrice'\] = li.xpath(".//div\[@class='totalPrice'\]//text()")  
        item\['totalPrice'\] = \[i.strip()for i in item\['totalPrice'\] if i.strip()\]  
        item\['totalPrice'\] = ''.join(item\['totalPrice'\])  
        item\['Price'\] = li.xpath(".//div\[@class='unitPrice'\]/span/text()")\[0\]  
        item\['followInfo'\] =li.xpath(".//div\[@class='followInfo'\]//text()")  
        item\['followInfo'\] = \[i.strip() for i in item\['followInfo'\] if i.strip()\]  
        item\['followInfo'\] = ''.join( item\['followInfo'\])

        print(item)

        return item

def  get\_page\_url(self):

    # 生成待抓取的url

    yield self.start\_url

    for i in range(2,101):

        url = "https://sz.lianjia.com/ershoufang/pg%sco32/"%i

        print('正在抓取:=============%s'%url)

        yield url

def save\_data(self,item):  
    # 保存数据  
    if item:

        self.collention.insert(item)

    else:  
        print('数据不存在===========')

def run(self):

   **pool** **\= Pool(5)  
    # 线程池进行抓取数据  
    data = pool.map(self.get\_content,self.get\_page\_url())  
    # 线程池进行存储  
    pool.map(self.save\_data,data)** if \_\_name\_\_ == '\_\_main\_\_':

lian\_jia = Ljia()

lian\_jia.run()