1. 线程池 的应用
from multiprocessing.dummy import Pool
import requests
from lxml import etree
url="https://sz.lianjia.com/ershoufang/co32/"
from multiprocessing.dummy import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
import pymongo
res = requests.get(url=url,headers=headers)
class Ljia():
def \_\_init\_\_(self):
self.start\_url ="https://sz.lianjia.com/ershoufang/co32/"
self.headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
self.client = pymongo.MongoClient()
self.collention = self.client.Ljia.lianjia
def get\_content(self,url):
# 提取数据
html = requests.get(url,headers=self.headers)
tree = etree.HTML(html.text)
li\_list=tree.xpath('//ul\[@class="sellListContent"\]/li')
for li in li\_list:
item={}
item\['title'\] =li.xpath(".//div\[@class='title'\]/a/text()")\[0\]
item\['detail\_url'\] = li.xpath(".//div\[@class='houseInfo'\]/a/@href")\[0\]
item\['houseInfo'\] =li.xpath(".//div\[@class='houseInfo'\]//text()")
item\['houseInfo'\] =\[i.strip() for i in item\['houseInfo'\] if i.strip()\]
item\['houseInfo'\] = ''.join(item\['houseInfo'\])
item\['totalPrice'\] = li.xpath(".//div\[@class='totalPrice'\]//text()")
item\['totalPrice'\] = \[i.strip()for i in item\['totalPrice'\] if i.strip()\]
item\['totalPrice'\] = ''.join(item\['totalPrice'\])
item\['Price'\] = li.xpath(".//div\[@class='unitPrice'\]/span/text()")\[0\]
item\['followInfo'\] =li.xpath(".//div\[@class='followInfo'\]//text()")
item\['followInfo'\] = \[i.strip() for i in item\['followInfo'\] if i.strip()\]
item\['followInfo'\] = ''.join( item\['followInfo'\])
print(item)
return item
def get\_page\_url(self):
# 生成待抓取的url
yield self.start\_url
for i in range(2,101):
url = "https://sz.lianjia.com/ershoufang/pg%sco32/"%i
print('正在抓取:=============%s'%url)
yield url
def save\_data(self,item):
# 保存数据
if item:
self.collention.insert(item)
else:
print('数据不存在===========')
def run(self):
**pool** **\= Pool(5)
# 线程池进行抓取数据
data = pool.map(self.get\_content,self.get\_page\_url())
# 线程池进行存储
pool.map(self.save\_data,data)** if \_\_name\_\_ == '\_\_main\_\_':
lian\_jia = Ljia()
lian\_jia.run()
手机扫一扫
移动阅读更方便
你可能感兴趣的文章