import math
import re
import requests
from lxml import etree
type = "https://www.cnrepark.com/gyy/park{}/"
urlList = []
for i in range(1,8):
url = type.format(i)
urlList.append(url)
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
br = webdriver.Chrome(chrome_options=options)
class ChanyeList(object):
User\_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
Refer = "https://www.cnrepark.com"
# 初始化
def \_\_init\_\_(self,br):
self.br = br
# 使用selenium下载内容
def selenium\_download(self,url):
br.get(url)
content = self.br.page\_source
return content
def get\_total\_numbers(self,url):
content = self.selenium\_download(url)
html = etree.HTML(content)
totalNumbersTxt = html.xpath('.//div\[@class="nw\_num"\]/text()')
import re
totalNumbers = re.findall(r'\\d+',totalNumbersTxt\[0\])
return totalNumbers\[0\]
# 解析出列表
def html\_parse\_list(self,content):
html = etree.HTML(content)
listObj = html.xpath('.//div\[@class="area"\]//div\[@class="con\_lst"\]')
list = \[\]
for item in listObj:
src = item.xpath('./div/a/img/@src')
href = item.xpath('./div//h2/a/@href')
title = item.xpath('./div//h2/a/text()')
list.append({'title':title\[0\],'href':href\[0\],"src":src\[0\]})
return list
def optimizeContent(self,res):
res = res.replace('b\\'', '')
res = res.replace('\\\\n', '')
res = res.replace('\\'', '')
return res
# 解析出详情
def html\_parse\_detail(self,content):
html = etree.HTML(content)
detail = html.xpath('.//div\[@class="right\_nr"\]/div\[1\]//div\[@class="kfq\_box"\]/ul')
detail = etree.tostring(detail\[0\])
detail = self.optimizeContent(str(detail))
# 区域优势
regionalAdvantages = html.xpath('.//div\[@id="tbc\_81"\]')
regionalAdvantages = etree.tostring(regionalAdvantages\[0\])
regionalAdvantages = self.optimizeContent(str(regionalAdvantages))
# 基础配套
basicConfiguration = html.xpath('.//div\[@id="tbc\_82"\]')
basicConfiguration = etree.tostring(basicConfiguration\[0\])
basicConfiguration = self.optimizeContent(str(basicConfiguration))
# 优惠政策
preferentialPolicy = html.xpath('.//div\[@id="tbc\_83"\]')
preferentialPolicy = etree.tostring(preferentialPolicy\[0\])
preferentialPolicy = self.optimizeContent(str(preferentialPolicy))
# 规划建设
planningInformation = html.xpath('.//div\[@id="tbc\_84"\]')
planningInformation = etree.tostring(planningInformation\[0\])
planningInformation = self.optimizeContent(str(planningInformation))
res = {'detail': detail,
"regionalAdvantages": regionalAdvantages,
"basicConfiguration": basicConfiguration,
"preferentialPolicy": preferentialPolicy,
"planningInformation": planningInformation,
}
return res;
def crawl\_url(self,url):
print("crawl page {}".format(url))
listContent = self.selenium\_download(url)
list = self.html\_parse\_list(listContent)
return list
def get\_name(self,index):
nameList = \[
"特色园区",
"创意园",
"孵化基地",
"商务园区",
"生态园区",
"综合乐园",
"产业园转移区"
\]
return nameList\[index\]
# 保存list 获取到详情 保存为html
def save\_list(self,list,type\_index):
try:
for item in list:
url = item\['href'\]
print("crawl url :"+url)
content = self.selenium\_download(url)
detailList = self.html\_parse\_detail(content)
item\['title'\] = self.validateTitle(item\['title'\])
type\_name = self.get\_name(type\_index)
with open("./txt/"+type\_name+"-"+item\['title'\]+".html","w") as f:
f.write("<h2>{}</h2>".format(item\['title'\]))
f.write("<div> <a href='{}'><img style='height:80px;height:80px;' src={} /></a></div>".format(item\['href'\],item\['src'\]))
f.write("<p>{}</p>".format(detailList\['detail'\]))
f.write("<p>{}</p><h3>区位优势:</h3>{}".format(detailList\['detail'\],detailList\['regionalAdvantages'\]))
f.write("<p>{}</p><h3>基础配套:</h3>{}".format(detailList\['detail'\],detailList\['basicConfiguration'\]))
f.write("<p>{}</p><h3>优惠政策:</h3>{}".format(detailList\['detail'\],detailList\['preferentialPolicy'\]))
f.write("<p>{}</p><h3>规划建设:</h3>{}".format(detailList\['detail'\],detailList\['planningInformation'\]))
f.write("<br>")
f.close()
except Exception as e:
print("Exception:"+str(e))
def validateTitle(self,title):
rstr = r"\[\\/\\\\\\:\\\*\\?\\"\<\\>\\|\\(\\)\]" # '/ \\ : \* ? " < > |'
new\_title = re.sub(rstr, "\_", title) # 替换为下划线
return new\_title
if __name__ == "__main__":
try:
rootUrl = "https://www.cnrepark.com/gyy/park{}/"
for k in range(1,8):
chanyeList = ChanyeList(br)
baseUrl = "https://www.cnrepark.com/gyy/park"+str(k)+"/?publishtime=desc&page={}"
pageUrl = "https://www.cnrepark.com/gyy/park"+str(k)+"/"
# 获取总页数
totalNumbers = chanyeList.get\_total\_numbers(pageUrl)
totalPage = math.ceil( int(totalNumbers) / 13)
result = \[\]
for page in range(1,int(totalPage) + 1 ):
realUrl = baseUrl.format(page)
list = chanyeList.crawl\_url(realUrl)
result.extend(list)
chanyeList.save\_list(result,k-1)
br.quit()
except Exception as e:
print(str(e))
手机扫一扫
移动阅读更方便
你可能感兴趣的文章