python+selenium 爬取中国工业园网

阅读原文时间：2023年07月10日阅读：2

import math
import re

import requests

from lxml import etree

type = "https://www.cnrepark.com/gyy/park{}/"
urlList = []
for i in range(1,8):
url = type.format(i)
urlList.append(url)

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
br = webdriver.Chrome(chrome_options=options)

class ChanyeList(object):

User\_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"  
Refer = "https://www.cnrepark.com"

# 初始化  
def \_\_init\_\_(self,br):  
    self.br = br

# 使用selenium下载内容  
def selenium\_download(self,url):  
    br.get(url)  
    content = self.br.page\_source  
    return content

def get\_total\_numbers(self,url):  
    content = self.selenium\_download(url)  
    html = etree.HTML(content)  
    totalNumbersTxt = html.xpath('.//div\[@class="nw\_num"\]/text()')  
    import re  
    totalNumbers = re.findall(r'\\d+',totalNumbersTxt\[0\])  
    return totalNumbers\[0\]  
# 解析出列表  
def html\_parse\_list(self,content):  
    html = etree.HTML(content)  
    listObj = html.xpath('.//div\[@class="area"\]//div\[@class="con\_lst"\]')  
    list = \[\]  
    for item in listObj:  
        src = item.xpath('./div/a/img/@src')  
        href = item.xpath('./div//h2/a/@href')  
        title = item.xpath('./div//h2/a/text()')  
        list.append({'title':title\[0\],'href':href\[0\],"src":src\[0\]})  
    return list

def optimizeContent(self,res):  
    res = res.replace('b\\'', '')  
    res = res.replace('\\\\n', '')  
    res = res.replace('\\'', '')  
    return res

# 解析出详情  
def html\_parse\_detail(self,content):  
    html = etree.HTML(content)  
    detail = html.xpath('.//div\[@class="right\_nr"\]/div\[1\]//div\[@class="kfq\_box"\]/ul')  
    detail = etree.tostring(detail\[0\])  
    detail = self.optimizeContent(str(detail))

    # 区域优势  
    regionalAdvantages = html.xpath('.//div\[@id="tbc\_81"\]')  
    regionalAdvantages = etree.tostring(regionalAdvantages\[0\])  
    regionalAdvantages = self.optimizeContent(str(regionalAdvantages))

    # 基础配套  
    basicConfiguration = html.xpath('.//div\[@id="tbc\_82"\]')  
    basicConfiguration = etree.tostring(basicConfiguration\[0\])  
    basicConfiguration = self.optimizeContent(str(basicConfiguration))

    # 优惠政策  
    preferentialPolicy = html.xpath('.//div\[@id="tbc\_83"\]')  
    preferentialPolicy = etree.tostring(preferentialPolicy\[0\])  
    preferentialPolicy = self.optimizeContent(str(preferentialPolicy))

    # 规划建设  
    planningInformation = html.xpath('.//div\[@id="tbc\_84"\]')  
    planningInformation = etree.tostring(planningInformation\[0\])  
    planningInformation = self.optimizeContent(str(planningInformation))

    res = {'detail': detail,  
           "regionalAdvantages": regionalAdvantages,  
           "basicConfiguration": basicConfiguration,  
           "preferentialPolicy": preferentialPolicy,  
           "planningInformation": planningInformation,  
           }

    return res;

def crawl\_url(self,url):  
    print("crawl page {}".format(url))  
    listContent = self.selenium\_download(url)  
    list = self.html\_parse\_list(listContent)  
    return list

def get\_name(self,index):  
    nameList = \[  
        "特色园区",  
        "创意园",  
        "孵化基地",  
      "商务园区",  
        "生态园区",  
       "综合乐园",  
        "产业园转移区"  
    \]  
    return nameList\[index\]

# 保存list  获取到详情 保存为html  
def save\_list(self,list,type\_index):  
    try:  
        for item in list:  
            url = item\['href'\]  
            print("crawl url :"+url)  
            content  = self.selenium\_download(url)  
            detailList = self.html\_parse\_detail(content)  
            item\['title'\] = self.validateTitle(item\['title'\])  
            type\_name = self.get\_name(type\_index)  
            with open("./txt/"+type\_name+"-"+item\['title'\]+".html","w") as f:  
                f.write("<h2>{}</h2>".format(item\['title'\]))  
                f.write("<div> <a href='{}'><img style='height:80px;height:80px;' src={} /></a></div>".format(item\['href'\],item\['src'\]))  
                f.write("<p>{}</p>".format(detailList\['detail'\]))  
                f.write("<p>{}</p><h3>区位优势:</h3>{}".format(detailList\['detail'\],detailList\['regionalAdvantages'\]))  
                f.write("<p>{}</p><h3>基础配套:</h3>{}".format(detailList\['detail'\],detailList\['basicConfiguration'\]))  
                f.write("<p>{}</p><h3>优惠政策:</h3>{}".format(detailList\['detail'\],detailList\['preferentialPolicy'\]))  
                f.write("<p>{}</p><h3>规划建设:</h3>{}".format(detailList\['detail'\],detailList\['planningInformation'\]))  
                f.write("<br>")  
                f.close()  
    except Exception as e:  
        print("Exception:"+str(e))  
def validateTitle(self,title):  
    rstr = r"\[\\/\\\\\\:\\\*\\?\\"\&lt;\\&gt;\\|\\(\\)\]"  # '/ \\ : \* ? " < > |'  
    new\_title = re.sub(rstr, "\_", title)  # 替换为下划线  
    return new\_title

if __name__ == "__main__":
try:

    rootUrl = "https://www.cnrepark.com/gyy/park{}/"

    for k in range(1,8):  
        chanyeList = ChanyeList(br)  
        baseUrl = "https://www.cnrepark.com/gyy/park"+str(k)+"/?publishtime=desc&page={}"  
        pageUrl = "https://www.cnrepark.com/gyy/park"+str(k)+"/"  
        # 获取总页数  
        totalNumbers = chanyeList.get\_total\_numbers(pageUrl)  
        totalPage = math.ceil( int(totalNumbers) / 13)  
        result = \[\]  
        for page in range(1,int(totalPage) + 1 ):  
            realUrl = baseUrl.format(page)  
            list =  chanyeList.crawl\_url(realUrl)  
            result.extend(list)  
        chanyeList.save\_list(result,k-1)

    br.quit()  
except Exception as e:  
    print(str(e))

手机扫一扫

移动阅读更方便

你可能感兴趣的文章