Python爬虫练习(多线程,进程,协程抓取网页)
阅读原文时间:2023年07月10日阅读:1

关注公众号“轻松学编程”了解更多。

一、多线程抓取网页

流程:a.设置种子url b.获取区域列表 c.循环区域列表 d.创建线程获取页面数据

e、启动线程

import csv
import threading
import time
import requests
import lxml
from lxml import etree
import json

# 递归锁
rlock = threading.RLock()
# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getAreaList(url):
    '''
    获取区域列表
    :param url:
    :return: dict {"cityName":"cityUrl"}
    '''
    # 获取响应
    response = requests.get(url,headers=headers).text
    # 创建xml树形结构对象
    mytree = lxml.etree.HTML(response)
    # 分区
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div/a')
    #分区字典
    areaDict = {}
    for area in areaList:
        #区域名
        areaName = area.xpath('./text()')[0]
        areaUrl = "https://gz.lianjia.com"+area.xpath('./@href')[0]
        areaDict[areaName] = areaUrl
        # print(areaName,areaUrl)
    return areaDict

def getPageTotal(url):
    '''
    获取分区页数
    :param url: utl
    :return: int 总页数
    '''
    response = requests.get(url,headers=headers).text
    mytree = lxml.etree.HTML(response)
    # 获取总页数
    pageTotal = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageTotal = int(json.loads(pageTotal)["totalPage"])
    # print(pageTotal)
    return pageTotal

def getHouseInfo(area,url):
    '''
    获取房子信息
    :param area:区域
    :param url: url
    :return:
    '''
    pageTotal = getPageTotal(url)
    for page in range(1,pageTotal+1):
        newurl = url+"pg%d/"%page
        # print(newurl)
        response = requests.get(newurl,headers=headers).text
        mytree = lxml.etree.HTML(response)
        houseList = mytree.xpath('//li[@class="clear"]')
        print(houseList)
        for house in houseList:
            # 房子标题
            houseTitle = house.xpath('.//div[@class="title"]/a/text()')[0]
            # 房子url
            houseUrl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子地址
            houseAddr = house.xpath('.//div[@class="houseInfo"]//text()')
            houseAddr = ''.join(houseAddr)
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
            positionInfo = ''.join(positionInfo)
            # 总价
            priceInfo = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
            print(houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice)
            # 保存成csv文件
            with rlock:
                with open('./data/'+area+'.csv','a+',encoding='utf-8',errors='ignore') as f:
                    writer = csv.writer(f)
                    writer.writerow([houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice])

if __name__ == '__main__':
    #设置种子url
    cityUrl = "https://gz.lianjia.com/ershoufang/"
    # 获取区域列表
    areaDict = getAreaList(cityUrl)

    threadList = []
    time.clock()
    for areaName,areaUrl in areaDict.items():
        # 创建线程
        t = threading.Thread(target=getHouseInfo,args=(areaName,areaUrl))
        t.start()
        threadList.append(t)

# 保证线程正常结束
    for t in threadList:
        t.join()
    print(time.clock())

二、协程抓取网页

import csv
import threading
import time
import requests
import lxml
from lxml import etree
import json

import gevent
from gevent import monkey

# 非阻塞型
gevent.monkey.patch_all()
# 递归锁
rlock = threading.RLock()
# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getAreaList(url):
    '''
    获取区域列表
    :param url:
    :return: dict {"cityName":"cityUrl"}
    '''
    # 获取响应
    response = requests.get(url,headers=headers).text
    # 创建xml树形结构对象
    mytree = lxml.etree.HTML(response)
    # 分区
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div/a')
    #分区字典
    areaDict = {}
    for area in areaList:
        #区域名
        areaName = area.xpath('./text()')[0]
        areaUrl = "https://gz.lianjia.com"+area.xpath('./@href')[0]
        areaDict[areaName] = areaUrl
        # print(areaName,areaUrl)
    return areaDict

def getPageTotal(url):
    '''
    获取分区页数
    :param url: utl
    :return: int 总页数
    '''
    response = requests.get(url,headers=headers).text
    mytree = lxml.etree.HTML(response)
    # 获取总页数
    pageTotal = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageTotal = int(json.loads(pageTotal)["totalPage"])
    # print(pageTotal)
    return pageTotal

def getHouseInfo(area,url):
    '''
    获取房子信息
    :param area:区域
    :param url: url
    :return:
    '''
    pageTotal = getPageTotal(url)
    for page in range(1,pageTotal+1):
        newurl = url+"pg%d/"%page
        # print(newurl)
        response = requests.get(newurl,headers=headers).text
        mytree = lxml.etree.HTML(response)
        houseList = mytree.xpath('//li[@class="clear"]')
        print(houseList)
        for house in houseList:
            # 房子标题
            houseTitle = house.xpath('.//div[@class="title"]/a/text()')[0]
            # 房子url
            houseUrl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子地址
            houseAddr = house.xpath('.//div[@class="houseInfo"]//text()')
            houseAddr = ''.join(houseAddr)
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
            positionInfo = ''.join(positionInfo)
            # 总价
            priceInfo = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
            print(houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice)
            # 保存成csv文件
            with rlock:
                with open('./data/'+area+'.csv','a+',encoding='utf-8',errors='ignore') as f:
                    writer = csv.writer(f)
                    writer.writerow([houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice])

if __name__ == '__main__':
    #设置种子url
    cityUrl = "https://gz.lianjia.com/ershoufang/"
    # 获取区域列表
    areaDict = getAreaList(cityUrl)

    geventList = []
    time.clock()
    for areaName,areaUrl in areaDict.items():
        # 创建协程
        g = gevent.spawn(getHouseInfo,areaName,areaUrl)

        geventList.append(g)
# 保证协程正常结束
    gevent.joinall(geventList)
    print(time.clock())

三、协程与进程结合抓取网页

import csv
import threading
import time
import requests
import lxml
from lxml import etree
import json
import multiprocessing
import gevent
from gevent import monkey

# 非阻塞型
gevent.monkey.patch_all()
# 递归锁
rlock = threading.RLock()
# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getAreaList(url):
    '''
    获取区域列表
    :param url:
    :return: dict {"cityName":"cityUrl"}
    '''
    # 获取响应
    response = requests.get(url,headers=headers).text
    # 创建xml树形结构对象
    mytree = lxml.etree.HTML(response)
    # 分区
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div/a')
    #分区字典
    areaDict = {}
    for area in areaList:
        #区域名
        areaName = area.xpath('./text()')[0]
        areaUrl = "https://gz.lianjia.com"+area.xpath('./@href')[0]
        areaDict[areaName] = areaUrl
        # print(areaName,areaUrl)
    return areaDict

def getPageTotal(url):
    '''
    获取分区页数
    :param url: utl
    :return: int 总页数
    '''
    response = requests.get(url,headers=headers).text
    mytree = lxml.etree.HTML(response)
    # 获取总页数
    pageTotal = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageTotal = int(json.loads(pageTotal)["totalPage"])
    # print(pageTotal)
    return pageTotal

def getHouseInfo(area,url):
    '''
    获取房子信息
    :param area:区域
    :param url: url
    :return:
    '''

    def houesInfo(area,url,pageTotal):
        for page in range(1,pageTotal+1):
            newurl = url+"pg%d/"%page
            # print(newurl)
            response = requests.get(newurl,headers=headers).text
            mytree = lxml.etree.HTML(response)
            houseList = mytree.xpath('//li[@class="clear"]')
            print(houseList)
            for house in houseList:
                # 房子标题
                houseTitle = house.xpath('.//div[@class="title"]/a/text()')[0]
                # 房子url
                houseUrl = house.xpath('.//div[@class="title"]/a/@href')[0]
                # 房子地址
                houseAddr = house.xpath('.//div[@class="houseInfo"]//text()')
                houseAddr = ''.join(houseAddr)
                # 位置信息
                positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
                positionInfo = ''.join(positionInfo)
                # 总价
                priceInfo = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
                # 平方价
                unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
                print(houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice)
                # 保存成csv文件
                with rlock:
                    with open('./data/'+area+'.csv','a+',encoding='utf-8',errors='ignore') as f:
                        writer = csv.writer(f)
                        writer.writerow([houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice])
    # 获取总页数
    pageTotal = getPageTotal(url)
    # 创建协程
    g = gevent.spawn(houesInfo, area, url, pageTotal)
    # 保证协程正常结束
    gevent.joinall([g])

if __name__ == '__main__':
    #设置种子url
    cityUrl = "https://gz.lianjia.com/ershoufang/"
    # 获取区域列表
    areaDict = getAreaList(cityUrl)

    processList = []
    time.clock()
    for areaName,areaUrl in areaDict.items():
        # 创建进程
        p = multiprocessing.Process(target=getHouseInfo,args=(areaName,areaUrl))
        p.start()
        processList.append(p)

    # 保证进程正常结束
    for p in processList:
        p.join()
    print(time.clock())

【后记】为了让大家能够轻松学编程,我创建了一个公众号【轻松学编程】,里面有让你快速学会编程的文章,当然也有一些干货提高你的编程水平,也有一些编程项目适合做一些课程设计等课题。

也可加我微信【1257309054】,拉你进群,大家一起交流学习。
如果文章对您有帮助,请我喝杯咖啡吧!

公众号

关注我,我们一起成长~~