python爬虫——汽车之家数据
阅读原文时间:2023年07月09日阅读:2

相信很多买车的朋友,首先会在网上查资料,对比车型价格等,首选就是“汽车之家”,于是,今天我就给大家扒一扒汽车之家的数据:

首先获取的数据是各款汽车名称、价格范围以及最低指导价:

    def get_oa_price(self):
        try:
            oa_price_data_list=[]
            for page in range(1,27):
                oa_price_api = f"https://price.16888.com/gz/search-0-0-0-0-0-0-0-0-0-1-0-0-0-0-{page}.html"
                response = self.sc.get_html(oa_price_api)
                if not response:
                    print('城市页请求失败')
                    return 0
                #燃油车数据块
                oa_data_= re.findall(r'<div class="style-box ">\s+<ul class="clearfix">([\s\S]*?)</ul>',response.text)[0]
                #燃油车id和名字列表
                car_id_name_list = re.findall(r'data-sid="(\d+)" data-name="(.*?)">',oa_data_)
                # 价格范围列表
                price_range_list = re.findall(fr'<p>(.*?)\s+<span class="', response.text)                if len(car_id_name_list)==len(price_range_list):
                    for index,car_list in enumerate(car_id_name_list):
                        car_id,car_name = car_list
                        #价格范围
                        price_range = price_range_list[index]
                        #最低价
                        price_min = int(eval(price_range.split('-')[0])*10000)
                        oa_price_data_list.append((int(car_id),car_name,price_range,price_min))
                        # print(price_min)
                if not oa_price_data_list or not len(oa_price_data_list):
                    return 0
                print(oa_price_data_list)

            print("燃油车价格已经爬取完成")
            return 1
        except:
            self.sc.collect_error()

结果输出如下:

一般买东西,尤其网上买东西,一看价格,二看销量。销量好不好有时候也会决定买不买:

    def get_ea_sale(self):
        try:
            ea_sale_data_list = []
            for year in range(2018,2021):
                for month in range(1, 13):
                    if month>9:
                        date_ = str(year) + str(month)
                    else:
                        date_ = str(year) + "0" + str(month)
                    for i in range(1,3):
                        ea_sale_api = f"https://xl.16888.com/ev-{date_}-{date_}-{i}.html"
                        print(ea_sale_api)
                        response = self.sc.get_html(ea_sale_api)
                        if not response:
                            print('城市页请求失败')
                            return 0
                        re_no = re.findall(r'<p>暂时没有 <em>电动车</em>&nbsp;<em>\d+.\d+</em>&nbsp;的销量数据</p>',response.text)
                        if re_no and len(re_no):
                            print("没第二页")
                            break
                        # 销量数据块
                        ea_sale_data_ = re.findall(r'<th width="\w+">车型相关</th>([\s\S]*?)<div class="xl-data-pageing lbBox">',
                                                  response.text)[0]
                        # 燃油车id和名字列表
                        car_id_name_list = re.findall(r'<td class="xl-td-\w+"><a href="/s/(\d+)/" target="_blank">(.*?)</a></td>', ea_sale_data_)
                        # 销量列表
                        sale_list = re.findall(r'<td class="xl-td-t3">(\d+)</td>', ea_sale_data_)
                        if len(car_id_name_list) == len(sale_list):
                            for index, car_list in enumerate(car_id_name_list):
                                car_id, car_name = car_list
                                # 价格范围
                                sale_num = int(sale_list[index])
                                ea_sale_data_list.append((int(car_id), car_name, sale_num,date_))
                        #没有列表数据
                        if not ea_sale_data_list or not len(ea_sale_data_list):
                            return 0

                    print(ea_sale_data_list)
            print("电动车销量已经爬取完成")
            return 1
        except:
            self.sc.collect_error()

结果输出如下:

俗话说:买东西看三宝,一看价格,二看销量,三看评论。

    def car_comment(self):
        try:
            ea_com_api = f"https://k.autohome.com.cn/ajax/getSceneSelectCar?minprice=2&maxprice=110&_appid=koubei&fueltype=4"
            response = self.sc.get_html(ea_com_api)
            if not response:
                print('车型列表请求失败')
                return 0
            ea_com_json=json.loads(response.text)
            # print(ea_com_json)
            result_list = ea_com_json['result']
            for result in result_list:
                ea_com_data_list = []
                car_id = int(result['SeriesId'])
                car_name = result['SeriesName']
                print(car_name)
                com_api = f"https://k.autohome.com.cn/{car_id}/index_1.html"
                com_resp = self.sc.get_html(com_api)
                if not com_resp:
                    print('口碑列表请求失败')
                    continue
                #查看口碑的条数
                com_num_list = re.findall(r'<span class="fn-right \w+">共有(\d+)条口碑</span>',com_resp.text)
                if not com_num_list or not len(com_num_list):
                    print("没有口碑")
                    api_ip = 'http://ip.dobel.cn/switch-ip'
                    api_ip_resp = self.sc.get_html(api_ip)
                    time.sleep(1)
                    com_api = f"https://k.autohome.com.cn/{car_id}/index_1.html"
                    com_resp = self.sc.get_html(com_api)
                    if not com_resp:
                        print('口碑列表请求失败=========')
                        continue
                    # 查看口碑的条数
                    com_num_list = re.findall(r'<span class="fn-right \w+">共有(\d+)条口碑</span>', com_resp.text)
                    if not com_num_list or not len(com_num_list):
                        print("没有口碑=========")
                        continue
                com_num = int(com_num_list[0])
                if com_num>15:
                    #翻页
                    page_num_list = re.findall(r"<span class='page-item-info'>共(\d+)页</span>",com_resp.text)
                    if not page_num_list or not len(page_num_list):
                        print("没有口碑")
                        page_num = 1
                    else:
                        page_num = int(page_num_list[0])
                else:
                    page_num = 1
                for page in range(1,page_num+1):
                    com_api2 = f"https://k.autohome.com.cn/{car_id}/index_{page}.html"
                    print(com_api2)
                    com_resp2 = self.sc.get_html(com_api2)
                    if not com_resp2:
                        print('口碑列表2请求失败')
                        api_ip = 'http://ip.dobel.cn/switch-ip'
                        api_ip_resp = self.sc.get_html(api_ip)
                        time.sleep(1)
                        com_resp2 = self.sc.get_html(com_api2)
                        if not com_resp2:
                            print('口碑列表3请求失败')
                            continue
                    #评论id和评论链接
                    com_id_url_list = re.findall(r'发表了口碑\s+<a href="(.*?)"',com_resp2.text)
                    if not com_id_url_list or not len(com_id_url_list):
                        print("没有口碑id")
                        api_ip = 'http://ip.dobel.cn/switch-ip'
                        api_ip_resp = self.sc.get_html(api_ip)
                        time.sleep(1)
                        com_resp3 = self.sc.get_html(com_api2)
                        if not com_resp3:
                            print('口碑列表3请求失败========')
                            continue
                        # 评论id和评论链接
                        com_id_url_list = re.findall(r'发表了口碑\s+<a href="(.*?)"', com_resp3.text)
                        if not com_id_url_list or not len(com_id_url_list):
                            print("没有口碑id======")
                            continue
                    for com_id_url in com_id_url_list:
                        com_url = com_id_url
                        #以时间戳作为评论id
                        com_id = str(uuid.uuid4())
                        ea_com_data_list.append((car_id,car_name,com_id,com_url))

                # 没有列表数据
                if not ea_com_data_list or not len(ea_com_data_list):
                    return 0
                print(f"汽车之家{car_name}评论id已经爬取完成")
            return 1
        except:
            self.sc.collect_error()

以上就是我的分享,如果有什么不足之处请指出,多交流,谢谢!

如果喜欢,请关注我的博客:https://www.cnblogs.com/qiuwuzhidi/

想获取更多数据或定制爬虫的请点击python爬虫专业定制