超全的python爬虫练手项目汇总
阅读原文时间:2021年04月22日阅读:1

文章目录

1. 爬取高德地图数据

import requests

'''
北京接口:https://www.amap.com/service/weather?adcode=110000
天津接口:https://www.amap.com/service/weather?adcode=120000

adcode接口:https://www.amap.com/service/cityList?version=201951410
'''
class Gao(object):

    def __init__(self):
        self.run()

    def run(self):

        # 声明一个base_url
        base_url = "https://www.amap.com/service/weather?adcode="

        # 获取所有城市adcode
        adcode_list = self.get_adcode()
        # print(adcode_list)

        # for循环adcode 去获取城市天气信息
        for c, adcode_dict in enumerate(adcode_list, 1):
            # 获取adcode 用于拼接完整的url
            adcode = adcode_dict["adcode"]

            # 拼接完整的url
            full_url = base_url + adcode
            # print(full_url)

            # 发情请求 获取天气json数据
            response = requests.get(full_url)
            json_data = response.json()
            # print(json_data)
            # 第一种方式
            # msg = json_data.get("data").get("message")
            # if msg == "Successful.":
            #     # 获取天气信息
            #     weather_name = json_data.get("data").get("data")[0].get("live").get("weather_name")
            #     # print(weather_name)
            #
            #     # 将天气信息 加入到adcode_dict中
            #     adcode_dict["weather_name"] = weather_name
            #     print(c, adcode_dict)
            # else:
            #     print(msg)

            # 第二种方式

            try:
                # 获取天气信息
                weather_name = json_data.get("data").get("data")[0].get("live").get("weather_name")
                # print(weather_name)

                # 将天气信息 加入到adcode_dict中
                adcode_dict["weather_name"] = weather_name
                print(c, adcode_dict)
            except Exception as e:
                print(e)

    # 获取所有城市adcode
    def get_adcode(self):
        # 定义adcode接口
        base_url = "https://www.amap.com/service/cityList?version=201951410"
        # 发起请求
        response = requests.get(base_url)
        # print(response.json())
        # print(response.text)

        # 获取json数据
        json_data = response.json()

        # 获取adcode列表
        city_by_letter = json_data.get("data").get("cityByLetter")
        # print(city_by_letter)

        # 声明一个列表 放所有的城市字典
        city_list = []
        # 循环遍历字典中的值

        # for city_list1 in city_by_letter.values():
        #     # print(city_list1)
        #     # 第一种方式
        #     for city_dict in city_list1:
        #         print(self.count, city_dict)
        #         city_list.append(city_dict)
        #         self.count += 1
        # 循环遍历字典中的值
        for city_list1 in city_by_letter.values():
            # 第二种方式
            city_list += city_list1
        print(city_list)
        # print(len(city_list))
        # 所有存放城市字典的列表
        return city_list


if __name__ == '__main__':
    Gao()

'''
总结:
列表相加 可以使用+=号, city_list += city_list1
判断某个信息 时候成功获取到,接口返回的信息 会告诉你,只有在获取到信息之后 
我们才对指定 信息进行获取 ,如果不这样的话 会报错,影响代码正常运行,
此时应该写异常处理
'''

2. 有道翻译 js加密

import requests
import time
import random
import hashlib
def md5(value):
    # 创建MD5对象
    md5_obj = hashlib.md5()
    # 加密字符串
    md5_obj.update(bytes(value, encoding="utf-8"))
    # 进行16位的加密
    sign = md5_obj.hexdigest()
    return sign
def youdao(i):
    # 获取salt
    salt = str(int(time.time() * 1000)) + str(random.randint(0, 9))
    # print(salt)
    # 获取sign
    sign1 = "fanyideskweb" + i + salt + "@6f#X3=cCuncYssPsuRUE"
    sign = md5(sign1)
    # 定义data参数
    data = {
        "i": i,
        # "from": "AUTO",
        # "to": "AUTO",
        # "smartresult": "dict",
        "client": "fanyideskweb",
        "salt": salt,
        "sign": sign,
        # "ts": "1558514897639",
        # "bv": "cf156b581152bd0b259b90070b1120e6",
        # "doctype": "json",
        # "version": "2.1",
        "keyfrom": "fanyi.web",
        # "action": "FY_BY_REALTlME"
    }

    # 加上请求头 浏览器信息
    headers = {
        # "Accept": "application/json, text/javascript, */*; q=0.01",
        # "Accept-Encoding": "gzip, deflate",
        # "Accept-Language": "zh-CN,zh;q=0.9",
        # "Connection": "keep-alive",
        # "Content-Length": "238",
        # "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie": "OUTFOX_SEARCH_USER_ID=1007772075@10.168.8.76; OUTFOX_SEARCH_USER_ID_NCOO=1844201936.6123636; _ga=GA1.2.1939912746.1552966532; JSESSIONID=aaaB9UfpkFL02gnEynoRw; ___rl__test__cookies=1558514897636",
        # "Host": "fanyi.youdao.com",
        # "Origin": "http://fanyi.youdao.com",
        "Referer": "http://fanyi.youdao.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
        # "X-Requested-With": "XMLHttpRequest"
    }

    # 定义起始url
    base_url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"

    # 发送请求
    response = requests.post(base_url, data=data, headers=headers)
    # 获取response里面的json数据
    json_data = response.json()
    print(json_data)
    print(type(json_data))


if __name__ == '__main__':
    i = input("请输入需要翻译的内容:")
    # i = "banana"
    youdao(i)

"""
遇到的问题1:
""只携带参数data 发起请求的时候,请求不到数据,出现{'errorCode'"": ""50},",",
此时的解决方案是:加上请求头浏览器信息 再次发起请求 
""问题2:还是获取不到信息 {'errorCode'"": ""50}",",
解决的方案是:把所有的请求头信息添加到headers中
"""

# i: banana
# client: fanyideskweb
# salt: 15585168560444
# sign: da50e3193cda496e1455ff28c1bb21b1
# keyfrom: fanyi.web
#
# i: apple
# "client": "fanyideskweb",
# "salt": "15585148976393",
# "sign": "147950af9758d1e79aeaacd4ff27d14d",
# "keyfrom": "fanyi.web",
#
#
# salt:  需要看一下 是否需要加密
# sign: 也要看一下是否需要加密

# 首要解决的问题是salt和sign生成的过程
'''
salt = o.salt = i = r + parseInt(10 * Math.random(), 10)
"" + (new Date).getTime() + parseInt(10 * Math.random(), 10)  js
= "" + int(time.time() * 1000) + random.randint(0, 9)

o = r.generateSaltSign(t) = r(t)
r.generateSaltSign(t) = t.generateSaltSign(t) = r(t)
{
    ts: r,
    bv: t,
    salt: i,
    sign: n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
}


e = t = "apple" 需要翻译的内容

var r = function(e)
    {
            var
        t = n.md5(navigator.appVersion),
            r = "" + (new Date).getTime(),
                i = r + parseInt(10 * Math.random(), 10);
        return {
            ts: r,
            bv: t,
            salt: i,
            sign: n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
    }
};


长度比较:
15585816225096   python
15585148976393   js
15585822104216


sign = o.sign
= n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
= md5("fanyideskweb" + "apple" + salt + "@6f#X3=cCuncYssPsuRUE")
'''

3. 朗朗渔家

import re
import requests


class Lang:

    def __init__(self):
        self.run()

    def run(self):
        # 获取页面信息
        base_url = "http://www.langlang2017.com/"
        response = requests.get(base_url)
        html = response.content.decode("utf-8")
        print(html)
        self.get_data(html)

    def get_data(self, html):
        # 缩小范围
        pattern1 = re.compile('<div class="banner_box">[\w\W]*?</div>')
        result1 = pattern1.search(html).group()
        # print(result1)

        # 获取alt信息  只返回括号中的内容 括号外面的不返回
        alt_list = re.findall('alt="(.*?)"', result1)
        # print(alt)

        # 获取src信息
        src_list1 = re.findall('src="(.*?)"', result1)
        # print(src_list1)
        # 获取完整的src图片链接
        src_list = []
        for s in src_list1:
            src = "http://www.langlang2017.com/" + s
            # print(src)
            src_list.append(src)
        # print(src_list)

        # 提示:从html里面获取
        # 获取电话号码
        phone1 = re.findall('<div class="dianhua">联系电话:(\d{11})</div>', html)[0]
        # phone1 = re.search('1\d{10}', html)
        # group(1)选择匹配到的第一个括号的内容
        # phone = phone1.group()
        print(phone1)

        # 获取所有的http连接 有两个
        http_list = re.findall('"(http.*?)"', html)
        # print(http_list)

        # 获取地址
        address = re.search('<div class="dizhi">地址:(.*?)</div>', html)
        address = address.group(1)

        lang_dict = {
            "alt_list": alt_list,
            "src_list": src_list,
            # "phone": phone,
            "http_list": http_list,
            "address": address
        }

        print(lang_dict)
        import json
        print(json.dumps(lang_dict))


if __name__ == '__main__':
    Lang()

4. 猫眼电影

import requests
import re


class Mao:
    def __init__(self):
        '''
        本质是初始化一些条件,并不是调用其他函数
        当前类实例化的时候触发
        '''
        self.count = 1
        self.spider_name = "万能爬虫"
        # self.run()

    def __call__(self, *args, **kwargs):
        '''
        当前类的对象当做函数使用的时候触发
        '''
        self.run()

    def run(self):
        # 获取猫眼的html信息 字符串信息
        base_url = "https://maoyan.com/board"
        response = requests.get(base_url)
        html = response.text
        # print(html)
        self.get_data(html)

    def get_data(self, html):
        # 缩小范围 获取电影dd
        # 换行不能用.*? 要是用\s\S   re.S 可以是\n 换行变为普通字符 .就可以匹配到\n
        dd_list = re.findall('<dd>.*?</dd>', html, re.S)
        # print(dd_list)
        # print(dd_list[0])
        # print(len(dd_list))
        # import json
        # print(json.dumps(dd_list))

        # 循环获取dd中的电影信息
        for dd in dd_list:
            # print(dd)

            # 获取排名
            rank = re.findall('<i class="board-index board-index-\d{1,2}">(\d+)</i>', dd)[0]
            # print(rank)

            # 获取电影名称
            name = re.findall('title="(.*?)" class', dd)[0]
            # print(name)

            # 获取主演信息
            actor = re.findall('<p class="star">([\d\D]*?)</p>', dd)[0]
            if "主演" not in actor:
                actor = ""
            else:
                # 去掉前面和后面的空白
                actor = actor.strip()
            # print(actor)

            # 上映日期
            publish_date = re.findall('<p class="releasetime">上映时间:(.*?)</p>', dd)[0]
            # print(publish_date)

            # 评分信息
            score_match = re.search('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(\d)</i></p>', dd)
            # print(score.group(1))
            # print(score.group(2))
            score = score_match.group(1) + score_match.group(2)
            # print(score)

            # 获取图片
            # 浏览器获取到的信息 和代码获取到的信息 有时候不一样
            # 写正则表达式的时候 以代码获取到的字符串 为准
            # 在获取图片的过程当中 优先获取大图
            pic = re.findall('data-src="(.*?)@160w_220h_1e_1c"', dd)[0]
            # print(pic)

            # 将电影信息 存入字典中
            movie_dict = {
                "rank": rank,
                "name": name,
                "actor": actor,
                "publish_date": publish_date,
                "score": score,
                "pic": pic
            }

            print(movie_dict)


if __name__ == '__main__':
    mao = Mao()
    mao()

5. 爬取房王网的数据

import re
import requests

class Fang:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    # 获取最大页码
    def get_max_page(self):
        base_url = "http://gz.ihk.cn/myxf/houselist/?mark=gzxf089"
        html = self.get_html(base_url)

        max_page_list = re.findall('<a href="javascript:SetCondition.*?>(\d+)</a>', html)
        # if max_page_list:
        #     max_page = int(max_page_list[-1])
        # else:
        #     max_page = 1
        # 三元表达式 推导式 与上面的四行代码等效
        max_page = int(max_page_list[-1]) if max_page_list else 1
        # print(max_page)

        self.get_page_html(max_page)

    # 获取每一页的html
    def get_page_html(self, max_page):
        # print(max_page)
        for page in range(1, max_page + 1):
            print("================第{}页开始下载=================".format(page))
            page_url = "http://gz.ihk.cn/myxf/houselist/p{}/?mark=gzxf089".format(page)
            # print(page_url)
            html = self.get_html(page_url)
            # print(html)

            self.get_data(html)
            # break

    def get_data(self, html):
        # 缩小范围 主要是让每一个楼盘信息 放到一个字典中
        div_list = re.findall('<div class="n_conlist"[\s\S]*?n_conlistrliioc[\w\W]*?</div>', html)
        # print(div_list)
        # print(div_list[0])
        # print(len(div_list))
        for div in div_list:
            # 获取图片
            pic = re.findall('data-original="(.*?)"', div)[0]
            # print(pic)

            # 获取新房名称
            name = re.findall('<a><strong>(.*?)</strong></a>', div)[0]
            # print(name)

            # 描述信息
            desc = re.findall('<div class="n_conlistrbrief">[\s\S]*?<span>([\w\W]*?)</span>', div)[0]
            # print(desc)

            # 主力户型
            house_type = re.findall('<div class="n_conlistradd"><span>(.*?)</span>', div)[0]
            # print(house_type)

            # 地址
            address = re.findall('</em><span>(.*?)</span>', div)[0].strip()
            # print(address)

            # 标签
            sign = re.findall('<i><span>(.*?)</span></i>', div)
            # print(sign)

            # 价格
            price = re.findall('<li><strong>(.*?)</strong>', div)[0]
            # print(price)

            fang_dict = {
                "pic": pic,
                "name": name,
                "house_type": house_type,
                "desc": desc,
                "address": address,
                "sign": sign,
                "price": price
            }

            print(self.count, fang_dict)
            self.count += 1

    # 获取指定url的页面
    def get_html(self, base_url):
        response = requests.get(base_url)
        html = response.text
        # print(html)
        return html


if __name__ == '__main__':
    fang = Fang()
    fang()

6.华夏基金

import requests
import re

class Hua:

    def __init__(self):
        pass

    def __call__(self, *args, **kwargs):
        # self.get_html()
        self.get_data()

    def get_html(self):
        base_url = "http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp"
        response = requests.get(base_url)
        html = response.text
        # print(html)

        with open("hua.html", "w", encoding="utf-8") as f:
            f.write(html)

    def get_data(self):
        with open("hua.html", "r", encoding="utf-8") as f:
            html = f.read()
        # print(html)
        # print(type(html))
        table_list = re.findall('<table.*?id="tb\d*?">[\s\S]*?</table>', html)
        # print(table_list)
        # print(table_list[0])
        # print(len(table_list))
        #
        # import json
        # print(json.dumps(table_list))
        for c, table in enumerate(table_list):
            if c == 0:
                print("=====<!-- 股票型、指数型、混合型、债券型、ETF -->=====")
                # 缩小范围 后去tr列表
                self.tr_list = self.table_handler(table)

                # print(tr_list)
                # print(tr_list[0])
                # print(len(tr_list))
                for co, tr in enumerate(self.tr_list, 1):
                    # 获取基金名称
                    name_fund_list = self.tr_hander(tr)
                    fund_list = name_fund_list[1]

                    fund_dict = {
                        "name": name_fund_list[0],
                        "code": fund_list[1].strip(),
                        "nw_date": fund_list[2],
                        "net_worth": fund_list[3],
                        "cum_worth": fund_list[4],
                        "price_limit": "" if fund_list[5] == "---" else fund_list[5],
                        "set_up_date": fund_list[6],
                        "purchase_status": fund_list[7],
                        "redemption_status": fund_list[8],
                        "cast_surely_status": fund_list[9],
                    }
                    print(co, fund_dict)

                    # break

            elif c == 1:
                print("=============<!-- 货币型 -->===============")
                # 缩小范围 获取tr
                self.tr_list = self.table_handler(table)
                # print(tr_list)
                # print(tr_list[0])
                # print(len(tr_list))

                for co, tr in enumerate(self.tr_list, 1):
                    # 获取基金名称
                    name_fund_list = self.tr_hander(tr)
                    fund_list = name_fund_list[1]

                    fund_dict = {
                        "name": name_fund_list[0],
                        "code": fund_list[2].strip(),
                        "nw_date": fund_list[3],
                        "million_return": fund_list[4],
                        "seven_day_annualized_yield": fund_list[5],
                        "aror30": fund_list[6],
                        "the_year_aror": fund_list[7],
                        "set_up_date": fund_list[8],
                        "purchase_status": fund_list[9],
                        "redemption_status": fund_list[10],
                        "cast_surely_status": fund_list[11]
                    }
                    print(co, fund_dict)

            elif c == 2:
                print("=================<!--理财型开始-->==================")
                # 缩小范围 获取tr
                self.tr_list = self.table_handler(table)
                # print(len(tr_list))
                # print(tr_list)
                for co, tr in enumerate(self.tr_list, 1):
                    # 获取基金名称
                    name_fund_list = self.tr_hander(tr)
                    fund_list = name_fund_list[1]

                    fund_dict = {
                        "name": name_fund_list[0],
                        "code": fund_list[2].strip(),
                        "nw_date": fund_list[3],
                        "thousands_return": fund_list[4],
                        "seven_day_annualized_yield": fund_list[5],
                        "operation_period": "",
                        "set_up_date": fund_list[6],
                        "purchase_status": fund_list[7],
                        "redemption_status": fund_list[8],
                        "cast_surely_status": fund_list[9]
                    }
                    print(co, fund_dict)

            else:
                print("==============<!-- 封闭型 -->===============")
                # 缩小范围 获取tr
                self.tr_list = self.table_handler(table)
                # print(tr_list)
                # print(len(tr_list))

                for co, tr in enumerate(self.tr_list, 1):

                    # 获取name
                    name_fund_list = self.tr_hander(tr)
                    fund_list = name_fund_list[1]

                    fund_dict = {
                        "name": name_fund_list[0],
                        "code": fund_list[2].strip(),
                        "nw_date": fund_list[3],
                        "net_worth": "" if fund_list[4] == "--" else fund_list[4],
                        "cum_worth": "" if fund_list[5] == "--" else fund_list[5],
                        "set_up_date": fund_list[6],
                        "due_date": fund_list[7],
                        "cast_surely_status": "" if fund_list[8] == "---" else fund_list[8],
                        "trade_status": fund_list[9],
                    }
                    print(co, fund_dict)

    # table中获取tr 缩小范围
    def table_handler(self, table):
        tr_list = re.findall('<tr.*?id="tr\d+".*?>[\s\S]*?</tr>', table)
        del tr_list[0]
        return tr_list

    def tr_hander(self, tr):
        name = re.search('title="(.*?)"', tr).group(1)
        fund_list = re.findall('<td height="30">(.*?)</td>', tr)
        return name, fund_list


if __name__ == '__main__':
    hua = Hua()
    hua()

7. 糗事百科

import requests
from lxml import etree
from fake_useragent import UserAgent


class Qiu:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    def get_max_page(self):
        base_url = "https://www.qiushibaike.com/8hr/page/2/"
        html_xml = self.get_html(base_url)
        # 获取最大页码
        max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
        # print(max_page)
        self.get_data(max_page)

    def get_data(self, max_page):

        for page in range(1, max_page + 1):
            print("===================第{}页开始下载=========================".format(page))
            page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
            # print(page_url)
            html_xml = self.get_html(page_url)
            # 缩小范围
            li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
            # print(len(li_list))

            for li in li_list:
                # 获取图片
                pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
                # if "/w/150/h/112" in pic:
                #     pic = "https:" + pic[:-12]
                # else:
                #     pic = ""

                # 三元表达式 实现上面的代码
                pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
                # print(pic)

                # 获取昵称
                nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
                # print(nike_name)

                # 获取内容
                content = li.xpath(".//a[@class='recmd-content']/text()")
                content = content[0] if content else ""
                # print(content)

                # 获取好笑数量
                laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
                # if "万" in laught_num:
                #     laught_num = int(float(laught_num[:-1]) * 10000)
                # else:
                #     laught_num = int(laught_num)
                laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
                # print(laught_num)

                # 评论数量
                comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
                comment_num = int(comment_num[0]) if comment_num else 0
                # print(comment_num)

                qiu_dict = {
                    "pic": pic,
                    "nike_name": nike_name,
                    "content": content,
                    "laught_num": laught_num,
                    "comment_num": comment_num,

                }
                print(self.count, qiu_dict)
                self.count += 1


    def get_html(self, base_url):
        # 随机产生一个浏览器信息
        headers = {"User-Agent": UserAgent().random}
        response = requests.get(base_url, headers=headers)
        html = response.text
        # print(html)
        html_xml = etree.HTML(html)
        return html_xml


if __name__ == '__main__':
    qiu = Qiu()
    qiu()

8. 爬取百姓网数据

import requests
from lxml import etree


class Bai:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    def get_max_page(self):
        base_url = "http://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D=3"
        html_xml = self.get_html(base_url)

        # 获取最大页码
        max_page = int(html_xml.xpath("//ul[@class='list-pagination']/li[last()-1]/a/text()")[0])
        # print(max_page)

        # 获取数据
        self.get_data(max_page)

    def get_data(self, max_page):
        # 循环获取每一页的xml对象 并获取其中的指定的数据
        for page in range(1, max_page + 1):
            print("================第{}页开始下载======================".format(page))
            base_url = "http://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&page={}&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D=3".format(page)
            # print(base_url)
            html_xml = self.get_html(base_url)

            # 缩小范围
            li_list = html_xml.xpath("//ul[@class='list-ad-items']/li[@data-aid]")
            # print(len(li_list))

            # 遍历获取每条狗的信息
            for co, li in enumerate(li_list, 1):
                # 图片
                pic = li.xpath(".//img/@src")[0]
                if "http" not in pic:
                    pic = li.xpath(".//img/@data-originsource")
                    pic = pic[0] if pic else ""
                # print(co, pic)

                # 获取描述信息
                desc = li.xpath(".//a[@class='ad-title']/text()")[0]
                # print(co, desc)

                # 获取地址信息
                address = li.xpath(".//div/div[@class='ad-item-detail'][1]/text()")[0]
                # print(address)

                # 类型
                dog_type = li.xpath(".//div/div[@class='ad-item-detail'][2]/text()")[0].strip()
                dog_type = dog_type.replace(" ", "")
                # print(dog_type)

                # 获取价格
                price = li.xpath(".//div/span/text()")[0]
                # print(price)

                dog_dict = {
                    "pic": pic,
                    "desc": desc,
                    "address": address,
                    "dog_type": dog_type,
                    "price": price,
                }
                print(self.count, dog_dict)
                self.count += 1

    # 获取指定url对应的xml对象
    def get_html(self, url):
        response = requests.get(url)
        html = response.text
        # print(html)
        return etree.HTML(html)


if __name__ == '__main__':
    bai = Bai()
    bai()

9. 爬取房天下数据

import requests
from lxml import etree
import re

class Fang:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    def get_max_page(self):
        base_url = "https://zu.fang.com/house/i3100/"
        html, html_xml = self.get_html(base_url)
        max_page = int(re.search('共(\d+)页', html).group(1))
        # print(max_page)

        # 通过url后去指定页面的数据
        self.get_data(max_page)

    def get_data(self, max_page):

        for page in range(1, max_page+1):
            print("=================第{}页开始下载======================".format(page))
            page_url = "https://zu.fang.com/house/i3{}/".format(page)

            # 获取分页URL得页面
            html, html_xml = self.get_html(page_url)

            # 缩小范围
            dl_list = html_xml.xpath("//div[@class='houseList']/dl[dt]")
            # print(len(dl_list))

            for co, dl in enumerate(dl_list, 1):
                # 获取图片
                pic = "https:" + dl.xpath(".//img/@data-src")[0]
                pic = pic.replace('275x207', "1000x1000")
                # print(co, pic)

                # 标题
                title = dl.xpath(".//a[@title]/@title")[0]
                # print(co, title)

                # 租房类型
                rent_type = dl.xpath(".//dd/p[2]/text()[1]")[0].strip()
                # print(rent_type)

                # 室
                fang_info = dl.xpath(".//dd/p[2]/text()[2]")[0]
                # print(fang_info)
                if "室" in fang_info:
                    room = re.findall('(\d+)室', fang_info)[0]
                else:
                    room = ""

                if "厅" in fang_info:
                    ting = re.findall("(\d+)厅",fang_info)[0]
                else:
                    ting = ""
                # print(co, room, ting)

                # 面积
                area = dl.xpath(".//dd/p[2]/text()[3]")[0]
                area = area[:-2]
                # print(area)

                # 朝向
                toward = dl.xpath(".//dd/p[2]/text()[4]")[0].strip()
                # print(toward)

                # 城区
                city_area = dl.xpath(".//dd/p[3]/a[1]/span/text()")[0]
                # print(city_area)

                # 商圈
                business_circle = dl.xpath(".//dd/p[3]/a[2]/span/text()")[0]
                # print(business_circle)

                # 小区
                community = dl.xpath(".//dd/p[3]/a[3]/span/text()")
                community = community[0] if community else ""
                # print(community)

                # 地址
                address_list = dl.xpath(".//span[@class='note subInfor']//text()")
                # print(address)
                # 用空字符串 将列表中的元素 连接成一个字符串
                address = "".join(address_list)
                # print(address)

                # 标签
                sign_list = dl.xpath(".//dd/p[@class='mt12']/span/text()")
                # print(sign_list)

                # 价格
                price = dl.xpath(".//span[@class='price']/text()")[0]
                # print(price)

                fang_dict = {
                    "pic": pic,
                    "title": title,
                    "rent_type": rent_type,
                    "room": room,
                    "ting": ting,
                    "area": area,
                    "toward": toward,
                    "city_area": city_area,
                    "business_circle": business_circle,
                    "community": community,
                    "address": address,
                    "sign_list": sign_list,
                    "price": price,
                }

                print(self.count, fang_dict)
                self.count += 1


            # break

    # 获取指定url对应的xml对象
    def get_html(self, url):
        response = requests.get(url)
        html = response.text
        # print(html)
        # with open("2.html", "r", encoding="utf-8") as f:
        #     html = f.read()
        return html, etree.HTML(html)


if __name__ == '__main__':
    fang = Fang()
    fang()

10. 爬取豆瓣电影排名数据

import requests
from fake_useragent import UserAgent
from selenium import webdriver
from lxml import etree
import time
import re


class Dou:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_data()

    # 获取最大页码
    def get_data(self):

        page = 42
        while True:
            print(f"=================第{page+1}页开始下载===================")
            base_url = "https://book.douban.com/subject_search?search_text=python&cat=1001&start={}".format(page*15)
            html, html_xml = self.get_html(base_url)
            if "查询错误" in html:
                break
            # print(base_url)

            # 缩小范围 获取每本书的大div
            div_list = html_xml.xpath("//div[@class='item-root']")
            # print(div_list)
            # print(len(div_list))

            # 循环获取每本书的详细信息
            for co, div in enumerate(div_list, 1):
                # 获取图片
                pic = div.xpath(".//img/@src")[0]
                # print(co, pic)

                # 获取书名
                name = div.xpath(".//a[@class='title-text']/text()")[0]
                # print(co, name)

                # 评分
                score = div.xpath(".//span[@class='rating_nums']/text()")
                score = score[0] if score else ""
                # print(score)

                # 评价人数
                comment_nums_str = div.xpath(".//span[@class='pl']/text()")
                comment_nums_str = comment_nums_str[0] if comment_nums_str else ""
                comment_nums = re.findall("\d+", comment_nums_str)
                comment_nums = int(comment_nums[0]) if comment_nums else 0
                # print(comment_nums)

                # 获取出版社信息
                desc_info = div.xpath(".//div[@class='meta abstract']/text()")
                if desc_info:
                    desc_info = desc_info[0]
                    desc_info = desc_info.replace(" ", "")
                else:
                    desc_info = ""
                # print(desc_info)

                book_dict = {
                    "pic": pic,
                    "name": name,
                    "score": score,
                    "comment_nums": comment_nums,
                    "desc_info": desc_info,
                }

                print(self.count, book_dict)
                self.count += 1

            page += 1
            # break

    # 获取指定url对应的网页信息
    def get_html(self, url):
        # 通过requests获取不到想要的页面信息 所以改用selenium获取
        # headers = {"User-Agent": UserAgent().random}
        # response = requests.get(url, headers=headers)
        # html = response.text
        # print(html)

        # 创建浏览器对象
        self.driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1-windows\phantomjs"
                                                     r"-2.1.1-windows\bin\phantomjs.exe")

        # 使用无界面浏览器 发起请求
        self.driver.get(url)
        time.sleep(1)
        # 获取页面信息
        html = self.driver.page_source
        # print(html)
        # 将页面存入文件中 便于开发
        # with open("3.html", "r", encoding="utf-8") as f:
        #     html = f.read()
        # 返回一个xml对象
        return html, etree.HTML(html)

    def __del__(self):
        """
        触发条件: 当所有代码执行完成 执行此函数
        """
        # print(self.driver)
        # print(type(self.driver))
        self.driver.close()  # 关闭页面
        self.driver.quit()  # 关闭浏览器
        print("------浏览器已关闭-------")


if __name__ == '__main__':
    dou = Dou()
    dou()

淘宝接口

import requests
import json
from fake_useragent import UserAgent
'''
分析:
第一层:https://tce.taobao.com/api/mget.htm?callback=jsonp1606&tce_sid=1870316,1871653&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online
1870316  第一层
1870321  第二层
1870333  第三层
1870340  第四层
1870341  第五层
1870342  第六层
1870343  第七层
'''
class Tao:
    def __init__(self):
        pass
    def __call__(self, *args, **kwargs):
        self.get_data()
    def get_data(self):
        base_url = "https://tce.taobao.com/api/mget.htm?tce_sid=1870316,1870321,1870333,1870340,1870341,1870342,1870343&tce_vid=2,2,2,2,2,2,2"
        headers = {"User-Agent": UserAgent().random}

        # 对接口发起请求
        response = requests.get(base_url, headers=headers)
        # 获取字符串数据
        str_data = response.text.strip()
        # print(str_data)
        # 获取json数据
        json_data = json.loads(str_data)
        # print(json_data)
        count = 1
        # 获取到的是一个字典
        data_dict = json_data.get('result')
        for i in data_dict.values():
            data_list = i.get("result")
            for data in data_list:
                data["item_pic"] = "https:" + data["item_pic"]
                print(count, data)
                count += 1


if __name__ == '__main__':
    tao = Tao()
    tao()

11. 爬取网易云数据

import requests
from lxml import etree
from fake_useragent import UserAgent

class Music:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_class_url_list()

    # 获取分类url列表
    def get_class_url_list(self):
        # 发起请求 获取指定页面
        base_url = "https://music.163.com/discover/artist"
        html_xml = self.get_html(base_url, 1)

        # 获取分类url
        class_url_list = html_xml.xpath("//a[@class='cat-flag']/@href")
        class_name_list = html_xml.xpath("//a[@class='cat-flag']/text()")
        del class_name_list[0]
        del class_url_list[0]
        # print(class_url_list)
        # print(class_name_list)
        # print(len(class_url_list))
        # print(len(class_name_list))
        for index in range(len(class_url_list)):
            # index += 1
            print("==============={}开始下载================".format(class_name_list[index]))
            # 拼接完整的分类url
            class_url = "https://music.163.com" + class_url_list[index]
            # print(class_url)

            # 通过分类url获取字母的url
            self.get_alphabet_url(class_url)

            # break

    def get_alphabet_url(self, class_url):
        # 获取分类url的页面 xml对象
        html_xml = self.get_html(class_url, 1)

        # 获取字母url列表
        alphabet_url_list = html_xml.xpath("//ul[@class='n-ltlst f-cb']/li[position()>1]/a/@href")
        # print(alphabet_url_list)

        # 循环获取每个字母url对应歌手信息
        for alphabet_url in alphabet_url_list:
            # 拼接完整的字母url
            alphabet_url = "https://music.163.com" + alphabet_url

            self.get_singer_info(alphabet_url)
            # break

    def get_singer_info(self, alphabet_url):

        # 根据字母url获取每个歌手的名称和对应的详情url
        html_xml = self.get_html(alphabet_url, 1)

        singer_name_list = html_xml.xpath("//a[@class='nm nm-icn f-thide s-fc0']/text()")
        singer_url_list = html_xml.xpath("//a[@class='nm nm-icn f-thide s-fc0']/@href")
        # print(singer_name_list)
        # print(singer_url_list)
        # print(len(singer_name_list))
        # print(len(singer_url_list))
        for index in range(len(singer_name_list)):
            # 声明一个存放歌手信息的字典
            singer_url = "https://music.163.com" + singer_url_list[index].strip()



            # import json
            # singer_dict = json.dumps(singer_dict)
            # with open("singer.txt", "w", encoding="utf-8") as f:
            #     f.write(singer_dict + "\n")

            html_xml = self.get_html(singer_url, 0)
            # tbody在页面当中显示 但是在代码获取到的页面中一般不显示
            hot_song = html_xml.xpath("//ul[@class='f-hide']/li/a/text()")
            # print(hot_song)
            singer_dict = {
                "singer_name": singer_name_list[index],
                "singer_url": singer_url,
                "hot_song": hot_song
            }
            print(self.count, singer_dict)
            self.count += 1
            # break

    # 获取指定url对应的页面信息
    def get_html(self, url, sign):
        '''
        :param url: 要获取的url
        :param sign: 用于判断使用哪个 headers,如果是1 则使用上面的headers 否则使用下面的headers
        :return:
        '''
        # headers = {"User-Agent": UserAgent().random}
        # if sign == 0:
        headers = {
            "cookie": "nts_mail_user=13349949963@163.com:-1:1; mail_psc_fingerprint=7fb6c5032f50ce8c1a07fdb15fd2251d; _iuqxldmzr_=32; _ntes_nnid=ec024cec32803d4dfd5c42e4e40cba08,1552969997617; _ntes_nuid=ec024cec32803d4dfd5c42e4e40cba08; WM_TID=eZJB4FRfmstFBVFRVFZ508IkS9OSa6K6; usertrack=CrHtiVyQhXO2rmpiAwOpAg==; UM_distinctid=16a307022e2b3-0b705b12e3ccd3-414f0c2a-100200-16a307022e3361; NTES_CMT_USER_INFO=72051947%7Cm13349949963_1%40163.com%7Chttp%3A%2F%2Fcms-bucket.nosdn.127.net%2F2018%2F08%2F13%2F078ea9f65d954410b62a52ac773875a1.jpeg%7Cfalse%7CbTEzMzQ5OTQ5OTYzXzFAMTYzLmNvbQ%3D%3D; vinfo_n_f_l_n3=dd7e8b71253298e9.1.0.1555590818606.0.1555590912731; P_INFO=m13349949963_1@163.com|1558093033|0|mail163|00&99|gud&1557298197&urs#bej&null#10#0#0|133963&1||13349949963@163.com; WM_NI=ROVoQSBgJquFTl4wFtlT0uStCW6f1tfWf3lX6czDHARSzgJQQaXu0QDk3vv%2BGl8GXFZhvOKF0OdWlzFB5MvSmfqUF%2B2c8YDTYjUbcM1JWQMmcQImmDpluWXxtf50voINRkI%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb4ae3fbbed98abef7d9a9a8bb2d85a939f9aaff763ac9a8c96ae79b5989da6f52af0fea7c3b92a92919a90d45982b98692f84e98b4fc98c580b08c0096d2808189fa87b480a689aad4ef54f6bdb6a5cb4b928db688c95b93bf9896b35b88b5fd97f52185b4f8a8db4e9ab8bab0ca4ef491acb8ef72869efbaef559afbabfb6c521f2bdf8bac7609bb69b83e247f39699b2d067a18f878ef050b4b4bbb8db74b8bafbd1f5658b929e8ccc37e2a3; __remember_me=true; gdxidpyhxdE=YoWfxdQEE%2BgYxhtnKi5zVBa4eaecS1%2F%2BR48h%2FgaKUjHCIj9OPH8QnoJuU4VE%2BYq4zYxRiKjDWw%2BR%2Bey3b9tDY4PDQSfKUjPQkuqfkPZY6oDRPPZouWGNpQMKNdSy8lpSY7W7Syf90lWTaOUXDzSavZz%5Cw4A1LcvEXNtkeBjksCD5L%2F7O%3A1559116416164; _9755xjdesxxd_=32; NETEASE_WDA_UID=1866933109#|#1559115550866; MUSIC_U=065d91e631703dfb7280fe33a565a5643bafb378927678189c0459a4967381afd261a8a054abc7f1c2a0cd2f9ccbfca9b9370d24fa62f9d6c26e43e3ad55584d850eee1fae4e41b77955a739ab43dce1; __csrf=b8c227a578ab1044087e44fe79d5b402; JSESSIONID-WYYY=blMRzR0VnxMzQI3YWDAisc30pDmUBmsJPcTiRP5bRK0eGtlnRzQnG4Ee963zZ9jzGlA1pX1VyCx8kOkqhCRWwDpAw84JQ4RetEJunCyMYUjgW5d5l4gPYKBTMPkBPiDD8pM9JGynKZei2c338XnVcZBC939OsBPXQR5UlDjc5pZf%2FCew%3A1559119405744"
        }
        response = requests.get(url, headers=headers)
        html = response.text
        # 只打印 歌手信息的页面
        if sign == 0:
            # print(html)
            pass
        return etree.HTML(html)


if __name__ == '__main__':
    music = Music()
    music()


'''
index {"singer": "ljj", "hot_song": ["", ""]}
{"ljj": ["", ""]}
'''

12. 爬取链家租房信息数据

import redis
import requests
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql


class CityArea:

    def __init__(self):
        # 初始化redis连接
        self.r = self.get_redis()

    def __call__(self, *args, **kwargs):
        self.get_city_area()

    # redis数据库连接
    def get_redis(self):
        return redis.Redis(host="127.0.0.1", port=6379, db=1)

    def get_city_area(self):
        # 获取城区信息
        base_url = "https://bj.lianjia.com/zufang/"
        html_xml = self.get_html(base_url)

        city_area_list = html_xml.xpath("//ul[@data-target='area']/li[position()>1]/a/@href | "
                       "//ul[@data-target='area']/li[position()>1]/a/text()")
        print(city_area_list)
        print(len(city_area_list))

        for city_area in city_area_list:
            if "zufang" in city_area:
                city_area = "https://bj.lianjia.com" + city_area
            print(city_area)
            # 将城区信息插入数据库
            self.r.rpush("city_area_list", city_area)

    # 获取指定url对应xml页面
    def get_html(self, url):
        headers = {"User-Agent": UserAgent().random}
        response = requests.get(url, headers=headers)
        html = response.text
        # print(html)
        return etree.HTML(html)


class BusinessCircle(CityArea):

    def __call__(self, *args, **kwargs):
        self.get_business_circle()

    # 通过城区url获取商圈url
    def get_business_circle(self):
        count = 1
        # 查询城区信息
        city_area_list = self.r.lrange("city_area_list", 0, -1)
        # print(city_area_list)
        for index in range(0, len(city_area_list), 2):
            # print(index)
            # 分别获取城区url和城区的名称
            city_area_url = city_area_list[index].decode("utf-8")
            city_area_name = city_area_list[index+1].decode("utf-8")
            print(city_area_url, city_area_name)

            # 获取城区url xml对象
            html_xml = self.get_html(city_area_url)
            # 获取商圈信息
            business_circle_list = html_xml.xpath("//div[@id='filter']/ul[4]/li[position()>1]/a/@href | "
                                                  "//div[@id='filter']/ul[4]/li[position()>1]/a/text()")

            print(business_circle_list)
            for index in range(len(business_circle_list)):
                # 获取商圈列表中的信息
                business_circle = business_circle_list[index]
                # 将城区和商圈用-连接起来 存入数据库
                if index % 2 == 1:
                    business_circle = city_area_name + "-" + business_circle_list[index]
                print(count, business_circle, type(business_circle))
                # print(type(business_circle))
                count += 1

                # 存入数据库
                self.r.rpush("business_circle_list", business_circle)

            # break


class Lian(CityArea):

    def __call__(self, *args, **kwargs):
        self.conn_mysql()
        self.count_ucid = 1
        self.get_page_url()

    def get_page_url(self):
        # 查询数据库中的商圈信息
        business_circle_list = self.r.lrange("business_circle_list", 0, -1)
        # print(business_circle_list)
        # 循环获取商圈url
        for index in range(0, len(business_circle_list), 2):
            # 分别获取商圈url和商圈名称
            business_circle_url = business_circle_list[index].decode("utf-8")
            # 拼接完整的商圈url
            business_circle_url = "https://bj.lianjia.com" + business_circle_url
            business_circle_name = business_circle_list[index+1].decode("utf-8")
            print(f"==================={business_circle_name}开始下载====================")
            print(business_circle_url, business_circle_name)
            # 获取商圈url指定xml页面
            html_xml = self.get_html(business_circle_url)

            # 获取最大页码
            max_page = html_xml.xpath("//div[@class='content__pg']/@data-totalpage")
            # 如果获取不到最大页码 则max_page 为空列表 然后跳过本次循环
            if not max_page:
                continue
            max_page = int(max_page[0])
            # print(max_page, type(max_page))

            # 循环生成分页url
            for page in range(1, max_page+1):
                # 拼接完整的分页url
                page_url = business_circle_url + "pg{}/".format(page)
                # print(page_url)
                # 获取数据
                self.get_data(page_url)
                break
            break

    # 获取指定分页url的数据
    def get_data(self, page_url):
        # 获取分页url页面
        html_xml = self.get_html(page_url)

        # 缩小范围
        div_list = html_xml.xpath("//div[@class='content__list']/div")

        for div in div_list:
            # 图片
            pic = div.xpath(".//img/@data-src")[0]
            pic = pic.replace("250x182", "2500x1800")
            # print(pic)

            # 标题
            title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()")[0].strip()
            # print(title)

            # 城区
            city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()")[0]

            # 商圈
            business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()")[0]
            # print(city_area, business_circle)

            # 面积
            area = div.xpath(".//p[@class='content__list--item--des']//text()[4]")
            area = area[0].strip() if area else ""  # 空值处理
            # print(area)

            # 朝向
            toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]")[0].strip()
            # print(toward)

            # 房间信息
            fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]")[0].strip()
            # print(fang_info)
            room = re.findall("(\d+)室", fang_info)  # 室
            hall = re.findall("(\d+)厅",fang_info)  # 厅
            toilet = re.findall("(\d+)卫", fang_info)  # 卫
            # 空值处理
            room = int(room[0]) if room else 0
            hall = int(hall[0]) if hall else 0
            toilet = int(toilet[0]) if toilet else 0
            # print(room, hall, toilet)

            # 发布时间
            publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()")[0]
            # print(publish_date)

            # 标签
            sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
            # print(sign_list)
            # 将标签转换为字符串
            sign = "#".join(sign_list)
            # print(sign)

            # 价格
            price = div.xpath(".//em/text()")[0]
            # print(price)

            # 详情url
            detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href")[0]
            # 拼接完整的详情url
            detail_url = "https://bj.lianjia.com" + detail_url
            # print(detail_url)

            fang_dict = {
                "pic": pic, "title": title, "city_area": city_area, "business_circle": business_circle,
                "area": area, "toward": toward, "room": room, "hall": hall, "toilet": toilet,
                "publish_date": publish_date, "sign": sign, "price": price, "detail_url": detail_url
            }

            self.parse_detail(fang_dict)

    # 解析详情页
    def parse_detail(self, fang_dict):

        # print(fang_dict)
        detail_url = fang_dict['detail_url']
        print(detail_url)

        # 获取详情url对应的xml对象
        html_xml = self.get_html(detail_url)

        floor = html_xml.xpath("//ul/li[@class='fl oneline'][8]/text()")
        floor = floor[0] if floor else ""
        # print(floor)

        # 获取经纪人电话号码 不在页面中
        # 电话号码在接口中
        # phone = html_xml.xpath(".//p[@class='content__aside__list--bottom oneline phone']/text()")
        # print(phone)

        # 获取经纪人id号 ucid
        ucid = self.get_ucid(html_xml)
        # print(ucid)
        # 获取house_code
        house_code = re.findall("zufang/(.*?).html", detail_url)[0]
        # print(house_code)

        # 拼接完整的经纪人接口
        agent_url = f"https://bj.lianjia.com/zufang/aj/house/brokers?" \
                    f"house_codes={house_code}&position=bottom" \
                    f"&ucid={ucid}"
        # print(agent_url)
        try:
            # 获取接口中的信息
            headers = {"User-Agent": UserAgent().random}
            json_data = requests.get(agent_url, headers=headers).json()
            # print(json_data)
            phone = json_data.get("data")[house_code][house_code].get("tp_number")

            # print(phone)
        except Exception as e:
            print(e)
            phone = ''

        # 将电话和楼层信息放到fang_dict中
        fang_dict["floor"] = floor
        fang_dict["phone"] = phone

        self.insert_mysql(fang_dict)

    def insert_mysql(self, fang_dict):
        print(self.conn)
        print(self.cur)

    def conn_mysql(self):
        # 创建数据库的连接对象
        self.conn = pymysql.connect(host="127.0.0.1", user="root",
                                    database="0218", charset="utf8")
        # 创建操作数据库的对象
        self.cur = self.conn.cursor()

    def get_ucid(self, html_xml):

        try:
            ucid = html_xml.xpath("//span[@class='contact__im im__online']/@data-info")[0]
            # print(ucid)
            self.count_ucid = 1
            return ucid
        except Exception as e:
            print(e)
            if self.count_ucid == 3:
                return ""
            else:
                self.count_ucid += 1
                return self.get_ucid(html_xml)


# ucid = self.get_ucid() = self.get_ucid(html_xml) = ucid


if __name__ == '__main__':
    # cityarea = CityArea()
    # cityarea()
    # 实例化BusinessCircle bc为当前类的对象 调用时触发__call__
    # bc = BusinessCircle()
    # bc()
    lian = Lian()
    lian()




'''
电话接口分析:
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259333770690183168&position=bottom&ucid=1000000026012783
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2234691835526389760&position=bottom&ucid=1000000023002201

'''