Loss Landscape Sightseeing with Multi-Point Optimization
阅读原文时间:2023年07月11日阅读:3

目录

Skorokhodov I, Burtsev M. Loss Landscape Sightseeing with Multi-Point Optimization.[J]. arXiv: Learning, 2019.

@article{skorokhodov2019loss,

title={Loss Landscape Sightseeing with Multi-Point Optimization.},

author={Skorokhodov, Ivan and Burtsev, Mikhail},

journal={arXiv: Learning},

year={2019}}

现在的任务是, 给出了第一幅图, 这是一只鸟,我们希望loss landscape 表现的同这只鸟一样. 换言之, 这幅图的一个像素点代表了一个相同规模的神经网络的在一网络参数下的损失(或者正确率). 黑色的部分表示这部分的网络我们希望他们能正确识别样本, 白色像素点希望他们错误识别样本. 第三幅图就是通过训练后的正确率的一个热点图.

所以这实际上一种正则化的过程.

固定神经网络\(\mathcal{F}\)的模式,给定三组参数\(w_O, w_{right}, w_{up}\), 考虑如下线性组合

\[w_{\alpha, \beta} = w_O + \alpha w_{right} + \beta w_{up},
\]

这里我们视\(\alpha, \beta \in \{0, 1, 2, \ldots\}\), \(w_{\alpha, \beta}\)就是图片中第\((\alpha, \beta)\)个元素(看了作者代码,在实际操作中\(\alpha, \beta\)可以再同乘一个系数).

所以, 样本\(x\), 传入网络\(\mathcal{F}_{\alpha, \beta}\),

\[loss =
\left \{
\begin{array}{ll}
\mathcal{L}(\mathcal{F}_{\alpha, \beta}(x), y) &K(\alpha, \beta) = 1 \\
-\mathcal{L}(\mathcal{F}_{\alpha, \beta}(x), y) &K(\alpha, \beta) = -1 \\
\end{array} \right.
\]

其中\(K(\alpha, \beta)=1\)表示需要正确分类, 反之为不正确分类. 在代码中, 发现作者对\(K(\alpha, \beta)=-1\)的损失额外乘上了一个系数.

在作者的代码中, 并非是按序选择\(\mathcal{F}_{\alpha, \beta}\)的, 而是随机选择.

另外, 并非一定要限制在二维, 作者只是为了便于说明.

此外, 可以通过Schmidt正交法使得\(w_{right}, w_{up}\)正交, 这部分就不讲了(蛮简单的).

作者的代码

"""
这部分相当于是作者代码中的layerops.py
个人认为代码的难点就在于此, 故只重写了这部分
的代码. 作者是完全重新定义Module模块, 我是在
Module的模块上进行修改.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F

class Modulerebuild(nn.Module):
    """
    对Module的部分方法进行重定义, 因为
    一般的Module只接受Parameter, 这就要求
    网络参数都是叶节点, 这与论文的思路不符
    """
    def register_parameter(self, name: str, param):
        if '_parameters' not in self.__dict__:
            raise AttributeError(
                "cannot assign parameter before Module.__init__() call")

        elif not isinstance(name, torch._six.string_classes):
            raise TypeError("parameter name should be a string. "
                            "Got {}".format(torch.typename(name)))
        elif '.' in name:
            raise KeyError("parameter name can't contain \".\"")
        elif name == '':
            raise KeyError("parameter name can't be empty string \"\"")
        elif hasattr(self, name) and name not in self._parameters:
            raise KeyError("attribute '{}' already exists".format(name))

        if param is None:
            self._parameters[name] = None
        elif not param.requires_grad:
            raise  ValueError("Invalid parameters, "
                              "the tensor should requires_grad == True")
        else:

            self._parameters[name] = param

    def __setattr__(self, name, value):
        def remove_from(*dicts):
            for d in dicts:
                if name in d:
                    del d[name]

        params = self.__dict__.get('_parameters')
        if isinstance(value, torch.Tensor):
            if params is None:
                raise AttributeError(
                    "cannot assign parameters before Module.__init__() call")
            remove_from(self.__dict__, self._buffers, self._modules)
            self.register_parameter(name, value)
        elif params is not None and name in params:
            self.register_parameter(name, value)
        else:
            modules = self.__dict__.get('_modules')
            if isinstance(value, Modulerebuild):
                if modules is None:
                    raise AttributeError(
                        "cannot assign module before Module.__init__() call")
                remove_from(self.__dict__, self._parameters, self._buffers)
                modules[name] = value
            elif modules is not None and name in modules:
                if value is not None:
                    raise TypeError("cannot assign '{}' as child module '{}' "
                                    "(torch.nn.Module or None expected)"
                                    .format(torch.typename(value), name))
                modules[name] = value
            else:
                buffers = self.__dict__.get('_buffers')
                if buffers is not None and name in buffers:
                    if value is not None and not isinstance(value, torch.Tensor):
                        raise TypeError("cannot assign '{}' as buffer '{}' "
                                        "(torch.Tensor or None expected)"
                                        .format(torch.typename(value), name))
                    buffers[name] = value
                else:
                    object.__setattr__(self, name, value)

    def parameters_(self):
        """
        对parameters方法进行了重定义, 因为如果直接采用parameters() 结果
        会返回空的生成器. 不直接在parameters上重定义的原因是, parameters的
        参数设置与parameters_不同...
        :return:
        """
        for p in self._parameters.values():
            yield p
        for m in self._modules.values():
            for p in m.parameters_():
                yield p

class Squentialrebuild(Modulerebuild):

    def __init__(self, *rebs):
        super(Squentialrebuild, self).__init__()
        self.rebs = rebs
        for i, m in enumerate(self.rebs):
            self.__setattr__(f'module_{i}', m)

    def forward(self, x):
        for m in self.rebs:
            x = m(x)

        return x

    def parameters_(self):
        print(self._modules.values())
        return super(Squentialrebuild, self).parameters()

class Linearrebuild(Modulerebuild):

    def __init__(self, weight, bias=None):
        super(Linearrebuild, self).__init__()
        self.weight = weight
        self.bias = bias

    def __call__(self, x):
        return F.linear(x, self.weight, self.bias)

class Conv2drebuild(Modulerebuild):

    def __init__(self, weight, bias=None, **kwargs):
        super(Conv2drebuild, self).__init__()
        self.weight = weight
        self.bias = bias
        self.kwargs = kwargs

    def forward(self, x):
        return F.conv2d(x, self.weight, self.bias, **self.kwargs)

class BatchNormrebuild(Modulerebuild):

    def __init__(self, weight, bias, **kwargs):
        super(BatchNormrebuild, self).__init__()
        self.weight = weight
        self.bias = bias
        self.kwargs = kwargs

    def forward(self, x):
        dummy_mean = torch.zeros_like(self.bias)
        dummy_var = torch.ones_like(self.weight)
        return F.batch_norm(x, dummy_mean, dummy_var,
                            self.weight, self.bias,
                            training=True, **self.kwargs)

class Net(Modulerebuild):

    def __init__(self, *rebs):
        super(Net, self).__init__()
        self.dense = Squentialrebuild(
            *rebs
        )

    def forward(self, x):
        return self.dense(x)

if __name__ == "__main__":

    parameters = []
    rebs = []
    weight = torch.rand(10, 2, requires_grad=True)
    bias = torch.rand(10, requires_grad=True)
    parameters += [weight, bias]
    rebs.append(Linearrebuild(weight, bias))
    weight = torch.rand(10, requires_grad=True)
    bias = torch.rand(10, requires_grad=True)
    parameters += [weight, bias]
    rebs.append(BatchNormrebuild(weight, bias))
    rebs.append(nn.ReLU())
    weight = torch.rand(1, 10, requires_grad=True)
    bias = torch.rand(1, requires_grad=True)
    parameters += [weight, bias]
    rebs.append(Linearrebuild(weight, bias + 1)) #注意我们这里传进去的bias+1不是叶节点

    net = Net(*rebs)
    x = torch.tensor([[1., 2.], [3., 4]])
    y = torch.tensor([[1.], [2.]])

    criterion = nn.MSELoss()
    opti = torch.optim.SGD(parameters, lr=0.001)  #虽然net.parameters_()可以获得参数, 但是
                                                  #里面的参数并非全是叶结点时, optim依旧无法
                                                  #进行更新
    print(parameters[-1])
    pred = net(x)
    loss = criterion(pred, y)
    opti.zero_grad()
    loss.backward()
    opti.step()
    print(parameters[-1])  #但是可以发现bias的确发生了变化