zero-cost-nas/foresight/pruners/measures/grasp.py

# Copyright 2021 Samsung Electronics Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd

from . import measure
from ..p_utils import get_layer_metric_array


@measure('grasp', bn=True, mode='param')
def compute_grasp_per_weight(net, inputs, targets, mode, loss_fn, T=1, num_iters=1, split_data=1):

    # get all applicable weights
    weights = []
    for layer in net.modules():
        if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
            weights.append(layer.weight)
            layer.weight.requires_grad_(True) # TODO isn't this already true?

    # NOTE original code had some input/target splitting into 2
    # I am guessing this was because of GPU mem limit
    net.zero_grad()
    N = inputs.shape[0]
    for sp in range(split_data):
        st=sp*N//split_data
        en=(sp+1)*N//split_data

        #forward/grad pass #1
        grad_w = None
        for _ in range(num_iters):
            #TODO get new data, otherwise num_iters is useless!
            outputs = net.forward(inputs[st:en])/T
            loss = loss_fn(outputs, targets[st:en])
            grad_w_p = autograd.grad(loss, weights, allow_unused=True)
            if grad_w is None:
                grad_w = list(grad_w_p)
            else:
                for idx in range(len(grad_w)):
                    grad_w[idx] += grad_w_p[idx]

    
    for sp in range(split_data):
        st=sp*N//split_data
        en=(sp+1)*N//split_data

        # forward/grad pass #2
        outputs = net.forward(inputs[st:en])/T
        loss = loss_fn(outputs, targets[st:en])
        grad_f = autograd.grad(loss, weights, create_graph=True, allow_unused=True)
        
        # accumulate gradients computed in previous step and call backwards
        z, count = 0,0
        for layer in net.modules():
            if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
                if grad_w[count] is not None:
                    z += (grad_w[count].data * grad_f[count]).sum()
                count += 1
        z.backward()

    # compute final sensitivity metric and put in grads
    def grasp(layer):
        if layer.weight.grad is not None:
            return -layer.weight.data * layer.weight.grad   # -theta_q Hg
            #NOTE in the grasp code they take the *bottom* (1-p)% of values
            #but we take the *top* (1-p)%, therefore we remove the -ve sign
            #EDIT accuracy seems to be negatively correlated with this metric, so we add -ve sign here!
        else:
            return torch.zeros_like(layer.weight)
    
    grads = get_layer_metric_array(net, grasp, mode)

    return grads
upload 2023-05-04 13:09:03 +08:00			`# Copyright 2021 Samsung Electronics Co., Ltd.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`

			`# http://www.apache.org/licenses/LICENSE-2.0`

			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# =============================================================================`

			`import torch`
			`import torch.nn as nn`
			`import torch.nn.functional as F`
			`import torch.autograd as autograd`

			`from . import measure`
			`from ..p_utils import get_layer_metric_array`


			`@measure('grasp', bn=True, mode='param')`
			`def compute_grasp_per_weight(net, inputs, targets, mode, loss_fn, T=1, num_iters=1, split_data=1):`

			`# get all applicable weights`
			`weights = []`
			`for layer in net.modules():`
			`if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):`
			`weights.append(layer.weight)`
			`layer.weight.requires_grad_(True) # TODO isn't this already true?`

			`# NOTE original code had some input/target splitting into 2`
			`# I am guessing this was because of GPU mem limit`
			`net.zero_grad()`
			`N = inputs.shape[0]`
			`for sp in range(split_data):`
			`st=sp*N//split_data`
			`en=(sp+1)*N//split_data`

			`#forward/grad pass #1`
			`grad_w = None`
			`for _ in range(num_iters):`
			`#TODO get new data, otherwise num_iters is useless!`
			`outputs = net.forward(inputs[st:en])/T`
			`loss = loss_fn(outputs, targets[st:en])`
			`grad_w_p = autograd.grad(loss, weights, allow_unused=True)`
			`if grad_w is None:`
			`grad_w = list(grad_w_p)`
			`else:`
			`for idx in range(len(grad_w)):`
			`grad_w[idx] += grad_w_p[idx]`


			`for sp in range(split_data):`
			`st=sp*N//split_data`
			`en=(sp+1)*N//split_data`

			`# forward/grad pass #2`
			`outputs = net.forward(inputs[st:en])/T`
			`loss = loss_fn(outputs, targets[st:en])`
			`grad_f = autograd.grad(loss, weights, create_graph=True, allow_unused=True)`

			`# accumulate gradients computed in previous step and call backwards`
			`z, count = 0,0`
			`for layer in net.modules():`
			`if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):`
			`if grad_w[count] is not None:`
			`z += (grad_w[count].data * grad_f[count]).sum()`
			`count += 1`
			`z.backward()`

			`# compute final sensitivity metric and put in grads`
			`def grasp(layer):`
			`if layer.weight.grad is not None:`
			`return -layer.weight.data * layer.weight.grad # -theta_q Hg`
			`#NOTE in the grasp code they take the bottom (1-p)% of values`
			`#but we take the top (1-p)%, therefore we remove the -ve sign`
			`#EDIT accuracy seems to be negatively correlated with this metric, so we add -ve sign here!`
			`else:`
			`return torch.zeros_like(layer.weight)`

			`grads = get_layer_metric_array(net, grasp, mode)`

			`return grads`