# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
from paddle.amp import GradScaler
from paddle import _C_ops
import paddle


class LSCGradScaler(GradScaler):
    def __init__(self,
                 enable=True,
                 init_loss_scaling=2.**15,
                 incr_ratio=2.0,
                 decr_ratio=0.5,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 use_dynamic_loss_scaling=True,
                 max_loss_scaling=32768.0):
        super(LSCGradScaler, self).__init__(
            enable, init_loss_scaling, incr_ratio, decr_ratio,
            incr_every_n_steps, decr_every_n_nan_or_inf,
            use_dynamic_loss_scaling)
        self.max_loss_scaling = max_loss_scaling

    def step(self, optimizer, classifier=None):
        if not self._enable:
            if classifier is not None:
                classifier.step(optimizer)
            return optimizer.step()

        # unscale the grad
        self._unscale(optimizer)

        if self._found_inf:
            self._cache_founf_inf = True
        else:
            optimizer.step()
            if classifier is not None:
                classifier.step(optimizer)

            self._cache_founf_inf = False

        if self._use_dynamic_loss_scaling:
            # update the scale
            self._update()

    def _unscale(self, optimizer):
        if not self._enable:
            return

        param_grads_dict = defaultdict(list)
        dist_param_grads_dict = defaultdict(list)
        if getattr(optimizer, '_param_groups', None) and isinstance(
                optimizer._param_groups[0], dict):
            for group in optimizer._param_groups:
                for param in group['params']:
                    if not param.is_distributed:
                        if param._grad_ivar() is not None:
                            param_grads_dict[param._grad_ivar().dtype].append(
                                param._grad_ivar())
                    else:
                        if param._grad_ivar() is not None:
                            dist_param_grads_dict[param._grad_ivar(
                            ).dtype].append(param._grad_ivar())
        else:
            for param in optimizer._parameter_list:
                if not param.is_distributed:
                    if param._grad_ivar() is not None:
                        param_grads_dict[param._grad_ivar().dtype].append(
                            param._grad_ivar())
                else:
                    if param._grad_ivar() is not None:
                        dist_param_grads_dict[param._grad_ivar().dtype].append(
                            param._grad_ivar())
        for dtype in dist_param_grads_dict:
            for grad in dist_param_grads_dict[dtype]:
                self._found_inf = paddle.logical_not(
                    paddle.all(paddle.isfinite(grad)))
                if self._found_inf:
                    print('Found inf or nan in classifier, dtype is', dtype)
                    return

        for dtype in param_grads_dict:
            param_grads = param_grads_dict[dtype]
            _C_ops.check_finite_and_unscale(param_grads, self._scale,
                                            param_grads, self._found_inf)
            if self._found_inf:
                print('Found inf or nan in backbone, dtype is', dtype)
                break