# Author: Kilian Fatras <kilian.fatras@gmail.com>
#
# License: MIT License

import numpy as np


##############################################################################
# Optimization toolbox for SEMI - DUAL problems
##############################################################################


def coordinate_grad_semi_dual(b, M, reg, beta, i):
    r'''
    Compute the coordinate gradient update for regularized discrete distributions for (i, :)

    The function computes the gradient of the semi dual problem:

    .. math::
        \max_v \sum_i (\sum_j v_j * b_j - reg * log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i

    Where :

    - M is the (ns,nt) metric cost matrix
    - v is a dual variable in R^J
    - reg is the regularization term
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the ASGD & SAG algorithms
    as proposed in [18]_ [alg.1 & alg.2]


    Parameters
    ----------

    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float nu
        Regularization term > 0
    v : np.ndarray(nt,)
        dual variable
    i : number int
        picked number i

    Returns
    -------

    coordinate gradient : np.ndarray(nt,)

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 300000
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> method = "ASGD"
    >>> asgd_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method, numItermax)
    >>> print(asgd_pi)

    References
    ----------

    [Genevay et al., 2016] :
                    Stochastic Optimization for Large-scale Optimal Transport,
                     Advances in Neural Information Processing Systems (2016),
                      arXiv preprint arxiv:1605.08527.

    '''

    r = M[i, :] - beta
    exp_beta = np.exp(-r / reg) * b
    khi = exp_beta / (np.sum(exp_beta))
    return b - khi


def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
    r'''
    Compute the SAG algorithm to solve the regularized discrete measures
        optimal transport max problem

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1 = b

             \gamma \geq 0

    Where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the SAG algorithm
    as proposed in [18]_ [alg.1]


    Parameters
    ----------

    a : np.ndarray(ns,),
        source measure
    b : np.ndarray(nt,),
        target measure
    M : np.ndarray(ns, nt),
        cost matrix
    reg : float number,
        Regularization term > 0
    numItermax : int number
        number of iteration
    lr : float number
        learning rate

    Returns
    -------

    v : np.ndarray(nt,)
        dual variable

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 300000
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> method = "ASGD"
    >>> asgd_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method, numItermax)
    >>> print(asgd_pi)

    References
    ----------

    [Genevay et al., 2016] :
                    Stochastic Optimization for Large-scale Optimal Transport,
                     Advances in Neural Information Processing Systems (2016),
                      arXiv preprint arxiv:1605.08527.
    '''

    if lr is None:
        lr = 1. / max(a / reg)
    n_source = np.shape(M)[0]
    n_target = np.shape(M)[1]
    cur_beta = np.zeros(n_target)
    stored_gradient = np.zeros((n_source, n_target))
    sum_stored_gradient = np.zeros(n_target)
    for _ in range(numItermax):
        i = np.random.randint(n_source)
        cur_coord_grad = a[i] * coordinate_grad_semi_dual(b, M, reg,
                                                          cur_beta, i)
        sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
        stored_gradient[i] = cur_coord_grad
        cur_beta += lr * (1. / n_source) * sum_stored_gradient
    return cur_beta


def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
    r'''
    Compute the ASGD algorithm to solve the regularized semi continous measures optimal transport max problem

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma \geq 0

    Where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the ASGD algorithm
    as proposed in [18]_ [alg.2]


    Parameters
    ----------

    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float number
        Regularization term > 0
    numItermax : int number
        number of iteration
    lr : float number
        learning rate


    Returns
    -------

    ave_v : np.ndarray(nt,)
        dual variable

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 300000
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> method = "ASGD"
    >>> asgd_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method, numItermax)
    >>> print(asgd_pi)

    References
    ----------

    [Genevay et al., 2016] :
                    Stochastic Optimization for Large-scale Optimal Transport,
                     Advances in Neural Information Processing Systems (2016),
                      arXiv preprint arxiv:1605.08527.
    '''

    if lr is None:
        lr = 1. / max(a / reg)
    n_source = np.shape(M)[0]
    n_target = np.shape(M)[1]
    cur_beta = np.zeros(n_target)
    ave_beta = np.zeros(n_target)
    for cur_iter in range(numItermax):
        k = cur_iter + 1
        i = np.random.randint(n_source)
        cur_coord_grad = coordinate_grad_semi_dual(b, M, reg, cur_beta, i)
        cur_beta += (lr / np.sqrt(k)) * cur_coord_grad
        ave_beta = (1. / k) * cur_beta + (1 - 1. / k) * ave_beta
    return ave_beta


def c_transform_entropic(b, M, reg, beta):
    r'''
    The goal is to recover u from the c-transform.

    The function computes the c_transform of a dual variable from the other
    dual variable:

    .. math::
        u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j

    Where :

    - M is the (ns,nt) metric cost matrix
    - u, v are dual variables in R^IxR^J
    - reg is the regularization term

    It is used to recover an optimal u from optimal v solving the semi dual
    problem, see Proposition 2.1 of [18]_


    Parameters
    ----------

    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float
        regularization term > 0
    v : np.ndarray(nt,)
        dual variable

    Returns
    -------

    u : np.ndarray(ns,)
        dual variable

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 300000
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> method = "ASGD"
    >>> asgd_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method, numItermax)
    >>> print(asgd_pi)

    References
    ----------

    [Genevay et al., 2016] :
                    Stochastic Optimization for Large-scale Optimal Transport,
                     Advances in Neural Information Processing Systems (2016),
                      arXiv preprint arxiv:1605.08527.
    '''

    n_source = np.shape(M)[0]
    alpha = np.zeros(n_source)
    for i in range(n_source):
        r = M[i, :] - beta
        min_r = np.min(r)
        exp_beta = np.exp(-(r - min_r) / reg) * b
        alpha[i] = min_r - reg * np.log(np.sum(exp_beta))
    return alpha


def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
                                log=False):
    r'''
    Compute the transportation matrix to solve the regularized discrete
        measures optimal transport max problem

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma \geq 0

    Where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)
    The algorithm used for solving the problem is the SAG or ASGD algorithms
    as proposed in [18]_


    Parameters
    ----------

    a : np.ndarray(ns,)
        source measure
    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float number
        Regularization term > 0
    methode : str
        used method (SAG or ASGD)
    numItermax : int number
        number of iteration
    lr : float number
        learning rate
    n_source : int number
        size of the source measure
    n_target : int number
        size of the target measure
    log : bool, optional
        record log if True

    Returns
    -------

    pi : np.ndarray(ns, nt)
        transportation matrix
    log : dict
        log dictionary return only if log==True in parameters

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 300000
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> method = "ASGD"
    >>> asgd_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method, numItermax)
    >>> print(asgd_pi)

    References
    ----------

    [Genevay et al., 2016] :
                    Stochastic Optimization for Large-scale Optimal Transport,
                     Advances in Neural Information Processing Systems (2016),
                      arXiv preprint arxiv:1605.08527.
    '''

    if method.lower() == "sag":
        opt_beta = sag_entropic_transport(a, b, M, reg, numItermax, lr)
    elif method.lower() == "asgd":
        opt_beta = averaged_sgd_entropic_transport(a, b, M, reg, numItermax, lr)
    else:
        print("Please, select your method between SAG and ASGD")
        return None

    opt_alpha = c_transform_entropic(b, M, reg, opt_beta)
    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :]) / reg) *
          a[:, None] * b[None, :])

    if log:
        log = {}
        log['alpha'] = opt_alpha
        log['beta'] = opt_beta
        return pi, log
    else:
        return pi


##############################################################################
# Optimization toolbox for DUAL problems
##############################################################################


def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
                    batch_beta):
    r'''
    Computes the partial gradient of the dual optimal transport problem.

    For each (i,j) in a batch of coordinates, the partial gradients are :

    .. math::
        \partial_{u_i} F = u_i * b_s/l_{v} - \sum_{j \in B_v} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j

        \partial_{v_j} F = v_j * b_s/l_{u} - \sum_{i \in B_u} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j

    Where :

    - M is the (ns,nt) metric cost matrix
    - u, v are dual variables in R^ixR^J
    - reg is the regularization term
    - :math:`B_u` and :math:`B_v` are lists of index
    - :math:`b_s` is the size of the batchs :math:`B_u` and :math:`B_v`
    - :math:`l_u` and :math:`l_v` are the lenghts of :math:`B_u` and :math:`B_v`
    - a and b are source and target weights (sum to 1)


    The algorithm used for solving the dual problem is the SGD algorithm
    as proposed in [19]_ [alg.1]


    Parameters
    ----------

    a : np.ndarray(ns,)
        source measure
    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float number
        Regularization term > 0
    alpha : np.ndarray(ns,)
        dual variable
    beta : np.ndarray(nt,)
        dual variable
    batch_size : int number
        size of the batch
    batch_alpha : np.ndarray(bs,)
        batch of index of alpha
    batch_beta : np.ndarray(bs,)
        batch of index of beta

    Returns
    -------

    grad : np.ndarray(ns,)
        partial grad F

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 20000
    >>> lr = 0.1
    >>> batch_size = 3
    >>> log = True
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
    >>> print(log['alpha'], log['beta'])
    >>> print(sgd_dual_pi)

    References
    ----------

    [Seguy et al., 2018] :
                    International Conference on Learning Representation (2018),
                      arXiv preprint arxiv:1711.02283.
    '''

    G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] -
                   M[batch_alpha, :][:, batch_beta]) / reg) *
           a[batch_alpha, None] * b[None, batch_beta])
    grad_beta = np.zeros(np.shape(M)[1])
    grad_alpha = np.zeros(np.shape(M)[0])
    grad_beta[batch_beta] = (b[batch_beta] * len(batch_alpha) / np.shape(M)[0]
                             + G.sum(0))
    grad_alpha[batch_alpha] = (a[batch_alpha] * len(batch_beta)
                               / np.shape(M)[1] + G.sum(1))

    return grad_alpha, grad_beta


def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
    r'''
    Compute the sgd algorithm to solve the regularized discrete measures
        optimal transport dual problem

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma \geq 0

    Where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)

    Parameters
    ----------

    a : np.ndarray(ns,)
        source measure
    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float number
        Regularization term > 0
    batch_size : int number
        size of the batch
    numItermax : int number
        number of iteration
    lr : float number
        learning rate

    Returns
    -------

    alpha : np.ndarray(ns,)
        dual variable
    beta : np.ndarray(nt,)
        dual variable

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 20000
    >>> lr = 0.1
    >>> batch_size = 3
    >>> log = True
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
    >>> print(log['alpha'], log['beta'])
    >>> print(sgd_dual_pi)

    References
    ----------

    [Seguy et al., 2018] :
                    International Conference on Learning Representation (2018),
                      arXiv preprint arxiv:1711.02283.
    '''

    n_source = np.shape(M)[0]
    n_target = np.shape(M)[1]
    cur_alpha = np.zeros(n_source)
    cur_beta = np.zeros(n_target)
    for cur_iter in range(numItermax):
        k = np.sqrt(cur_iter + 1)
        batch_alpha = np.random.choice(n_source, batch_size, replace=False)
        batch_beta = np.random.choice(n_target, batch_size, replace=False)
        update_alpha, update_beta = batch_grad_dual(a, b, M, reg, cur_alpha,
                                                    cur_beta, batch_size,
                                                    batch_alpha, batch_beta)
        cur_alpha[batch_alpha] += (lr / k) * update_alpha[batch_alpha]
        cur_beta[batch_beta] += (lr / k) * update_beta[batch_beta]

    return cur_alpha, cur_beta


def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
                        log=False):
    r'''
    Compute the transportation matrix to solve the regularized discrete measures
        optimal transport dual problem

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma \geq 0

    Where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)

    Parameters
    ----------

    a : np.ndarray(ns,)
        source measure
    b : np.ndarray(nt,)
        target measure
    M : np.ndarray(ns, nt)
        cost matrix
    reg : float number
        Regularization term > 0
    batch_size : int number
        size of the batch
    numItermax : int number
        number of iteration
    lr : float number
        learning rate
    log : bool, optional
        record log if True

    Returns
    -------

    pi : np.ndarray(ns, nt)
        transportation matrix
    log : dict
        log dictionary return only if log==True in parameters

    Examples
    --------
    >>> import ot
    >>> n_source = 7
    >>> n_target = 4
    >>> reg = 1
    >>> numItermax = 20000
    >>> lr = 0.1
    >>> batch_size = 3
    >>> log = True
    >>> a = ot.utils.unif(n_source)
    >>> b = ot.utils.unif(n_target)
    >>> rng = np.random.RandomState(0)
    >>> X_source = rng.randn(n_source, 2)
    >>> Y_target = rng.randn(n_target, 2)
    >>> M = ot.dist(X_source, Y_target)
    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
    >>> print(log['alpha'], log['beta'])
    >>> print(sgd_dual_pi)

    References
    ----------

    [Seguy et al., 2018] :
                    International Conference on Learning Representation (2018),
                      arXiv preprint arxiv:1711.02283.
    '''

    opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,
                                                      numItermax, lr)
    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :]) / reg) *
          a[:, None] * b[None, :])
    if log:
        log = {}
        log['alpha'] = opt_alpha
        log['beta'] = opt_beta
        return pi, log
    else:
        return pi