# -*- coding: utf-8 -*- """ Bregman projections for regularized OT with GPU """ import numpy as np import cudamat def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False, log=False, returnAsGPU=False): """ Solve the entropic regularization optimal transport problem on GPU The function solves the following optimization problem: .. math:: \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) s.t. \gamma 1 = a \gamma^T 1= b \gamma\geq 0 where : - M is the (ns,nt) metric cost matrix - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - a and b are source and target weights (sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_ Parameters ---------- a : np.ndarray (ns,) samples weights in the source domain b : np.ndarray (nt,) samples in the target domain M_GPU : cudamat.CUDAMatrix (ns,nt) loss matrix reg : float Regularization term >0 numItermax : int, optional Max number of iterations stopThr : float, optional Stop threshol on error (>0) verbose : bool, optional Print information along iterations log : bool, optional record log if True returnAsGPU : bool, optional return the OT matrix as a cudamat.CUDAMatrix Returns ------- gamma : (ns x nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters Examples -------- >>> import ot >>> a=[.5,.5] >>> b=[.5,.5] >>> M=[[0.,1.],[1.,0.]] >>> ot.sinkhorn(a,b,M,1) array([[ 0.36552929, 0.13447071], [ 0.13447071, 0.36552929]]) References ---------- .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 See Also -------- ot.lp.emd : Unregularized OT ot.optim.cg : General regularized OT """ # init data Nini = len(a) Nfin = len(b) if log: log = {'err': []} # we assume that no distances are null except those of the diagonal of # distances u = (np.ones(Nini)/Nini).reshape((Nini, 1)) u_GPU = cudamat.CUDAMatrix(u) a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1))) ones_GPU = cudamat.empty(u_GPU.shape).assign(1) v = (np.ones(Nfin)/Nfin).reshape((Nfin, 1)) v_GPU = cudamat.CUDAMatrix(v) b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1))) M_GPU.divide(-reg) K_GPU = cudamat.exp(M_GPU) ones_GPU.divide(a_GPU, target=a_GPU) Kp_GPU = cudamat.empty(K_GPU.shape) K_GPU.mult_by_col(a_GPU, target=Kp_GPU) tmp_GPU = cudamat.empty(K_GPU.shape) cpt = 0 err = 1 while (err > stopThr and cpt < numItermax): uprev_GPU = u_GPU.copy() vprev_GPU = v_GPU.copy() KtransposeU_GPU = K_GPU.transpose().dot(u_GPU) b_GPU.divide(KtransposeU_GPU, target=v_GPU) ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU) if (np.any(KtransposeU_GPU.asarray() == 0) or not u_GPU.allfinite() or not v_GPU.allfinite()): # we have reached the machine precision # come back to previous solution and quit loop print('Warning: numerical errors at iteration', cpt) u_GPU = uprev_GPU.copy() v_GPU = vprev_GPU.copy() break if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations K_GPU.mult_by_col(u_GPU, target=tmp_GPU) tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU) bcopy_GPU = b_GPU.copy().transpose() bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1) err = bcopy_GPU.euclid_norm()**2 if log: log['err'].append(err) if verbose: if cpt % 200 == 0: print('{:5s}|{:12s}'.format('It.', 'Err')+'\n'+'-'*19) print('{:5d}|{:8e}|'.format(cpt, err)) cpt += 1 if log: log['u'] = u_GPU.asarray() log['v'] = v_GPU.asarray() K_GPU.mult_by_col(u_GPU, target=K_GPU) K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU) if returnAsGPU: res = K_GPU else: res = K_GPU.asarray() if log: return res, log else: return res