15 files changed, 2202 insertions, 921 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index a79a5ce..6d4c4c6 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -4,6 +4,10 @@
 
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
 
 # All submodules and packages
 from . import lp
@@ -24,6 +28,6 @@ from .utils import dist, unif, tic, toc, toq
 
 __version__ = "0.3.1"
 
-__all__ = ["emd", "emd2", "sinkhorn","sinkhorn2", "utils", 'datasets',
+__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets',
            'bregman', 'lp', 'plot', 'tic', 'toc', 'toq',
            'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim']
diff --git a/ot/bregman.py b/ot/bregman.py
index 0d68602..d63c51d 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -3,9 +3,15 @@
 Bregman projections for regularized OT
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Nicolas Courty <ncourty@irisa.fr>
+#
+# License: MIT License
+
 import numpy as np
 
-def sinkhorn(a,b, M, reg,method='sinkhorn', numItermax = 1000, stopThr=1e-9, verbose=False, log=False,**kwargs):
+
+def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs):
     u"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
@@ -92,22 +98,29 @@ def sinkhorn(a,b, M, reg,method='sinkhorn', numItermax = 1000, stopThr=1e-9, ver
 
     """
 
-    if method.lower()=='sinkhorn':
-        sink= lambda: sinkhorn_knopp(a,b, M, reg,numItermax=numItermax,
-                                     stopThr=stopThr, verbose=verbose, log=log,**kwargs)
-    elif method.lower()=='sinkhorn_stabilized':
-        sink= lambda: sinkhorn_stabilized(a,b, M, reg,numItermax=numItermax,
-                                     stopThr=stopThr, verbose=verbose, log=log, **kwargs)
-    elif method.lower()=='sinkhorn_epsilon_scaling':
-        sink= lambda: sinkhorn_epsilon_scaling(a,b, M, reg,numItermax=numItermax,
-                                     stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    if method.lower() == 'sinkhorn':
+        def sink():
+            return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
+                                  stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    elif method.lower() == 'sinkhorn_stabilized':
+        def sink():
+            return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
+                                       stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    elif method.lower() == 'sinkhorn_epsilon_scaling':
+        def sink():
+            return sinkhorn_epsilon_scaling(
+                a, b, M, reg, numItermax=numItermax,
+                stopThr=stopThr, verbose=verbose, log=log, **kwargs)
     else:
         print('Warning : unknown method using classic Sinkhorn Knopp')
-        sink= lambda:  sinkhorn_knopp(a,b, M, reg, **kwargs)
+
+        def sink():
+            return sinkhorn_knopp(a, b, M, reg, **kwargs)
 
     return sink()
 
-def sinkhorn2(a,b, M, reg,method='sinkhorn', numItermax = 1000, stopThr=1e-9, verbose=False, log=False,**kwargs):
+
+def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs):
     u"""
     Solve the entropic regularization optimal transport problem and return the loss
 
@@ -170,7 +183,7 @@ def sinkhorn2(a,b, M, reg,method='sinkhorn', numItermax = 1000, stopThr=1e-9, ve
     >>> M=[[0.,1.],[1.,0.]]
     >>> ot.sinkhorn2(a,b,M,1)
     array([ 0.26894142])
-    
+
 
 
     References
@@ -194,27 +207,33 @@ def sinkhorn2(a,b, M, reg,method='sinkhorn', numItermax = 1000, stopThr=1e-9, ve
 
     """
 
-    if method.lower()=='sinkhorn':
-        sink= lambda: sinkhorn_knopp(a,b, M, reg,numItermax=numItermax,
-                                     stopThr=stopThr, verbose=verbose, log=log,**kwargs)
-    elif method.lower()=='sinkhorn_stabilized':
-        sink= lambda: sinkhorn_stabilized(a,b, M, reg,numItermax=numItermax,
-                                     stopThr=stopThr, verbose=verbose, log=log, **kwargs)
-    elif method.lower()=='sinkhorn_epsilon_scaling':
-        sink= lambda: sinkhorn_epsilon_scaling(a,b, M, reg,numItermax=numItermax,
-                                     stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    if method.lower() == 'sinkhorn':
+        def sink():
+            return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
+                                  stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    elif method.lower() == 'sinkhorn_stabilized':
+        def sink():
+            return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
+                                       stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    elif method.lower() == 'sinkhorn_epsilon_scaling':
+        def sink():
+            return sinkhorn_epsilon_scaling(
+                a, b, M, reg, numItermax=numItermax,
+                stopThr=stopThr, verbose=verbose, log=log, **kwargs)
     else:
         print('Warning : unknown method using classic Sinkhorn Knopp')
-        sink= lambda:  sinkhorn_knopp(a,b, M, reg, **kwargs)
-    
-    b=np.asarray(b,dtype=np.float64)
-    if len(b.shape)<2:
-        b=b.reshape((-1,1))
+
+        def sink():
+            return sinkhorn_knopp(a, b, M, reg, **kwargs)
+
+    b = np.asarray(b, dtype=np.float64)
+    if len(b.shape) < 2:
+        b = b.reshape((-1, 1))
 
     return sink()
 
 
-def sinkhorn_knopp(a,b, M, reg, numItermax = 1000, stopThr=1e-9, verbose=False, log=False,**kwargs):
+def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs):
     """
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
@@ -290,100 +309,101 @@ def sinkhorn_knopp(a,b, M, reg, numItermax = 1000, stopThr=1e-9, verbose=False,
 
     """
 
-    a=np.asarray(a,dtype=np.float64)
-    b=np.asarray(b,dtype=np.float64)
-    M=np.asarray(M,dtype=np.float64)
-
-
-    if len(a)==0:
-        a=np.ones((M.shape[0],),dtype=np.float64)/M.shape[0]
-    if len(b)==0:
-        b=np.ones((M.shape[1],),dtype=np.float64)/M.shape[1]
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
 
+    if len(a) == 0:
+        a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
     # init data
     Nini = len(a)
     Nfin = len(b)
 
-    if len(b.shape)>1:
-        nbb=b.shape[1]
+    if len(b.shape) > 1:
+        nbb = b.shape[1]
     else:
-        nbb=0
-
+        nbb = 0
 
     if log:
-        log={'err':[]}
+        log = {'err': []}
 
-    # we assume that no distances are null except those of the diagonal of distances
+    # we assume that no distances are null except those of the diagonal of
+    # distances
     if nbb:
-        u = np.ones((Nini,nbb))/Nini
-        v = np.ones((Nfin,nbb))/Nfin
+        u = np.ones((Nini, nbb)) / Nini
+        v = np.ones((Nfin, nbb)) / Nfin
     else:
-        u = np.ones(Nini)/Nini
-        v = np.ones(Nfin)/Nfin
+        u = np.ones(Nini) / Nini
+        v = np.ones(Nfin) / Nfin
 
+    # print(reg)
 
-    #print(reg)
+    K = np.exp(-M / reg)
+    # print(np.min(K))
 
-    K = np.exp(-M/reg)
-    #print(np.min(K))
-
-    Kp = (1/a).reshape(-1, 1) * K
+    Kp = (1 / a).reshape(-1, 1) * K
     cpt = 0
-    err=1
-    while (err>stopThr and cpt<numItermax):
+    err = 1
+    while (err > stopThr and cpt < numItermax):
         uprev = u
         vprev = v
         KtransposeU = np.dot(K.T, u)
         v = np.divide(b, KtransposeU)
-        u = 1./np.dot(Kp,v)
+        u = 1. / np.dot(Kp, v)
 
-        if (np.any(KtransposeU==0) or
-           np.any(np.isnan(u)) or np.any(np.isnan(v)) or
-           np.any(np.isinf(u)) or np.any(np.isinf(v))):
+        if (np.any(KtransposeU == 0) or
+                np.any(np.isnan(u)) or np.any(np.isnan(v)) or
+                np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
             print('Warning: numerical errors at iteration', cpt)
             u = uprev
             v = vprev
             break
-        if cpt%10==0:
-            # we can speed up the process by checking for the error only all the 10th iterations
+        if cpt % 10 == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
             if nbb:
-                err = np.sum((u-uprev)**2)/np.sum((u)**2)+np.sum((v-vprev)**2)/np.sum((v)**2)
+                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
+                    np.sum((v - vprev)**2) / np.sum((v)**2)
             else:
                 transp = u.reshape(-1, 1) * (K * v)
-                err = np.linalg.norm((np.sum(transp,axis=0)-b))**2
+                err = np.linalg.norm((np.sum(transp, axis=0) - b))**2
             if log:
                 log['err'].append(err)
 
             if verbose:
-                if cpt%200 ==0:
-                    print('{:5s}|{:12s}'.format('It.','Err')+'\n'+'-'*19)
-                print('{:5d}|{:8e}|'.format(cpt,err))
-        cpt = cpt +1
+                if cpt % 200 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+        cpt = cpt + 1
     if log:
-        log['u']=u
-        log['v']=v
+        log['u'] = u
+        log['v'] = v
 
-    if nbb: #return only loss
-        res=np.zeros((nbb))
+    if nbb:  # return only loss
+        res = np.zeros((nbb))
         for i in range(nbb):
-            res[i]=np.sum(u[:,i].reshape((-1,1))*K*v[:,i].reshape((1,-1))*M)
+            res[i] = np.sum(
+                u[:, i].reshape((-1, 1)) * K * v[:, i].reshape((1, -1)) * M)
         if log:
-            return res,log
+            return res, log
         else:
             return res
 
-    else: # return OT matrix
+    else:  # return OT matrix
 
         if log:
-            return u.reshape((-1,1))*K*v.reshape((1,-1)),log
+            return u.reshape((-1, 1)) * K * v.reshape((1, -1)), log
         else:
-            return u.reshape((-1,1))*K*v.reshape((1,-1))
+            return u.reshape((-1, 1)) * K * v.reshape((1, -1))
 
 
-def sinkhorn_stabilized(a,b, M, reg, numItermax = 1000,tau=1e3, stopThr=1e-9,warmstart=None, verbose=False,print_period=20, log=False,**kwargs):
+def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, warmstart=None, verbose=False, print_period=20, log=False, **kwargs):
     """
     Solve the entropic regularization OT problem with log stabilization
 
@@ -468,21 +488,21 @@ def sinkhorn_stabilized(a,b, M, reg, numItermax = 1000,tau=1e3, stopThr=1e-9,war
 
     """
 
-    a=np.asarray(a,dtype=np.float64)
-    b=np.asarray(b,dtype=np.float64)
-    M=np.asarray(M,dtype=np.float64)
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
 
-    if len(a)==0:
-        a=np.ones((M.shape[0],),dtype=np.float64)/M.shape[0]
-    if len(b)==0:
-        b=np.ones((M.shape[1],),dtype=np.float64)/M.shape[1]
+    if len(a) == 0:
+        a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
     # test if multiple target
-    if len(b.shape)>1:
-        nbb=b.shape[1]
-        a=a[:,np.newaxis]
+    if len(b.shape) > 1:
+        nbb = b.shape[1]
+        a = a[:, np.newaxis]
     else:
-        nbb=0
+        nbb = 0
 
     # init data
     na = len(a)
@@ -490,81 +510,80 @@ def sinkhorn_stabilized(a,b, M, reg, numItermax = 1000,tau=1e3, stopThr=1e-9,war
 
     cpt = 0
     if log:
-        log={'err':[]}
+        log = {'err': []}
 
-    # we assume that no distances are null except those of the diagonal of distances
+    # we assume that no distances are null except those of the diagonal of
+    # distances
     if warmstart is None:
-        alpha,beta=np.zeros(na),np.zeros(nb)
+        alpha, beta = np.zeros(na), np.zeros(nb)
     else:
-        alpha,beta=warmstart
+        alpha, beta = warmstart
 
     if nbb:
-        u,v = np.ones((na,nbb))/na,np.ones((nb,nbb))/nb
+        u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
     else:
-        u,v = np.ones(na)/na,np.ones(nb)/nb
+        u, v = np.ones(na) / na, np.ones(nb) / nb
 
-    def get_K(alpha,beta):
+    def get_K(alpha, beta):
         """log space computation"""
-        return np.exp(-(M-alpha.reshape((na,1))-beta.reshape((1,nb)))/reg)
+        return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg)
 
-    def get_Gamma(alpha,beta,u,v):
+    def get_Gamma(alpha, beta, u, v):
         """log space gamma computation"""
-        return np.exp(-(M-alpha.reshape((na,1))-beta.reshape((1,nb)))/reg+np.log(u.reshape((na,1)))+np.log(v.reshape((1,nb))))
+        return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log(v.reshape((1, nb))))
 
-    #print(np.min(K))
+    # print(np.min(K))
 
-    K=get_K(alpha,beta)
+    K = get_K(alpha, beta)
     transp = K
-    loop=1
+    loop = 1
     cpt = 0
-    err=1
+    err = 1
     while loop:
 
-
-
         uprev = u
         vprev = v
 
         # sinkhorn update
-        v = b/(np.dot(K.T,u)+1e-16)
-        u = a/(np.dot(K,v)+1e-16)
-
+        v = b / (np.dot(K.T, u) + 1e-16)
+        u = a / (np.dot(K, v) + 1e-16)
 
         # remove numerical problems and store them in K
-        if  np.abs(u).max()>tau or  np.abs(v).max()>tau:
+        if np.abs(u).max() > tau or np.abs(v).max() > tau:
             if nbb:
-                alpha,beta=alpha+reg*np.max(np.log(u),1),beta+reg*np.max(np.log(v))
+                alpha, beta = alpha + reg * \
+                    np.max(np.log(u), 1), beta + reg * np.max(np.log(v))
             else:
-                alpha,beta=alpha+reg*np.log(u),beta+reg*np.log(v)
+                alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v)
                 if nbb:
-                    u,v = np.ones((na,nbb))/na,np.ones((nb,nbb))/nb
+                    u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
                 else:
-                    u,v = np.ones(na)/na,np.ones(nb)/nb
-            K=get_K(alpha,beta)
-
+                    u, v = np.ones(na) / na, np.ones(nb) / nb
+            K = get_K(alpha, beta)
 
-        if cpt%print_period==0:
-            # we can speed up the process by checking for the error only all the 10th iterations
+        if cpt % print_period == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
             if nbb:
-                err = np.sum((u-uprev)**2)/np.sum((u)**2)+np.sum((v-vprev)**2)/np.sum((v)**2)
+                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
+                    np.sum((v - vprev)**2) / np.sum((v)**2)
             else:
-                transp = get_Gamma(alpha,beta,u,v)
-                err = np.linalg.norm((np.sum(transp,axis=0)-b))**2
+                transp = get_Gamma(alpha, beta, u, v)
+                err = np.linalg.norm((np.sum(transp, axis=0) - b))**2
             if log:
                 log['err'].append(err)
 
             if verbose:
-                if cpt%(print_period*20) ==0:
-                    print('{:5s}|{:12s}'.format('It.','Err')+'\n'+'-'*19)
-                print('{:5d}|{:8e}|'.format(cpt,err))
+                if cpt % (print_period * 20) == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
 
+        if err <= stopThr:
+            loop = False
 
-        if err<=stopThr:
-            loop=False
-
-        if cpt>=numItermax:
-            loop=False
-
+        if cpt >= numItermax:
+            loop = False
 
         if np.any(np.isnan(u)) or np.any(np.isnan(v)):
             # we have reached the machine precision
@@ -574,34 +593,34 @@ def sinkhorn_stabilized(a,b, M, reg, numItermax = 1000,tau=1e3, stopThr=1e-9,war
             v = vprev
             break
 
-        cpt = cpt +1
+        cpt = cpt + 1
 
-
-    #print('err=',err,' cpt=',cpt)
+    # print('err=',err,' cpt=',cpt)
     if log:
-        log['logu']=alpha/reg+np.log(u)
-        log['logv']=beta/reg+np.log(v)
-        log['alpha']=alpha+reg*np.log(u)
-        log['beta']=beta+reg*np.log(v)
-        log['warmstart']=(log['alpha'],log['beta'])
+        log['logu'] = alpha / reg + np.log(u)
+        log['logv'] = beta / reg + np.log(v)
+        log['alpha'] = alpha + reg * np.log(u)
+        log['beta'] = beta + reg * np.log(v)
+        log['warmstart'] = (log['alpha'], log['beta'])
         if nbb:
-            res=np.zeros((nbb))
+            res = np.zeros((nbb))
             for i in range(nbb):
-                res[i]=np.sum(get_Gamma(alpha,beta,u[:,i],v[:,i])*M)
-            return res,log
+                res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
+            return res, log
 
         else:
-            return get_Gamma(alpha,beta,u,v),log
+            return get_Gamma(alpha, beta, u, v), log
     else:
         if nbb:
-            res=np.zeros((nbb))
+            res = np.zeros((nbb))
             for i in range(nbb):
-                res[i]=np.sum(get_Gamma(alpha,beta,u[:,i],v[:,i])*M)
+                res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
             return res
         else:
-            return get_Gamma(alpha,beta,u,v)
+            return get_Gamma(alpha, beta, u, v)
+
 
-def sinkhorn_epsilon_scaling(a,b, M, reg, numItermax = 100, epsilon0=1e4, numInnerItermax = 100,tau=1e3, stopThr=1e-9,warmstart=None, verbose=False,print_period=10, log=False,**kwargs):
+def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInnerItermax=100, tau=1e3, stopThr=1e-9, warmstart=None, verbose=False, print_period=10, log=False, **kwargs):
     """
     Solve the entropic regularization optimal transport problem with log
     stabilization and epsilon scaling.
@@ -690,14 +709,14 @@ def sinkhorn_epsilon_scaling(a,b, M, reg, numItermax = 100, epsilon0=1e4, numInn
 
     """
 
-    a=np.asarray(a,dtype=np.float64)
-    b=np.asarray(b,dtype=np.float64)
-    M=np.asarray(M,dtype=np.float64)
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
 
-    if len(a)==0:
-        a=np.ones((M.shape[0],),dtype=np.float64)/M.shape[0]
-    if len(b)==0:
-        b=np.ones((M.shape[1],),dtype=np.float64)/M.shape[1]
+    if len(a) == 0:
+        a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
     # init data
     na = len(a)
@@ -705,88 +724,94 @@ def sinkhorn_epsilon_scaling(a,b, M, reg, numItermax = 100, epsilon0=1e4, numInn
 
     # nrelative umerical precision with 64 bits
     numItermin = 35
-    numItermax=max(numItermin,numItermax) # ensure that last velue is exact
-
+    numItermax = max(numItermin, numItermax)  # ensure that last velue is exact
 
     cpt = 0
     if log:
-        log={'err':[]}
+        log = {'err': []}
 
-    # we assume that no distances are null except those of the diagonal of distances
+    # we assume that no distances are null except those of the diagonal of
+    # distances
     if warmstart is None:
-        alpha,beta=np.zeros(na),np.zeros(nb)
+        alpha, beta = np.zeros(na), np.zeros(nb)
     else:
-        alpha,beta=warmstart
-
+        alpha, beta = warmstart
 
-    def get_K(alpha,beta):
+    def get_K(alpha, beta):
         """log space computation"""
-        return np.exp(-(M-alpha.reshape((na,1))-beta.reshape((1,nb)))/reg)
+        return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg)
 
-    #print(np.min(K))
-    def get_reg(n): # exponential decreasing
-        return (epsilon0-reg)*np.exp(-n)+reg
+    # print(np.min(K))
+    def get_reg(n):  # exponential decreasing
+        return (epsilon0 - reg) * np.exp(-n) + reg
 
-    loop=1
+    loop = 1
     cpt = 0
-    err=1
+    err = 1
     while loop:
 
-        regi=get_reg(cpt)
+        regi = get_reg(cpt)
 
-        G,logi=sinkhorn_stabilized(a,b, M, regi, numItermax = numInnerItermax, stopThr=1e-9,warmstart=(alpha,beta), verbose=False,print_period=20,tau=tau, log=True)
+        G, logi = sinkhorn_stabilized(a, b, M, regi, numItermax=numInnerItermax, stopThr=1e-9, warmstart=(
+            alpha, beta), verbose=False, print_period=20, tau=tau, log=True)
 
-        alpha=logi['alpha']
-        beta=logi['beta']
+        alpha = logi['alpha']
+        beta = logi['beta']
 
-        if cpt>=numItermax:
-            loop=False
+        if cpt >= numItermax:
+            loop = False
 
-        if  cpt%(print_period)==0: # spsion nearly converged
-            # we can speed up the process by checking for the error only all the 10th iterations
+        if cpt % (print_period) == 0:  # spsion nearly converged
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
             transp = G
-            err = np.linalg.norm((np.sum(transp,axis=0)-b))**2+np.linalg.norm((np.sum(transp,axis=1)-a))**2
+            err = np.linalg.norm(
+                (np.sum(transp, axis=0) - b))**2 + np.linalg.norm((np.sum(transp, axis=1) - a))**2
             if log:
                 log['err'].append(err)
 
             if verbose:
-                if cpt%(print_period*10) ==0:
-                    print('{:5s}|{:12s}'.format('It.','Err')+'\n'+'-'*19)
-                print('{:5d}|{:8e}|'.format(cpt,err))
+                if cpt % (print_period * 10) == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
 
-        if err<=stopThr and cpt>numItermin:
-            loop=False
+        if err <= stopThr and cpt > numItermin:
+            loop = False
 
-        cpt = cpt +1
-    #print('err=',err,' cpt=',cpt)
+        cpt = cpt + 1
+    # print('err=',err,' cpt=',cpt)
     if log:
-        log['alpha']=alpha
-        log['beta']=beta
-        log['warmstart']=(log['alpha'],log['beta'])
-        return G,log
+        log['alpha'] = alpha
+        log['beta'] = beta
+        log['warmstart'] = (log['alpha'], log['beta'])
+        return G, log
     else:
         return G
 
 
-def geometricBar(weights,alldistribT):
+def geometricBar(weights, alldistribT):
     """return the weighted geometric mean of distributions"""
-    assert(len(weights)==alldistribT.shape[1])
-    return np.exp(np.dot(np.log(alldistribT),weights.T))
+    assert(len(weights) == alldistribT.shape[1])
+    return np.exp(np.dot(np.log(alldistribT), weights.T))
+
 
 def geometricMean(alldistribT):
     """return the  geometric mean of distributions"""
-    return np.exp(np.mean(np.log(alldistribT),axis=1))
+    return np.exp(np.mean(np.log(alldistribT), axis=1))
 
-def projR(gamma,p):
+
+def projR(gamma, p):
     """return the KL projection on the row constrints """
-    return np.multiply(gamma.T,p/np.maximum(np.sum(gamma,axis=1),1e-10)).T
+    return np.multiply(gamma.T, p / np.maximum(np.sum(gamma, axis=1), 1e-10)).T
+
 
-def projC(gamma,q):
+def projC(gamma, q):
     """return the KL projection on the column constrints """
-    return np.multiply(gamma,q/np.maximum(np.sum(gamma,axis=0),1e-10))
+    return np.multiply(gamma, q / np.maximum(np.sum(gamma, axis=0), 1e-10))
 
 
-def barycenter(A,M,reg, weights=None, numItermax = 1000, stopThr=1e-4,verbose=False,log=False):
+def barycenter(A, M, reg, weights=None, numItermax=1000, stopThr=1e-4, verbose=False, log=False):
     """Compute the entropic regularized wasserstein barycenter of distributions A
 
      The function solves the following optimization problem:
@@ -837,49 +862,49 @@ def barycenter(A,M,reg, weights=None, numItermax = 1000, stopThr=1e-4,verbose=Fa
 
     """
 
-
     if weights is None:
-        weights=np.ones(A.shape[1])/A.shape[1]
+        weights = np.ones(A.shape[1]) / A.shape[1]
     else:
-        assert(len(weights)==A.shape[1])
+        assert(len(weights) == A.shape[1])
 
     if log:
-        log={'err':[]}
+        log = {'err': []}
 
-    #M = M/np.median(M) # suggested by G. Peyre
-    K = np.exp(-M/reg)
+    # M = M/np.median(M) # suggested by G. Peyre
+    K = np.exp(-M / reg)
 
     cpt = 0
-    err=1
+    err = 1
 
-    UKv=np.dot(K,np.divide(A.T,np.sum(K,axis=0)).T)
-    u = (geometricMean(UKv)/UKv.T).T
+    UKv = np.dot(K, np.divide(A.T, np.sum(K, axis=0)).T)
+    u = (geometricMean(UKv) / UKv.T).T
 
-    while (err>stopThr and cpt<numItermax):
-        cpt = cpt +1
-        UKv=u*np.dot(K,np.divide(A,np.dot(K,u)))
-        u = (u.T*geometricBar(weights,UKv)).T/UKv
+    while (err > stopThr and cpt < numItermax):
+        cpt = cpt + 1
+        UKv = u * np.dot(K, np.divide(A, np.dot(K, u)))
+        u = (u.T * geometricBar(weights, UKv)).T / UKv
 
-        if cpt%10==1:
-            err=np.sum(np.std(UKv,axis=1))
+        if cpt % 10 == 1:
+            err = np.sum(np.std(UKv, axis=1))
 
             # log and verbose print
             if log:
                 log['err'].append(err)
 
             if verbose:
-                if cpt%200 ==0:
-                    print('{:5s}|{:12s}'.format('It.','Err')+'\n'+'-'*19)
-                print('{:5d}|{:8e}|'.format(cpt,err))
+                if cpt % 200 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
 
     if log:
-        log['niter']=cpt
-        return geometricBar(weights,UKv),log
+        log['niter'] = cpt
+        return geometricBar(weights, UKv), log
     else:
-        return geometricBar(weights,UKv)
+        return geometricBar(weights, UKv)
 
 
-def unmix(a,D,M,M0,h0,reg,reg0,alpha,numItermax = 1000, stopThr=1e-3,verbose=False,log=False):
+def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000, stopThr=1e-3, verbose=False, log=False):
     """
     Compute the unmixing of an observation with a given dictionary using Wasserstein distance
 
@@ -942,44 +967,45 @@ def unmix(a,D,M,M0,h0,reg,reg0,alpha,numItermax = 1000, stopThr=1e-3,verbose=Fal
 
     """
 
-    #M = M/np.median(M)
-    K = np.exp(-M/reg)
+    # M = M/np.median(M)
+    K = np.exp(-M / reg)
 
-    #M0 = M0/np.median(M0)
-    K0 = np.exp(-M0/reg0)
+    # M0 = M0/np.median(M0)
+    K0 = np.exp(-M0 / reg0)
     old = h0
 
-    err=1
-    cpt=0
-    #log = {'niter':0, 'all_err':[]}
+    err = 1
+    cpt = 0
+    # log = {'niter':0, 'all_err':[]}
     if log:
-        log={'err':[]}
-
-
-    while (err>stopThr and cpt<numItermax):
-        K = projC(K,a)
-        K0 = projC(K0,h0)
-        new  = np.sum(K0,axis=1)
-        inv_new = np.dot(D,new) # we recombine the current selection from dictionnary
-        other = np.sum(K,axis=1)
-        delta = np.exp(alpha*np.log(other)+(1-alpha)*np.log(inv_new)) # geometric interpolation
-        K = projR(K,delta)
-        K0 =  np.dot(np.diag(np.dot(D.T,delta/inv_new)),K0)
-
-        err=np.linalg.norm(np.sum(K0,axis=1)-old)
+        log = {'err': []}
+
+    while (err > stopThr and cpt < numItermax):
+        K = projC(K, a)
+        K0 = projC(K0, h0)
+        new = np.sum(K0, axis=1)
+        # we recombine the current selection from dictionnary
+        inv_new = np.dot(D, new)
+        other = np.sum(K, axis=1)
+        # geometric interpolation
+        delta = np.exp(alpha * np.log(other) + (1 - alpha) * np.log(inv_new))
+        K = projR(K, delta)
+        K0 = np.dot(np.diag(np.dot(D.T, delta / inv_new)), K0)
+
+        err = np.linalg.norm(np.sum(K0, axis=1) - old)
         old = new
         if log:
             log['err'].append(err)
 
         if verbose:
-            if cpt%200 ==0:
-                print('{:5s}|{:12s}'.format('It.','Err')+'\n'+'-'*19)
-            print('{:5d}|{:8e}|'.format(cpt,err))
+            if cpt % 200 == 0:
+                print('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+            print('{:5d}|{:8e}|'.format(cpt, err))
 
-        cpt = cpt+1
+        cpt = cpt + 1
 
     if log:
-        log['niter']=cpt
-        return np.sum(K0,axis=1),log
+        log['niter'] = cpt
+        return np.sum(K0, axis=1), log
     else:
-        return np.sum(K0,axis=1)
+        return np.sum(K0, axis=1)
diff --git a/ot/da.py b/ot/da.py
index ddf1c60..564c7b7 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -3,22 +3,34 @@
 Domain adaptation with optimal transport
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Nicolas Courty <ncourty@irisa.fr>
+#         Michael Perrot <michael.perrot@univ-st-etienne.fr>
+#
+# License: MIT License
+
 import numpy as np
+
 from .bregman import sinkhorn
 from .lp import emd
-from .utils import unif,dist,kernel
+from .utils import unif, dist, kernel, cost_normalization
+from .utils import check_params, deprecated, BaseEstimator
 from .optim import cg
 from .optim import gcg
 
 
-def sinkhorn_lpl1_mm(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerItermax = 200,stopInnerThr=1e-9,verbose=False,log=False):
+def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
+                     numInnerItermax=200, stopInnerThr=1e-9, verbose=False,
+                     log=False):
     """
-    Solve the entropic regularization optimal transport problem with nonconvex group lasso regularization
+    Solve the entropic regularization optimal transport problem with nonconvex
+    group lasso regularization
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma)
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)
+        + \eta \Omega_g(\gamma)
 
         s.t. \gamma 1 = a
 
@@ -28,11 +40,16 @@ def sinkhorn_lpl1_mm(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerIter
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`   where  :math:`\mathcal{I}_c` are the index of samples from class c in the source domain.
+    - :math:`\Omega_e` is the entropic regularization term
+        :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso  regulaization term
+      :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
+      where  :math:`\mathcal{I}_c` are the index of samples from class c
+      in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional gradient as proposed in  [5]_ [7]_
+    The algorithm used for solving the problem is the generalised conditional
+    gradient as proposed in  [5]_ [7]_
 
 
     Parameters
@@ -72,8 +89,13 @@ def sinkhorn_lpl1_mm(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerIter
     References
     ----------
 
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
-    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
+    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE
+       Transactions on Pattern Analysis and Machine Intelligence ,
+       vol.PP, no.99, pp.1-1
+    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
+       Generalized conditional gradient: analysis of convergence
+       and applications. arXiv preprint arXiv:1510.06567.
 
     See Also
     --------
@@ -94,7 +116,7 @@ def sinkhorn_lpl1_mm(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerIter
     W = np.zeros(M.shape)
 
     for cpt in range(numItermax):
-        Mreg = M + eta*W
+        Mreg = M + eta * W
         transp = sinkhorn(a, b, Mreg, reg, numItermax=numInnerItermax,
                           stopThr=stopInnerThr)
         # the transport has been computed. Check if classes are really
@@ -102,19 +124,24 @@ def sinkhorn_lpl1_mm(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerIter
         W = np.ones(M.shape)
         for (i, c) in enumerate(classes):
             majs = np.sum(transp[indices_labels[i]], axis=0)
-            majs = p*((majs+epsilon)**(p-1))
+            majs = p * ((majs + epsilon)**(p - 1))
             W[indices_labels[i]] = majs
 
     return transp
 
-def sinkhorn_l1l2_gl(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerItermax = 200,stopInnerThr=1e-9,verbose=False,log=False):
+
+def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
+                     numInnerItermax=200, stopInnerThr=1e-9, verbose=False,
+                     log=False):
     """
-    Solve the entropic regularization optimal transport problem with group lasso regularization
+    Solve the entropic regularization optimal transport problem with group
+    lasso regularization
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma)
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+
+        \eta \Omega_g(\gamma)
 
         s.t. \gamma 1 = a
 
@@ -124,11 +151,16 @@ def sinkhorn_l1l2_gl(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerIter
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^2`   where  :math:`\mathcal{I}_c` are the index of samples from class c in the source domain.
+    - :math:`\Omega_e` is the entropic regularization term
+      :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso regulaization term
+      :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^2`
+      where  :math:`\mathcal{I}_c` are the index of samples from class
+      c in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional gradient as proposed in  [5]_ [7]_
+    The algorithm used for solving the problem is the generalised conditional
+    gradient as proposed in  [5]_ [7]_
 
 
     Parameters
@@ -168,46 +200,54 @@ def sinkhorn_l1l2_gl(a,labels_a, b, M, reg, eta=0.1,numItermax = 10,numInnerIter
     References
     ----------
 
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
-    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
+    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE Transactions
+       on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
+       Generalized conditional gradient: analysis of convergence and
+       applications. arXiv preprint arXiv:1510.06567.
 
     See Also
     --------
     ot.optim.gcg : Generalized conditional gradient for OT problems
 
     """
-    lstlab=np.unique(labels_a)
+    lstlab = np.unique(labels_a)
 
     def f(G):
-        res=0
+        res = 0
         for i in range(G.shape[1]):
             for lab in lstlab:
-                temp=G[labels_a==lab,i]
-                res+=np.linalg.norm(temp)
+                temp = G[labels_a == lab, i]
+                res += np.linalg.norm(temp)
         return res
 
     def df(G):
-        W=np.zeros(G.shape)
+        W = np.zeros(G.shape)
         for i in range(G.shape[1]):
             for lab in lstlab:
-                temp=G[labels_a==lab,i]
-                n=np.linalg.norm(temp)
+                temp = G[labels_a == lab, i]
+                n = np.linalg.norm(temp)
                 if n:
-                    W[labels_a==lab,i]=temp/n
+                    W[labels_a == lab, i] = temp / n
         return W
 
-
-    return gcg(a,b,M,reg,eta,f,df,G0=None,numItermax = numItermax,numInnerItermax=numInnerItermax, stopThr=stopInnerThr,verbose=verbose,log=log)
+    return gcg(a, b, M, reg, eta, f, df, G0=None, numItermax=numItermax,
+               numInnerItermax=numInnerItermax, stopThr=stopInnerThr,
+               verbose=verbose, log=log)
 
 
-
-def joint_OT_mapping_linear(xs,xt,mu=1,eta=0.001,bias=False,verbose=False,verbose2=False,numItermax = 100,numInnerItermax = 10,stopInnerThr=1e-6,stopThr=1e-5,log=False,**kwargs):
+def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False,
+                            verbose2=False, numItermax=100, numInnerItermax=10,
+                            stopInnerThr=1e-6, stopThr=1e-5, log=False,
+                            **kwargs):
     """Joint OT and linear mapping estimation as proposed in [8]
 
     The function solves the following optimization problem:
 
     .. math::
-        \min_{\gamma,L}\quad \|L(X_s) -n_s\gamma X_t\|^2_F + \mu<\gamma,M>_F + \eta  \|L -I\|^2_F
+        \min_{\gamma,L}\quad \|L(X_s) -n_s\gamma X_t\|^2_F +
+          \mu<\gamma,M>_F + \eta  \|L -I\|^2_F
 
         s.t. \gamma 1 = a
 
@@ -216,8 +256,10 @@ def joint_OT_mapping_linear(xs,xt,mu=1,eta=0.001,bias=False,verbose=False,verbos
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) squared euclidean cost matrix between samples in Xs and Xt (scaled by ns)
-    - :math:`L` is a dxd linear operator that approximates the barycentric mapping
+    - M is the (ns,nt) squared euclidean cost matrix between samples in
+       Xs and Xt (scaled by ns)
+    - :math:`L` is a dxd linear operator that approximates the barycentric
+      mapping
     - :math:`I` is the identity matrix (neutral linear mapping)
     - a and b are uniform source and target weights
 
@@ -272,7 +314,9 @@ def joint_OT_mapping_linear(xs,xt,mu=1,eta=0.001,bias=False,verbose=False,verbos
     References
     ----------
 
-    .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard, "Mapping estimation for discrete optimal transport", Neural Information Processing Systems (NIPS), 2016.
+    .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard,
+        "Mapping estimation for discrete optimal transport",
+        Neural Information Processing Systems (NIPS), 2016.
 
     See Also
     --------
@@ -281,103 +325,116 @@ def joint_OT_mapping_linear(xs,xt,mu=1,eta=0.001,bias=False,verbose=False,verbos
 
     """
 
-    ns,nt,d=xs.shape[0],xt.shape[0],xt.shape[1]
+    ns, nt, d = xs.shape[0], xt.shape[0], xt.shape[1]
 
     if bias:
-        xs1=np.hstack((xs,np.ones((ns,1))))
-        xstxs=xs1.T.dot(xs1)
-        I=np.eye(d+1)
-        I[-1]=0
-        I0=I[:,:-1]
-        sel=lambda x : x[:-1,:]
+        xs1 = np.hstack((xs, np.ones((ns, 1))))
+        xstxs = xs1.T.dot(xs1)
+        I = np.eye(d + 1)
+        I[-1] = 0
+        I0 = I[:, :-1]
+
+        def sel(x):
+            return x[:-1, :]
     else:
-        xs1=xs
-        xstxs=xs1.T.dot(xs1)
-        I=np.eye(d)
-        I0=I
-        sel=lambda x : x
+        xs1 = xs
+        xstxs = xs1.T.dot(xs1)
+        I = np.eye(d)
+        I0 = I
+
+        def sel(x):
+            return x
 
     if log:
-        log={'err':[]}
+        log = {'err': []}
 
-    a,b=unif(ns),unif(nt)
-    M=dist(xs,xt)*ns
-    G=emd(a,b,M)
+    a, b = unif(ns), unif(nt)
+    M = dist(xs, xt) * ns
+    G = emd(a, b, M)
 
-    vloss=[]
+    vloss = []
 
-    def loss(L,G):
+    def loss(L, G):
         """Compute full loss"""
-        return np.sum((xs1.dot(L)-ns*G.dot(xt))**2)+mu*np.sum(G*M)+eta*np.sum(sel(L-I0)**2)
+        return np.sum((xs1.dot(L) - ns * G.dot(xt))**2) + mu * np.sum(G * M) + eta * np.sum(sel(L - I0)**2)
 
     def solve_L(G):
         """ solve L problem with fixed G (least square)"""
-        xst=ns*G.dot(xt)
-        return np.linalg.solve(xstxs+eta*I,xs1.T.dot(xst)+eta*I0)
+        xst = ns * G.dot(xt)
+        return np.linalg.solve(xstxs + eta * I, xs1.T.dot(xst) + eta * I0)
 
-    def solve_G(L,G0):
+    def solve_G(L, G0):
         """Update G with CG algorithm"""
-        xsi=xs1.dot(L)
+        xsi = xs1.dot(L)
+
         def f(G):
-            return np.sum((xsi-ns*G.dot(xt))**2)
+            return np.sum((xsi - ns * G.dot(xt))**2)
+
         def df(G):
-            return -2*ns*(xsi-ns*G.dot(xt)).dot(xt.T)
-        G=cg(a,b,M,1.0/mu,f,df,G0=G0,numItermax=numInnerItermax,stopThr=stopInnerThr)
+            return -2 * ns * (xsi - ns * G.dot(xt)).dot(xt.T)
+        G = cg(a, b, M, 1.0 / mu, f, df, G0=G0,
+               numItermax=numInnerItermax, stopThr=stopInnerThr)
         return G
 
+    L = solve_L(G)
 
-    L=solve_L(G)
-
-    vloss.append(loss(L,G))
+    vloss.append(loss(L, G))
 
     if verbose:
-        print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-        print('{:5d}|{:8e}|{:8e}'.format(0,vloss[-1],0))
-
+        print('{:5s}|{:12s}|{:8s}'.format(
+            'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+        print('{:5d}|{:8e}|{:8e}'.format(0, vloss[-1], 0))
 
     # init loop
-    if numItermax>0:
-        loop=1
+    if numItermax > 0:
+        loop = 1
     else:
-        loop=0
-    it=0
+        loop = 0
+    it = 0
 
     while loop:
 
-        it+=1
+        it += 1
 
         # update G
-        G=solve_G(L,G)
+        G = solve_G(L, G)
 
-        #update L
-        L=solve_L(G)
+        # update L
+        L = solve_L(G)
 
-        vloss.append(loss(L,G))
+        vloss.append(loss(L, G))
 
-        if it>=numItermax:
-            loop=0
+        if it >= numItermax:
+            loop = 0
 
-        if abs(vloss[-1]-vloss[-2])/abs(vloss[-2])<stopThr:
-            loop=0
+        if abs(vloss[-1] - vloss[-2]) / abs(vloss[-2]) < stopThr:
+            loop = 0
 
         if verbose:
-            if it%20==0:
-                print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-            print('{:5d}|{:8e}|{:8e}'.format(it,vloss[-1],(vloss[-1]-vloss[-2])/abs(vloss[-2])))
+            if it % 20 == 0:
+                print('{:5s}|{:12s}|{:8s}'.format(
+                    'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+            print('{:5d}|{:8e}|{:8e}'.format(
+                it, vloss[-1], (vloss[-1] - vloss[-2]) / abs(vloss[-2])))
     if log:
-        log['loss']=vloss
-        return G,L,log
+        log['loss'] = vloss
+        return G, L, log
     else:
-        return G,L
+        return G, L
 
 
-def joint_OT_mapping_kernel(xs,xt,mu=1,eta=0.001,kerneltype='gaussian',sigma=1,bias=False,verbose=False,verbose2=False,numItermax = 100,numInnerItermax = 10,stopInnerThr=1e-6,stopThr=1e-5,log=False,**kwargs):
+def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
+                            sigma=1, bias=False, verbose=False, verbose2=False,
+                            numItermax=100, numInnerItermax=10,
+                            stopInnerThr=1e-6, stopThr=1e-5, log=False,
+                            **kwargs):
     """Joint OT and nonlinear mapping estimation with kernels as proposed in [8]
 
     The function solves the following optimization problem:
 
     .. math::
-        \min_{\gamma,L\in\mathcal{H}}\quad \|L(X_s) -n_s\gamma X_t\|^2_F + \mu<\gamma,M>_F + \eta  \|L\|^2_\mathcal{H}
+        \min_{\gamma,L\in\mathcal{H}}\quad \|L(X_s) -
+        n_s\gamma X_t\|^2_F + \mu<\gamma,M>_F + \eta  \|L\|^2_\mathcal{H}
 
         s.t. \gamma 1 = a
 
@@ -386,8 +443,10 @@ def joint_OT_mapping_kernel(xs,xt,mu=1,eta=0.001,kerneltype='gaussian',sigma=1,b
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) squared euclidean cost matrix between samples in Xs and Xt (scaled by ns)
-    - :math:`L` is a ns x d linear operator on a kernel matrix that approximates the barycentric mapping
+    - M is the (ns,nt) squared euclidean cost matrix between samples in
+      Xs and Xt (scaled by ns)
+    - :math:`L` is a ns x d linear operator on a kernel matrix that
+      approximates the barycentric mapping
     - a and b are uniform source and target weights
 
     The problem consist in solving jointly an optimal transport matrix
@@ -445,7 +504,9 @@ def joint_OT_mapping_kernel(xs,xt,mu=1,eta=0.001,kerneltype='gaussian',sigma=1,b
     References
     ----------
 
-    .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard, "Mapping estimation for discrete optimal transport", Neural Information Processing Systems (NIPS), 2016.
+    .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard,
+       "Mapping estimation for discrete optimal transport",
+       Neural Information Processing Systems (NIPS), 2016.
 
     See Also
     --------
@@ -454,161 +515,170 @@ def joint_OT_mapping_kernel(xs,xt,mu=1,eta=0.001,kerneltype='gaussian',sigma=1,b
 
     """
 
-    ns,nt=xs.shape[0],xt.shape[0]
+    ns, nt = xs.shape[0], xt.shape[0]
 
-    K=kernel(xs,xs,method=kerneltype,sigma=sigma)
+    K = kernel(xs, xs, method=kerneltype, sigma=sigma)
     if bias:
-        K1=np.hstack((K,np.ones((ns,1))))
-        I=np.eye(ns+1)
-        I[-1]=0
-        Kp=np.eye(ns+1)
-        Kp[:ns,:ns]=K
+        K1 = np.hstack((K, np.ones((ns, 1))))
+        I = np.eye(ns + 1)
+        I[-1] = 0
+        Kp = np.eye(ns + 1)
+        Kp[:ns, :ns] = K
 
         # ls regu
-        #K0 = K1.T.dot(K1)+eta*I
-        #Kreg=I
+        # K0 = K1.T.dot(K1)+eta*I
+        # Kreg=I
 
         # RKHS regul
-        K0 = K1.T.dot(K1)+eta*Kp
-        Kreg=Kp
+        K0 = K1.T.dot(K1) + eta * Kp
+        Kreg = Kp
 
     else:
-        K1=K
-        I=np.eye(ns)
+        K1 = K
+        I = np.eye(ns)
 
         # ls regul
-        #K0 = K1.T.dot(K1)+eta*I
-        #Kreg=I
+        # K0 = K1.T.dot(K1)+eta*I
+        # Kreg=I
 
         # proper kernel ridge
-        K0=K+eta*I
-        Kreg=K
-
-
-
+        K0 = K + eta * I
+        Kreg = K
 
     if log:
-        log={'err':[]}
+        log = {'err': []}
 
-    a,b=unif(ns),unif(nt)
-    M=dist(xs,xt)*ns
-    G=emd(a,b,M)
+    a, b = unif(ns), unif(nt)
+    M = dist(xs, xt) * ns
+    G = emd(a, b, M)
 
-    vloss=[]
+    vloss = []
 
-    def loss(L,G):
+    def loss(L, G):
         """Compute full loss"""
-        return np.sum((K1.dot(L)-ns*G.dot(xt))**2)+mu*np.sum(G*M)+eta*np.trace(L.T.dot(Kreg).dot(L))
+        return np.sum((K1.dot(L) - ns * G.dot(xt))**2) + mu * np.sum(G * M) + eta * np.trace(L.T.dot(Kreg).dot(L))
 
     def solve_L_nobias(G):
         """ solve L problem with fixed G (least square)"""
-        xst=ns*G.dot(xt)
-        return np.linalg.solve(K0,xst)
+        xst = ns * G.dot(xt)
+        return np.linalg.solve(K0, xst)
 
     def solve_L_bias(G):
         """ solve L problem with fixed G (least square)"""
-        xst=ns*G.dot(xt)
-        return np.linalg.solve(K0,K1.T.dot(xst))
+        xst = ns * G.dot(xt)
+        return np.linalg.solve(K0, K1.T.dot(xst))
 
-    def solve_G(L,G0):
+    def solve_G(L, G0):
         """Update G with CG algorithm"""
-        xsi=K1.dot(L)
+        xsi = K1.dot(L)
+
         def f(G):
-            return np.sum((xsi-ns*G.dot(xt))**2)
+            return np.sum((xsi - ns * G.dot(xt))**2)
+
         def df(G):
-            return -2*ns*(xsi-ns*G.dot(xt)).dot(xt.T)
-        G=cg(a,b,M,1.0/mu,f,df,G0=G0,numItermax=numInnerItermax,stopThr=stopInnerThr)
+            return -2 * ns * (xsi - ns * G.dot(xt)).dot(xt.T)
+        G = cg(a, b, M, 1.0 / mu, f, df, G0=G0,
+               numItermax=numInnerItermax, stopThr=stopInnerThr)
         return G
 
     if bias:
-        solve_L=solve_L_bias
+        solve_L = solve_L_bias
     else:
-        solve_L=solve_L_nobias
+        solve_L = solve_L_nobias
 
-    L=solve_L(G)
+    L = solve_L(G)
 
-    vloss.append(loss(L,G))
+    vloss.append(loss(L, G))
 
     if verbose:
-        print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-        print('{:5d}|{:8e}|{:8e}'.format(0,vloss[-1],0))
-
+        print('{:5s}|{:12s}|{:8s}'.format(
+            'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+        print('{:5d}|{:8e}|{:8e}'.format(0, vloss[-1], 0))
 
     # init loop
-    if numItermax>0:
-        loop=1
+    if numItermax > 0:
+        loop = 1
     else:
-        loop=0
-    it=0
+        loop = 0
+    it = 0
 
     while loop:
 
-        it+=1
+        it += 1
 
         # update G
-        G=solve_G(L,G)
+        G = solve_G(L, G)
 
-        #update L
-        L=solve_L(G)
+        # update L
+        L = solve_L(G)
 
-        vloss.append(loss(L,G))
+        vloss.append(loss(L, G))
 
-        if it>=numItermax:
-            loop=0
+        if it >= numItermax:
+            loop = 0
 
-        if abs(vloss[-1]-vloss[-2])/abs(vloss[-2])<stopThr:
-            loop=0
+        if abs(vloss[-1] - vloss[-2]) / abs(vloss[-2]) < stopThr:
+            loop = 0
 
         if verbose:
-            if it%20==0:
-                print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-            print('{:5d}|{:8e}|{:8e}'.format(it,vloss[-1],(vloss[-1]-vloss[-2])/abs(vloss[-2])))
+            if it % 20 == 0:
+                print('{:5s}|{:12s}|{:8s}'.format(
+                    'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+            print('{:5d}|{:8e}|{:8e}'.format(
+                it, vloss[-1], (vloss[-1] - vloss[-2]) / abs(vloss[-2])))
     if log:
-        log['loss']=vloss
-        return G,L,log
+        log['loss'] = vloss
+        return G, L, log
     else:
-        return G,L
+        return G, L
 
 
+@deprecated("The class OTDA is deprecated in 0.3.1 and will be "
+            "removed in 0.5"
+            "\n\tfor standard transport use class EMDTransport instead.")
 class OTDA(object):
+
     """Class for domain adaptation with optimal transport as proposed in [5]
 
 
     References
     ----------
 
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE Transactions on
+       Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
 
     """
 
-    def __init__(self,metric='sqeuclidean'):
+    def __init__(self, metric='sqeuclidean', norm=None):
         """ Class initialization"""
-        self.xs=0
-        self.xt=0
-        self.G=0
-        self.metric=metric
-        self.computed=False
-
-
-    def fit(self,xs,xt,ws=None,wt=None,norm=None):
-        """ Fit domain adaptation between samples is xs and xt (with optional weights)"""
-        self.xs=xs
-        self.xt=xt
+        self.xs = 0
+        self.xt = 0
+        self.G = 0
+        self.metric = metric
+        self.norm = norm
+        self.computed = False
+
+    def fit(self, xs, xt, ws=None, wt=None, max_iter=100000):
+        """Fit domain adaptation between samples is xs and xt
+        (with optional weights)"""
+        self.xs = xs
+        self.xt = xt
 
         if wt is None:
-            wt=unif(xt.shape[0])
+            wt = unif(xt.shape[0])
         if ws is None:
-            ws=unif(xs.shape[0])
+            ws = unif(xs.shape[0])
 
-        self.ws=ws
-        self.wt=wt
+        self.ws = ws
+        self.wt = wt
 
-        self.M=dist(xs,xt,metric=self.metric)
-        self.normalizeM(norm)
-        self.G=emd(ws,wt,self.M)
-        self.computed=True
+        self.M = dist(xs, xt, metric=self.metric)
+        self.M = cost_normalization(self.M, self.norm)
+        self.G = emd(ws, wt, self.M, max_iter)
+        self.computed = True
 
-    def interp(self,direction=1):
+    def interp(self, direction=1):
         """Barycentric interpolation for the source (1) or target (-1) samples
 
         This Barycentric interpolation solves for each source (resp target)
@@ -623,28 +693,28 @@ class OTDA(object):
         metric  could be used in the future.
 
         """
-        if direction>0: # >0 then source to target
-            G=self.G
-            w=self.ws.reshape((self.xs.shape[0],1))
-            x=self.xt
+        if direction > 0:  # >0 then source to target
+            G = self.G
+            w = self.ws.reshape((self.xs.shape[0], 1))
+            x = self.xt
         else:
-            G=self.G.T
-            w=self.wt.reshape((self.xt.shape[0],1))
-            x=self.xs
+            G = self.G.T
+            w = self.wt.reshape((self.xt.shape[0], 1))
+            x = self.xs
 
         if self.computed:
-            if self.metric=='sqeuclidean':
-                return np.dot(G/w,x) # weighted mean
+            if self.metric == 'sqeuclidean':
+                return np.dot(G / w, x)  # weighted mean
             else:
-                print("Warning, metric not handled yet, using weighted average")
-                return np.dot(G/w,x) # weighted mean
+                print(
+                    "Warning, metric not handled yet, using weighted average")
+                return np.dot(G / w, x)  # weighted mean
                 return None
         else:
             print("Warning, model not fitted yet, returning None")
             return None
 
-
-    def predict(self,x,direction=1):
+    def predict(self, x, direction=1):
         """ Out of sample mapping using the formulation from [6]
 
         For each sample x to map, it finds the nearest source sample xs and
@@ -654,183 +724,1019 @@ class OTDA(object):
         References
         ----------
 
-        .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882.
+        .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+          Regularized discrete optimal transport. SIAM Journal on Imaging
+          Sciences, 7(3), 1853-1882.
 
         """
-        if direction>0: # >0 then source to target
-            xf=self.xt
-            x0=self.xs
+        if direction > 0:  # >0 then source to target
+            xf = self.xt
+            x0 = self.xs
         else:
-            xf=self.xs
-            x0=self.xt
-
-        D0=dist(x,x0) # dist netween new samples an source
-        idx=np.argmin(D0,1) # closest one
-        xf=self.interp(direction)# interp the source samples
-        return xf[idx,:]+x-x0[idx,:] # aply the delta to the interpolation
-
-    def normalizeM(self, norm):
-        """ Apply normalization to the loss matrix
-        
-        
-        Parameters
-        ----------
-        norm : str
-            type of normalization from 'median','max','log','loglog'
-            
-        """
-        
-        if norm == "median":
-            self.M /= float(np.median(self.M))
-        elif norm == "max":
-            self.M /= float(np.max(self.M))
-        elif norm == "log":
-            self.M = np.log(1 + self.M)
-        elif norm == "loglog":
-            self.M = np.log(1 + np.log(1 + self.M))
-            
+            xf = self.xs
+            x0 = self.xt
 
+        D0 = dist(x, x0)  # dist netween new samples an source
+        idx = np.argmin(D0, 1)  # closest one
+        xf = self.interp(direction)  # interp the source samples
+        # aply the delta to the interpolation
+        return xf[idx, :] + x - x0[idx, :]
 
+
+@deprecated("The class OTDA_sinkhorn is deprecated in 0.3.1 and will be"
+            " removed in 0.5 \nUse class SinkhornTransport instead.")
 class OTDA_sinkhorn(OTDA):
-    """Class for domain adaptation with optimal transport with entropic regularization"""
 
-    def fit(self,xs,xt,reg=1,ws=None,wt=None,norm=None,**kwargs):
-        """ Fit regularized domain adaptation between samples is xs and xt (with optional weights)"""
-        self.xs=xs
-        self.xt=xt
+    """Class for domain adaptation with optimal transport with entropic
+    regularization
+
+
+    """
+
+    def fit(self, xs, xt, reg=1, ws=None, wt=None, **kwargs):
+        """Fit regularized domain adaptation between samples is xs and xt
+        (with optional weights)"""
+        self.xs = xs
+        self.xt = xt
 
         if wt is None:
-            wt=unif(xt.shape[0])
+            wt = unif(xt.shape[0])
         if ws is None:
-            ws=unif(xs.shape[0])
+            ws = unif(xs.shape[0])
 
-        self.ws=ws
-        self.wt=wt
+        self.ws = ws
+        self.wt = wt
 
-        self.M=dist(xs,xt,metric=self.metric)
-        self.normalizeM(norm)
-        self.G=sinkhorn(ws,wt,self.M,reg,**kwargs)
-        self.computed=True
+        self.M = dist(xs, xt, metric=self.metric)
+        self.M = cost_normalization(self.M, self.norm)
+        self.G = sinkhorn(ws, wt, self.M, reg, **kwargs)
+        self.computed = True
 
 
+@deprecated("The class OTDA_lpl1 is deprecated in 0.3.1 and will be"
+            " removed in 0.5 \nUse class SinkhornLpl1Transport instead.")
 class OTDA_lpl1(OTDA):
-    """Class for domain adaptation with optimal transport with entropic and group regularization"""
 
+    """Class for domain adaptation with optimal transport with entropic and
+    group regularization"""
 
-    def fit(self,xs,ys,xt,reg=1,eta=1,ws=None,wt=None,norm=None,**kwargs):
-        """ Fit regularized domain adaptation between samples is xs and xt (with optional weights),  See ot.da.sinkhorn_lpl1_mm for fit parameters"""
-        self.xs=xs
-        self.xt=xt
+    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, **kwargs):
+        """Fit regularized domain adaptation between samples is xs and xt
+        (with optional weights),  See ot.da.sinkhorn_lpl1_mm for fit
+        parameters"""
+        self.xs = xs
+        self.xt = xt
 
         if wt is None:
-            wt=unif(xt.shape[0])
+            wt = unif(xt.shape[0])
         if ws is None:
-            ws=unif(xs.shape[0])
+            ws = unif(xs.shape[0])
+
+        self.ws = ws
+        self.wt = wt
 
-        self.ws=ws
-        self.wt=wt
+        self.M = dist(xs, xt, metric=self.metric)
+        self.M = cost_normalization(self.M, self.norm)
+        self.G = sinkhorn_lpl1_mm(ws, ys, wt, self.M, reg, eta, **kwargs)
+        self.computed = True
 
-        self.M=dist(xs,xt,metric=self.metric)
-        self.normalizeM(norm)
-        self.G=sinkhorn_lpl1_mm(ws,ys,wt,self.M,reg,eta,**kwargs)
-        self.computed=True
 
+@deprecated("The class OTDA_l1L2 is deprecated in 0.3.1 and will be"
+            " removed in 0.5 \nUse class SinkhornL1l2Transport instead.")
 class OTDA_l1l2(OTDA):
-    """Class for domain adaptation with optimal transport with entropic and group lasso regularization"""
 
+    """Class for domain adaptation with optimal transport with entropic
+    and group lasso regularization"""
 
-    def fit(self,xs,ys,xt,reg=1,eta=1,ws=None,wt=None,norm=None,**kwargs):
-        """ Fit regularized domain adaptation between samples is xs and xt (with optional weights),  See ot.da.sinkhorn_lpl1_gl for fit parameters"""
-        self.xs=xs
-        self.xt=xt
+    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, **kwargs):
+        """Fit regularized domain adaptation between samples is xs and xt
+           (with optional weights),  See ot.da.sinkhorn_lpl1_gl for fit
+           parameters"""
+        self.xs = xs
+        self.xt = xt
 
         if wt is None:
-            wt=unif(xt.shape[0])
+            wt = unif(xt.shape[0])
         if ws is None:
-            ws=unif(xs.shape[0])
+            ws = unif(xs.shape[0])
+
+        self.ws = ws
+        self.wt = wt
 
-        self.ws=ws
-        self.wt=wt
+        self.M = dist(xs, xt, metric=self.metric)
+        self.M = cost_normalization(self.M, self.norm)
+        self.G = sinkhorn_l1l2_gl(ws, ys, wt, self.M, reg, eta, **kwargs)
+        self.computed = True
 
-        self.M=dist(xs,xt,metric=self.metric)
-        self.normalizeM(norm)
-        self.G=sinkhorn_l1l2_gl(ws,ys,wt,self.M,reg,eta,**kwargs)
-        self.computed=True
 
+@deprecated("The class OTDA_mapping_linear is deprecated in 0.3.1 and will be"
+            " removed in 0.5 \nUse class MappingTransport instead.")
 class OTDA_mapping_linear(OTDA):
-    """Class for optimal transport with joint linear mapping estimation as in [8]"""
 
+    """Class for optimal transport with joint linear mapping estimation as in
+    [8]
+    """
 
     def __init__(self):
         """ Class initialization"""
 
+        self.xs = 0
+        self.xt = 0
+        self.G = 0
+        self.L = 0
+        self.bias = False
+        self.computed = False
+        self.metric = 'sqeuclidean'
 
-        self.xs=0
-        self.xt=0
-        self.G=0
-        self.L=0
-        self.bias=False
-        self.computed=False
-        self.metric='sqeuclidean'
-
-    def fit(self,xs,xt,mu=1,eta=1,bias=False,**kwargs):
+    def fit(self, xs, xt, mu=1, eta=1, bias=False, **kwargs):
         """ Fit domain adaptation between samples is xs and xt (with optional
             weights)"""
-        self.xs=xs
-        self.xt=xt
-        self.bias=bias
+        self.xs = xs
+        self.xt = xt
+        self.bias = bias
 
+        self.ws = unif(xs.shape[0])
+        self.wt = unif(xt.shape[0])
 
-        self.ws=unif(xs.shape[0])
-        self.wt=unif(xt.shape[0])
-
-        self.G,self.L=joint_OT_mapping_linear(xs,xt,mu=mu,eta=eta,bias=bias,**kwargs)
-        self.computed=True
+        self.G, self.L = joint_OT_mapping_linear(
+            xs, xt, mu=mu, eta=eta, bias=bias, **kwargs)
+        self.computed = True
 
     def mapping(self):
         return lambda x: self.predict(x)
 
-
-    def predict(self,x):
+    def predict(self, x):
         """ Out of sample mapping estimated during the call to fit"""
         if self.computed:
             if self.bias:
-                x=np.hstack((x,np.ones((x.shape[0],1))))
-            return x.dot(self.L) # aply the delta to the interpolation
+                x = np.hstack((x, np.ones((x.shape[0], 1))))
+            return x.dot(self.L)  # aply the delta to the interpolation
         else:
             print("Warning, model not fitted yet, returning None")
             return None
 
-class OTDA_mapping_kernel(OTDA_mapping_linear):
-    """Class for optimal transport with joint nonlinear mapping estimation as in [8]"""
 
+@deprecated("The class OTDA_mapping_kernel is deprecated in 0.3.1 and will be"
+            " removed in 0.5 \nUse class MappingTransport instead.")
+class OTDA_mapping_kernel(OTDA_mapping_linear):
 
+    """Class for optimal transport with joint nonlinear mapping
+    estimation as in [8]"""
 
-    def fit(self,xs,xt,mu=1,eta=1,bias=False,kerneltype='gaussian',sigma=1,**kwargs):
+    def fit(self, xs, xt, mu=1, eta=1, bias=False, kerneltype='gaussian',
+            sigma=1, **kwargs):
         """ Fit domain adaptation between samples is xs and xt """
-        self.xs=xs
-        self.xt=xt
-        self.bias=bias
-
-        self.ws=unif(xs.shape[0])
-        self.wt=unif(xt.shape[0])
-        self.kernel=kerneltype
-        self.sigma=sigma
-        self.kwargs=kwargs
-
+        self.xs = xs
+        self.xt = xt
+        self.bias = bias
 
-        self.G,self.L=joint_OT_mapping_kernel(xs,xt,mu=mu,eta=eta,bias=bias,**kwargs)
-        self.computed=True
+        self.ws = unif(xs.shape[0])
+        self.wt = unif(xt.shape[0])
+        self.kernel = kerneltype
+        self.sigma = sigma
+        self.kwargs = kwargs
 
+        self.G, self.L = joint_OT_mapping_kernel(
+            xs, xt, mu=mu, eta=eta, bias=bias, **kwargs)
+        self.computed = True
 
-    def predict(self,x):
+    def predict(self, x):
         """ Out of sample mapping estimated during the call to fit"""
 
         if self.computed:
-            K=kernel(x,self.xs,method=self.kernel,sigma=self.sigma,**self.kwargs)
+            K = kernel(
+                x, self.xs, method=self.kernel, sigma=self.sigma,
+                **self.kwargs)
             if self.bias:
-                K=np.hstack((K,np.ones((x.shape[0],1))))
+                K = np.hstack((K, np.ones((x.shape[0], 1))))
             return K.dot(self.L)
         else:
             print("Warning, model not fitted yet, returning None")
-            return None
-\ No newline at end of file
+            return None
+
+
+def distribution_estimation_uniform(X):
+    """estimates a uniform distribution from an array of samples X
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        The array of samples
+
+    Returns
+    -------
+    mu : array-like, shape (n_samples,)
+        The uniform distribution estimated from X
+    """
+
+    return unif(X.shape[0])
+
+
+class BaseTransport(BaseEstimator):
+    """Base class for OTDA objects
+
+    Notes
+    -----
+    All estimators should specify all the parameters that can be set
+    at the class level in their ``__init__`` as explicit keyword
+    arguments (no ``*args`` or ``**kwargs``).
+
+    fit method should:
+    - estimate a cost matrix and store it in a `cost_` attribute
+    - estimate a coupling matrix and store it in a `coupling_`
+    attribute
+    - estimate distributions from source and target data and store them in
+    mu_s and mu_t attributes
+    - store Xs and Xt in attributes to be used later on in transform and
+    inverse_transform methods
+
+    transform method should always get as input a Xs parameter
+    inverse_transform method should always get as input a Xt parameter
+    """
+
+    def fit(self, Xs=None, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs, Xt=Xt):
+
+            # pairwise distance
+            self.cost_ = dist(Xs, Xt, metric=self.metric)
+            self.cost_ = cost_normalization(self.cost_, self.norm)
+
+            if (ys is not None) and (yt is not None):
+
+                if self.limit_max != np.infty:
+                    self.limit_max = self.limit_max * np.max(self.cost_)
+
+                # assumes labeled source samples occupy the first rows
+                # and labeled target samples occupy the first columns
+                classes = np.unique(ys)
+                for c in classes:
+                    idx_s = np.where((ys != c) & (ys != -1))
+                    idx_t = np.where(yt == c)
+
+                    # all the coefficients corresponding to a source sample
+                    # and a target sample :
+                    # with different labels get a infinite
+                    for j in idx_t[0]:
+                        self.cost_[idx_s[0], j] = self.limit_max
+
+            # distribution estimation
+            self.mu_s = self.distribution_estimation(Xs)
+            self.mu_t = self.distribution_estimation(Xt)
+
+            # store arrays of samples
+            self.xs_ = Xs
+            self.xt_ = Xt
+
+        return self
+
+    def fit_transform(self, Xs=None, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt) and transports source samples Xs onto target
+        ones Xt
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        transp_Xs : array-like, shape (n_source_samples, n_features)
+            The source samples samples.
+        """
+
+        return self.fit(Xs, ys, Xt, yt).transform(Xs, ys, Xt, yt)
+
+    def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
+        """Transports source samples Xs onto target ones Xt
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+        batch_size : int, optional (default=128)
+            The batch size for out of sample inverse transform
+
+        Returns
+        -------
+        transp_Xs : array-like, shape (n_source_samples, n_features)
+            The transport source samples.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs):
+
+            if np.array_equal(self.xs_, Xs):
+
+                # perform standard barycentric mapping
+                transp = self.coupling_ / np.sum(self.coupling_, 1)[:, None]
+
+                # set nans to 0
+                transp[~ np.isfinite(transp)] = 0
+
+                # compute transported samples
+                transp_Xs = np.dot(transp, self.xt_)
+            else:
+                # perform out of sample mapping
+                indices = np.arange(Xs.shape[0])
+                batch_ind = [
+                    indices[i:i + batch_size]
+                    for i in range(0, len(indices), batch_size)]
+
+                transp_Xs = []
+                for bi in batch_ind:
+
+                    # get the nearest neighbor in the source domain
+                    D0 = dist(Xs[bi], self.xs_)
+                    idx = np.argmin(D0, axis=1)
+
+                    # transport the source samples
+                    transp = self.coupling_ / np.sum(
+                        self.coupling_, 1)[:, None]
+                    transp[~ np.isfinite(transp)] = 0
+                    transp_Xs_ = np.dot(transp, self.xt_)
+
+                    # define the transported points
+                    transp_Xs_ = transp_Xs_[idx, :] + Xs[bi] - self.xs_[idx, :]
+
+                    transp_Xs.append(transp_Xs_)
+
+                transp_Xs = np.concatenate(transp_Xs, axis=0)
+
+            return transp_Xs
+
+    def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None,
+                          batch_size=128):
+        """Transports target samples Xt onto target samples Xs
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+        batch_size : int, optional (default=128)
+            The batch size for out of sample inverse transform
+
+        Returns
+        -------
+        transp_Xt : array-like, shape (n_source_samples, n_features)
+            The transported target samples.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xt=Xt):
+
+            if np.array_equal(self.xt_, Xt):
+
+                # perform standard barycentric mapping
+                transp_ = self.coupling_.T / np.sum(self.coupling_, 0)[:, None]
+
+                # set nans to 0
+                transp_[~ np.isfinite(transp_)] = 0
+
+                # compute transported samples
+                transp_Xt = np.dot(transp_, self.xs_)
+            else:
+                # perform out of sample mapping
+                indices = np.arange(Xt.shape[0])
+                batch_ind = [
+                    indices[i:i + batch_size]
+                    for i in range(0, len(indices), batch_size)]
+
+                transp_Xt = []
+                for bi in batch_ind:
+
+                    D0 = dist(Xt[bi], self.xt_)
+                    idx = np.argmin(D0, axis=1)
+
+                    # transport the target samples
+                    transp_ = self.coupling_.T / np.sum(
+                        self.coupling_, 0)[:, None]
+                    transp_[~ np.isfinite(transp_)] = 0
+                    transp_Xt_ = np.dot(transp_, self.xs_)
+
+                    # define the transported points
+                    transp_Xt_ = transp_Xt_[idx, :] + Xt[bi] - self.xt_[idx, :]
+
+                    transp_Xt.append(transp_Xt_)
+
+                transp_Xt = np.concatenate(transp_Xt, axis=0)
+
+            return transp_Xt
+
+
+class SinkhornTransport(BaseTransport):
+    """Domain Adapatation OT method based on Sinkhorn Algorithm
+
+    Parameters
+    ----------
+    reg_e : float, optional (default=1)
+        Entropic regularization parameter
+    max_iter : int, float, optional (default=1000)
+        The minimum number of iteration before stopping the optimization
+        algorithm if no it has not converged
+    tol : float, optional (default=10e-9)
+        The precision required to stop the optimization algorithm.
+    mapping : string, optional (default="barycentric")
+        The kind of mapping to apply to transport samples from a domain into
+        another one.
+        if "barycentric" only the samples used to estimate the coupling can
+        be transported from a domain to another one.
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution : string, optional (default="uniform")
+        The kind of distribution estimation to employ
+    verbose : int, optional (default=0)
+        Controls the verbosity of the optimization algorithm
+    log : int, optional (default=0)
+        Controls the logs of the optimization algorithm
+    limit_max: float, optional (defaul=np.infty)
+        Controls the semi supervised mode. Transport between labeled source
+        and target samples of different classes will exhibit an infinite cost
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+    log_ : dictionary
+        The dictionary of log, empty dic if parameter log is not True
+
+    References
+    ----------
+    .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+           "Optimal Transport for Domain Adaptation," in IEEE Transactions
+           on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
+           Transport, Advances in Neural Information Processing Systems (NIPS)
+           26, 2013
+    """
+
+    def __init__(self, reg_e=1., max_iter=1000,
+                 tol=10e-9, verbose=False, log=False,
+                 metric="sqeuclidean", norm=None,
+                 distribution_estimation=distribution_estimation_uniform,
+                 out_of_sample_map='ferradans', limit_max=np.infty):
+
+        self.reg_e = reg_e
+        self.max_iter = max_iter
+        self.tol = tol
+        self.verbose = verbose
+        self.log = log
+        self.metric = metric
+        self.norm = norm
+        self.limit_max = limit_max
+        self.distribution_estimation = distribution_estimation
+        self.out_of_sample_map = out_of_sample_map
+
+    def fit(self, Xs=None, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        super(SinkhornTransport, self).fit(Xs, ys, Xt, yt)
+
+        # coupling estimation
+        returned_ = sinkhorn(
+            a=self.mu_s, b=self.mu_t, M=self.cost_, reg=self.reg_e,
+            numItermax=self.max_iter, stopThr=self.tol,
+            verbose=self.verbose, log=self.log)
+
+        # deal with the value of log
+        if self.log:
+            self.coupling_, self.log_ = returned_
+        else:
+            self.coupling_ = returned_
+            self.log_ = dict()
+
+        return self
+
+
+class EMDTransport(BaseTransport):
+    """Domain Adapatation OT method based on Earth Mover's Distance
+
+    Parameters
+    ----------
+    mapping : string, optional (default="barycentric")
+        The kind of mapping to apply to transport samples from a domain into
+        another one.
+        if "barycentric" only the samples used to estimate the coupling can
+        be transported from a domain to another one.
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution : string, optional (default="uniform")
+        The kind of distribution estimation to employ
+    verbose : int, optional (default=0)
+        Controls the verbosity of the optimization algorithm
+    log : int, optional (default=0)
+        Controls the logs of the optimization algorithm
+    limit_max: float, optional (default=10)
+        Controls the semi supervised mode. Transport between labeled source
+        and target samples of different classes will exhibit an infinite cost
+        (10 times the maximum value of the cost matrix)
+    max_iter : int, optional (default=100000)
+        The maximum number of iterations before stopping the optimization
+        algorithm if it has not converged.
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+
+    References
+    ----------
+    .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+           "Optimal Transport for Domain Adaptation," in IEEE Transactions
+           on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+    """
+
+    def __init__(self, metric="sqeuclidean", norm=None,
+                 distribution_estimation=distribution_estimation_uniform,
+                 out_of_sample_map='ferradans', limit_max=10,
+                 max_iter=100000):
+
+        self.metric = metric
+        self.norm = norm
+        self.limit_max = limit_max
+        self.distribution_estimation = distribution_estimation
+        self.out_of_sample_map = out_of_sample_map
+        self.max_iter = max_iter
+
+    def fit(self, Xs, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        super(EMDTransport, self).fit(Xs, ys, Xt, yt)
+
+        # coupling estimation
+        self.coupling_ = emd(
+            a=self.mu_s, b=self.mu_t, M=self.cost_, numItermax=self.max_iter
+        )
+
+        return self
+
+
+class SinkhornLpl1Transport(BaseTransport):
+    """Domain Adapatation OT method based on sinkhorn algorithm +
+    LpL1 class regularization.
+
+    Parameters
+    ----------
+    reg_e : float, optional (default=1)
+        Entropic regularization parameter
+    reg_cl : float, optional (default=0.1)
+        Class regularization parameter
+    mapping : string, optional (default="barycentric")
+        The kind of mapping to apply to transport samples from a domain into
+        another one.
+        if "barycentric" only the samples used to estimate the coupling can
+        be transported from a domain to another one.
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution : string, optional (default="uniform")
+        The kind of distribution estimation to employ
+    max_iter : int, float, optional (default=10)
+        The minimum number of iteration before stopping the optimization
+        algorithm if no it has not converged
+    max_inner_iter : int, float, optional (default=200)
+        The number of iteration in the inner loop
+    verbose : int, optional (default=0)
+        Controls the verbosity of the optimization algorithm
+    limit_max: float, optional (defaul=np.infty)
+        Controls the semi supervised mode. Transport between labeled source
+        and target samples of different classes will exhibit an infinite cost
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+
+    References
+    ----------
+
+    .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE
+       Transactions on Pattern Analysis and Machine Intelligence ,
+       vol.PP, no.99, pp.1-1
+    .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
+       Generalized conditional gradient: analysis of convergence
+       and applications. arXiv preprint arXiv:1510.06567.
+
+    """
+
+    def __init__(self, reg_e=1., reg_cl=0.1,
+                 max_iter=10, max_inner_iter=200,
+                 tol=10e-9, verbose=False,
+                 metric="sqeuclidean", norm=None,
+                 distribution_estimation=distribution_estimation_uniform,
+                 out_of_sample_map='ferradans', limit_max=np.infty):
+
+        self.reg_e = reg_e
+        self.reg_cl = reg_cl
+        self.max_iter = max_iter
+        self.max_inner_iter = max_inner_iter
+        self.tol = tol
+        self.verbose = verbose
+        self.metric = metric
+        self.norm = norm
+        self.distribution_estimation = distribution_estimation
+        self.out_of_sample_map = out_of_sample_map
+        self.limit_max = limit_max
+
+    def fit(self, Xs, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs, Xt=Xt, ys=ys):
+
+            super(SinkhornLpl1Transport, self).fit(Xs, ys, Xt, yt)
+
+            self.coupling_ = sinkhorn_lpl1_mm(
+                a=self.mu_s, labels_a=ys, b=self.mu_t, M=self.cost_,
+                reg=self.reg_e, eta=self.reg_cl, numItermax=self.max_iter,
+                numInnerItermax=self.max_inner_iter, stopInnerThr=self.tol,
+                verbose=self.verbose)
+
+        return self
+
+
+class SinkhornL1l2Transport(BaseTransport):
+    """Domain Adapatation OT method based on sinkhorn algorithm +
+    l1l2 class regularization.
+
+    Parameters
+    ----------
+    reg_e : float, optional (default=1)
+        Entropic regularization parameter
+    reg_cl : float, optional (default=0.1)
+        Class regularization parameter
+    mapping : string, optional (default="barycentric")
+        The kind of mapping to apply to transport samples from a domain into
+        another one.
+        if "barycentric" only the samples used to estimate the coupling can
+        be transported from a domain to another one.
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution : string, optional (default="uniform")
+        The kind of distribution estimation to employ
+    max_iter : int, float, optional (default=10)
+        The minimum number of iteration before stopping the optimization
+        algorithm if no it has not converged
+    max_inner_iter : int, float, optional (default=200)
+        The number of iteration in the inner loop
+    verbose : int, optional (default=0)
+        Controls the verbosity of the optimization algorithm
+    log : int, optional (default=0)
+        Controls the logs of the optimization algorithm
+    limit_max: float, optional (default=10)
+        Controls the semi supervised mode. Transport between labeled source
+        and target samples of different classes will exhibit an infinite cost
+        (10 times the maximum value of the cost matrix)
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+    log_ : dictionary
+        The dictionary of log, empty dic if parameter log is not True
+
+    References
+    ----------
+
+    .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE
+       Transactions on Pattern Analysis and Machine Intelligence ,
+       vol.PP, no.99, pp.1-1
+    .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
+       Generalized conditional gradient: analysis of convergence
+       and applications. arXiv preprint arXiv:1510.06567.
+
+    """
+
+    def __init__(self, reg_e=1., reg_cl=0.1,
+                 max_iter=10, max_inner_iter=200,
+                 tol=10e-9, verbose=False, log=False,
+                 metric="sqeuclidean", norm=None,
+                 distribution_estimation=distribution_estimation_uniform,
+                 out_of_sample_map='ferradans', limit_max=10):
+
+        self.reg_e = reg_e
+        self.reg_cl = reg_cl
+        self.max_iter = max_iter
+        self.max_inner_iter = max_inner_iter
+        self.tol = tol
+        self.verbose = verbose
+        self.log = log
+        self.metric = metric
+        self.norm = norm
+        self.distribution_estimation = distribution_estimation
+        self.out_of_sample_map = out_of_sample_map
+        self.limit_max = limit_max
+
+    def fit(self, Xs, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs, Xt=Xt, ys=ys):
+
+            super(SinkhornL1l2Transport, self).fit(Xs, ys, Xt, yt)
+
+            returned_ = sinkhorn_l1l2_gl(
+                a=self.mu_s, labels_a=ys, b=self.mu_t, M=self.cost_,
+                reg=self.reg_e, eta=self.reg_cl, numItermax=self.max_iter,
+                numInnerItermax=self.max_inner_iter, stopInnerThr=self.tol,
+                verbose=self.verbose, log=self.log)
+
+            # deal with the value of log
+            if self.log:
+                self.coupling_, self.log_ = returned_
+            else:
+                self.coupling_ = returned_
+                self.log_ = dict()
+
+        return self
+
+
+class MappingTransport(BaseEstimator):
+    """MappingTransport: DA methods that aims at jointly estimating a optimal
+    transport coupling and the associated mapping
+
+    Parameters
+    ----------
+    mu : float, optional (default=1)
+        Weight for the linear OT loss (>0)
+    eta : float, optional (default=0.001)
+        Regularization term for the linear mapping L (>0)
+    bias : bool, optional (default=False)
+        Estimate linear mapping with constant bias
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    kernel : string, optional (default="linear")
+        The kernel to use either linear or gaussian
+    sigma : float, optional (default=1)
+        The gaussian kernel parameter
+    max_iter : int, optional (default=100)
+        Max number of BCD iterations
+    tol : float, optional (default=1e-5)
+        Stop threshold on relative loss decrease (>0)
+    max_inner_iter : int, optional (default=10)
+        Max number of iterations (inner CG solver)
+    inner_tol : float, optional (default=1e-6)
+        Stop threshold on error (inner CG solver) (>0)
+    verbose : bool, optional (default=False)
+        Print information along iterations
+    log : bool, optional (default=False)
+        record log if True
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+    mapping_ : array-like, shape (n_features (+ 1), n_features)
+        (if bias) for kernel == linear
+        The associated mapping
+        array-like, shape (n_source_samples (+ 1), n_features)
+        (if bias) for kernel == gaussian
+    log_ : dictionary
+        The dictionary of log, empty dic if parameter log is not True
+
+    References
+    ----------
+
+    .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard,
+            "Mapping estimation for discrete optimal transport",
+            Neural Information Processing Systems (NIPS), 2016.
+
+    """
+
+    def __init__(self, mu=1, eta=0.001, bias=False, metric="sqeuclidean",
+                 norm=None, kernel="linear", sigma=1, max_iter=100, tol=1e-5,
+                 max_inner_iter=10, inner_tol=1e-6, log=False, verbose=False,
+                 verbose2=False):
+
+        self.metric = metric
+        self.norm = norm
+        self.mu = mu
+        self.eta = eta
+        self.bias = bias
+        self.kernel = kernel
+        self.sigma = sigma
+        self.max_iter = max_iter
+        self.tol = tol
+        self.max_inner_iter = max_inner_iter
+        self.inner_tol = inner_tol
+        self.log = log
+        self.verbose = verbose
+        self.verbose2 = verbose2
+
+    def fit(self, Xs=None, ys=None, Xt=None, yt=None):
+        """Builds an optimal coupling and estimates the associated mapping
+        from source and target sets of samples (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_labeled_target_samples,)
+            The class labels
+
+        Returns
+        -------
+        self : object
+            Returns self
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs, Xt=Xt):
+
+            self.xs_ = Xs
+            self.xt_ = Xt
+
+            if self.kernel == "linear":
+                returned_ = joint_OT_mapping_linear(
+                    Xs, Xt, mu=self.mu, eta=self.eta, bias=self.bias,
+                    verbose=self.verbose, verbose2=self.verbose2,
+                    numItermax=self.max_iter,
+                    numInnerItermax=self.max_inner_iter, stopThr=self.tol,
+                    stopInnerThr=self.inner_tol, log=self.log)
+
+            elif self.kernel == "gaussian":
+                returned_ = joint_OT_mapping_kernel(
+                    Xs, Xt, mu=self.mu, eta=self.eta, bias=self.bias,
+                    sigma=self.sigma, verbose=self.verbose,
+                    verbose2=self.verbose, numItermax=self.max_iter,
+                    numInnerItermax=self.max_inner_iter,
+                    stopInnerThr=self.inner_tol, stopThr=self.tol,
+                    log=self.log)
+
+            # deal with the value of log
+            if self.log:
+                self.coupling_, self.mapping_, self.log_ = returned_
+            else:
+                self.coupling_, self.mapping_ = returned_
+                self.log_ = dict()
+
+        return self
+
+    def transform(self, Xs):
+        """Transports source samples Xs onto target ones Xt
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+
+        Returns
+        -------
+        transp_Xs : array-like, shape (n_source_samples, n_features)
+            The transport source samples.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs):
+
+            if np.array_equal(self.xs_, Xs):
+                # perform standard barycentric mapping
+                transp = self.coupling_ / np.sum(self.coupling_, 1)[:, None]
+
+                # set nans to 0
+                transp[~ np.isfinite(transp)] = 0
+
+                # compute transported samples
+                transp_Xs = np.dot(transp, self.xt_)
+            else:
+                if self.kernel == "gaussian":
+                    K = kernel(Xs, self.xs_, method=self.kernel,
+                               sigma=self.sigma)
+                elif self.kernel == "linear":
+                    K = Xs
+                if self.bias:
+                    K = np.hstack((K, np.ones((Xs.shape[0], 1))))
+                transp_Xs = K.dot(self.mapping_)
+
+            return transp_Xs
diff --git a/ot/datasets.py b/ot/datasets.py
index 7816833..e4fe118 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -2,12 +2,16 @@
 Simple example datasets for OT
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
 
 import numpy as np
 import scipy as sp
 
 
-def get_1D_gauss(n,m,s):
+def get_1D_gauss(n, m, s):
     """return a 1D histogram for a gaussian distribution (n bins, mean m and std s)
 
     Parameters
@@ -27,12 +31,12 @@ def get_1D_gauss(n,m,s):
           1D histogram for a gaussian distribution
 
     """
-    x=np.arange(n,dtype=np.float64)
-    h=np.exp(-(x-m)**2/(2*s**2))
-    return h/h.sum()
+    x = np.arange(n, dtype=np.float64)
+    h = np.exp(-(x - m)**2 / (2 * s**2))
+    return h / h.sum()
 
 
-def get_2D_samples_gauss(n,m,sigma):
+def get_2D_samples_gauss(n, m, sigma):
     """return n samples drawn from 2D gaussian N(m,sigma)
 
     Parameters
@@ -52,17 +56,17 @@ def get_2D_samples_gauss(n,m,sigma):
           n samples drawn from  N(m,sigma)
 
     """
-    if  np.isscalar(sigma):
-        sigma=np.array([sigma,])
-    if len(sigma)>1:
-        P=sp.linalg.sqrtm(sigma)
-        res= np.random.randn(n,2).dot(P)+m
+    if np.isscalar(sigma):
+        sigma = np.array([sigma, ])
+    if len(sigma) > 1:
+        P = sp.linalg.sqrtm(sigma)
+        res = np.random.randn(n, 2).dot(P) + m
     else:
-        res= np.random.randn(n,2)*np.sqrt(sigma)+m
+        res = np.random.randn(n, 2) * np.sqrt(sigma) + m
     return res
 
 
-def get_data_classif(dataset,n,nz=.5,theta=0,**kwargs):
+def get_data_classif(dataset, n, nz=.5, theta=0, **kwargs):
     """ dataset generation for classification problems
 
     Parameters
@@ -84,48 +88,53 @@ def get_data_classif(dataset,n,nz=.5,theta=0,**kwargs):
           labels of the samples
 
     """
-    if dataset.lower()=='3gauss':
-        y=np.floor((np.arange(n)*1.0/n*3))+1
-        x=np.zeros((n,2))
+    if dataset.lower() == '3gauss':
+        y = np.floor((np.arange(n) * 1.0 / n * 3)) + 1
+        x = np.zeros((n, 2))
         # class 1
-        x[y==1,0]=-1.; x[y==1,1]=-1.
-        x[y==2,0]=-1.; x[y==2,1]=1.
-        x[y==3,0]=1. ; x[y==3,1]=0
-
-        x[y!=3,:]+=1.5*nz*np.random.randn(sum(y!=3),2)
-        x[y==3,:]+=2*nz*np.random.randn(sum(y==3),2)
-
-    elif dataset.lower()=='3gauss2':
-        y=np.floor((np.arange(n)*1.0/n*3))+1
-        x=np.zeros((n,2))
-        y[y==4]=3
+        x[y == 1, 0] = -1.
+        x[y == 1, 1] = -1.
+        x[y == 2, 0] = -1.
+        x[y == 2, 1] = 1.
+        x[y == 3, 0] = 1.
+        x[y == 3, 1] = 0
+
+        x[y != 3, :] += 1.5 * nz * np.random.randn(sum(y != 3), 2)
+        x[y == 3, :] += 2 * nz * np.random.randn(sum(y == 3), 2)
+
+    elif dataset.lower() == '3gauss2':
+        y = np.floor((np.arange(n) * 1.0 / n * 3)) + 1
+        x = np.zeros((n, 2))
+        y[y == 4] = 3
         # class 1
-        x[y==1,0]=-2.; x[y==1,1]=-2.
-        x[y==2,0]=-2.; x[y==2,1]=2.
-        x[y==3,0]=2. ; x[y==3,1]=0
-
-        x[y!=3,:]+=nz*np.random.randn(sum(y!=3),2)
-        x[y==3,:]+=2*nz*np.random.randn(sum(y==3),2)
-
-    elif  dataset.lower()=='gaussrot'      :
-        rot=np.array([[np.cos(theta),np.sin(theta)],[-np.sin(theta),np.cos(theta)]])
-        m1=np.array([-1,1])
-        m2=np.array([1,-1])
-        y=np.floor((np.arange(n)*1.0/n*2))+1
-        n1=np.sum(y==1)
-        n2=np.sum(y==2)
-        x=np.zeros((n,2))
-
-        x[y==1,:]=get_2D_samples_gauss(n1,m1,nz)
-        x[y==2,:]=get_2D_samples_gauss(n2,m2,nz)
-
-        x=x.dot(rot)
-
-
+        x[y == 1, 0] = -2.
+        x[y == 1, 1] = -2.
+        x[y == 2, 0] = -2.
+        x[y == 2, 1] = 2.
+        x[y == 3, 0] = 2.
+        x[y == 3, 1] = 0
+
+        x[y != 3, :] += nz * np.random.randn(sum(y != 3), 2)
+        x[y == 3, :] += 2 * nz * np.random.randn(sum(y == 3), 2)
+
+    elif dataset.lower() == 'gaussrot':
+        rot = np.array(
+            [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]])
+        m1 = np.array([-1, 1])
+        m2 = np.array([1, -1])
+        y = np.floor((np.arange(n) * 1.0 / n * 2)) + 1
+        n1 = np.sum(y == 1)
+        n2 = np.sum(y == 2)
+        x = np.zeros((n, 2))
+
+        x[y == 1, :] = get_2D_samples_gauss(n1, m1, nz)
+        x[y == 2, :] = get_2D_samples_gauss(n2, m2, nz)
+
+        x = x.dot(rot)
 
     else:
-        x=np.array(0)
-        y=np.array(0)
+        x = np.array(0)
+        y = np.array(0)
         print("unknown dataset")
 
-    return x,y.astype(int)
-\ No newline at end of file
+    return x, y.astype(int)
diff --git a/ot/dr.py b/ot/dr.py
index 763ce35..d30ab30 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -3,43 +3,50 @@
 Dimension reduction with optimal transport
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
+from scipy import linalg
 import autograd.numpy as np
 from pymanopt.manifolds import Stiefel
 from pymanopt import Problem
 from pymanopt.solvers import SteepestDescent, TrustRegions
-import scipy.linalg as la
 
-def dist(x1,x2):
+
+def dist(x1, x2):
     """ Compute squared euclidean distance between samples (autograd)
     """
-    x1p2=np.sum(np.square(x1),1)    
-    x2p2=np.sum(np.square(x2),1)
-    return x1p2.reshape((-1,1))+x2p2.reshape((1,-1))-2*np.dot(x1,x2.T)    
+    x1p2 = np.sum(np.square(x1), 1)
+    x2p2 = np.sum(np.square(x2), 1)
+    return x1p2.reshape((-1, 1)) + x2p2.reshape((1, -1)) - 2 * np.dot(x1, x2.T)
+
 
-def sinkhorn(w1,w2,M,reg,k):
+def sinkhorn(w1, w2, M, reg, k):
     """Sinkhorn algorithm with fixed number of iteration (autograd)
     """
-    K=np.exp(-M/reg)
-    ui=np.ones((M.shape[0],))
-    vi=np.ones((M.shape[1],))
+    K = np.exp(-M / reg)
+    ui = np.ones((M.shape[0],))
+    vi = np.ones((M.shape[1],))
     for i in range(k):
-        vi=w2/(np.dot(K.T,ui))
-        ui=w1/(np.dot(K,vi))
-    G=ui.reshape((M.shape[0],1))*K*vi.reshape((1,M.shape[1]))   
+        vi = w2 / (np.dot(K.T, ui))
+        ui = w1 / (np.dot(K, vi))
+    G = ui.reshape((M.shape[0], 1)) * K * vi.reshape((1, M.shape[1]))
     return G
 
-def split_classes(X,y):
+
+def split_classes(X, y):
     """split samples in X by classes in y
     """
-    lstsclass=np.unique(y)
-    return [X[y==i,:].astype(np.float32) for i in lstsclass]
+    lstsclass = np.unique(y)
+    return [X[y == i, :].astype(np.float32) for i in lstsclass]
+
+
+def fda(X, y, p=2, reg=1e-16):
+    """
+    Fisher Discriminant Analysis
 
 
-def fda(X,y,p=2,reg=1e-16):
-    """ 
-    Fisher Discriminant Analysis 
-      
-    
     Parameters
     ----------
     X : numpy.ndarray (n,d)
@@ -59,62 +66,62 @@ def fda(X,y,p=2,reg=1e-16):
     proj : fun
         projection function including mean centering
 
-    
-    """    
-    
-    mx=np.mean(X)
-    X-=mx.reshape((1,-1))
-    
+
+    """
+
+    mx = np.mean(X)
+    X -= mx.reshape((1, -1))
+
     # data split between classes
-    d=X.shape[1]
-    xc=split_classes(X,y)  
-    nc=len(xc)
-    
-    p=min(nc-1,p)
-    
-    Cw=0
+    d = X.shape[1]
+    xc = split_classes(X, y)
+    nc = len(xc)
+
+    p = min(nc - 1, p)
+
+    Cw = 0
     for x in xc:
-        Cw+=np.cov(x,rowvar=False)
-    Cw/=nc
-    
-    mxc=np.zeros((d,nc))
-    
+        Cw += np.cov(x, rowvar=False)
+    Cw /= nc
+
+    mxc = np.zeros((d, nc))
+
     for i in range(nc):
-        mxc[:,i]=np.mean(xc[i])
-        
-    mx0=np.mean(mxc,1)
-    Cb=0
+        mxc[:, i] = np.mean(xc[i])
+
+    mx0 = np.mean(mxc, 1)
+    Cb = 0
     for i in range(nc):
-        Cb+=(mxc[:,i]-mx0).reshape((-1,1))*(mxc[:,i]-mx0).reshape((1,-1))
-        
-    w,V=la.eig(Cb,Cw+reg*np.eye(d))
-    
-    idx=np.argsort(w.real)
-    
-    Popt=V[:,idx[-p:]]
-    
-    
-    
+        Cb += (mxc[:, i] - mx0).reshape((-1, 1)) * \
+            (mxc[:, i] - mx0).reshape((1, -1))
+
+    w, V = linalg.eig(Cb, Cw + reg * np.eye(d))
+
+    idx = np.argsort(w.real)
+
+    Popt = V[:, idx[-p:]]
+
     def proj(X):
-        return (X-mx.reshape((1,-1))).dot(Popt)
-    
+        return (X - mx.reshape((1, -1))).dot(Popt)
+
     return Popt, proj
 
-def wda(X,y,p=2,reg=1,k=10,solver = None,maxiter=100,verbose=0,P0=None):
-    """ 
+
+def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None):
+    """
     Wasserstein Discriminant Analysis [11]_
-    
+
     The function solves the following optimization problem:
 
     .. math::
         P = \\text{arg}\min_P \\frac{\\sum_i W(PX^i,PX^i)}{\\sum_{i,j\\neq i} W(PX^i,PX^j)}
 
     where :
-        
+
     - :math:`P` is a linear projection operator in the Stiefel(p,d) manifold
     - :math:`W` is entropic regularized Wasserstein distances
-    - :math:`X^i` are samples in the dataset corresponding to class i    
-    
+    - :math:`X^i` are samples in the dataset corresponding to class i
+
     Parameters
     ----------
     X : numpy.ndarray (n,d)
@@ -147,54 +154,50 @@ def wda(X,y,p=2,reg=1,k=10,solver = None,maxiter=100,verbose=0,P0=None):
     ----------
 
     .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016). Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
-    
-    """
-    
-    mx=np.mean(X)
-    X-=mx.reshape((1,-1))
-    
+
+    """  # noqa
+
+    mx = np.mean(X)
+    X -= mx.reshape((1, -1))
+
     # data split between classes
-    d=X.shape[1]
-    xc=split_classes(X,y)
+    d = X.shape[1]
+    xc = split_classes(X, y)
     # compute uniform weighs
-    wc=[np.ones((x.shape[0]),dtype=np.float32)/x.shape[0] for x in xc]
-        
+    wc = [np.ones((x.shape[0]), dtype=np.float32) / x.shape[0] for x in xc]
+
     def cost(P):
         # wda loss
-        loss_b=0
-        loss_w=0
-    
-        for i,xi in enumerate(xc):
-            xi=np.dot(xi,P)
-            for j,xj in  enumerate(xc[i:]):
-                xj=np.dot(xj,P)
-                M=dist(xi,xj)
-                G=sinkhorn(wc[i],wc[j+i],M,reg,k)
-                if j==0:
-                    loss_w+=np.sum(G*M)
+        loss_b = 0
+        loss_w = 0
+
+        for i, xi in enumerate(xc):
+            xi = np.dot(xi, P)
+            for j, xj in enumerate(xc[i:]):
+                xj = np.dot(xj, P)
+                M = dist(xi, xj)
+                G = sinkhorn(wc[i], wc[j + i], M, reg, k)
+                if j == 0:
+                    loss_w += np.sum(G * M)
                 else:
-                    loss_b+=np.sum(G*M)
-                    
-        # loss inversed because minimization            
-        return loss_w/loss_b
-    
-    
+                    loss_b += np.sum(G * M)
+
+        # loss inversed because minimization
+        return loss_w / loss_b
+
     # declare manifold and problem
-    manifold = Stiefel(d, p)    
+    manifold = Stiefel(d, p)
     problem = Problem(manifold=manifold, cost=cost)
-    
+
     # declare solver and solve
     if solver is None:
-        solver= SteepestDescent(maxiter=maxiter,logverbosity=verbose)   
-    elif solver in ['tr','TrustRegions']:
-        solver= TrustRegions(maxiter=maxiter,logverbosity=verbose)
-        
-    Popt = solver.solve(problem,x=P0)
-    
-    def proj(X):
-        return (X-mx.reshape((1,-1))).dot(Popt)
-    
-    return Popt, proj
+        solver = SteepestDescent(maxiter=maxiter, logverbosity=verbose)
+    elif solver in ['tr', 'TrustRegions']:
+        solver = TrustRegions(maxiter=maxiter, logverbosity=verbose)
 
+    Popt = solver.solve(problem, x=P0)
 
+    def proj(X):
+        return (X - mx.reshape((1, -1))).dot(Popt)
 
+    return Popt, proj
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index 40b11c0..c8f9433 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -4,4 +4,9 @@ from . import bregman
 from . import da
 from .bregman import sinkhorn
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Leo Gautheron <https://github.com/aje> 
+#
+# License: MIT License
+
 __all__ = ["bregman", "da", "sinkhorn"]
diff --git a/ot/gpu/bregman.py b/ot/gpu/bregman.py
index 2c3e317..47939c4 100644
--- a/ot/gpu/bregman.py
+++ b/ot/gpu/bregman.py
@@ -3,13 +3,18 @@
 Bregman projections for regularized OT with GPU
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Leo Gautheron <https://github.com/aje>
+#
+# License: MIT License
+
 import numpy as np
 import cudamat
 
 
 def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
                 log=False, returnAsGPU=False):
-    """
+    r"""
     Solve the entropic regularization optimal transport problem on GPU
 
     The function solves the following optimization problem:
@@ -82,7 +87,7 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     ot.lp.emd : Unregularized OT
     ot.optim.cg : General regularized OT
 
-    """    
+    """
     # init data
     Nini = len(a)
     Nfin = len(b)
@@ -92,11 +97,11 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
 
     # we assume that no distances are null except those of the diagonal of
     # distances
-    u = (np.ones(Nini)/Nini).reshape((Nini, 1))
+    u = (np.ones(Nini) / Nini).reshape((Nini, 1))
     u_GPU = cudamat.CUDAMatrix(u)
     a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1)))
     ones_GPU = cudamat.empty(u_GPU.shape).assign(1)
-    v = (np.ones(Nfin)/Nfin).reshape((Nfin, 1))
+    v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1))
     v_GPU = cudamat.CUDAMatrix(v)
     b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1)))
 
@@ -121,7 +126,7 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
         ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU)
 
         if (np.any(KtransposeU_GPU.asarray() == 0) or
-           not u_GPU.allfinite() or not v_GPU.allfinite()):
+                not u_GPU.allfinite() or not v_GPU.allfinite()):
             # we have reached the machine precision
             # come back to previous solution and quit loop
             print('Warning: numerical errors at iteration', cpt)
@@ -142,7 +147,8 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
 
             if verbose:
                 if cpt % 200 == 0:
-                    print('{:5s}|{:12s}'.format('It.', 'Err')+'\n'+'-'*19)
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                 print('{:5d}|{:8e}|'.format(cpt, err))
         cpt += 1
     if log:
diff --git a/ot/gpu/da.py b/ot/gpu/da.py
index b05ff70..05c580f 100644
--- a/ot/gpu/da.py
+++ b/ot/gpu/da.py
@@ -3,6 +3,14 @@
 Domain adaptation with optimal transport with GPU implementation
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Nicolas Courty <ncourty@irisa.fr>
+#         Michael Perrot <michael.perrot@univ-st-etienne.fr>
+#         Leo Gautheron <https://github.com/aje>
+#
+# License: MIT License
+
+
 import numpy as np
 from ..utils import unif
 from ..da import OTDA
@@ -167,7 +175,7 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
             tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0)
             transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU)
             majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon)
-            cudamat.pow(majs_GPU, (p-1))
+            cudamat.pow(majs_GPU, (p - 1))
             majs_GPU.mult(p)
 
             tmpC_GPU.assign(0)
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h
index fb7feca..15e9115 100644
--- a/ot/lp/EMD.h
+++ b/ot/lp/EMD.h
@@ -23,8 +23,12 @@
 using namespace lemon;
 typedef unsigned int node_id_type;
 
+enum ProblemType {
+    INFEASIBLE,
+    OPTIMAL,
+    UNBOUNDED
+};
 
-void EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G,
-              double* alpha, double* beta, double *cost, int max_iter);
+int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int max_iter);
 
 #endif
diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp
index 0977e75..8ac43c7 100644
--- a/ot/lp/EMD_wrapper.cpp
+++ b/ot/lp/EMD_wrapper.cpp
@@ -15,10 +15,11 @@
 #include "EMD.h"
 
 
-void EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
+int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
                 double* alpha, double* beta, double *cost, int max_iter)  {
 // beware M and C anre strored in row major C style!!!
-  int n, m, i,cur;
+  int n, m, i, cur;
+  double  max;
 
     typedef FullBipartiteDigraph Digraph;
   DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
@@ -44,7 +45,7 @@ void EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
     std::vector<int> indI(n), indJ(m);
     std::vector<double> weights1(n), weights2(m);
     Digraph di(n, m);
-    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m,max_iter);
+    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, max_iter);
 
     // Set supply and demand, don't account for 0 values (faster)
 
@@ -108,5 +109,5 @@ void EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
     };
 
 
-
+    return ret;
 }
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 915a18c..a14d4e4 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -3,6 +3,10 @@
 Solvers for the original linear program OT problem
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
 import numpy as np
 # import compiled emd
 from .emd_wrap import emd_c, emd2_c
@@ -10,8 +14,7 @@ from ..utils import parmap
 import multiprocessing
 
 
-
-def emd(a, b, M, dual_variables=False, max_iter=-1):
+def emd(a, b, M, numItermax=100000, dual_variables=False):
     """Solves the Earth Movers distance problem and returns the OT matrix
 
 
@@ -36,6 +39,9 @@ def emd(a, b, M, dual_variables=False, max_iter=-1):
         Target histogram (uniform weigth if empty list)
     M : (ns,nt) ndarray, float64
         loss matrix
+    numItermax : int, optional (default=100000)
+        The maximum number of iterations before stopping the optimization
+        algorithm if it has not converged.
 
     Returns
     -------
@@ -48,7 +54,7 @@ def emd(a, b, M, dual_variables=False, max_iter=-1):
 
     Simple example with obvious solution. The function emd accepts lists and
     perform automatic conversion to numpy arrays
-    
+
     >>> import ot
     >>> a=[.5,.5]
     >>> b=[.5,.5]
@@ -80,13 +86,13 @@ def emd(a, b, M, dual_variables=False, max_iter=-1):
     if len(b) == 0:
         b = np.ones((M.shape[1], ), dtype=np.float64)/M.shape[1]
 
-    G, alpha, beta = emd_c(a, b, M, max_iter)
+    G, alpha, beta = emd_c(a, b, M, numItermax)
     if dual_variables:
         return G, alpha, beta
     return G
 
-def emd2(a, b, M,processes=multiprocessing.cpu_count(), max_iter=-1):
-    """Solves the Earth Movers distance problem and returns the loss 
+def emd2(a, b, M, processes=multiprocessing.cpu_count(), numItermax=100000):
+    """Solves the Earth Movers distance problem and returns the loss
 
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F
@@ -109,6 +115,9 @@ def emd2(a, b, M,processes=multiprocessing.cpu_count(), max_iter=-1):
         Target histogram (uniform weigth if empty list)
     M : (ns,nt) ndarray, float64
         loss matrix
+    numItermax : int, optional (default=100000)
+        The maximum number of iterations before stopping the optimization
+        algorithm if it has not converged.
 
     Returns
     -------
@@ -121,15 +130,15 @@ def emd2(a, b, M,processes=multiprocessing.cpu_count(), max_iter=-1):
 
     Simple example with obvious solution. The function emd accepts lists and
     perform automatic conversion to numpy arrays
-    
-    
+
+
     >>> import ot
     >>> a=[.5,.5]
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
     >>> ot.emd2(a,b,M)
     0.0
-    
+
     References
     ----------
 
@@ -152,16 +161,13 @@ def emd2(a, b, M,processes=multiprocessing.cpu_count(), max_iter=-1):
         a = np.ones((M.shape[0], ), dtype=np.float64)/M.shape[0]
     if len(b) == 0:
         b = np.ones((M.shape[1], ), dtype=np.float64)/M.shape[1]
-        
+    
     if len(b.shape)==1:
-        return emd2_c(a, b, M, max_iter)[0]
-    else:
-        nb=b.shape[1]
-        #res=[emd2_c(a,b[:,i].copy(),M) for i in range(nb)]
-        def f(b):
-            return emd2_c(a,b,M, max_iter)[0]
-        res= parmap(f, [b[:,i] for i in range(nb)],processes)
-        return np.array(res)
-        
-
-  
-\ No newline at end of file
+        return emd2_c(a, b, M, numItermax)[0]
+    nb = b.shape[1]
+    # res = [emd2_c(a, b[:, i].copy(), M, numItermax) for i in range(nb)]
+
+    def f(b):
+        return emd2_c(a,b,M, max_iter)[0]
+    res= parmap(f, [b[:,i] for i in range(nb)],processes)
+    return np.array(res)
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 4febb32..7056e0e 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
 """
-Created on Thu Sep 11 08:42:08 2014
-
-@author: rflamary
+Cython linker with C solver
 """
+
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
 import numpy as np
 cimport numpy as np
 
@@ -12,47 +15,50 @@ cimport cython
 
 
 cdef extern from "EMD.h":
-    void EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double *cost,
-                             double* alpha, double* beta, int max_iter)
+    int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int max_iter)
+    cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED
 
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def emd_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mode="c"]  b,np.ndarray[double, ndim=2, mode="c"]  M, int maxiter):
+def emd_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mode="c"]  b,np.ndarray[double, ndim=2, mode="c"]  M, int max_iter):
     """
         Solves the Earth Movers distance problem and returns the optimal transport matrix
-        
+
         gamm=emd(a,b,M)
-    
+
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F 
-        
+        \gamma = arg\min_\gamma <\gamma,M>_F
+
         s.t. \gamma 1 = a
-        
-             \gamma^T 1= b 
-             
+
+             \gamma^T 1= b
+
              \gamma\geq 0
     where :
-    
+
     - M is the metric cost matrix
     - a and b are the sample weights
-             
+
     Parameters
     ----------
     a : (ns,) ndarray, float64
-        source histogram 
+        source histogram
     b : (nt,) ndarray, float64
         target histogram
     M : (ns,nt) ndarray, float64
-        loss matrix        
-  
-    
+        loss matrix
+    max_iter : int
+        The maximum number of iterations before stopping the optimization
+        algorithm if it has not converged.
+
+
     Returns
     -------
     gamma: (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
- 
+
     """
     cdef int n1= M.shape[0]
     cdef int n2= M.shape[1]
@@ -61,7 +67,8 @@ def emd_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mod
     cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2])
     cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1)
     cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2)
-    
+
+
     if not len(a):
         a=np.ones((n1,))/n1
 
@@ -69,47 +76,54 @@ def emd_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mod
         b=np.ones((n2,))/n2
 
     # calling the function
-    EMD_wrap(n1,n2,<double*> a.data,<double*> b.data,<double*> M.data,<double*> G.data,
-                <double*> alpha.data, <double*> beta.data, <double*> &cost, maxiter)
+    cdef int resultSolver = EMD_wrap(n1,n2,<double*> a.data,<double*> b.data,<double*> M.data,<double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+    if resultSolver != OPTIMAL:
+        if resultSolver == INFEASIBLE:
+            print("Problem infeasible. Try to increase numItermax.")
+        elif resultSolver == UNBOUNDED:
+            print("Problem unbounded")
 
     return G, alpha, beta
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def emd2_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mode="c"]  b,np.ndarray[double, ndim=2, mode="c"]  M, int maxiter):
+def emd2_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mode="c"]  b,np.ndarray[double, ndim=2, mode="c"]  M, int max_iter):
     """
         Solves the Earth Movers distance problem and returns the optimal transport loss
-        
+
         gamm=emd(a,b,M)
-    
+
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F 
-        
+        \gamma = arg\min_\gamma <\gamma,M>_F
+
         s.t. \gamma 1 = a
-        
-             \gamma^T 1= b 
-             
+
+             \gamma^T 1= b
+
              \gamma\geq 0
     where :
-    
+
     - M is the metric cost matrix
     - a and b are the sample weights
-             
+
     Parameters
     ----------
     a : (ns,) ndarray, float64
-        source histogram 
+        source histogram
     b : (nt,) ndarray, float64
         target histogram
     M : (ns,nt) ndarray, float64
-        loss matrix        
-  
-    
+        loss matrix
+    max_iter : int
+        The maximum number of iterations before stopping the optimization
+        algorithm if it has not converged.
+
+
     Returns
     -------
     gamma: (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
- 
+
     """
     cdef int n1= M.shape[0]
     cdef int n2= M.shape[1]
@@ -119,16 +133,19 @@ def emd2_c( np.ndarray[double, ndim=1, mode="c"] a,np.ndarray[double, ndim=1, mo
 
     cdef np.ndarray[double, ndim = 1, mode = "c"] alpha = np.zeros([n1])
     cdef np.ndarray[double, ndim = 1, mode = "c"] beta = np.zeros([n2])
-    
+
     if not len(a):
         a=np.ones((n1,))/n1
 
     if not len(b):
         b=np.ones((n2,))/n2
     # calling the function
-    EMD_wrap(n1,n2,<double*> a.data,<double*> b.data,<double*> M.data,<double*> G.data,
-                <double*> alpha.data, <double*> beta.data, <double*> &cost, maxiter)
-
+    cdef int resultSolver = EMD_wrap(n1,n2,<double*> a.data,<double*> b.data,<double*> M.data,<double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+    if resultSolver != OPTIMAL:
+        if resultSolver == INFEASIBLE:
+            print("Problem infeasible. Try to inscrease numItermax.")
+        elif resultSolver == UNBOUNDED:
+            print("Problem unbounded")
 
     return cost, alpha, beta
 
diff --git a/ot/optim.py b/ot/optim.py
index 79f4f66..1d09adc 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -3,13 +3,19 @@
 Optimization algorithms for OT
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
 import numpy as np
 from scipy.optimize.linesearch import scalar_search_armijo
 from .lp import emd
 from .bregman import sinkhorn
 
 # The corresponding scipy function does not work for matrices
-def line_search_armijo(f,xk,pk,gfk,old_fval,args=(),c1=1e-4,alpha0=0.99):
+
+
+def line_search_armijo(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=0.99):
     """
     Armijo linesearch function that works with matrices
 
@@ -51,20 +57,21 @@ def line_search_armijo(f,xk,pk,gfk,old_fval,args=(),c1=1e-4,alpha0=0.99):
 
     def phi(alpha1):
         fc[0] += 1
-        return f(xk + alpha1*pk, *args)
+        return f(xk + alpha1 * pk, *args)
 
     if old_fval is None:
         phi0 = phi(0.)
     else:
         phi0 = old_fval
 
-    derphi0 = np.sum(pk*gfk) # Quickfix for matrices
-    alpha,phi1 = scalar_search_armijo(phi,phi0,derphi0,c1=c1,alpha0=alpha0)
+    derphi0 = np.sum(pk * gfk)  # Quickfix for matrices
+    alpha, phi1 = scalar_search_armijo(
+        phi, phi0, derphi0, c1=c1, alpha0=alpha0)
 
-    return alpha,fc[0],phi1
+    return alpha, fc[0], phi1
 
 
-def cg(a,b,M,reg,f,df,G0=None,numItermax = 200,stopThr=1e-9,verbose=False,log=False):
+def cg(a, b, M, reg, f, df, G0=None, numItermax=200, stopThr=1e-9, verbose=False, log=False):
     """
     Solve the general regularized OT problem with conditional gradient
 
@@ -128,74 +135,74 @@ def cg(a,b,M,reg,f,df,G0=None,numItermax = 200,stopThr=1e-9,verbose=False,log=Fa
 
     """
 
-    loop=1
+    loop = 1
 
     if log:
-        log={'loss':[]}
+        log = {'loss': []}
 
     if G0 is None:
-        G=np.outer(a,b)
+        G = np.outer(a, b)
     else:
-        G=G0
+        G = G0
 
     def cost(G):
-        return np.sum(M*G)+reg*f(G)
+        return np.sum(M * G) + reg * f(G)
 
-    f_val=cost(G)
+    f_val = cost(G)
     if log:
         log['loss'].append(f_val)
 
-    it=0
+    it = 0
 
     if verbose:
-        print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-        print('{:5d}|{:8e}|{:8e}'.format(it,f_val,0))
+        print('{:5s}|{:12s}|{:8s}'.format(
+            'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+        print('{:5d}|{:8e}|{:8e}'.format(it, f_val, 0))
 
     while loop:
 
-        it+=1
-        old_fval=f_val
-
+        it += 1
+        old_fval = f_val
 
         # problem linearization
-        Mi=M+reg*df(G)
+        Mi = M + reg * df(G)
         # set M positive
-        Mi+=Mi.min()
+        Mi += Mi.min()
 
         # solve linear program
-        Gc=emd(a,b,Mi)
+        Gc = emd(a, b, Mi)
 
-        deltaG=Gc-G
+        deltaG = Gc - G
 
         # line search
-        alpha,fc,f_val = line_search_armijo(cost,G,deltaG,Mi,f_val)
+        alpha, fc, f_val = line_search_armijo(cost, G, deltaG, Mi, f_val)
 
-        G=G+alpha*deltaG
+        G = G + alpha * deltaG
 
         # test convergence
-        if it>=numItermax:
-            loop=0
-
-        delta_fval=(f_val-old_fval)/abs(f_val)
-        if abs(delta_fval)<stopThr:
-            loop=0
+        if it >= numItermax:
+            loop = 0
 
+        delta_fval = (f_val - old_fval) / abs(f_val)
+        if abs(delta_fval) < stopThr:
+            loop = 0
 
         if log:
             log['loss'].append(f_val)
 
         if verbose:
-            if it%20 ==0:
-                print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-            print('{:5d}|{:8e}|{:8e}'.format(it,f_val,delta_fval))
-
+            if it % 20 == 0:
+                print('{:5s}|{:12s}|{:8s}'.format(
+                    'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+            print('{:5d}|{:8e}|{:8e}'.format(it, f_val, delta_fval))
 
     if log:
-        return G,log
+        return G, log
     else:
         return G
 
-def gcg(a,b,M,reg1,reg2,f,df,G0=None,numItermax = 10,numInnerItermax = 200,stopThr=1e-9,verbose=False,log=False):
+
+def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10, numInnerItermax=200, stopThr=1e-9, verbose=False, log=False):
     """
     Solve the general regularized OT problem with the generalized conditional gradient
 
@@ -264,70 +271,68 @@ def gcg(a,b,M,reg1,reg2,f,df,G0=None,numItermax = 10,numInnerItermax = 200,stopT
 
     """
 
-    loop=1
+    loop = 1
 
     if log:
-        log={'loss':[]}
+        log = {'loss': []}
 
     if G0 is None:
-        G=np.outer(a,b)
+        G = np.outer(a, b)
     else:
-        G=G0
+        G = G0
 
     def cost(G):
-        return np.sum(M*G)+ reg1*np.sum(G*np.log(G)) + reg2*f(G)
+        return np.sum(M * G) + reg1 * np.sum(G * np.log(G)) + reg2 * f(G)
 
-    f_val=cost(G)
+    f_val = cost(G)
     if log:
         log['loss'].append(f_val)
 
-    it=0
+    it = 0
 
     if verbose:
-        print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-        print('{:5d}|{:8e}|{:8e}'.format(it,f_val,0))
+        print('{:5s}|{:12s}|{:8s}'.format(
+            'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+        print('{:5d}|{:8e}|{:8e}'.format(it, f_val, 0))
 
     while loop:
 
-        it+=1
-        old_fval=f_val
-
+        it += 1
+        old_fval = f_val
 
         # problem linearization
-        Mi=M+reg2*df(G)
+        Mi = M + reg2 * df(G)
 
         # solve linear program with Sinkhorn
-        #Gc = sinkhorn_stabilized(a,b, Mi, reg1, numItermax = numInnerItermax)
-        Gc = sinkhorn(a,b, Mi, reg1, numItermax = numInnerItermax)
+        # Gc = sinkhorn_stabilized(a,b, Mi, reg1, numItermax = numInnerItermax)
+        Gc = sinkhorn(a, b, Mi, reg1, numItermax=numInnerItermax)
 
-        deltaG=Gc-G
+        deltaG = Gc - G
 
         # line search
-        dcost=Mi+reg1*(1+np.log(G)) #??
-        alpha,fc,f_val = line_search_armijo(cost,G,deltaG,dcost,f_val)
+        dcost = Mi + reg1 * (1 + np.log(G))  # ??
+        alpha, fc, f_val = line_search_armijo(cost, G, deltaG, dcost, f_val)
 
-        G=G+alpha*deltaG
+        G = G + alpha * deltaG
 
         # test convergence
-        if it>=numItermax:
-            loop=0
-
-        delta_fval=(f_val-old_fval)/abs(f_val)
-        if abs(delta_fval)<stopThr:
-            loop=0
+        if it >= numItermax:
+            loop = 0
 
+        delta_fval = (f_val - old_fval) / abs(f_val)
+        if abs(delta_fval) < stopThr:
+            loop = 0
 
         if log:
             log['loss'].append(f_val)
 
         if verbose:
-            if it%20 ==0:
-                print('{:5s}|{:12s}|{:8s}'.format('It.','Loss','Delta loss')+'\n'+'-'*32)
-            print('{:5d}|{:8e}|{:8e}'.format(it,f_val,delta_fval))
-
+            if it % 20 == 0:
+                print('{:5s}|{:12s}|{:8s}'.format(
+                    'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
+            print('{:5d}|{:8e}|{:8e}'.format(it, f_val, delta_fval))
 
     if log:
-        return G,log
+        return G, log
     else:
         return G
-
diff --git a/ot/plot.py b/ot/plot.py
index 9737f8a..784a372 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -2,91 +2,84 @@
 Functions for plotting OT matrices
 """
 
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
 
 import numpy as np
 import matplotlib.pylab as pl
 from matplotlib import gridspec
 
 
-def plot1D_mat(a,b,M,title=''):
-    """ Plot matrix M  with the source and target 1D distribution 
-    
-    Creates a subplot with the source distribution a on the left and 
+def plot1D_mat(a, b, M, title=''):
+    """ Plot matrix M  with the source and target 1D distribution
+
+    Creates a subplot with the source distribution a on the left and
     target distribution b on the tot. The matrix M is shown in between.
-    
-    
+
+
     Parameters
     ----------
-
-    a : np.array (na,)
+    a : np.array, shape (na,)
         Source distribution
-    b : np.array (nb,)
-        Target distribution  
-    M : np.array (na,nb)
+    b : np.array, shape (nb,)
+        Target distribution
+    M : np.array, shape (na,nb)
         Matrix to plot
-    
-    
-    
     """
-    
-    na=M.shape[0]
-    nb=M.shape[1]
-    
+    na, nb = M.shape
+
     gs = gridspec.GridSpec(3, 3)
-    
-    
-    xa=np.arange(na)
-    xb=np.arange(nb)
-    
-    
-    ax1=pl.subplot(gs[0,1:])
-    pl.plot(xb,b,'r',label='Target distribution')
+
+    xa = np.arange(na)
+    xb = np.arange(nb)
+
+    ax1 = pl.subplot(gs[0, 1:])
+    pl.plot(xb, b, 'r', label='Target distribution')
     pl.yticks(())
     pl.title(title)
-    
-    #pl.axis('off')
-    
-    ax2=pl.subplot(gs[1:,0])
-    pl.plot(a,xa,'b',label='Source distribution')
+
+    ax2 = pl.subplot(gs[1:, 0])
+    pl.plot(a, xa, 'b', label='Source distribution')
     pl.gca().invert_xaxis()
     pl.gca().invert_yaxis()
     pl.xticks(())
-    #pl.ylim((0,n))
-    #pl.axis('off')
-    
-    pl.subplot(gs[1:,1:],sharex=ax1,sharey=ax2)
-    pl.imshow(M,interpolation='nearest')
-    
-    pl.xlim((0,nb))
 
+    pl.subplot(gs[1:, 1:], sharex=ax1, sharey=ax2)
+    pl.imshow(M, interpolation='nearest')
+    pl.axis('off')
+
+    pl.xlim((0, nb))
+    pl.tight_layout()
+    pl.subplots_adjust(wspace=0., hspace=0.2)
 
-def plot2D_samples_mat(xs,xt,G,thr=1e-8,**kwargs):
+
+def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs):
     """ Plot matrix M  in 2D with  lines using alpha values
-    
-    Plot lines between source and target 2D samples with a color 
+
+    Plot lines between source and target 2D samples with a color
     proportional to the value of the matrix G between samples.
-    
-    
+
+
     Parameters
     ----------
-
-    xs : np.array (ns,2)
+    xs : ndarray, shape (ns,2)
         Source samples positions
-    b : np.array (nt,2)
+    b : ndarray, shape (nt,2)
         Target samples positions
-    G : np.array (na,nb)
+    G : ndarray, shape (na,nb)
         OT matrix
     thr : float, optional
         threshold above which the line is drawn
     **kwargs : dict
-        paameters given to the plot functions (default color is black if nothing given)
-    
+        paameters given to the plot functions (default color is black if
+        nothing given)
     """
-    if ('color' not in kwargs) and ('c' not  in kwargs):
-        kwargs['color']='k'
-    mx=G.max()
+    if ('color' not in kwargs) and ('c' not in kwargs):
+        kwargs['color'] = 'k'
+    mx = G.max()
     for i in range(xs.shape[0]):
         for j in range(xt.shape[0]):
-            if G[i,j]/mx>thr:
-                pl.plot([xs[i,0],xt[j,0]],[xs[i,1],xt[j,1]],alpha=G[i,j]/mx,**kwargs)
-    
-\ No newline at end of file
+            if G[i, j] / mx > thr:
+                pl.plot([xs[i, 0], xt[j, 0]], [xs[i, 1], xt[j, 1]],
+                        alpha=G[i, j] / mx, **kwargs)
diff --git a/ot/utils.py b/ot/utils.py
index 7ad7637..31a002b 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -2,36 +2,50 @@
 """
 Various function that can be usefull
 """
+
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#
+# License: MIT License
+
+import multiprocessing
+from functools import reduce
+import time
+
 import numpy as np
 from scipy.spatial.distance import cdist
-import multiprocessing
+import sys
+import warnings
+
+
+__time_tic_toc = time.time()
 
-import time
-__time_tic_toc=time.time()
 
 def tic():
     """ Python implementation of Matlab tic() function """
     global __time_tic_toc
-    __time_tic_toc=time.time()
+    __time_tic_toc = time.time()
+
 
 def toc(message='Elapsed time : {} s'):
     """ Python implementation of Matlab toc() function """
-    t=time.time()
-    print(message.format(t-__time_tic_toc))
-    return t-__time_tic_toc
+    t = time.time()
+    print(message.format(t - __time_tic_toc))
+    return t - __time_tic_toc
+
 
 def toq():
     """ Python implementation of Julia toc() function """
-    t=time.time()
-    return t-__time_tic_toc
+    t = time.time()
+    return t - __time_tic_toc
 
 
-def kernel(x1,x2,method='gaussian',sigma=1,**kwargs):
+def kernel(x1, x2, method='gaussian', sigma=1, **kwargs):
     """Compute kernel matrix"""
-    if method.lower() in ['gaussian','gauss','rbf']:
-        K=np.exp(-dist(x1,x2)/(2*sigma**2))
+    if method.lower() in ['gaussian', 'gauss', 'rbf']:
+        K = np.exp(-dist(x1, x2) / (2 * sigma**2))
     return K
 
+
 def unif(n):
     """ return a uniform histogram of length n (simplex)
 
@@ -48,17 +62,19 @@ def unif(n):
 
 
     """
-    return np.ones((n,))/n
+    return np.ones((n,)) / n
+
 
-def clean_zeros(a,b,M):
-    """ Remove all components with zeros weights in a and b 
+def clean_zeros(a, b, M):
+    """ Remove all components with zeros weights in a and b
     """
-    M2=M[a>0,:][:,b>0].copy() # copy force c style matrix (froemd)
-    a2=a[a>0]
-    b2=b[b>0]
-    return a2,b2,M2
+    M2 = M[a > 0, :][:, b > 0].copy()  # copy force c style matrix (froemd)
+    a2 = a[a > 0]
+    b2 = b[b > 0]
+    return a2, b2, M2
 
-def dist(x1,x2=None,metric='sqeuclidean'):
+
+def dist(x1, x2=None, metric='sqeuclidean'):
     """Compute distance between samples in x1 and x2 using function scipy.spatial.distance.cdist
 
     Parameters
@@ -84,12 +100,12 @@ def dist(x1,x2=None,metric='sqeuclidean'):
 
     """
     if x2 is None:
-        x2=x1
+        x2 = x1
 
-    return cdist(x1,x2,metric=metric)
+    return cdist(x1, x2, metric=metric)
 
 
-def dist0(n,method='lin_square'):
+def dist0(n, method='lin_square'):
     """Compute standard cost matrices of size (n,n) for OT problems
 
     Parameters
@@ -111,16 +127,50 @@ def dist0(n,method='lin_square'):
 
 
     """
-    res=0
-    if method=='lin_square':
-        x=np.arange(n,dtype=np.float64).reshape((n,1))
-        res=dist(x,x)
+    res = 0
+    if method == 'lin_square':
+        x = np.arange(n, dtype=np.float64).reshape((n, 1))
+        res = dist(x, x)
     return res
 
 
+def cost_normalization(C, norm=None):
+    """ Apply normalization to the loss matrix
+
+
+    Parameters
+    ----------
+    C : np.array (n1, n2)
+        The cost matrix to normalize.
+    norm : str
+        type of normalization from 'median','max','log','loglog'. Any other
+        value do not normalize.
+
+
+    Returns
+    -------
+
+    C : np.array (n1, n2)
+        The input cost matrix normalized according to given norm.
+
+    """
+
+    if norm == "median":
+        C /= float(np.median(C))
+    elif norm == "max":
+        C /= float(np.max(C))
+    elif norm == "log":
+        C = np.log(1 + C)
+    elif norm == "loglog":
+        C = np.log(1 + np.log(1 + C))
+
+    return C
+
+
 def dots(*args):
     """ dots function for multiple matrix multiply """
-    return reduce(np.dot,args)
+    return reduce(np.dot, args)
+
 
 def fun(f, q_in, q_out):
     """ Utility function for parmap with no serializing problems """
@@ -130,6 +180,7 @@ def fun(f, q_in, q_out):
             break
         q_out.put((i, f(x)))
 
+
 def parmap(f, X, nprocs=multiprocessing.cpu_count()):
     """ paralell map for multiprocessing """
     q_in = multiprocessing.Queue(1)
@@ -147,4 +198,241 @@ def parmap(f, X, nprocs=multiprocessing.cpu_count()):
 
     [p.join() for p in proc]
 
-    return [x for i, x in sorted(res)]    
-\ No newline at end of file
+    return [x for i, x in sorted(res)]
+
+
+def check_params(**kwargs):
+    """check_params: check whether some parameters are missing
+    """
+
+    missing_params = []
+    check = True
+
+    for param in kwargs:
+        if kwargs[param] is None:
+            missing_params.append(param)
+
+    if len(missing_params) > 0:
+        print("POT - Warning: following necessary parameters are missing")
+        for p in missing_params:
+            print("\n", p)
+
+        check = False
+
+    return check
+
+
+class deprecated(object):
+    """Decorator to mark a function or class as deprecated.
+
+    deprecated class from scikit-learn package
+    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/deprecation.py
+    Issue a warning when the function is called/the class is instantiated and
+    adds a warning to the docstring.
+    The optional extra argument will be appended to the deprecation message
+    and the docstring. Note: to use this with the default value for extra, put
+    in an empty of parentheses:
+    >>> from ot.deprecation import deprecated
+    >>> @deprecated()
+    ... def some_function(): pass
+
+    Parameters
+    ----------
+    extra : string
+          to be added to the deprecation messages
+    """
+
+    # Adapted from http://wiki.python.org/moin/PythonDecoratorLibrary,
+    # but with many changes.
+
+    def __init__(self, extra=''):
+        self.extra = extra
+
+    def __call__(self, obj):
+        """Call method
+        Parameters
+        ----------
+        obj : object
+        """
+        if isinstance(obj, type):
+            return self._decorate_class(obj)
+        else:
+            return self._decorate_fun(obj)
+
+    def _decorate_class(self, cls):
+        msg = "Class %s is deprecated" % cls.__name__
+        if self.extra:
+            msg += "; %s" % self.extra
+
+        # FIXME: we should probably reset __new__ for full generality
+        init = cls.__init__
+
+        def wrapped(*args, **kwargs):
+            warnings.warn(msg, category=DeprecationWarning)
+            return init(*args, **kwargs)
+
+        cls.__init__ = wrapped
+
+        wrapped.__name__ = '__init__'
+        wrapped.__doc__ = self._update_doc(init.__doc__)
+        wrapped.deprecated_original = init
+
+        return cls
+
+    def _decorate_fun(self, fun):
+        """Decorate function fun"""
+
+        msg = "Function %s is deprecated" % fun.__name__
+        if self.extra:
+            msg += "; %s" % self.extra
+
+        def wrapped(*args, **kwargs):
+            warnings.warn(msg, category=DeprecationWarning)
+            return fun(*args, **kwargs)
+
+        wrapped.__name__ = fun.__name__
+        wrapped.__dict__ = fun.__dict__
+        wrapped.__doc__ = self._update_doc(fun.__doc__)
+
+        return wrapped
+
+    def _update_doc(self, olddoc):
+        newdoc = "DEPRECATED"
+        if self.extra:
+            newdoc = "%s: %s" % (newdoc, self.extra)
+        if olddoc:
+            newdoc = "%s\n\n%s" % (newdoc, olddoc)
+        return newdoc
+
+
+def _is_deprecated(func):
+    """Helper to check if func is wraped by our deprecated decorator"""
+    if sys.version_info < (3, 5):
+        raise NotImplementedError("This is only available for python3.5 "
+                                  "or above")
+    closures = getattr(func, '__closure__', [])
+    if closures is None:
+        closures = []
+    is_deprecated = ('deprecated' in ''.join([c.cell_contents
+                                              for c in closures
+                     if isinstance(c.cell_contents, str)]))
+    return is_deprecated
+
+
+class BaseEstimator(object):
+    """Base class for most objects in POT
+    adapted from sklearn BaseEstimator class
+
+    Notes
+    -----
+    All estimators should specify all the parameters that can be set
+    at the class level in their ``__init__`` as explicit keyword
+    arguments (no ``*args`` or ``**kwargs``).
+    """
+
+    @classmethod
+    def _get_param_names(cls):
+        """Get parameter names for the estimator"""
+        try:
+            from inspect import signature
+        except ImportError:
+            from .externals.funcsigs import signature
+        # fetch the constructor or the original constructor before
+        # deprecation wrapping if any
+        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
+        if init is object.__init__:
+            # No explicit constructor to introspect
+            return []
+
+        # introspect the constructor arguments to find the model parameters
+        # to represent
+        init_signature = signature(init)
+        # Consider the constructor parameters excluding 'self'
+        parameters = [p for p in init_signature.parameters.values()
+                      if p.name != 'self' and p.kind != p.VAR_KEYWORD]
+        for p in parameters:
+            if p.kind == p.VAR_POSITIONAL:
+                raise RuntimeError("POT estimators should always "
+                                   "specify their parameters in the signature"
+                                   " of their __init__ (no varargs)."
+                                   " %s with constructor %s doesn't "
+                                   " follow this convention."
+                                   % (cls, init_signature))
+        # Extract and sort argument names excluding 'self'
+        return sorted([p.name for p in parameters])
+
+    def get_params(self, deep=True):
+        """Get parameters for this estimator.
+
+        Parameters
+        ----------
+        deep : boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        out = dict()
+        for key in self._get_param_names():
+            # We need deprecation warnings to always be on in order to
+            # catch deprecated param values.
+            # This is set in utils/__init__.py but it gets overwritten
+            # when running under python3 somehow.
+            warnings.simplefilter("always", DeprecationWarning)
+            try:
+                with warnings.catch_warnings(record=True) as w:
+                    value = getattr(self, key, None)
+                if len(w) and w[0].category == DeprecationWarning:
+                    # if the parameter is deprecated, don't show it
+                    continue
+            finally:
+                warnings.filters.pop(0)
+
+            # XXX: should we rather test if instance of estimator?
+            if deep and hasattr(value, 'get_params'):
+                deep_items = value.get_params().items()
+                out.update((key + '__' + k, val) for k, val in deep_items)
+            out[key] = value
+        return out
+
+    def set_params(self, **params):
+        """Set the parameters of this estimator.
+
+        The method works on simple estimators as well as on nested objects
+        (such as pipelines). The latter have parameters of the form
+        ``<component>__<parameter>`` so that it's possible to update each
+        component of a nested object.
+
+        Returns
+        -------
+        self
+        """
+        if not params:
+            # Simple optimisation to gain speed (inspect is slow)
+            return self
+        valid_params = self.get_params(deep=True)
+        # for key, value in iteritems(params):
+        for key, value in params.items():
+            split = key.split('__', 1)
+            if len(split) > 1:
+                # nested objects case
+                name, sub_name = split
+                if name not in valid_params:
+                    raise ValueError('Invalid parameter %s for estimator %s. '
+                                     'Check the list of available parameters '
+                                     'with `estimator.get_params().keys()`.' %
+                                     (name, self))
+                sub_object = valid_params[name]
+                sub_object.set_params(**{sub_name: value})
+            else:
+                # simple objects case
+                if key not in valid_params:
+                    raise ValueError('Invalid parameter %s for estimator %s. '
+                                     'Check the list of available parameters '
+                                     'with `estimator.get_params().keys()`.' %
+                                     (key, self.__class__.__name__))
+                setattr(self, key, value)
+        return self