11 files changed, 412 insertions, 457 deletions
diff --git a/Makefile b/Makefile
index 4cdb7d1..89c30c3 100644
--- a/Makefile
+++ b/Makefile
@@ -74,4 +74,7 @@ autopep8 :
 aautopep8 :
 	autopep8 -air test ot examples --jobs -1
 
+pydocstyle :
+	pydocstyle ot
+
 FORCE :
diff --git a/ot/bregman.py b/ot/bregman.py
index 13dfa3b..f39145d 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -40,12 +40,12 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -64,7 +64,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -155,12 +155,12 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -176,7 +176,6 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
     W : (nt) ndarray or float
@@ -272,12 +271,12 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -290,10 +289,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -453,12 +451,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -469,10 +467,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -602,11 +599,11 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -623,10 +620,9 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -823,11 +819,11 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -835,7 +831,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
         thershold for max value in u or v for log scaling
     tau : float
         thershold for max value in u or v for log scaling
-    warmstart : tible of vectors
+    warmstart : tuple of vectors
         if given then sarting values for alpha an beta log scalings
     numItermax : int, optional
         Max number of iterations
@@ -850,10 +846,9 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -1006,13 +1001,13 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
 
     Parameters
     ----------
-    A : np.ndarray (d,n)
+    A : ndarray, shape (d,n)
         n training distributions a_i of size d
-    M : np.ndarray (d,d)
+    M : ndarray, shape (d,d)
         loss matrix   for OT
     reg : float
         Regularization term >0
-    weights : np.ndarray (n,)
+    weights : ndarray, shape (n,)
         Weights of each histogram a_i on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
@@ -1102,11 +1097,11 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
 
     Parameters
     ----------
-    A : np.ndarray (n,w,h)
+    A : ndarray, shape (n, w, h)
         n distributions (2D images) of size w x h
     reg : float
         Regularization term >0
-    weights : np.ndarray (n,)
+    weights : ndarray, shape (n,)
         Weights of each image on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
@@ -1119,15 +1114,13 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    a : (w,h) ndarray
+    a : ndarray, shape (w, h)
         2D Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
 
-
     References
     ----------
 
@@ -1217,15 +1210,15 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (d)
+    a : ndarray, shape (d)
         observed distribution
-    D : np.ndarray (d,n)
+    D : ndarray, shape (d, n)
         dictionary matrix
-    M : np.ndarray (d,d)
+    M : ndarray, shape (d, d)
         loss matrix
-    M0 : np.ndarray (n,n)
+    M0 : ndarray, shape (n, n)
         loss matrix
-    h0 : np.ndarray (n,)
+    h0 : ndarray, shape (n,)
         prior on h
     reg : float
         Regularization term >0 (Wasserstein data fitting)
@@ -1245,7 +1238,7 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     Returns
     -------
-    a : (d,) ndarray
+    a : ndarray, shape (d,)
         Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
@@ -1325,15 +1318,15 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI
 
     Parameters
     ----------
-    X_s : np.ndarray (ns, d)
+    X_s : ndarray, shape (ns, d)
         samples in the source domain
-    X_t : np.ndarray (nt, d)
+    X_t : ndarray, shape (nt, d)
         samples in the target domain
     reg : float
         Regularization term >0
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples weights in the target domain
     numItermax : int, optional
         Max number of iterations
@@ -1347,7 +1340,7 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Regularized optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -1415,15 +1408,15 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
 
     Parameters
     ----------
-    X_s : np.ndarray (ns, d)
+    X_s : ndarray, shape (ns, d)
         samples in the source domain
-    X_t : np.ndarray (nt, d)
+    X_t : ndarray, shape (nt, d)
         samples in the target domain
     reg : float
         Regularization term >0
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples weights in the target domain
     numItermax : int, optional
         Max number of iterations
@@ -1437,7 +1430,7 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Regularized optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -1523,15 +1516,15 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
 
     Parameters
     ----------
-    X_s : np.ndarray (ns, d)
+    X_s : ndarray, shape (ns, d)
         samples in the source domain
-    X_t : np.ndarray (nt, d)
+    X_t : ndarray, shape (nt, d)
         samples in the target domain
     reg : float
         Regularization term >0
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples weights in the target domain
     numItermax : int, optional
         Max number of iterations
@@ -1542,17 +1535,15 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Regularized optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
     Examples
     --------
-
     >>> n_s = 2
     >>> n_t = 4
     >>> reg = 0.1
@@ -1564,7 +1555,6 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
 
     References
     ----------
-
     .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences,  Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018
     '''
     if log:
diff --git a/ot/datasets.py b/ot/datasets.py
index e76e75d..ba0cfd9 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -17,7 +17,6 @@ def make_1D_gauss(n, m, s):
 
     Parameters
     ----------
-
     n : int
         number of bins in the histogram
     m : float
@@ -25,12 +24,10 @@ def make_1D_gauss(n, m, s):
     s : float
         standard deviaton of the gaussian distribution
 
-
     Returns
     -------
-    h : np.array (n,)
-          1D histogram for a gaussian distribution
-
+    h : ndarray (n,)
+        1D histogram for a gaussian distribution
     """
     x = np.arange(n, dtype=np.float64)
     h = np.exp(-(x - m)**2 / (2 * s**2))
@@ -44,16 +41,15 @@ def get_1D_gauss(n, m, sigma):
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
-    """return n samples drawn from 2D gaussian N(m,sigma)
+    """Return n samples drawn from 2D gaussian N(m,sigma)
 
     Parameters
     ----------
-
     n : int
         number of samples to make
-    m : np.array (2,)
+    m : ndarray, shape (2,)
         mean value of the gaussian distribution
-    sigma : np.array (2,2)
+    sigma : ndarray, shape (2, 2)
         covariance matrix of the gaussian distribution
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -63,9 +59,8 @@ def make_2D_samples_gauss(n, m, sigma, random_state=None):
 
     Returns
     -------
-    X : np.array (n,2)
-          n samples drawn from  N(m,sigma)
-
+    X : ndarray, shape (n, 2)
+        n samples drawn from N(m, sigma).
     """
 
     generator = check_random_state(random_state)
@@ -86,11 +81,10 @@ def get_2D_samples_gauss(n, m, sigma, random_state=None):
 
 
 def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
-    """ dataset generation for classification problems
+    """Dataset generation for classification problems
 
     Parameters
     ----------
-
     dataset : str
         type of classification problem (see code)
     n : int
@@ -105,13 +99,11 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
 
     Returns
     -------
-    X : np.array (n,d)
-          n observation of size d
-    y : np.array (n,)
-          labels of the samples
-
+    X : ndarray, shape (n, d)
+        n observation of size d
+    y : ndarray, shape (n,)
+        labels of the samples.
     """
-
     generator = check_random_state(random_state)
 
     if dataset.lower() == '3gauss':
diff --git a/ot/dr.py b/ot/dr.py
index d2bf6e2..680dabf 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -49,30 +49,25 @@ def split_classes(X, y):
 
 
 def fda(X, y, p=2, reg=1e-16):
-    """
-    Fisher Discriminant Analysis
-
+    """Fisher Discriminant Analysis
 
     Parameters
     ----------
-    X : numpy.ndarray (n,d)
-        Training samples
-    y : np.ndarray (n,)
-        labels for training samples
+    X : ndarray, shape (n, d)
+        Training samples.
+    y : ndarray, shape (n,)
+        Labels for training samples.
     p : int, optional
-        size of dimensionnality reduction
+        Size of dimensionnality reduction.
     reg : float, optional
         Regularization term >0 (ridge regularization)
 
-
     Returns
     -------
-    P : (d x p) ndarray
+    P : ndarray, shape (d, p)
         Optimal transportation matrix for the given parameters
-    proj : fun
+    proj : callable
         projection function including mean centering
-
-
     """
 
     mx = np.mean(X)
@@ -130,37 +125,33 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None):
 
     Parameters
     ----------
-    X : numpy.ndarray (n,d)
-        Training samples
-    y : np.ndarray (n,)
-        labels for training samples
+    X : ndarray, shape (n, d)
+        Training samples.
+    y : ndarray, shape (n,)
+        Labels for training samples.
     p : int, optional
-        size of dimensionnality reduction
+        Size of dimensionnality reduction.
     reg : float, optional
         Regularization term >0 (entropic regularization)
-    solver : str, optional
-        None for steepest decsent or 'TrustRegions' for trust regions algorithm
-        else shoudl be a pymanopt.solvers
-    P0 : numpy.ndarray (d,p)
-        Initial starting point for projection
+    solver : None | str, optional
+        None for steepest descent or 'TrustRegions' for trust regions algorithm
+        else should be a pymanopt.solvers
+    P0 : ndarray, shape (d, p)
+        Initial starting point for projection.
     verbose : int, optional
-        Print information along iterations
-
-
+        Print information along iterations.
 
     Returns
     -------
-    P : (d x p) ndarray
+    P : ndarray, shape (d, p)
         Optimal transportation matrix for the given parameters
-    proj : fun
-        projection function including mean centering
-
+    proj : callable
+        Projection function including mean centering.
 
     References
     ----------
-
-    .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016). Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
-
+    .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016).
+            Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
     """  # noqa
 
     mx = np.mean(X)
diff --git a/ot/gromov.py b/ot/gromov.py
index 3a7e24c..699ae4c 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -1,9 +1,6 @@
-
 # -*- coding: utf-8 -*-
 """
 Gromov-Wasserstein transport method
-
-
 """
 
 # Author: Erwan Vautier <erwan.vautier@gmail.com>
@@ -22,7 +19,7 @@ from .optim import cg
 
 
 def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
-    """ Return loss matrices and tensors for Gromov-Wasserstein fast computation
+    """Return loss matrices and tensors for Gromov-Wasserstein fast computation
 
     Returns the value of \mathcal{L}(C1,C2) \otimes T with the selected loss
     function as the loss function of Gromow-Wasserstein discrepancy.
@@ -51,23 +48,21 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     T :  ndarray, shape (ns, nt)
-         Coupling between source and target spaces
+        Coupling between source and target spaces
     p : ndarray, shape (ns,)
 
-
     Returns
     -------
-
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
 
     References
     ----------
@@ -114,25 +109,23 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
 
 
 def tensor_product(constC, hC1, hC2, T):
-    """ Return the tensor for Gromov-Wasserstein fast computation
+    """Return the tensor for Gromov-Wasserstein fast computation
 
     The tensor is computed as described in Proposition 1 Eq. (6) in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
-
+        h2(C) matrix in Eq. (6)
 
     Returns
     -------
-
     tens : ndarray, shape (ns, nt)
-           \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
+        \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
 
     References
     ----------
@@ -148,26 +141,25 @@ def tensor_product(constC, hC1, hC2, T):
 
 
 def gwloss(constC, hC1, hC2, T):
-    """ Return the Loss for Gromov-Wasserstein
+    """Return the Loss for Gromov-Wasserstein
 
     The loss is computed as described in Proposition 1 Eq. (6) in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
     T : ndarray, shape (ns, nt)
-           Current value of transport matrix T
+        Current value of transport matrix T
 
     Returns
     -------
-
     loss : float
-           Gromov Wasserstein loss
+        Gromov Wasserstein loss
 
     References
     ----------
@@ -183,24 +175,23 @@ def gwloss(constC, hC1, hC2, T):
 
 
 def gwggrad(constC, hC1, hC2, T):
-    """ Return the gradient for Gromov-Wasserstein
+    """Return the gradient for Gromov-Wasserstein
 
     The gradient is computed as described in Proposition 2 in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
     T : ndarray, shape (ns, nt)
-           Current value of transport matrix T
+        Current value of transport matrix T
 
     Returns
     -------
-
     grad : ndarray, shape (ns, nt)
            Gromov Wasserstein gradient
 
@@ -222,19 +213,19 @@ def update_square_loss(p, lambdas, T, Cs):
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        Masses in the targeted barycenter.
     lambdas : list of float
-              list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
+        List of the S spaces' weights.
+    T : list of S np.ndarray of shape (ns,N)
+        The S Ts couplings calculated at each iteration.
     Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (nt,nt)
-        updated C matrix
+    C : ndarray, shape (nt, nt)
+        Updated C matrix.
     """
     tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
                   for s in range(len(T))])
@@ -251,12 +242,12 @@ def update_kl_loss(p, lambdas, T, Cs):
     Parameters
     ----------
     p  : ndarray, shape (N,)
-         weights in the targeted barycenter
+        Weights in the targeted barycenter.
     lambdas : list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
+    T : list of S np.ndarray of shape (ns,N)
+        The S Ts couplings calculated at each iteration.
     Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        Metric cost matrices.
 
     Returns
     ----------
@@ -290,14 +281,14 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
-    q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    loss_fun : str
         loss function used for the solver either 'square_loss' or 'kl_loss'
 
     max_iter : int, optional
@@ -317,10 +308,10 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     Returns
     -------
     T : ndarray, shape (ns, nt)
-        coupling between the two spaces that minimizes :
+        Doupling between the two spaces that minimizes:
             \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
     log : dict
-        convergence information and loss
+        Convergence information and loss.
 
     References
     ----------
@@ -374,18 +365,18 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
     Parameters
     ----------
-    M  : ndarray, shape (ns, nt)
-         Metric cost matrix between features across domains
+    M : ndarray, shape (ns, nt)
+        Metric cost matrix between features across domains
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix representative of the structure in the source space
+        Metric cost matrix representative of the structure in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric cost matrix representative of the structure in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
-    q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string,optional
-        loss function used for the solver
+        Metric cost matrix representative of the structure in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    loss_fun : str, optional
+        Loss function used for the solver
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -402,11 +393,10 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
-        Optimal transportation matrix for the given parameters
+    gamma : ndarray, shape (ns, nt)
+        Optimal transportation matrix for the given parameters.
     log : dict
-        log dictionary return only if log==True in parameters
-
+        Log dictionary return only if log==True in parameters.
 
     References
     ----------
@@ -414,7 +404,6 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
         and Courty Nicolas "Optimal Transport for structured data with
         application on graphs", International Conference on Machine Learning
         (ICML). 2019.
-
     """
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
@@ -457,18 +446,18 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
 
     Parameters
     ----------
-    M  : ndarray, shape (ns, nt)
-         Metric cost matrix between features across domains
+    M : ndarray, shape (ns, nt)
+        Metric cost matrix between features across domains
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix respresentative of the structure in the source space
+        Metric cost matrix respresentative of the structure in the source space.
     C2 : ndarray, shape (nt, nt)
-         Metric cost matrix espresentative of the structure in the target space
+        Metric cost matrix espresentative of the structure in the target space.
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space.
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string,optional
-        loss function used for the solver
+        Distribution in the target space.
+    loss_fun : str, optional
+        Loss function used for the solver.
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -476,19 +465,19 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the steps of the line-search is found via an armijo research.
+        Else closed form is used. If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        Parameters can be directly pased to the ot.optim.cg solver.
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
-        Optimal transportation matrix for the given parameters
+    gamma : ndarray, shape (ns, nt)
+        Optimal transportation matrix for the given parameters.
     log : dict
-        log dictionary return only if log==True in parameters
+        Log dictionary return only if log==True in parameters.
 
     References
     ----------
@@ -537,16 +526,15 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric cost matrix in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
+        Metric cost matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space.
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
+        Distribution in the target space.
+    loss_fun :  str
         loss function used for the solver either 'square_loss' or 'kl_loss'
-
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -558,6 +546,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     armijo : bool, optional
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
+
     Returns
     -------
     gw_dist : float
@@ -624,25 +613,25 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space
     q :  ndarray, shape (nt,)
-         distribution in the target space
+        Distribution in the target space
     loss_fun :  string
-        loss function used for the solver either 'square_loss' or 'kl_loss'
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
     epsilon : float
         Regularization term >0
     max_iter : int, optional
-       Max number of iterations
+        Max number of iterations
     tol : float, optional
         Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
 
     Returns
     -------
@@ -725,15 +714,15 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
-        loss function used for the solver either 'square_loss' or 'kl_loss'
+        Distribution in the target space
+    loss_fun : str
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -743,7 +732,7 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
 
     Returns
     -------
@@ -757,7 +746,6 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
         International Conference on Machine Learning (ICML). 2016.
 
     """
-
     gw, logv = entropic_gromov_wasserstein(
         C1, C2, p, q, loss_fun, epsilon, max_iter, tol, verbose, log=True)
 
@@ -789,19 +777,21 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     Parameters
     ----------
-    N  : Integer
-         Size of the targeted barycenter
-    Cs : list of S np.ndarray(ns,ns)
-         Metric cost matrices
-    ps : list of S np.ndarray(ns,)
-         sample weights in the S spaces
-    p  : ndarray, shape(N,)
-         weights in the targeted barycenter
+    N : int
+        Size of the targeted barycenter
+    Cs : list of S np.ndarray of shape (ns,ns)
+        Metric cost matrices
+    ps : list of S np.ndarray of shape (ns,)
+        Sample weights in the S spaces
+    p : ndarray, shape(N,)
+        Weights in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
-    loss_fun :  tensor-matrix multiplication function based on specific loss function
-    update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-             with the S Ts couplings calculated at each iteration
+        List of the S spaces' weights.
+    loss_fun : callable
+        Tensor-matrix multiplication function based on specific loss function.
+    update : callable
+        function(p,lambdas,T,Cs) that updates C according to a specific Kernel
+        with the S Ts couplings calculated at each iteration
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -809,11 +799,11 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     tol : float, optional
         Stop threshol on error (>0)
     verbose : bool, optional
-        Print information along iterations
+        Print information along iterations.
     log : bool, optional
-        record log if True
-    init_C : bool, ndarray, shape(N,N)
-             random initial value for the C matrix provided by user
+        Record log if True.
+    init_C : bool | ndarray, shape (N, N)
+        Random initial value for the C matrix provided by user.
 
     Returns
     -------
@@ -825,7 +815,6 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
-
     """
 
     S = len(Cs)
@@ -835,6 +824,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
+        # XXX use random state
         xalea = np.random.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
@@ -846,7 +836,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     error = []
 
-    while(err > tol and cpt < max_iter):
+    while (err > tol) and (cpt < max_iter):
         Cprev = C
 
         T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon,
@@ -890,7 +880,6 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     .. math::
         C = argmin_C\in R^NxN \sum_s \lambda_s GW(C,Cs,p,ps)
 
-
     Where :
 
     - Cs : metric cost matrix
@@ -898,29 +887,29 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
     Parameters
     ----------
-    N  : Integer
-         Size of the targeted barycenter
-    Cs : list of S np.ndarray(ns,ns)
-         Metric cost matrices
-    ps : list of S np.ndarray(ns,)
-         sample weights in the S spaces
-    p  : ndarray, shape(N,)
-         weights in the targeted barycenter
+    N : int
+        Size of the targeted barycenter
+    Cs : list of S np.ndarray of shape (ns, ns)
+        Metric cost matrices
+    ps : list of S np.ndarray of shape (ns,)
+        Sample weights in the S spaces
+    p : ndarray, shape (N,)
+        Weights in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     loss_fun :  tensor-matrix multiplication function based on specific loss function
     update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
              with the S Ts couplings calculated at each iteration
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
-        Stop threshol on error (>0)
+        Stop threshol on error (>0).
     verbose : bool, optional
-        Print information along iterations
+        Print information along iterations.
     log : bool, optional
-        record log if True
-    init_C : bool, ndarray, shape(N,N)
-             random initial value for the C matrix provided by user
+        Record log if True.
+    init_C : bool | ndarray, shape(N,N)
+        Random initial value for the C matrix provided by user.
 
     Returns
     -------
@@ -934,7 +923,6 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
         International Conference on Machine Learning (ICML). 2016.
 
     """
-
     S = len(Cs)
 
     Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
@@ -942,6 +930,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
+        # XXX : should use a random state and not use the global seed
         xalea = np.random.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
@@ -987,8 +976,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
                     p=None, loss_fun='square_loss', max_iter=100, tol=1e-9,
                     verbose=False, log=False, init_C=None, init_X=None):
-    """
-    Compute the fgw barycenter as presented eq (5) in [24].
+    """Compute the fgw barycenter as presented eq (5) in [24].
 
     Parameters
     ----------
@@ -997,30 +985,32 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
     Ys: list of ndarray, each element has shape (ns,d)
         Features of all samples
     Cs : list of ndarray, each element has shape (ns,ns)
-         Structure matrices of all samples
+        Structure matrices of all samples
     ps : list of ndarray, each element has shape (ns,)
-        masses of all samples
+        Masses of all samples.
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     alpha : float
-            Alpha parameter for the fgw distance
-    fixed_structure :  bool
-                       Wether to fix the structure of the barycenter during the updates
-    fixed_features :  bool
-                       Wether to fix the feature of the barycenter during the updates
-    init_C :  ndarray, shape (N,N), optional
-              initialization for the barycenters' structure matrix. If not set random init
-    init_X :  ndarray, shape (N,d), optional
-              initialization for the barycenters' features. If not set random init
+        Alpha parameter for the fgw distance
+    fixed_structure : bool
+        Whether to fix the structure of the barycenter during the updates
+    fixed_features : bool
+        Whether to fix the feature of the barycenter during the updates
+    init_C : ndarray, shape (N,N), optional
+        Initialization for the barycenters' structure matrix. If not set
+        a random init is used.
+    init_X : ndarray, shape (N,d), optional
+        Initialization for the barycenters' features. If not set a
+        random init is used.
 
     Returns
     -------
-    X : ndarray, shape (N,d)
+    X : ndarray, shape (N, d)
         Barycenters' features
-    C : ndarray, shape (N,N)
+    C : ndarray, shape (N, N)
         Barycenters' structure matrix
-    log_: dictionary
-        Only returned when log=True
+    log_: dict
+        Only returned when log=True. It contains the keys:
         T : list of (N,ns) transport matrices
         Ms : all distance matrices between the feature of the barycenter and the
         other features dist(X,Ys) shape (N,ns)
@@ -1032,7 +1022,6 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
-
     S = len(Cs)
     d = Ys[0].shape[1]  # dimension on the node features
     if p is None:
@@ -1095,7 +1084,8 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
                 T_temp = [t.T for t in T]
                 C = update_sructure_matrix(p, lambdas, T_temp, Cs)
 
-        T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha, numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
+        T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
+                                      numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
 
         # T is N,ns
         err_feature = np.linalg.norm(X - Xprev.reshape(N, d))
@@ -1114,6 +1104,7 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             print('{:5d}|{:8e}|'.format(cpt, err_feature))
 
         cpt += 1
+
     if log:
         log_['T'] = T  # from target to Ys
         log_['p'] = p
@@ -1126,25 +1117,25 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
 
 
 def update_sructure_matrix(p, lambdas, T, Cs):
-    """
-    Updates C according to the L2 Loss kernel with the S Ts couplings
-    calculated at each iteration
+    """Updates C according to the L2 Loss kernel with the S Ts couplings.
+
+    It is calculated at each iteration
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        Masses in the targeted barycenter.
     lambdas : list of float
-              list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
-    Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        List of the S spaces' weights.
+    T : list of S ndarray of shape (ns, N)
+        The S Ts couplings calculated at each iteration.
+    Cs : list of S ndarray, shape (ns, ns)
+         Metric cost matrices.
 
     Returns
-    ----------
-    C : ndarray, shape (nt,nt)
-        updated C matrix
+    -------
+    C : ndarray, shape (nt, nt)
+        Updated C matrix.
     """
     tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s]) for s in range(len(T))])
     ppt = np.outer(p, p)
@@ -1153,24 +1144,26 @@ def update_sructure_matrix(p, lambdas, T, Cs):
 
 
 def update_feature_matrix(lambdas, Ys, Ts, p):
-    """
-    Updates the feature with respect to the S Ts couplings. See "Solving the barycenter problem with Block Coordinate Descent (BCD)" in [24]
-    calculated at each iteration
+    """Updates the feature with respect to the S Ts couplings.
+
+
+    See "Solving the barycenter problem with Block Coordinate Descent (BCD)"
+    in [24] calculated at each iteration
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        masses in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     Ts : list of S np.ndarray(ns,N)
         the S Ts couplings calculated at each iteration
     Ys : list of S ndarray, shape(d,ns)
-         The features
+        The features.
 
     Returns
-    ----------
-    X : ndarray, shape (d,N)
+    -------
+    X : ndarray, shape (d, N)
 
     References
     ----------
@@ -1179,9 +1172,8 @@ def update_feature_matrix(lambdas, Ys, Ts, p):
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
+    p = np.array(1. / p).reshape(-1,)
 
-    p = np.diag(np.array(1 / p).reshape(-1,))
-
-    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T).dot(p) for s in range(len(Ts))])
+    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T) * p[None, :] for s in range(len(Ts))])
 
     return tmpsum
diff --git a/ot/optim.py b/ot/optim.py
index f94aceb..0abd9e9 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -26,14 +26,13 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
 
     Parameters
     ----------
-
-    f : function
+    f : callable
         loss function
-    xk : np.ndarray
+    xk : ndarray
         initial position
-    pk : np.ndarray
+    pk : ndarray
         descent direction
-    gfk : np.ndarray
+    gfk : ndarray
         gradient of f at xk
     old_fval : float
         loss value at xk
@@ -161,15 +160,15 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
-    G0 :  np.ndarray (ns,nt), optional
+    G0 :  ndarray, shape (ns,nt), optional
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
@@ -299,17 +298,17 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarrayv (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg1 : float
         Entropic Regularization term >0
     reg2 : float
         Second Regularization term >0
-    G0 :  np.ndarray (ns,nt), optional
+    G0 : ndarray, shape (ns, nt), optional
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
@@ -326,15 +325,13 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
-
     References
     ----------
-
     .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
     .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
 
@@ -422,13 +419,12 @@ def solve_1d_linesearch_quad(a, b, c):
     Parameters
     ----------
     a,b,c : float
-            The coefficients of the quadratic function
+        The coefficients of the quadratic function
 
     Returns
     -------
     x : float
         The optimal value which leads to the minimal cost
-
     """
     f0 = c
     df0 = b
diff --git a/ot/plot.py b/ot/plot.py
index a409d4a..f403e98 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -26,11 +26,11 @@ def plot1D_mat(a, b, M, title=''):
 
     Parameters
     ----------
-    a : np.array, shape (na,)
+    a : ndarray, shape (na,)
         Source distribution
-    b : np.array, shape (nb,)
+    b : ndarray, shape (nb,)
         Target distribution
-    M : np.array, shape (na,nb)
+    M : ndarray, shape (na, nb)
         Matrix to plot
     """
     na, nb = M.shape
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 5754968..13ed9cc 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -38,22 +38,20 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
-        target measure
-    M : np.ndarray(ns, nt)
-        cost matrix
-    reg : float nu
-        Regularization term > 0
-    v : np.ndarray(nt,)
-        dual variable
-    i : number int
-        picked number i
+    b : ndarray, shape (nt,)
+        Target measure.
+    M : ndarray, shape (ns, nt)
+        Cost matrix.
+    reg : float
+        Regularization term > 0.
+    v : ndarray, shape (nt,)
+        Dual variable.
+    i : int
+        Picked number i.
 
     Returns
     -------
-
-    coordinate gradient : np.ndarray(nt,)
+    coordinate gradient : ndarray, shape (nt,)
 
     Examples
     --------
@@ -78,14 +76,11 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
     References
     ----------
-
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
-
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
-
     r = M[i, :] - beta
     exp_beta = np.exp(-r / reg) * b
     khi = exp_beta / (np.sum(exp_beta))
@@ -121,24 +116,23 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
     Parameters
     ----------
 
-    a : np.ndarray(ns,),
-        source measure
-    b : np.ndarray(nt,),
-        target measure
-    M : np.ndarray(ns, nt),
-        cost matrix
-    reg : float number,
+    a : ndarray, shape (ns,),
+        Source measure.
+    b : ndarray, shape (nt,),
+        Target measure.
+    M : ndarray, shape (ns, nt),
+        Cost matrix.
+    reg : float
         Regularization term > 0
-    numItermax : int number
-        number of iteration
-    lr : float number
-        learning rate
+    numItermax : int
+        Number of iteration.
+    lr : float
+        Learning rate.
 
     Returns
     -------
-
-    v : np.ndarray(nt,)
-        dual variable
+    v : ndarray, shape (nt,)
+        Dual variable.
 
     Examples
     --------
@@ -213,23 +207,20 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    numItermax : int number
-        number of iteration
-    lr : float number
-        learning rate
-
+    numItermax : int
+        Number of iteration.
+    lr : float
+        Learning rate.
 
     Returns
     -------
-
-    ave_v : np.ndarray(nt,)
+    ave_v : ndarray, shape (nt,)
         dual variable
 
     Examples
@@ -256,9 +247,9 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
     ----------
 
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
 
     if lr is None:
@@ -298,21 +289,19 @@ def c_transform_entropic(b, M, reg, beta):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
-        target measure
-    M : np.ndarray(ns, nt)
-        cost matrix
+    b : ndarray, shape (nt,)
+        Target measure
+    M : ndarray, shape (ns, nt)
+        Cost matrix
     reg : float
-        regularization term > 0
-    v : np.ndarray(nt,)
-        dual variable
+        Regularization term > 0
+    v : ndarray, shape (nt,)
+        Dual variable.
 
     Returns
     -------
-
-    u : np.ndarray(ns,)
-        dual variable
+    u : ndarray, shape (ns,)
+        Dual variable.
 
     Examples
     --------
@@ -338,9 +327,9 @@ def c_transform_entropic(b, M, reg, beta):
     ----------
 
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
 
     n_source = np.shape(M)[0]
@@ -382,31 +371,30 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
     Parameters
     ----------
 
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
     methode : str
         used method (SAG or ASGD)
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
-    n_source : int number
+    n_source : int
         size of the source measure
-    n_target : int number
+    n_target : int
         size of the target measure
     log : bool, optional
         record log if True
 
     Returns
     -------
-
-    pi : np.ndarray(ns, nt)
+    pi : ndarray, shape (ns, nt)
         transportation matrix
     log : dict
         log dictionary return only if log==True in parameters
@@ -495,30 +483,28 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    alpha : np.ndarray(ns,)
+    alpha : ndarray, shape (ns,)
         dual variable
-    beta : np.ndarray(nt,)
+    beta : ndarray, shape (nt,)
         dual variable
-    batch_size : int number
+    batch_size : int
         size of the batch
-    batch_alpha : np.ndarray(bs,)
+    batch_alpha : ndarray, shape (bs,)
         batch of index of alpha
-    batch_beta : np.ndarray(bs,)
+    batch_beta : ndarray, shape (bs,)
         batch of index of beta
 
     Returns
     -------
-
-    grad : np.ndarray(ns,)
+    grad : ndarray, shape (ns,)
         partial grad F
 
     Examples
@@ -591,28 +577,26 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    batch_size : int number
+    batch_size : int
         size of the batch
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
 
     Returns
     -------
-
-    alpha : np.ndarray(ns,)
+    alpha : ndarray, shape (ns,)
         dual variable
-    beta : np.ndarray(nt,)
+    beta : ndarray, shape (nt,)
         dual variable
 
     Examples
@@ -648,10 +632,9 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
     References
     ----------
-
     [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+        International Conference on Learning Representation (2018),
+          arXiv preprint arxiv:1711.02283.
     '''
 
     n_source = np.shape(M)[0]
@@ -696,28 +679,26 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    batch_size : int number
+    batch_size : int
         size of the batch
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
     log : bool, optional
         record log if True
 
     Returns
     -------
-
-    pi : np.ndarray(ns, nt)
+    pi : ndarray, shape (ns, nt)
         transportation matrix
     log : dict
         log dictionary return only if log==True in parameters
@@ -757,8 +738,8 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
     ----------
 
     [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+        International Conference on Learning Representation (2018),
+          arXiv preprint arxiv:1711.02283.
     '''
 
     opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,
diff --git a/ot/unbalanced.py b/ot/unbalanced.py
index 1453b31..14e9e36 100644
--- a/ot/unbalanced.py
+++ b/ot/unbalanced.py
@@ -395,7 +395,8 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, mu, numItermax=1000,
                     print(
                         '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                 print('{:5d}|{:8e}|'.format(cpt, err))
-        cpt = cpt + 1
+        cpt += 1
+
     if log:
         log['logu'] = np.log(u + 1e-16)
         log['logv'] = np.log(v + 1e-16)
diff --git a/ot/utils.py b/ot/utils.py
index e8249ef..8419c83 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -111,12 +111,12 @@ def dist(x1, x2=None, metric='sqeuclidean'):
     Parameters
     ----------
 
-    x1 : np.array (n1,d)
+    x1 : ndarray, shape (n1,d)
         matrix with n1 samples of size d
-    x2 : np.array (n2,d), optional
+    x2 : array, shape (n2,d), optional
         matrix with n2 samples of size d (if None then x2=x1)
-    metric : str, fun, optional
-        name of the metric to be computed (full list in the doc of scipy),  If a string,
+    metric : str | callable, optional
+        Name of the metric to be computed (full list in the doc of scipy),  If a string,
         the distance function can be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
         'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski',
         'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
@@ -138,26 +138,21 @@ def dist(x1, x2=None, metric='sqeuclidean'):
 
 
 def dist0(n, method='lin_square'):
-    """Compute standard cost matrices of size (n,n) for OT problems
+    """Compute standard cost matrices of size (n, n) for OT problems
 
     Parameters
     ----------
-
     n : int
-        size of the cost matrix
+        Size of the cost matrix.
     method : str, optional
         Type of loss matrix chosen from:
 
         * 'lin_square' : linear sampling between 0 and n-1, quadratic loss
 
-
     Returns
     -------
-
-    M : np.array (n1,n2)
-        distance matrix computed with given metric
-
-
+    M : ndarray, shape (n1,n2)
+        Distance matrix computed with given metric.
     """
     res = 0
     if method == 'lin_square':
@@ -169,22 +164,18 @@ def dist0(n, method='lin_square'):
 def cost_normalization(C, norm=None):
     """ Apply normalization to the loss matrix
 
-
     Parameters
     ----------
-    C : np.array (n1, n2)
+    C : ndarray, shape (n1, n2)
         The cost matrix to normalize.
     norm : str
-        type of normalization from 'median','max','log','loglog'. Any other
-        value do not normalize.
-
+        Type of normalization from 'median', 'max', 'log', 'loglog'. Any
+        other value do not normalize.
 
     Returns
     -------
-
-    C : np.array (n1, n2)
+    C : ndarray, shape (n1, n2)
         The input cost matrix normalized according to given norm.
-
     """
 
     if norm == "median":
@@ -194,7 +185,7 @@ def cost_normalization(C, norm=None):
     elif norm == "log":
         C = np.log(1 + C)
     elif norm == "loglog":
-        C = np.log(1 + np.log(1 + C))
+        C = np.log1p(np.log1p(C))
 
     return C
 
@@ -256,6 +247,7 @@ def check_params(**kwargs):
 
 def check_random_state(seed):
     """Turn seed into a np.random.RandomState instance
+
     Parameters
     ----------
     seed : None | int | instance of RandomState
@@ -275,7 +267,6 @@ def check_random_state(seed):
 
 
 class deprecated(object):
-
     """Decorator to mark a function or class as deprecated.
 
     deprecated class from scikit-learn package
@@ -291,8 +282,8 @@ class deprecated(object):
 
     Parameters
     ----------
-    extra : string
-          to be added to the deprecation messages
+    extra : str
+        To be added to the deprecation messages.
     """
 
     # Adapted from http://wiki.python.org/moin/PythonDecoratorLibrary,
@@ -373,9 +364,9 @@ def _is_deprecated(func):
 
 
 class BaseEstimator(object):
-
     """Base class for most objects in POT
-    adapted from sklearn BaseEstimator class
+
+    Code adapted from sklearn BaseEstimator class
 
     Notes
     -----
@@ -417,7 +408,7 @@ class BaseEstimator(object):
 
         Parameters
         ----------
-        deep : boolean, optional
+        deep : bool, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
diff --git a/setup.cfg b/setup.cfg
index aa0ff62..6be91fe 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,3 +4,21 @@ description-file = README.md
 [flake8]
 exclude = __init__.py
 ignore = E265,E501,W605,W503,W504
+
+[tool:pytest]
+addopts =
+    --showlocals --durations=20 --doctest-modules -ra --cov-report= --cov=ot
+    --doctest-ignore-import-errors --junit-xml=junit-results.xml
+    --ignore=docs --ignore=examples --ignore=notebooks
+
+[pycodestyle]
+exclude = __init__.py,*externals*,constants.py,fixes.py
+ignore = E241,E305,W504
+
+[pydocstyle]
+convention = pep257
+match_dir = ^(?!\.|docs|examples).*$
+match = (?!tests/__init__\.py|fixes).*\.py
+add-ignore = D100,D104,D107,D413
+add-select = D214,D215,D404,D405,D406,D407,D408,D409,D410,D411
+ignore-decorators = ^(copy_.*_doc_to_|on_trait_change|cached_property|deprecated|property|.*setter).*