From cb6bdc516697e3bad6776b897f22c8b6a22f13cd Mon Sep 17 00:00:00 2001 From: LeoGautheron Date: Wed, 11 Jul 2018 22:28:38 +0200 Subject: Speed-up Sinkhorn Speed-up in 3 places: - the computation of pairwise distance is faster with sklearn.metrics.pairwise.euclidean_distances - faster computation of K = np.exp(-M / reg) - faster computation of the error every 10 iterations Example with this little script: import time import numpy as np import ot rng = np.random.RandomState(0) transport = ot.da.SinkhornTransport() time1 = time.time() Xs, ys, Xt = rng.randn(10000, 100), rng.randint(0, 2, size=10000), rng.randn(10000, 100) transport.fit(Xs=Xs, Xt=Xt) time2 = time.time() print("OT Computation Time {:6.2f} sec".format(time2-time1)) transport = ot.da.SinkhornLpl1Transport() transport.fit(Xs=Xs, ys=ys, Xt=Xt) time3 = time.time() print("OT LpL1 Computation Time {:6.2f} sec".format(time3-time2)) Before OT Computation Time 19.93 sec OT LpL1 Computation Time 133.43 sec After OT Computation Time 7.55 sec OT LpL1 Computation Time 82.25 sec --- ot/bregman.py | 14 +++++++++++--- ot/utils.py | 4 +++- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'ot') diff --git a/ot/bregman.py b/ot/bregman.py index b017c1a..55c44f6 100644 --- a/ot/bregman.py +++ b/ot/bregman.py @@ -344,8 +344,13 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, # print(reg) - K = np.exp(-M / reg) + K = np.empty(M.shape, dtype=M.dtype) + np.divide(M, -reg, out=K) + np.exp(K, out=K) + # print(np.min(K)) + tmp = np.empty(K.shape, dtype=M.dtype) + tmp2 = np.empty(b.shape, dtype=M.dtype) Kp = (1 / a).reshape(-1, 1) * K cpt = 0 @@ -373,8 +378,11 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, err = np.sum((u - uprev)**2) / np.sum((u)**2) + \ np.sum((v - vprev)**2) / np.sum((v)**2) else: - transp = u.reshape(-1, 1) * (K * v) - err = np.linalg.norm((np.sum(transp, axis=0) - b))**2 + np.multiply(u.reshape(-1, 1), K, out=tmp) + np.multiply(tmp, v.reshape(1, -1), out=tmp) + np.sum(tmp, axis=0, out=tmp2) + tmp2 -= b + err = np.linalg.norm(tmp2)**2 if log: log['err'].append(err) diff --git a/ot/utils.py b/ot/utils.py index 7dac283..5b052ac 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -13,6 +13,7 @@ import time import numpy as np from scipy.spatial.distance import cdist +from sklearn.metrics.pairwise import euclidean_distances import sys import warnings try: @@ -104,7 +105,8 @@ def dist(x1, x2=None, metric='sqeuclidean'): """ if x2 is None: x2 = x1 - + if metric == "sqeuclidean": + return euclidean_distances(x1, x2, squared=True) return cdist(x1, x2, metric=metric) -- cgit v1.2.3 From 73e61546b5509648ba6b7924d6e5a3ebe7fade5b Mon Sep 17 00:00:00 2001 From: LeoGautheron Date: Mon, 16 Jul 2018 06:48:54 +0200 Subject: Remove dependency sklearn --- ot/utils.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'ot') diff --git a/ot/utils.py b/ot/utils.py index 5b052ac..14cc805 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -13,7 +13,6 @@ import time import numpy as np from scipy.spatial.distance import cdist -from sklearn.metrics.pairwise import euclidean_distances import sys import warnings try: @@ -77,6 +76,33 @@ def clean_zeros(a, b, M): b2 = b[b > 0] return a2, b2, M2 +def euclidean_distances(X, Y, squared=False): + """ + Considering the rows of X (and Y=X) as vectors, compute the + distance matrix between each pair of vectors. + Parameters + ---------- + X : {array-like}, shape (n_samples_1, n_features) + Y : {array-like}, shape (n_samples_2, n_features) + squared : boolean, optional + Return squared Euclidean distances. + Returns + ------- + distances : {array}, shape (n_samples_1, n_samples_2) + """ + XX = np.einsum('ij,ij->i', X, X)[:, np.newaxis] + YY = np.einsum('ij,ij->i', Y, Y)[np.newaxis, :] + distances = np.dot(X, Y.T) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + if X is Y: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + distances.flat[::distances.shape[0] + 1] = 0.0 + return distances if squared else np.sqrt(distances, out=distances) + def dist(x1, x2=None, metric='sqeuclidean'): """Compute distance between samples in x1 and x2 using function scipy.spatial.distance.cdist -- cgit v1.2.3 From 0764e356325df7e18f72c0ff468bfa8f8ee35059 Mon Sep 17 00:00:00 2001 From: LeoGautheron Date: Mon, 16 Jul 2018 06:56:34 +0200 Subject: Add comment & fix flake8 error --- ot/bregman.py | 1 + ot/utils.py | 1 + 2 files changed, 2 insertions(+) (limited to 'ot') diff --git a/ot/bregman.py b/ot/bregman.py index 55c44f6..c8e69ce 100644 --- a/ot/bregman.py +++ b/ot/bregman.py @@ -344,6 +344,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, # print(reg) + # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute K = np.empty(M.shape, dtype=M.dtype) np.divide(M, -reg, out=K) np.exp(K, out=K) diff --git a/ot/utils.py b/ot/utils.py index 14cc805..bb21b38 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -76,6 +76,7 @@ def clean_zeros(a, b, M): b2 = b[b > 0] return a2, b2, M2 + def euclidean_distances(X, Y, squared=False): """ Considering the rows of X (and Y=X) as vectors, compute the -- cgit v1.2.3