5 files changed, 422 insertions, 0 deletions
diff --git a/ot/gpu/cudamat/examples/bench_cudamat.py b/ot/gpu/cudamat/examples/bench_cudamat.py
new file mode 100644
index 0000000..b3a5c19
--- /dev/null
+++ b/ot/gpu/cudamat/examples/bench_cudamat.py
@@ -0,0 +1,97 @@
+from __future__ import print_function, division
+import sys
+import numpy as np
+import cudamat as cmt
+import time
+import timeit
+from inspect import getmodule, getmembers, isfunction
+try: from itertools import ifilter as filter
+except: pass
+
+# heat-up time in seconds before starting the benchmark
+HEATUP = 2
+
+# shapes used for the small and large test matrix
+XS_SHAPE = (400, 256)
+XL_SHAPE = (4096, 4096)
+
+# timeit number and repeat parameter
+NUM_ITER = 100
+NUM_REPEATS = 5
+
+def setup(shape):
+    """Creates two matrices and corresponding row/column vectors"""
+    mat = cmt.empty(shape).fill_with_randn()
+    mat2 = cmt.empty(shape).fill_with_randn()
+    col = cmt.empty((shape[0], 1)).assign(0)
+    row = cmt.empty((1, shape[1])).assign(0)
+    return mat, mat2, col, row
+
+def bench_dot(X, Y, col, row):
+    cmt.dot(X.T, Y)
+
+def bench_add(X, Y, col, row):
+    X.add(Y)
+bench_add.repeats = 5  # 5 times more repetitions than usual
+
+def bench_mult(X, Y, col, row):
+    X.mult(Y)
+
+def bench_sigm(X, Y, col, row):
+    X.apply_sigmoid()
+
+def bench_colsum(X, Y, col, row):
+    X.sum(axis=0, target=row)
+
+def bench_rowsum(X, Y, col, row):
+    X.sum(axis=1, target=col)
+
+def bench_addcolsum(X, Y, col, row):
+    row.add_sums(X, axis=0, mult=3.2, beta=0.2)
+
+def bench_addrowsum(X, Y, col, row):
+    col.add_sums(X, axis=1, mult=3.2, beta=0.2)
+
+def bench_colmax(X, Y, col, row):
+    X.max(axis=0, target=row)
+
+def bench_rowmax(X, Y, col, row):
+    X.max(axis=1, target=col)
+
+def bench_addcolmult(X, Y, col, row):
+    X.add_col_mult(col, mult=3.2)
+
+def heatup(duration):
+    """Heat-up the GPU for a while so it enters full-performance mode"""
+    t1 = time.time()
+    while time.time() - t1 < duration:
+        cmt.dot(cmt.empty((200, 200)), cmt.empty((200, 200)))
+
+def main():
+    cmt.init()
+    cmt.CUDAMatrix.init_random()
+    if HEATUP:
+        print("heating up for %g seconds..." % HEATUP, end=' ')
+        sys.stdout.flush()
+        heatup(HEATUP)
+        print("done.")
+    print("small matrix shape:", XS_SHAPE)
+    print("large matrix shape:", XL_SHAPE)
+    for funcname, func in filter(lambda f: f[0].startswith('bench_'),
+            getmembers(getmodule(main), isfunction)):
+        print("%-15s" % funcname[len('bench_'):], end=' ')
+        sys.stdout.flush()
+        for size, shape, factor in ('small', XS_SHAPE, 10), ('large', XL_SHAPE, 1):
+            repeat = NUM_REPEATS * getattr(func, 'repeats', 1)
+            time = min(timeit.repeat(\
+                    setup="from __main__ import setup, %s\nmats = setup(%s)" % (funcname, shape),
+                    stmt="%s(*mats)" % funcname, repeat=repeat,
+                    number=NUM_ITER * factor)) / (NUM_ITER * factor)
+            print("%.3es (%s) " % (time, size), end=' ')
+            sys.stdout.flush()
+        print()
+    cmt.shutdown()
+
+if __name__=="__main__":
+    main()
+
diff --git a/ot/gpu/cudamat/examples/nn_cudamat.py b/ot/gpu/cudamat/examples/nn_cudamat.py
new file mode 100644
index 0000000..7c56c7d
--- /dev/null
+++ b/ot/gpu/cudamat/examples/nn_cudamat.py
@@ -0,0 +1,133 @@
+# This file shows how to implement a single hidden layer neural network for
+# performing binary classification on the GPU using cudamat.
+
+from __future__ import division
+import pdb
+import time
+import numpy as np
+import cudamat as cm
+from cudamat import learn as cl
+import util
+
+# initialize CUDA
+cm.cublas_init()
+
+# load data
+util.load('mnist49.dat', globals())
+
+# Put training data onto the GPU.
+dat_train = dat_train/255.
+dat_train = dat_train - (np.mean(dat_train, 1)+10**-8)[:, np.newaxis]
+dev_train = cm.CUDAMatrix(dat_train)
+dev_lbl = cm.CUDAMatrix(lbl_train)
+
+# training parameters
+epsilon = 0.01
+momentum = 0.9
+
+num_epochs = 30
+batch_size = 128
+num_batches = dat_train.shape[1]//batch_size
+
+# model parameters
+dim_in = dat_train.shape[0]
+dim_out = 1
+num_hid = 1024
+
+# initialize weights
+w_w1 = cm.CUDAMatrix(dim_in ** -0.5 * np.random.randn(dim_in, num_hid))
+w_b1 = cm.CUDAMatrix(np.zeros((num_hid, 1)))
+w_w2 = cm.CUDAMatrix(num_hid ** -0.5 * np.random.randn(num_hid, dim_out))
+w_b2 = cm.CUDAMatrix(np.zeros((dim_out, 1)))
+
+# initialize weight update matrices
+wu_w1 = cm.empty(w_w1.shape).assign(0)
+wu_b1 = cm.empty(w_b1.shape).assign(0)
+wu_w2 = cm.empty(w_w2.shape).assign(0)
+wu_b2 = cm.empty(w_b2.shape).assign(0)
+
+# initialize temporary storage
+h = cm.empty((num_hid, batch_size))
+out = cm.empty((dim_out, batch_size))
+delta = cm.empty((num_hid, batch_size))
+
+# Train neural network.
+start_time = time.time()
+for epoch in range(num_epochs):
+    print("Epoch %i" % (epoch + 1))
+    err = []
+
+    for batch in range(num_batches):
+        # get current minibatch
+        inp = dev_train.slice(batch*batch_size,(batch + 1)*batch_size)
+        target = dev_lbl.slice(batch*batch_size,(batch + 1)*batch_size)
+
+        # forward pass
+        cm.dot(w_w1.T, inp, target = h)
+
+        h.add_col_vec(w_b1)
+        h.apply_sigmoid()
+
+        cm.dot(w_w2.T, h, target = out)
+
+        out.add_col_vec(w_b2)
+        out.apply_sigmoid()
+
+        # back prop errors
+        out.subtract(target) # compute error
+
+        # gradients for w_w2 and w_b2
+        wu_w2.add_dot(h, out.T, beta = momentum)
+        wu_b2.add_sums(out, axis = 1, beta = momentum)
+
+        # compute delta
+        cm.dot(w_w2, out, target = delta)
+
+        # delta = delta * h * (1 - h)
+        cl.mult_by_sigmoid_deriv(delta, h)
+
+        # gradients for w_w1 and w_b1
+        wu_w1.add_dot(inp, delta.T, beta = momentum)
+        wu_b1.add_sums(delta, axis = 1, beta = momentum)
+
+        # update weights
+        w_w1.subtract_mult(wu_w1, epsilon/batch_size)
+        w_b1.subtract_mult(wu_b1, epsilon/batch_size)
+        w_w2.subtract_mult(wu_w2, epsilon/batch_size)
+        w_b2.subtract_mult(wu_b2, epsilon/batch_size)
+
+        # calculate error on current minibatch 
+        err.append(np.abs(out.asarray())>0.5)
+
+    print("Training misclassification rate: %f" % np.mean(err))
+    print("Time: %f" % (time.time() - start_time))
+
+# Evaluate neural network on test data.
+
+# Load test data onto the GPU.
+dat_test = dat_test/255.
+dat_test = dat_test - np.mean(dat_test, 1)[:, np.newaxis]
+dev_test = cm.CUDAMatrix(dat_test)
+dev_lbl = cm.CUDAMatrix(lbl_test)
+
+# Initalize temporary storage.
+h = cm.empty((num_hid, dat_test.shape[1]))
+out = cm.empty((dim_out, dat_test.shape[1]))
+
+# forward pass
+cm.dot(w_w1.T, dev_test, target = h)
+
+h.add_col_vec(w_b1)
+h.apply_sigmoid()
+
+cm.dot(w_w2.T, h, target = out)
+
+out.add_col_vec(w_b2)
+out.apply_sigmoid()
+
+# compute error
+out.subtract(dev_lbl)
+
+print("Testing misclassification rate: %f" % np.mean(np.abs(out.asarray())>0.5))
+
+cm.cublas_shutdown()
diff --git a/ot/gpu/cudamat/examples/rbm_cudamat.py b/ot/gpu/cudamat/examples/rbm_cudamat.py
new file mode 100644
index 0000000..3f6a900
--- /dev/null
+++ b/ot/gpu/cudamat/examples/rbm_cudamat.py
@@ -0,0 +1,98 @@
+from __future__ import division
+import time
+import numpy as np
+import cudamat as cm
+import util
+
+# initialize CUDA
+cm.cublas_init()
+cm.CUDAMatrix.init_random(1)
+
+# load data
+util.load('mnist.dat', globals())
+dev_dat = cm.CUDAMatrix(cm.reformat(dat/255.))
+
+# training parameters
+epsilon = 0.1
+momentum = 0.9
+
+num_epochs = 30
+batch_size = 128
+num_batches = dat.shape[1]//batch_size
+
+# model parameters
+num_vis = dat.shape[0]
+num_hid = 4096
+
+# initialize weights
+w_vh = cm.CUDAMatrix(0.1 * np.random.randn(num_vis, num_hid))
+w_v = cm.CUDAMatrix(np.zeros((num_vis, 1)))
+w_h = cm.CUDAMatrix(-4.*np.ones((num_hid, 1)))
+
+# initialize weight updates
+wu_vh = cm.CUDAMatrix(np.zeros((num_vis, num_hid)))
+wu_v = cm.CUDAMatrix(np.zeros((num_vis, 1)))
+wu_h = cm.CUDAMatrix(np.zeros((num_hid, 1)))
+
+# initialize temporary storage
+v = cm.empty((num_vis, batch_size))
+h = cm.empty((num_hid, batch_size))
+r = cm.empty((num_hid, batch_size))
+
+start_time = time.time()
+for epoch in range(num_epochs):
+    print("Epoch %i" % (epoch + 1))
+    err = []
+
+    for batch in range(num_batches):
+        # get current minibatch
+        v_true = dev_dat.slice(batch*batch_size,(batch + 1)*batch_size)
+        v.assign(v_true)
+
+        # apply momentum
+        wu_vh.mult(momentum)
+        wu_v.mult(momentum)
+        wu_h.mult(momentum)
+
+        # positive phase
+        cm.dot(w_vh.T, v, target = h)
+        h.add_col_vec(w_h)
+        h.apply_sigmoid()
+
+        wu_vh.add_dot(v, h.T)
+        wu_v.add_sums(v, axis = 1)
+        wu_h.add_sums(h, axis = 1)
+
+        # sample hiddens
+        r.fill_with_rand()
+        r.less_than(h, target = h)
+
+        # negative phase
+        cm.dot(w_vh, h, target = v)
+        v.add_col_vec(w_v)
+        v.apply_sigmoid()
+
+        cm.dot(w_vh.T, v, target = h)
+        h.add_col_vec(w_h)
+        h.apply_sigmoid()
+
+        wu_vh.subtract_dot(v, h.T)
+        wu_v.add_sums(v, axis = 1, mult = -1.)
+        wu_h.add_sums(h, axis = 1, mult = -1.)
+
+        # update weights
+        w_vh.add_mult(wu_vh, epsilon/batch_size)
+        w_v.add_mult(wu_v, epsilon/batch_size)
+        w_h.add_mult(wu_h, epsilon/batch_size)
+
+        # calculate reconstruction error
+        v.subtract(v_true)
+        err.append(v.euclid_norm()**2/(num_vis*batch_size))
+
+    print("Mean squared error: %f" % np.mean(err))
+    print("Time: %f" % (time.time() - start_time))
+
+w_vh.copy_to_host()
+util.save('weights.dat', 'w_vh', {'w_vh': w_vh.numpy_array})
+
+cm.cublas_shutdown()
diff --git a/ot/gpu/cudamat/examples/rbm_numpy.py b/ot/gpu/cudamat/examples/rbm_numpy.py
new file mode 100644
index 0000000..1331566
--- /dev/null
+++ b/ot/gpu/cudamat/examples/rbm_numpy.py
@@ -0,0 +1,72 @@
+from __future__ import division
+import time
+import numpy as np
+import util
+
+# load data
+util.load('mnist.dat', globals())
+dat = dat/255.
+
+# training parameters
+epsilon = 0.01
+momentum = 0.9
+
+num_epochs = 10
+batch_size = 64
+num_batches = dat.shape[1]//batch_size
+
+# model parameters
+num_vis = dat.shape[0]
+num_hid = 1024
+
+# initialize weights
+w_vh = 0.1 * np.random.randn(num_vis, num_hid)
+w_v = np.zeros((num_vis, 1))
+w_h = np.zeros((num_hid, 1))
+
+# initialize weight updates
+wu_vh = np.zeros((num_vis, num_hid))
+wu_v = np.zeros((num_vis, 1))
+wu_h = np.zeros((num_hid, 1))
+
+start_time = time.time()
+for epoch in range(num_epochs):
+    print("Epoch %i" % (epoch + 1))
+    err = []
+
+    for batch in range(num_batches):
+        v_true = dat[:, batch*batch_size:(batch + 1)*batch_size]
+        v = v_true
+
+        # apply momentum
+        wu_vh *= momentum
+        wu_v *= momentum
+        wu_h *= momentum
+
+        # positive phase
+        h = 1. / (1 + np.exp(-(np.dot(w_vh.T, v) + w_h)))
+
+        wu_vh += np.dot(v, h.T)
+        wu_v += v.sum(1)[:, np.newaxis]
+        wu_h += h.sum(1)[:, np.newaxis]
+
+        # sample hiddens
+        h = 1. * (h > np.random.rand(num_hid, batch_size))
+
+        # negative phase
+        v = 1. / (1 + np.exp(-(np.dot(w_vh, h) + w_v)))
+        h = 1. / (1 + np.exp(-(np.dot(w_vh.T, v) + w_h)))
+
+        wu_vh -= np.dot(v, h.T)
+        wu_v -= v.sum(1)[:, np.newaxis]
+        wu_h -= h.sum(1)[:, np.newaxis]
+
+        # update weights
+        w_vh += epsilon/batch_size * wu_vh
+        w_v += epsilon/batch_size * wu_v
+        w_h += epsilon/batch_size * wu_h
+
+        err.append(np.mean((v - v_true)**2))
+
+    print("Mean squared error: %f" % np.mean(err))
+    print("Time: %f" % (time.time() - start_time))
diff --git a/ot/gpu/cudamat/examples/util.py b/ot/gpu/cudamat/examples/util.py
new file mode 100644
index 0000000..79ceead
--- /dev/null
+++ b/ot/gpu/cudamat/examples/util.py
@@ -0,0 +1,22 @@
+from __future__ import division
+import gzip
+try: import cPickle as pickle
+except: import pickle
+
+def save(fname, var_list, source_dict):
+    var_list = [var.strip() for var in var_list.split() if len(var.strip())>0]
+    fo = gzip.GzipFile(fname, 'wb')
+    pickle.dump(var_list, fo)
+    for var in var_list:
+        pickle.dump(source_dict[var], fo, protocol=2)
+    fo.close()
+
+def load(fname, target_dict, verbose = True):
+    fo = gzip.GzipFile(fname, 'rb')
+    var_list = pickle.load(fo)
+    if verbose:
+        print(var_list)
+    for var in var_list:
+        target_dict[var] = pickle.load(fo)
+    fo.close()
+