1 files changed, 1575 insertions, 0 deletions
diff --git a/ot/gpu/cudamat/cudamat/cudamat.py b/ot/gpu/cudamat/cudamat/cudamat.py
new file mode 100644
index 0000000..f855ed2
--- /dev/null
+++ b/ot/gpu/cudamat/cudamat/cudamat.py
@@ -0,0 +1,1575 @@
+import os
+import platform
+import warnings
+import sysconfig
+import sys
+
+import ctypes as ct
+import numpy as np
+
+def load_library(basename):
+    if platform.system() == 'Windows':
+       ext = '.dll'
+    else:
+       ext = sysconfig.get_config_var('SO')
+    return ct.cdll.LoadLibrary(os.path.join(
+        os.path.dirname(__file__) or os.path.curdir,
+        basename + ext))
+
+_cudamat = load_library('libcudamat')
+
+_cudamat.get_last_cuda_error.restype = ct.c_char_p
+_cudamat.get_last_clib_error.restype = ct.c_char_p
+_cudamat.cublas_init.restype = ct.c_int
+_cudamat.cublas_shutdown.restype = ct.c_int
+_cudamat.cuda_set_device.restype = ct.c_int
+_cudamat.init_random.restype = ct.c_int
+
+_cudamat.init_empty.restype = ct.c_int
+_cudamat.reshape.restype = ct.c_int
+_cudamat.copy_to_host.restype = ct.c_int
+_cudamat.allocate_device_memory = ct.c_int
+_cudamat.copy_to_device.restype = ct.c_int
+_cudamat.copy_on_device.restype = ct.c_int
+_cudamat.free_device_memory.restype = ct.c_int
+
+_cudamat.get_slice.restype = ct.c_int
+_cudamat.get_row_slice.restype = ct.c_int
+_cudamat.set_row_slice.restype = ct.c_int
+_cudamat.copy_transpose.restype = ct.c_int
+_cudamat.get_vector_slice.restype = ct.c_int
+_cudamat.fill_with_rand.restype = ct.c_int
+_cudamat.fill_with_randn.restype = ct.c_int
+
+_cudamat.add_col_vec.restype = ct.c_int
+_cudamat.add_col_mult.restype = ct.c_int
+_cudamat.add_row_vec.restype = ct.c_int
+_cudamat.mult_by_col_vec.restype = ct.c_int
+_cudamat.mult_by_row_vec.restype = ct.c_int
+_cudamat.divide_by_col_vec.restype = ct.c_int
+_cudamat.divide_by_row_vec.restype = ct.c_int
+
+_cudamat.less_than.restype = ct.c_int
+_cudamat.less_than_scalar.restype = ct.c_int
+_cudamat.greater_than.restype = ct.c_int
+_cudamat.greater_than_scalar.restype = ct.c_int
+_cudamat.equals.restype = ct.c_int
+_cudamat.equals_scalar.restype = ct.c_int
+_cudamat.minimum.restype = ct.c_int
+_cudamat.minimum_scalar.restype = ct.c_int
+_cudamat.maximum.restype = ct.c_int
+_cudamat.maximum_scalar.restype = ct.c_int
+_cudamat.min_by_axis.restype = ct.c_int
+_cudamat.max_by_axis.restype = ct.c_int
+_cudamat.argmin_by_axis.restype = ct.c_int
+_cudamat.argmax_by_axis.restype = ct.c_int
+_cudamat.sign.restype = ct.c_int
+_cudamat.apply_sigmoid.restype = ct.c_int
+_cudamat.apply_tanh.restype = ct.c_int
+_cudamat.apply_soft_threshold.restype = ct.c_int
+_cudamat.apply_abs.restype = ct.c_int
+_cudamat.apply_log_1_plus_exp.restype = ct.c_int
+_cudamat.apply_log.restype = ct.c_int
+_cudamat.apply_exp.restype = ct.c_int
+_cudamat.apply_gamma.restype = ct.c_int
+_cudamat.apply_lgamma.restype = ct.c_int
+_cudamat.apply_sqrt.restype = ct.c_int
+_cudamat.apply_pow.restype = ct.c_int
+_cudamat.apply_pow_matrix.restype = ct.c_int
+_cudamat.reciprocal.restype = ct.c_int
+
+_cudamat.add_elementwise.restype = ct.c_int
+_cudamat.subtract_elementwise.restype = ct.c_int
+_cudamat.divide_elementwise.restype = ct.c_int
+_cudamat.mult_elementwise.restype = ct.c_int
+_cudamat.assign_scalar.restype = ct.c_int
+_cudamat.mult_by_scalar.restype = ct.c_int
+_cudamat.divide_by_scalar.restype = ct.c_int
+_cudamat.add_scalar.restype = ct.c_int
+
+_cudamat.euclid_norm.restype = ct.c_double
+_cudamat.manhattan_norm.restype = ct.c_double
+_cudamat.selectRows.restype = ct.c_int
+_cudamat.setSelectedRows.restype = ct.c_int
+_cudamat.vdot.restype = ct.c_double
+_cudamat.dot.restype = ct.c_int
+
+_cudamat.where.restype = ct.c_int
+
+_cudamat.correlate.restype = ct.c_int
+
+
+def deprecated(func):
+    """This is a decorator which can be used to mark functions
+    as deprecated. It will result in a warning being emmitted
+    when the function is used."""
+
+    def newFunc(*args, **kwargs):
+        warnings.warn("Call to deprecated function %s." % func.__name__,
+                      category=DeprecationWarning)
+        return func(*args, **kwargs)
+    newFunc.__name__ = func.__name__
+    newFunc.__doc__ = func.__doc__
+    newFunc.__dict__.update(func.__dict__)
+    return newFunc
+
+
+class CUDAMatException(Exception):
+    pass
+
+
+def get_last_cuda_error():
+    errmsg = _cudamat.get_last_cuda_error()
+    if sys.version_info >= (3,):
+        return bytes(errmsg).decode()
+    else:
+        return str(errmsg)
+
+
+def get_last_clib_error():
+    errmsg = _cudamat.get_last_clib_error()
+    if sys.version_info >= (3,):
+        return bytes(errmsg).decode()
+    else:
+        return str(errmsg)
+
+
+def generate_exception(err_code, **kwargs):
+    """
+    Return a CUDAMatException object based on the error code err_code.
+    Additional arguments are error-specific and optional.
+    """
+
+    if err_code == -1:
+        return CUDAMatException("Incompatible matrix dimensions.")
+    elif err_code == -2:
+        return CUDAMatException("CUBLAS error.")
+    elif err_code == -3:
+        return CUDAMatException("CUDA error: " + get_last_cuda_error())
+    elif err_code == -4:
+        return CUDAMatException("Operation not supported on views.")
+    elif err_code == -5:
+        return CUDAMatException("Operation not supported on "
+                                "transposed matrices.")
+    elif err_code == -6:
+        return CUDAMatException("")
+    elif err_code == -7:
+        return CUDAMatException("Incompatible transposedness.")
+    elif err_code == -8:
+        return CUDAMatException("Matrix is not in device memory.")
+    elif err_code == -9:
+        return CUDAMatException("Operation not supported.")
+    elif err_code == -10:
+        filepath = kwargs.get("filepath","");
+        if filepath:
+            filepath = ": '%s'" % filepath
+        return CUDAMatException("Cannot open file%s: %s" % (filepath,get_last_clib_error()))
+    elif err_code == -11:
+        filepath = kwargs.get("filepath","");
+        if filepath:
+            filepath = ": '%s'" % filepath
+        return CUDAMatException("Cannot parse file%s." % filepath)
+    else:
+        return CUDAMatException("")
+
+
+class cudamat(ct.Structure):
+    _fields_ = [('data_host', ct.POINTER(ct.c_double)),
+                ('data_device', ct.POINTER(ct.c_double)),
+                ('on_device', ct.c_int),
+                ('on_host', ct.c_int),
+                ('size', ct.c_int * 2),
+                ('is_trans', ct.c_int),
+                ('owns_data', ct.c_int)]
+
+
+class rnd_struct(ct.Structure):
+    _fields_ = [('dev_rnd_mults', ct.POINTER(ct.c_uint)),
+                ('dev_rnd_words', ct.POINTER(ct.c_longlong))]
+
+
+class TransposedCUDAMatrix(object):
+    def __init__(self, mat):
+        self.mat = cudamat()
+        ct.memmove(ct.pointer(self.mat), ct.pointer(mat), ct.sizeof(self.mat))
+        self.mat.is_trans = 1
+        self.p_mat = ct.pointer(self.mat)
+
+
+class CUDAMatrix(object):
+    """
+    A CUDAMatrix object represents a matrix of single precision floating point
+    numbers on a GPU.
+    """
+
+    def __init__(self, array, copy_to_device=True, copy_on_host=True):
+        """
+        Initializes a new matrix object in one of two ways. If array is a numpy
+        ndarray, memory for a matrix with the same dimensions is allocated on
+        the GPU. If the copy_to_device flag is set to True, the GPU matrix is
+        initialized with the given ndarray. If the copy_on_host flag is set to
+        True, a copy of the matrix will be created in host memory even if the
+        matrix is of the correct type (float64, Fortran-contiguous order).
+        If array is not an ndarray, it must be a cudamat structure (typically
+        the user will never use this way of calling __init__).
+        """
+
+        if type(array) in [np.ndarray, np.memmap]:
+            # Convert array to float64 in FORTRAN order
+            array = reformat(array, copy=copy_on_host)
+
+            # Initialize as a ndarray-tied matrix.
+            self.mat = cudamat()
+            self.size = self.mat.size
+            self.p_mat = ct.pointer(self.mat)
+            self.numpy_array = array
+
+            _cudamat.init_from_array(
+                self.p_mat,
+                array.ctypes.data_as(ct.POINTER(ct.c_double)),
+                ct.c_int(array.shape[0]),
+                ct.c_int(array.shape[1]))
+
+            if copy_to_device:
+                err_code = _cudamat.copy_to_device(self.p_mat)
+                if err_code:
+                    raise generate_exception(err_code)
+
+        else:
+            # Initialize based on existing cudamat structure.
+            mat = array
+            self.mat = mat
+            self.p_mat = ct.pointer(self.mat)
+
+        self.T = TransposedCUDAMatrix(self.mat)
+
+        # Keep a reference to free device memory in case of a crash.
+        self.__free_device_memory = _cudamat.free_device_memory
+
+    def __del__(self):
+        try:
+            if 'p_mat' in self.__dict__:
+                err_code = self.__free_device_memory(self.p_mat)
+                if err_code:
+                    raise generate_exception(err_code)
+        except AttributeError:
+            pass
+
+    @staticmethod
+    def init_random(seed=0):
+        """
+        Initialize and seed the random number generator.
+        """
+
+        CUDAMatrix.rndInitialized = 1
+        CUDAMatrix.rnd_state = rnd_struct()
+        CUDAMatrix.rnd_state_p = ct.pointer(CUDAMatrix.rnd_state)
+
+        cudamat_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
+                                    'rnd_multipliers_32bit.txt')
+
+        if sys.version_info >= (3,):
+            cudamat_path = cudamat_path.encode(sys.getfilesystemencoding())
+
+        err_code = _cudamat.init_random(CUDAMatrix.rnd_state_p,
+                                        ct.c_int(seed),
+                                        cudamat_path)
+        if err_code:
+            if sys.version_info >= (3,):
+                cudamat_path = cudamat_path.decode(sys.getfilesystemencoding())
+            raise generate_exception(err_code, filepath=cudamat_path)
+
+    @property
+    def shape(self):
+        return (self.mat.size[0], self.mat.size[1])
+
+    def reshape(self, shape):
+        """
+        Reshapes self to have the given shape. The number of elements cannot
+        change as this only changes how the contents are interpreted.
+        """
+
+        m = ct.c_uint(shape[0])
+        n = ct.c_uint(shape[1])
+
+        # Reshape the default matrix
+        err_code = _cudamat.reshape(self.p_mat, m, n)
+        if err_code:
+            raise generate_exception(err_code)
+        # Reshape the transposed matrix
+        err_code = _cudamat.reshape(self.T.p_mat, m, n)
+        if err_code:
+            raise generate_exception(err_code)
+        # Reshape the CPU matrix
+        if self.mat.on_host:
+            self.numpy_array = np.reshape(self.numpy_array, shape, order='F')
+
+        return self
+
+    def asarray(self):
+        """
+        Copies the matrix to an ndarray on the CPU and returns it.
+        """
+
+        self.copy_to_host()
+
+        return self.numpy_array
+
+    def copy_to_device(self):
+        """
+        Copy the matrix to the GPU.
+        """
+
+        err_code = _cudamat.copy_to_device(self.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+    def copy_to_host(self):
+        """
+        Copy the matrix to the CPU.
+        """
+
+        if not self.mat.on_host:
+            # allocate host storage if necessary
+            m = self.mat.size[0]
+            n = self.mat.size[1]
+
+            self.numpy_array = np.empty((m, n), dtype=np.float64, order='F')
+            self.mat.data_host = \
+                self.numpy_array.ctypes.data_as(ct.POINTER(ct.c_double))
+
+            self.mat.on_host = 1
+
+        err_code = _cudamat.copy_to_host(self.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+    def copy(self, include_host=False):
+        """
+        Create a copy of the matrix on GPU. If include_host is True, also
+        creates a copy of the matrix on CPU if there was any.
+        """
+
+        new_mat = empty(self.shape).assign(self)
+
+        if include_host and self.mat.on_host:
+            new_mat.numpy_array = self.numpy_array.copy()
+            new_mat.mat.data_host = \
+                new_mat.numpy_array.ctypes.data_as(ct.POINTER(ct.c_double))
+            new_mat.mat.on_host = 1
+
+        return new_mat
+
+    def assign(self, val):
+        """Assign val to self, where val can be a scalar or a CUDAMatrix
+        with the same dimensions as self. """
+
+        if isinstance(val, CUDAMatrix):
+            err_code = _cudamat.copy_on_device(val.p_mat, self.p_mat)
+        elif isinstance(val, (int, float)):
+            err_code = _cudamat.assign_scalar(self.p_mat, ct.c_double(val))
+        else:
+            raise ValueError("Assigned value must be of type"
+                             "CUDAMatrix, int, or float.")
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def free_device_memory(self):
+        """
+        Free memory used up by the matrix on the GPU.
+        """
+
+        err_code = _cudamat.free_device_memory(self.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+    def set_trans(self, is_trans):
+        """
+        Set the transposedness flag to is_trans.
+        """
+
+        _cudamat.set_transpose(self.p_mat, ct.c_int(1 * is_trans))
+
+    def slice(self, first_col, last_col, include_host=False):
+        """
+        Creates a view into a consecutive range of columns of an existing
+        matrix on GPU. If include_host is set to True, also creates a view
+        into the CPU copy of the matrix (i.e., the numpy_array).
+        """
+        mat = cudamat()
+
+        if self.mat.size[0] == 1 or self.mat.size[1] == 1:
+            err_code = _cudamat.get_vector_slice(self.p_mat,
+                                                 ct.pointer(mat),
+                                                 ct.c_int(first_col),
+                                                 ct.c_int(last_col))
+        else:
+            err_code = _cudamat.get_slice(self.p_mat,
+                                          ct.pointer(mat),
+                                          ct.c_int(first_col),
+                                          ct.c_int(last_col))
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        new_mat = CUDAMatrix(mat)
+
+        try:
+            new_mat.sliceof = self.sliceof
+        except:
+            new_mat.sliceof = self
+
+        # reproduce the slice on the host as well (if requested)
+        if include_host and self.mat.on_host:
+            new_mat.numpy_array = self.numpy_array[:, first_col:last_col]
+            new_mat.mat.data_host = \
+                new_mat.numpy_array.ctypes.data_as(ct.POINTER(ct.c_double))
+            new_mat.mat.on_host = 1
+
+        return new_mat
+
+    def get_col_slice(self, first_col, last_col, target=None):
+        """
+        Get the columns with indices first_col through last_col. If a target
+        is provided, columns are copied into the target. Otherwise, returns a
+        view into the existing memory on GPU.
+        """
+        col_slice = self.slice(first_col, last_col)
+
+        if target:
+            target.assign(col_slice)
+            return target
+        else:
+            return col_slice
+
+    def set_col_slice(self, first_col, last_col, mat):
+        """
+        Assign the contents of mat to the columns with indices first_col
+        through last_col.
+        """
+        self.slice(first_col, last_col).assign(mat)
+
+        return self
+
+    def get_row_slice(self, start, end, target=None):
+        """
+        Get the rows with indices start through end. If target is not provided
+        memory for a new matrix will be allocated.
+        """
+
+        width = self.shape[1]
+
+        if not target:
+            target = empty((end-start, width))
+
+        err_code = _cudamat.get_row_slice(self.p_mat,
+                                          target.p_mat,
+                                          ct.c_int(start),
+                                          ct.c_int(end))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def set_row_slice(self, start, end, mat):
+        """
+        Assign the contents of mat to the rows with indices start through end.
+        """
+
+        err_code = _cudamat.set_row_slice(mat.p_mat, self.p_mat,
+                                          ct.c_int(start), ct.c_int(end))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def transpose(self, target=None):
+        """
+        Return a transposed copy of the matrix.
+        """
+        if not target:
+            target = empty((self.shape[1], self.shape[0]))
+
+        err_code = _cudamat.copy_transpose(self.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def fill_with_rand(self):
+        """
+        Fill matrix on the GPU with random numbers drawn from the uniform
+        distribution over the (0,1) interval.
+        """
+
+        err_code = _cudamat.fill_with_rand(CUDAMatrix.rnd_state_p, self.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def fill_with_randn(self):
+        """
+        Fill matrix on the GPU with random numbers drawn from
+        the standard normal distribution.
+        """
+
+        err_code = _cudamat.fill_with_randn(CUDAMatrix.rnd_state_p, self.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def add_col_vec(self, vec, target=None):
+        """
+        Add vector vec to every column of the matrix. If a target is provided,
+        it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.add_col_vec(self.p_mat, vec.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def add_col_mult(self, vec, mult, target=None):
+        """
+        Add a multiple of vector vec to every column of the matrix. If a target
+        is provided, it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.add_col_mult(self.p_mat, vec.p_mat,
+                                         target.p_mat, ct.c_double(mult))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def add_row_vec(self, vec, target=None):
+        """
+        Add vector vec to every row of the matrix. If a target is provided,
+        it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.add_row_vec(self.p_mat, vec.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def mult_by_col(self, vec, target=None):
+        """
+        Multiply vector vec into every column of the matrix. If a target is
+        provided, it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.mult_by_col_vec(self.p_mat, vec.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def mult_by_row(self, vec, target=None):
+        """
+        Multiply vector vec into every row of the matrix. If a target is
+        provided, it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.mult_by_row_vec(self.p_mat, vec.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def div_by_col(self, vec, target=None):
+        """
+        Divide every column of the matrix by vector vec. If a target is
+        provided, it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.divide_by_col_vec(self.p_mat, vec.p_mat,
+                                              target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def div_by_row(self, vec, target=None):
+        """
+        Divide every row of the matrix by vector vec. If a target is
+        provided, it is used to store the result instead of self.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.divide_by_row_vec(self.p_mat, vec.p_mat,
+                                              target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def sum(self, axis, target=None, mult=1.):
+        """
+        Sum the matrix along the given dimension, where 0 represents the leading
+        dimension and 1 represents the non-leading dimension. If a target is
+        not provided, a new vector is created for storing the result. The result
+        is multiplied by the given factor mult (defaults to 1).
+        """
+
+        return sum(self, axis, target, mult)
+
+    def mean(self, axis, target=None):
+        """
+        Compute the mean of the matrix along the given dimension, where 0
+        represents the leading dimension and 1 represents the non-leading
+        dimension. If a target is not provided, a new vector is created for
+        storing the result.
+        """
+
+        return mean(self, axis, target)
+
+    def add_sums(self, mat, axis, mult=1., beta=1.):
+        """
+        Add a multiple of the sums of the matrix mat along the given dimension
+        to self. Self is scaled by beta before adding anything.
+        """
+
+        m = _cudamat.get_leading_dimension(mat.p_mat)
+        n = _cudamat.get_nonleading_dimension(mat.p_mat)
+
+        if axis == 0:
+            # sum along leading dimension
+            check_ones_matrix(m)
+            left = CUDAMatrix.ones.slice(0, m)
+            left.set_trans(True)
+            right = mat
+
+        elif axis == 1:
+            # sum along non-leading dimension
+            left = mat
+            check_ones_matrix(n)
+            right = CUDAMatrix.ones.slice(0, n)
+
+        err_code = _cudamat.dot(left.p_mat, right.p_mat, self.p_mat,
+                                ct.c_double(beta), ct.c_double(mult))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def less_than(self, val, target=None):
+        """
+        Perform the operation target = 1. * (self < val),
+        where val can be a matrix or a scalar.
+        """
+
+        if not target:
+            target = self
+
+        if isinstance(val, (int, float)):
+            err_code = _cudamat.less_than_scalar(self.p_mat, ct.c_double(val),
+                                                 target.p_mat)
+        else:
+            err_code = _cudamat.less_than(self.p_mat, val.p_mat, target.p_mat)
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def greater_than(self, val, target=None):
+        """
+        Perform the operation target = 1. * (self > val),
+        where val can be a matrix or a scalar.
+        """
+
+        if not target:
+            target = self
+
+        if isinstance(val, (int, float)):
+            err_code = _cudamat.greater_than_scalar(self.p_mat,
+                                                    ct.c_double(val),
+                                                    target.p_mat)
+        else:
+            err_code = _cudamat.greater_than(self.p_mat, val.p_mat,
+                                             target.p_mat)
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def equals(self, val, target=None):
+        """
+        Perform the operation target = 1. * (self == val),
+        where val can be a matrix or a scalar.
+        """
+
+        if not target:
+            target = self
+
+        if isinstance(val, (int, float)):
+            err_code = _cudamat.equals_scalar(self.p_mat, ct.c_double(val),
+                                              target.p_mat)
+        else:
+            err_code = _cudamat.equals(self.p_mat, val.p_mat, target.p_mat)
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def minimum(self, val, target=None):
+        """
+        Perform the element-wise operation target = min(self, val), where
+        val can be a matrix or a scalar.
+        """
+
+        if not target:
+            target = self
+
+        if isinstance(val, (int, float)):
+            err_code = _cudamat.minimum_scalar(self.p_mat, ct.c_double(val),
+                                               target.p_mat)
+        else:
+            err_code = _cudamat.minimum(self.p_mat, val.p_mat, target.p_mat)
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def maximum(self, val, target=None):
+        """
+        Perform the element-wise operation target = max(self, val), where
+        val can be a matrix or a scalar.
+        """
+
+        if not target:
+            target = self
+
+        if isinstance(val, (int, float)):
+            err_code = _cudamat.maximum_scalar(self.p_mat, ct.c_double(val),
+                                               target.p_mat)
+        else:
+            err_code = _cudamat.maximum(self.p_mat, val.p_mat, target.p_mat)
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def min(self, axis, target=None):
+        """
+        Find the minimum value along the given dimension, where 0 represents the
+        leading dimension and 1 represents the non-leading dimension. If a
+        target is not prvided, a new vector is created for storing the result.
+        """
+
+        m, n = self.shape
+
+        if axis == 0:
+            if not target:
+                target = empty((1, n))
+
+        elif axis == 1:
+            if not target:
+                target = empty((m, 1))
+
+        err_code = _cudamat.min_by_axis(self.p_mat, target.p_mat,
+                                        ct.c_int(axis))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def max(self, axis, target=None):
+        """
+        Find the maximum value along the given dimension, where 0 represents the
+        leading dimension and 1 represents the non-leading dimension. If a
+        target is not prvided, a new vector is created for storing the result.
+        """
+
+        m, n = self.shape
+
+        if axis == 0:
+            if not target:
+                target = empty((1, n))
+
+        elif axis == 1:
+            if not target:
+                target = empty((m, 1))
+
+        err_code = _cudamat.max_by_axis(self.p_mat, target.p_mat,
+                                        ct.c_int(axis))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def argmin(self, axis, target=None):
+        """
+        Find the index of the minimum value along the given dimension, where 0
+        represents the leading dimension and 1 represents the non-leading
+        dimension. If a target is not provided, a new vector is created for
+        storing the result.
+        """
+
+        m, n = self.shape
+
+        if axis == 0:
+            if not target:
+                target = empty((1, n))
+
+        elif axis == 1:
+            if not target:
+                target = empty((m, 1))
+
+        err_code = _cudamat.argmin_by_axis(self.p_mat, target.p_mat,
+                                           ct.c_int(axis))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def argmax(self, axis, target=None):
+        """
+        Find the index of the maximum value along the given dimension, where 0
+        represents the leading dimension and 1 represents the non-leading
+        dimension. If a target is not provided, a new vector is created for
+        storing the result.
+        """
+
+        m, n = self.shape
+
+        if axis == 0:
+            if not target:
+                target = empty((1, n))
+
+        elif axis == 1:
+            if not target:
+                target = empty((m, 1))
+
+        err_code = _cudamat.argmax_by_axis(self.p_mat, target.p_mat,
+                                           ct.c_int(axis))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def sign(self, target=None):
+        """
+        Find the sign of each element of the matrix.
+        """
+
+        if not target:
+            target = empty((self.mat.size[0], self.mat.size[1]))
+
+        err_code = _cudamat.sign(self.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def apply_sigmoid(self, target=None):
+        """
+        Apply the logistic sigmoid to each element of the matrix.
+        """
+
+        return sigmoid(self, target)
+
+    def apply_tanh(self, target=None):
+        """
+        Apply the tanh to each element of the matrix.
+        """
+
+        return tanh(self, target)
+
+    def apply_soft_threshold(self, alpha, target=None):
+        """
+        Apply the soft threshold function to each element of the matrix:
+
+        x = sign(x) * max(0, abs(x) - alpha)
+        """
+
+        return soft_threshold(self, alpha, target)
+
+    def reciprocal(self, target=None):
+        """
+        Find the reciprocal of each element of the matrix.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.reciprocal(self.p_mat, target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def dot(self, mat2, target=None):
+        """
+        Multiply the matrix by mat2 from the right.
+        """
+
+        return dot(self, mat2, target)
+
+    def add_dot(self, m1, m2, mult=1., beta=1.):
+        """
+        Add the dot product of m1 and m2 to the matrix, scaled by mult.
+        Self is scaled by beta before adding anything.
+        """
+
+        err_code = _cudamat.dot(m1.p_mat, m2.p_mat, self.p_mat,
+                                ct.c_double(beta), ct.c_double(mult))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def subtract_dot(self, m1, m2, mult=1., beta=1.):
+        """
+        Subtract the dot product of m1 and m2 from the matrix, scaled by mult.
+        Self is scaled by beta before subtracting anything.
+        """
+
+        return self.add_dot(m1, m2, mult=-1. * mult, beta=beta)
+
+    def add_mult(self, mat2, alpha=1.):
+        """
+        Add multiple of mat2 to the matrix.
+        """
+
+        err_code = _cudamat.add_mult(self.p_mat, mat2.p_mat, ct.c_double(alpha))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def subtract_mult(self, mat2, alpha=1.):
+        """
+        Subtract a multiple of mat2 from the matrix.
+        """
+
+        err_code = _cudamat.add_mult(self.p_mat, mat2.p_mat,
+                                     ct.c_double(-1. * alpha))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    def add(self, val, target=None):
+        """Add val to self, where val can be a scalar or a CUDAMatrix with the
+        same dimensions as self. """
+
+        if not target:
+            target = self
+
+        if isinstance(val, CUDAMatrix):
+            err_code = _cudamat.add_elementwise(self.p_mat, val.p_mat,
+                                                target.p_mat)
+        elif isinstance(val, (int, float)):
+            err_code = _cudamat.add_scalar(self.p_mat, ct.c_double(val),
+                                           target.p_mat)
+        else:
+            raise ValueError("Value must be of type CUDAMatrix, int, or float.")
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def subtract(self, val, target=None):
+        """Subtract val from self, where val can be a scalar or a CUDAMatrix with
+        the same dimensions as self. """
+
+        if not target:
+            target = self
+
+        if isinstance(val, CUDAMatrix):
+            err_code = _cudamat.subtract_elementwise(self.p_mat, val.p_mat,
+                                                     target.p_mat)
+        elif isinstance(val, (int, float)):
+            err_code = _cudamat.add_scalar(self.p_mat, ct.c_double(-1*val),
+                                           target.p_mat)
+        else:
+            raise ValueError("Value must be of type CUDAMatrix, int, or float.")
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def divide(self, val, target=None):
+        """Divide self by val, where val can be a scalar or
+        a CUDAMatrix with the same dimensions as self. """
+
+        if not target:
+            target = self
+
+        if isinstance(val, CUDAMatrix):
+            err_code = _cudamat.divide_elementwise(self.p_mat, val.p_mat,
+                                                   target.p_mat)
+        elif isinstance(val, (int, float)):
+            err_code = _cudamat.divide_by_scalar(self.p_mat, ct.c_double(val),
+                                                 target.p_mat)
+        else:
+            raise ValueError("Value must be of type CUDAMatrix, int, or float.")
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def mult(self, val, target=None):
+        """Multiply self by val, where val can be a scalar or a CUDAMatrix with
+        the same dimensions as self. """
+
+        if not target:
+            target = self
+
+        if isinstance(val, CUDAMatrix):
+            err_code = _cudamat.mult_elementwise(self.p_mat, val.p_mat,
+                                                 target.p_mat)
+        elif isinstance(val, (int, float)):
+            err_code = _cudamat.mult_by_scalar(self.p_mat, ct.c_double(val),
+                                               target.p_mat)
+        else:
+            raise ValueError("Value must be of type CUDAMatrix, int, or float.")
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    @deprecated
+    def assign_scalar(self, alpha):
+        """
+        Assign scalar alpha to every element of the matrix.
+        """
+
+        err_code = _cudamat.assign_scalar(self.p_mat, ct.c_double(alpha))
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+    @deprecated
+    def mult_by_scalar(self, alpha, target=None):
+        """
+        Multiply the matrix by a scalar.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.mult_by_scalar(self.p_mat, ct.c_double(alpha),
+                                           target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    @deprecated
+    def div_by_scalar(self, alpha, target=None):
+        """
+        Divide the matrix by a scalar.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.divide_by_scalar(self.p_mat, ct.c_double(alpha),
+                                             target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    @deprecated
+    def add_scalar(self, alpha, target=None):
+        """
+        Increment the matrix by a scalar.
+        """
+
+        if not target:
+            target = self
+
+        err_code = _cudamat.add_scalar(self.p_mat, ct.c_double(alpha),
+                                       target.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def euclid_norm(self):
+        """
+        Returns the L2 norm of the matrix flattened to a vector.
+        """
+        err_code = ct.c_int(0)
+        res = _cudamat.euclid_norm(self.p_mat, ct.byref(err_code))
+
+        if err_code:
+            raise generate_exception(err_code.value)
+
+        return res
+
+    def manhattan_norm(self):
+        """
+        Returns the L1 norm of the matrix flattened to a vector.
+        """
+        err_code = ct.c_int(0)
+        res = _cudamat.manhattan_norm(self.p_mat, ct.byref(err_code))
+
+        if err_code:
+            raise generate_exception(err_code.value)
+
+        return res
+
+    def allfinite(self):
+        """
+        Checks if all entries in this matrix are finite, i.e., there is no
+        NaN and no positive or negative infinity.
+        """
+        # Caveat: For a very large matrix of very large finite numbers, the
+        # manhattan norm may overflow and allfinite() may return False.
+        return np.isfinite(self.manhattan_norm())
+
+    def select_columns(self, indices, target):
+        """
+        Copies some columns of self into target.
+        <indices> must be a row vector. Its elements are float64's representing
+        integers, e.g. "34.0" means the integer "34".
+        After this call, for all r,c, target[r,c]=self[r,indices[c]].
+        This returns target.
+        Negative indices are interpreted in the usual Python way: all
+        elements of <indices> had better be in the range
+        [-self.shape[1], self.shape[1]-1].
+        This does bounds checking, but out of bounds indices do not raise an
+        exception (because the programmer was lazy). Instead, they result
+        in NaN values in <target>.
+        """
+
+        err_code = _cudamat.selectRows(self.p_mat, target.p_mat, indices.p_mat)
+
+        if err_code:
+            raise generate_exception(err_code)
+
+        return target
+
+    def set_selected_columns(self, indices, source):
+        """
+        copies all columns of source into some columns of self.
+        <indices> must be a row vector. Its elements are float64's representing
+        integers, e.g. "34.0" means the integer "34". after this call, for all
+        r,c, self[r,indices[c]]=source[r,c]. This returns self.
+        Negative indices are interpreted in the usual Python way: all elements
+        of <indices> had better be in the range
+        [-self.shape[1], self.shape[1]-1].
+        This does bounds checking, but out of bounds indices do not raise an
+        exception (because the programmer was lazy). Instead, they result in NaN
+        values in <self>.
+        """
+
+        err_code = _cudamat.setSelectedRows(self.p_mat, source.p_mat,
+                                            indices.p_mat)
+        if err_code:
+            raise generate_exception(err_code)
+
+        return self
+
+
+def empty(shape):
+    """
+    Creates and returns a new CUDAMatrix with the given shape.
+    """
+
+    mat = cudamat()
+    err_code = _cudamat.init_empty(ct.pointer(mat), ct.c_int(shape[0]),
+                                   ct.c_int(shape[1]))
+
+    if err_code:
+        raise generate_exception(err_code)
+
+    return CUDAMatrix(mat)
+
+
+def check_ones_matrix(min_size):
+    if min_size > CUDAMatrix.ones.shape[0]:
+        raise CUDAMatException(
+            'Not enough memory allocated for reduction. '
+            '({} needed, {} actual), use cudamat.init() '
+            'to allocate more'.format(min_size, CUDAMatrix.ones.shape[0]))
+
+
+def sum(mat, axis, target=None, mult=1.):
+    """
+    Sum the matrix along the given dimension, where 0 represents the leading
+    dimension and 1 represents the non-leading dimension. If a target is
+    not provided, a new vector is created for storing the result. The result
+    is multiplied by the given factor mult (defaults to 1).
+    """
+
+    m = _cudamat.get_leading_dimension(mat.p_mat)
+    n = _cudamat.get_nonleading_dimension(mat.p_mat)
+
+    if axis == 0:
+        # sum along leading dimension
+        check_ones_matrix(m)
+        left = CUDAMatrix.ones.slice(0, m)
+        left.set_trans(True)
+        right = mat
+
+        if not target:
+            target = empty((1, n))
+
+    elif axis == 1:
+        # sum along non-leading dimension
+        left = mat
+        check_ones_matrix(n)
+        right = CUDAMatrix.ones.slice(0, n)
+
+        if not target:
+            target = empty((m, 1))
+
+    err_code = _cudamat.dot(left.p_mat, right.p_mat, target.p_mat,
+                            ct.c_double(0.), ct.c_double(mult))
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def mean(mat, axis, target=None):
+    """
+    Compute the mean of the matrix along the given dimension, where 0 represents
+    the leading dimension and 1 represents the non-leading dimension. If a
+    target is not provided, a new vector is created for storing the result.
+    """
+
+    return sum(mat, axis, target=target, mult=1. / mat.shape[axis])
+
+
+def dot(m1, m2, target=None, beta=0., alpha=1.):
+    """
+    Find the dot product between m1 and m2 and store in target:
+    target = beta*target + alpha*(m1 m2)
+    If no target is given, it will be created automatically, but not
+    initialized -- so beta should be left at its default value zero.
+    """
+
+    if not target:
+        m = _cudamat.get_leading_dimension(m1.p_mat)
+        n = _cudamat.get_nonleading_dimension(m2.p_mat)
+
+        target = empty((m, n))
+
+    err_code = _cudamat.dot(m1.p_mat, m2.p_mat,
+                            target.p_mat, ct.c_double(beta),
+                            ct.c_double(alpha))
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def vdot(m1, m2):
+    """
+    Compute the vector dot product of matrices m1 and m2.
+    """
+
+    err_code = ct.c_int(0)
+    res = _cudamat.vdot(m1.p_mat, m2.p_mat, ct.byref(err_code))
+
+    if err_code:
+        raise generate_exception(err_code.value)
+
+    return res
+
+
+def sigmoid(mat, target=None):
+    """
+    Apply the logistic sigmoid to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_sigmoid(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def tanh(mat, target=None):
+    """
+    Apply the tanh to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_tanh(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def soft_threshold(mat, alpha, target=None):
+    """
+    Apply the soft threshold function to each element of the matrix:
+
+    mat = sign(mat) * max(0, abs(mat) - alpha)
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_soft_threshold(mat.p_mat, ct.c_double(alpha),
+                                             target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def abs(mat, target=None):
+    """
+    Apply abs to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_abs(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def log_1_plus_exp(mat, target=None):
+    """
+    Apply log(1+exp(x)) to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_log_1_plus_exp(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def log(mat, target=None):
+    """
+    Find the natural logarithm of each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_log(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def exp(mat, target=None):
+    """
+    Apply the exponential function to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_exp(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def gamma(mat, target=None):
+    """
+    Apply the gamma function to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_gamma(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def lgamma(mat, target=None):
+    """
+    Apply the log gamma function to each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_lgamma(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def sqrt(mat, target=None):
+    """
+    Compute the square root of each element of the matrix mat.
+    """
+
+    if not target:
+        target = mat
+
+    err_code = _cudamat.apply_sqrt(mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def pow(mat, p, target=None):
+    """
+    If p is a scalar, compute the 'p'th power of each element of the matrix mat,
+    otherwise raise each element of the matrix mat to the power given by the
+    corresponding element of the matrix p.
+    """
+
+    if not target:
+        target = mat
+
+    if isinstance(p, CUDAMatrix):
+        err_code = _cudamat.apply_pow_matrix(mat.p_mat, p.p_mat, target.p_mat)
+    elif isinstance(p, (int, float)):
+        err_code = _cudamat.apply_pow(mat.p_mat, ct.c_double(p), target.p_mat)
+    else:
+        raise ValueError("Value must be of type CUDAMatrix, int, or float.")
+
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def where(condition_mat, if_mat, else_mat, target=None):
+    """
+    For each element i, j, store if_math[i, j] in target[i,j] if
+    condition_mat[i, j] is True, and else_mat[i, j] otherwise.
+    """
+    if not target:
+        target = condition_mat
+
+    err_code = _cudamat.where(condition_mat.p_mat, if_mat.p_mat,
+                              else_mat.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def correlate(mat, kernel, target=None):
+    """
+    Cross-correlate a matrix with a kernel matrix.
+    The kernel matrix is centered over each element of the matrix mat.
+    Width and height of the kernel matrix must be an odd integer.
+    If a target is not provided, a new matrix is created for storing the result.
+    Note that this function cannot operate in-place.
+    """
+    if not target:
+        m = _cudamat.get_leading_dimension(mat.p_mat)
+        n = _cudamat.get_nonleading_dimension(mat.p_mat)
+
+        target = empty((m, n))
+
+    err_code = _cudamat.correlate(mat.p_mat, kernel.p_mat, target.p_mat)
+    if err_code:
+        raise generate_exception(err_code)
+
+    return target
+
+
+def cuda_sync_threads():
+    _cudamat.cuda_sync_threads()
+
+
+def reformat(array, copy=True):
+    """
+    Returns array as a float64 array in FORTRAN order.
+    If copy is set to False, the array will only be copied if it is not already
+    in the correct format.
+    """
+    return np.array(array, dtype=np.float64, order='F', copy=copy)
+
+
+def cuda_set_device(dev_id):
+    """
+    Selects the CUDA device with the given ID.
+    """
+
+    err_code = _cudamat.cuda_set_device(ct.c_int(dev_id))
+    if err_code:
+        raise generate_exception(err_code)
+
+
+def cublas_init(max_ones=(1024*256)):
+    """
+    Initialize Cublas.
+
+    'max_ones' is an optional argument that determines the length of
+    the largest sum that can be computed using Cublas matrix multiply.
+    A larger value causes more memory to be allocated for this purpose.
+    """
+
+    err = _cudamat.cublas_init()
+    if err:
+        raise CUDAMatException('error initializing CUBLAS: (err=%u)' % err)
+    CUDAMatrix.ones = empty((max_ones, 1)).assign(1.0)
+
+init = cublas_init
+
+
+def cublas_shutdown():
+    """
+    Shut down Cublas.
+    """
+
+    CUDAMatrix.ones = 0
+    _cudamat.cublas_shutdown()
+
+shutdown = cublas_shutdown