""" Simple example datasets for OT """ import numpy as np import scipy as sp def get_1D_gauss(n,m,s): """return a 1D histogram for a gaussian distribution (n bins, mean m and std s) Parameters ---------- n : int number of bins in the histogram m : float mean value of the gaussian distribution s : float standard deviaton of the gaussian distribution Returns ------- h : np.array (n,) 1D histogram for a gaussian distribution """ x=np.arange(n,dtype=np.float64) h=np.exp(-(x-m)**2/(2*s**2)) return h/h.sum() def get_2D_samples_gauss(n,m,sigma): """return n samples drawn from 2D gaussian N(m,sigma) Parameters ---------- n : int number of bins in the histogram m : np.array (2,) mean value of the gaussian distribution sigma : np.array (2,2) covariance matrix of the gaussian distribution Returns ------- X : np.array (n,2) n samples drawn from N(m,sigma) """ if np.isscalar(sigma): sigma=np.array([sigma,]) if len(sigma)>1: P=sp.linalg.sqrtm(sigma) res= np.random.randn(n,2).dot(P)+m else: res= np.random.randn(n,2)*np.sqrt(sigma)+m return res def get_data_classif(dataset,n,nz=.5,theta=0,**kwargs): """ dataset generation for classification problems Parameters ---------- dataset : str type of classification problem (see code) n : int number of training samples nz : float noise level (>0) Returns ------- X : np.array (n,d) n observation of size d y : np.array (n,) labels of the samples """ if dataset.lower()=='3gauss': y=np.floor((np.arange(n)*1.0/n*3))+1 x=np.zeros((n,2)) # class 1 x[y==1,0]=-1.; x[y==1,1]=-1. x[y==2,0]=-1.; x[y==2,1]=1. x[y==3,0]=1. ; x[y==3,1]=0 x[y!=3,:]+=1.5*nz*np.random.randn(sum(y!=3),2) x[y==3,:]+=2*nz*np.random.randn(sum(y==3),2) elif dataset.lower()=='3gauss2': y=np.floor((np.arange(n)*1.0/n*3))+1 x=np.zeros((n,2)) y[y==4]=3 # class 1 x[y==1,0]=-2.; x[y==1,1]=-2. x[y==2,0]=-2.; x[y==2,1]=2. x[y==3,0]=2. ; x[y==3,1]=0 x[y!=3,:]+=nz*np.random.randn(sum(y!=3),2) x[y==3,:]+=2*nz*np.random.randn(sum(y==3),2) elif dataset.lower()=='gaussrot' : rot=np.array([[np.cos(theta),np.sin(theta)],[-np.sin(theta),np.cos(theta)]]) m1=np.array([-1,1]) m2=np.array([1,-1]) y=np.floor((np.arange(n)*1.0/n*2))+1 n1=np.sum(y==1) n2=np.sum(y==2) x=np.zeros((n,2)) x[y==1,:]=get_2D_samples_gauss(n1,m1,nz) x[y==2,:]=get_2D_samples_gauss(n2,m2,nz) x=x.dot(rot) else: x=np.array(0) y=np.array(0) print("unknown dataset") return x,y.astype(int)