Source code for mdcgenpy.clusters.generate

from __future__ import division
import math
import numpy as np
import scipy.linalg


[docs]def generate_mass(clus_cfg): """ Get the number of samples to generate for each cluster. Args: clus_cfg (clusters.DataConfig): Configuration Returns: np.array: Array with len == nr of clusters, where each entry is the number of samples in the corresponding to generate in the corresponding cluster. """ if type(clus_cfg.k) == list: mass = np.array(clus_cfg.k) else: mass = np.random.uniform(0, 1, clus_cfg.n_clusters) total_mass = mass.sum() mass = np.vectorize(math.floor)(clus_cfg.n_samples * mass / total_mass) abs_mass = mass.sum() if abs_mass < clus_cfg.n_samples: # if samples are unassigned, send them to the cluster with least samples min_ind = np.argmin(mass) mass[min_ind] += clus_cfg.n_samples - abs_mass # guarantee there are enough samples in each cluster if clus_cfg.min_samples <= 0: min_mass = round(clus_cfg.n_samples / (clus_cfg.ki_coeff * clus_cfg.n_clusters)) else: min_mass = clus_cfg.min_samples need_to_add = True while need_to_add: need_to_add = False min_ind = np.argmin(mass) if mass[min_ind] < min_mass: max_ind = np.argmax(mass) extra = min_mass - mass[min_ind] mass[max_ind] -= extra mass[min_ind] += extra need_to_add = True return mass.astype(dtype=float)
[docs]def locate_centroids(clus_cfg): """ Generate locations for the centroids of the clusters. Args: clus_cfg (clusters.DataConfig): Configuration. Returns: np.array: Matrix (n_clusters, n_feats) with positions of centroids. """ centroids = np.zeros((clus_cfg.n_clusters, clus_cfg.n_feats)) p = 1. idx = 1 for i, c in enumerate(clus_cfg._cmax): p *= c if p > 2 * clus_cfg.n_clusters + clus_cfg.outliers / clus_cfg.n_clusters: idx = i break idx += 1 locis = np.arange(p) np.random.shuffle(locis) clin = locis[:clus_cfg.n_clusters] # voodoo magic for obtaining centroids res = clin for j in range(idx): center = ((res % clus_cfg._cmax[j]) + 1) / (clus_cfg._cmax[j] + 1) noise = (np.random.rand(clus_cfg.n_clusters) - 0.5) * clus_cfg.compactness_factor centroids[:, j] = center + noise res = np.floor(res / clus_cfg._cmax[j]) for j in range(idx, clus_cfg.n_feats): center = np.floor(clus_cfg._cmax[j] * np.random.rand(clus_cfg.n_clusters) + 1) / (clus_cfg._cmax[j] + 1) noise = (np.random.rand(clus_cfg.n_clusters) - 0.5) * clus_cfg.compactness_factor centroids[:, j] = center + noise return centroids, locis, idx
[docs]def generate_clusters(clus_cfg, batch_size = 0): """ Generate data. Args: clus_cfg (clusters.DataConfig): Configuration. batch_size (int): Number of samples for each batch. Yields: np.array: Generated samples. np.array: Labels for the samples. """ # generate correlation and rotation matrices for cluster in clus_cfg.clusters: # generate random symmetric matrix with ones in the diagonal # uses the vine method described here # http://stats.stackexchange.com/questions/2746/how-to-efficiently-generate-random-positive-semidefinite-correlation-matrices # using the correlation input parameter to set a threshold on the values of the correlation matrix corr = np.eye(clus_cfg.n_feats) aux = np.zeros(corr.shape) beta_param = 4 for k in range(clus_cfg.n_feats - 1): for i in range(k + 1, clus_cfg.n_feats): aux[k, i] = 2 * cluster.corr * (np.random.beta(beta_param, beta_param) - 0.5) p = aux[k, i] for l in range(k - 1, -1, -1): p = p * np.sqrt((1 - aux[l, i]**2) * (1 - aux[l, k]**2)) + aux[l, i] * aux[l, k] corr[k, i] = p corr[i, k] = p perm = np.random.permutation(clus_cfg.n_feats) corr = corr[perm, :][:, perm] cluster.corr_matrix = np.linalg.cholesky(corr) cluster.correlation_matrix = corr # rotation matrix if cluster.rotate: cluster.rotation_matrix = get_rotation_matrix(clus_cfg.n_feats) if batch_size == 0: batch_size = clus_cfg.n_samples for batch in range(((clus_cfg.n_samples - 1) // batch_size) + 1): n_samples = min(batch_size, clus_cfg.n_samples - batch * batch_size) data, labels = compute_batch(clus_cfg, n_samples) yield data, np.reshape(labels, (len(labels), 1))
[docs]def get_rotation_matrix(n_feats): rot_mat = 2 * (np.random.rand(n_feats, n_feats) - 0.5) ort = scipy.linalg.orth(rot_mat) if ort.shape == rot_mat.shape: # check if `rot_mat` is full rank, so that `ort` keeps the same shape return ort else: return get_rotation_matrix(n_feats)
[docs]def compute_batch(clus_cfg, n_samples): """ Generates one batch of data. Args: clus_cfg (clusters.DataConfig): Configuration. n_samples (int): Number of samples in the batch. Returns: np.array: Generated sample. """ # get probabilities of each class mass = clus_cfg.mass mass = np.insert(mass, 0, clus_cfg.outliers) # class 0 is now the outliers (this changes to -1 further down) mass /= mass.sum() labels = np.random.choice(clus_cfg.n_clusters + 1, n_samples, p=mass) - 1 # label -1 corresponds to outliers data = np.zeros((n_samples, clus_cfg.n_feats)) # generate samples for each cluster for label in range(clus_cfg.n_clusters): cluster = clus_cfg.clusters[label] indexes = (labels == label) samples = sum(indexes) # nr of samples in this cluster data[indexes] = cluster.generate_data(samples) data[indexes] = data[indexes].dot(cluster.corr_matrix) # apply correlation to data # apply rotation if cluster.rotate: data[indexes] = data[indexes].dot(cluster.rotation_matrix) # add centroid data[indexes] += clus_cfg._centroids[label] # add noisy variables for d in cluster.n_noise: data[indexes, d] = np.random.rand(samples) # generate outliers indexes = (labels == -1) out = sum(indexes) # voodoo magic for generating outliers locis = clus_cfg._locis[clus_cfg.n_clusters:] res = locis[np.arange(out) % len(locis)] for j in range(clus_cfg._idx): center = ((res % clus_cfg._cmax[j]) + 1) / (clus_cfg._cmax[j] + 1) noise = (1 / (clus_cfg._cmax[j] + 1)) * np.random.rand(out) - (1 / (2 * (clus_cfg._cmax[j] + 1))) data[indexes, j] = center + noise res = np.floor(res / clus_cfg._cmax[j]) for j in range(clus_cfg._idx, clus_cfg.n_feats): center = np.floor(clus_cfg._cmax[j] * (np.random.rand(out) + 1)) / (clus_cfg._cmax[j] + 1) noise = (1 / (clus_cfg._cmax[j] + 1)) * np.random.rand(out) - (1 / (2 * (clus_cfg._cmax[j] + 1))) data[indexes, j] = center + noise return data, labels