Source code for mdcgenpy.clusters.generate

from __future__ import division
import math
import numpy as np
import scipy.linalg


[docs]def generate_mass(clus_cfg):
    """
    Get the number of samples to generate for each cluster.

    Args:
        clus_cfg (clusters.DataConfig): Configuration

    Returns:
        np.array: Array with len == nr of clusters, where each entry is the number of samples in the corresponding
            to generate in the corresponding cluster.
    """
    if type(clus_cfg.k) == list:
        mass = np.array(clus_cfg.k)
    else:
        mass = np.random.uniform(0, 1, clus_cfg.n_clusters)
        total_mass = mass.sum()
        mass = np.vectorize(math.floor)(clus_cfg.n_samples * mass / total_mass)
        abs_mass = mass.sum()
        if abs_mass < clus_cfg.n_samples:  # if samples are unassigned, send them to the cluster with least samples
            min_ind = np.argmin(mass)
            mass[min_ind] += clus_cfg.n_samples - abs_mass

        # guarantee there are enough samples in each cluster
        if clus_cfg.min_samples <= 0:
            min_mass = round(clus_cfg.n_samples / (clus_cfg.ki_coeff * clus_cfg.n_clusters))
        else:
            min_mass = clus_cfg.min_samples
        need_to_add = True
        while need_to_add:
            need_to_add = False
            min_ind = np.argmin(mass)
            if mass[min_ind] < min_mass:
                max_ind = np.argmax(mass)
                extra = min_mass - mass[min_ind]
                mass[max_ind] -= extra
                mass[min_ind] += extra
                need_to_add = True

    return mass.astype(dtype=float)


[docs]def locate_centroids(clus_cfg):
    """
    Generate locations for the centroids of the clusters.

    Args:
        clus_cfg (clusters.DataConfig): Configuration.

    Returns:
        np.array: Matrix (n_clusters, n_feats) with positions of centroids.
    """
    centroids = np.zeros((clus_cfg.n_clusters, clus_cfg.n_feats))

    p = 1.
    idx = 1
    for i, c in enumerate(clus_cfg._cmax):
        p *= c
        if p > 2 * clus_cfg.n_clusters + clus_cfg.outliers / clus_cfg.n_clusters:
            idx = i
            break
    idx += 1

    locis = np.arange(p)
    np.random.shuffle(locis)
    clin = locis[:clus_cfg.n_clusters]

    # voodoo magic for obtaining centroids
    res = clin
    for j in range(idx):
        center = ((res % clus_cfg._cmax[j]) + 1) / (clus_cfg._cmax[j] + 1)
        noise = (np.random.rand(clus_cfg.n_clusters) - 0.5) * clus_cfg.compactness_factor
        centroids[:, j] = center + noise
        res = np.floor(res / clus_cfg._cmax[j])
    for j in range(idx, clus_cfg.n_feats):
        center = np.floor(clus_cfg._cmax[j] * np.random.rand(clus_cfg.n_clusters) + 1) / (clus_cfg._cmax[j] + 1)
        noise = (np.random.rand(clus_cfg.n_clusters) - 0.5) * clus_cfg.compactness_factor
        centroids[:, j] = center + noise

    return centroids, locis, idx


[docs]def generate_clusters(clus_cfg, batch_size = 0):
    """
    Generate data.

    Args:
        clus_cfg (clusters.DataConfig): Configuration.
        batch_size (int): Number of samples for each batch.

    Yields:
        np.array: Generated samples.
        np.array: Labels for the samples.
    """
    # generate correlation and rotation matrices
    for cluster in clus_cfg.clusters:
        # generate random symmetric matrix with ones in the diagonal
        # uses the vine method described here
        # http://stats.stackexchange.com/questions/2746/how-to-efficiently-generate-random-positive-semidefinite-correlation-matrices
        # using the correlation input parameter to set a threshold on the values of the correlation matrix
        corr = np.eye(clus_cfg.n_feats)
        aux = np.zeros(corr.shape)

        beta_param = 4

        for k in range(clus_cfg.n_feats - 1):
            for i in range(k + 1, clus_cfg.n_feats):
                aux[k, i] = 2 * cluster.corr * (np.random.beta(beta_param, beta_param) - 0.5)
                p = aux[k, i]
                for l in range(k - 1, -1, -1):
                    p = p * np.sqrt((1 - aux[l, i]**2) * (1 - aux[l, k]**2)) + aux[l, i] * aux[l, k]
                corr[k, i] = p
                corr[i, k] = p
        perm = np.random.permutation(clus_cfg.n_feats)
        corr = corr[perm, :][:, perm]
        cluster.corr_matrix = np.linalg.cholesky(corr)
        cluster.correlation_matrix = corr

        # rotation matrix
        if cluster.rotate:
            cluster.rotation_matrix = get_rotation_matrix(clus_cfg.n_feats)

    if batch_size == 0:
        batch_size = clus_cfg.n_samples
    for batch in range(((clus_cfg.n_samples - 1) // batch_size) + 1):
        n_samples = min(batch_size, clus_cfg.n_samples - batch * batch_size)
        data, labels = compute_batch(clus_cfg, n_samples)
        yield data, np.reshape(labels, (len(labels), 1))


[docs]def get_rotation_matrix(n_feats):
    rot_mat = 2 * (np.random.rand(n_feats, n_feats) - 0.5)
    ort = scipy.linalg.orth(rot_mat)
    if ort.shape == rot_mat.shape:  # check if `rot_mat` is full rank, so that `ort` keeps the same shape
        return ort
    else:
        return get_rotation_matrix(n_feats)


[docs]def compute_batch(clus_cfg, n_samples):
    """
    Generates one batch of data.

    Args:
        clus_cfg (clusters.DataConfig): Configuration.
        n_samples (int): Number of samples in the batch.

    Returns:
        np.array: Generated sample.
    """
    # get probabilities of each class
    mass = clus_cfg.mass
    mass = np.insert(mass, 0, clus_cfg.outliers)  # class 0 is now the outliers (this changes to -1 further down)
    mass /= mass.sum()

    labels = np.random.choice(clus_cfg.n_clusters + 1, n_samples, p=mass) - 1
    # label -1 corresponds to outliers
    data = np.zeros((n_samples, clus_cfg.n_feats))

    # generate samples for each cluster
    for label in range(clus_cfg.n_clusters):
        cluster = clus_cfg.clusters[label]
        indexes = (labels == label)
        samples = sum(indexes)  # nr of samples in this cluster
        data[indexes] = cluster.generate_data(samples)

        data[indexes] = data[indexes].dot(cluster.corr_matrix)  # apply correlation to data

        # apply rotation
        if cluster.rotate:
            data[indexes] = data[indexes].dot(cluster.rotation_matrix)

        # add centroid
        data[indexes] += clus_cfg._centroids[label]

        # add noisy variables
        for d in cluster.n_noise:
            data[indexes, d] = np.random.rand(samples)

    # generate outliers
    indexes = (labels == -1)
    out = sum(indexes)

    # voodoo magic for generating outliers
    locis = clus_cfg._locis[clus_cfg.n_clusters:]
    res = locis[np.arange(out) % len(locis)]
    for j in range(clus_cfg._idx):
        center = ((res % clus_cfg._cmax[j]) + 1) / (clus_cfg._cmax[j] + 1)
        noise = (1 / (clus_cfg._cmax[j] + 1)) * np.random.rand(out) - (1 / (2 * (clus_cfg._cmax[j] + 1)))
        data[indexes, j] = center + noise
        res = np.floor(res / clus_cfg._cmax[j])
    for j in range(clus_cfg._idx, clus_cfg.n_feats):
        center = np.floor(clus_cfg._cmax[j] * (np.random.rand(out) + 1)) / (clus_cfg._cmax[j] + 1)
        noise = (1 / (clus_cfg._cmax[j] + 1)) * np.random.rand(out) - (1 / (2 * (clus_cfg._cmax[j] + 1)))
        data[indexes, j] = center + noise

    return data, labels