Source code for deeprob.spn.learning.splitting.entropy

# MIT License: Copyright (c) 2021 Lorenzo Loconte, Gennaro Gala, Federico Luzzi

from typing import Union, Type, List

import numpy as np

from deeprob.spn.structure.leaf import Leaf, LeafType


[docs]def entropy_cols(
    data: np.ndarray,
    distributions: List[Type[Leaf]],
    domains: List[Union[list, tuple]],
    random_state: np.random.RandomState,
    e: float = 0.3,
    alpha: float = 0.1
) -> np.ndarray:
    """
    Entropy based column splitting method.

    :param data: The data.
    :param distributions: Distributions of the features.
    :param domains: Range of values of the features.
    :param e: Threshold of the considered entropy to be signficant.
    :param alpha: laplacian alpha to apply at frequence.
    :return: A partitioning of features.
    """
    _, n_features = data.shape
    partition = np.zeros(n_features, dtype=np.int64)

    # Compute entropy for each variable
    for i in range(n_features):
        if distributions[i].LEAF_TYPE == LeafType.DISCRETE:
            bins = domains[i] + [len(domains[i])]
            hist, _ = np.histogram(data[:, i], bins=bins)
            probs = (hist + alpha) / (len(data) + len(hist) * alpha)
            entropy = -np.sum(probs * np.log2(probs))
        elif distributions[i].LEAF_TYPE == LeafType.CONTINUOUS:
            hist, _ = np.histogram(data[:, i], bins='scott')
            probs = (hist + alpha) / (len(data) + len(hist) * alpha)
            entropy = -np.sum(probs * np.log2(probs)) / np.log2(len(hist))
        else:
            raise ValueError("Leaves distributions must be either discrete or continuous")

        # Add to cluster if entropy is less than the threshold
        if entropy < e:
            partition[i] = 1

    return partition


[docs]def entropy_adaptive_cols(
    data: np.ndarray,
    distributions: List[Type[Leaf]],
    domains: List[Union[list, tuple]],
    random_state: np.random.RandomState,
    e: float = 0.3,
    alpha: float = 0.1,
    size: int = None
) -> np.ndarray:
    """
    Adaptive Entropy based column splitting method.

    :param data: The data.
    :param distributions: Distributions of the features.
    :param domains: Range of values of the features.
    :param e: Threshold of the considered entropy to be signficant.
    :param alpha: laplacian alpha to apply at frequence.
    :param size: Size of whole dataset.
    :return: A partitioning of features.
    :raises ValueError: If the size of the data is missing.
    """
    if size is None:
        raise ValueError("Missing data size for Adaptive Entropy column splitting method")

    return entropy_cols(
        data, distributions, domains, random_state,
        e=max(e * (len(data) / size), np.finfo(np.float32).eps),
        alpha=alpha
    )