Source code for polytex.stats.bw_opt

import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV


[docs]def bw_scott(sigma, n=''):
    """
    Scott's rule for bandwidth selection.

    Parameters
    ----------
    sigma : float
        The standard deviation of the data.
    n : int
        The number of data points.

    Returns
    -------
    bw : float
        The bandwidth of the kernel.
    """
    return sigma * (4.0 / 3.0 / n) ** 0.2


[docs]def opt_bandwidth(variable, x_test, bw):
    """
    Find the optimal bandwidth by tuning of the `bandwidth` parameter
    via cross-validation and returns the parameter value that maximizes
    the log-likelihood of data.

    Parameters
    ----------
    variable : Numpy array
        A N x 1 dimension numpy array. The data to apply the kernel density estimation.
    x_test : Numpy array
        Test data to get the density distribution.
    bw : list of float
        The bandwidth of the kernels to be tested.

    Returns
    -------
    kde.bandwidth : float
        The optimal bandwidth of the kernel.
    """

    kde = KernelDensity(kernel='gaussian')
    grid = GridSearchCV(kde, {'bandwidth': bw})
    grid.fit(variable)

    kde = grid.best_estimator_
    log_dens = kde.score_samples(x_test)
    print("the log-likelihood of data: ", log_dens)
    if len(bw) > 1:
        print("optimal bandwidth: " + "{:.4f}".format(kde.bandwidth))

    return kde.bandwidth


[docs]def log_likelihood(pdf):
    """
    Calculate the likelihood of the given probability density function.
    The likelihood is:
    
    .. math::
        L = \\frac{1}{N}\\sum_{i=1}^{N} f(x_i)

    Parameters
    ----------
    pdf : Numpy array
        The probability density function.

    Returns
    -------
    LL : float
        The log-likelihood of the given probability density function.
    """
    LL = np.average(np.log(pdf))
    return LL