import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
[docs]def bw_scott(sigma, n=''):
"""
Scott's rule for bandwidth selection.
Parameters
----------
sigma : float
The standard deviation of the data.
n : int
The number of data points.
Returns
-------
bw : float
The bandwidth of the kernel.
"""
return sigma * (4.0 / 3.0 / n) ** 0.2
[docs]def opt_bandwidth(variable, x_test, bw):
"""
Find the optimal bandwidth by tuning of the `bandwidth` parameter
via cross-validation and returns the parameter value that maximizes
the log-likelihood of data.
Parameters
----------
variable : Numpy array
A N x 1 dimension numpy array. The data to apply the kernel density estimation.
x_test : Numpy array
Test data to get the density distribution.
bw : list of float
The bandwidth of the kernels to be tested.
Returns
-------
kde.bandwidth : float
The optimal bandwidth of the kernel.
"""
kde = KernelDensity(kernel='gaussian')
grid = GridSearchCV(kde, {'bandwidth': bw})
grid.fit(variable)
kde = grid.best_estimator_
log_dens = kde.score_samples(x_test)
print("the log-likelihood of data: ", log_dens)
if len(bw) > 1:
print("optimal bandwidth: " + "{:.4f}".format(kde.bandwidth))
return kde.bandwidth
[docs]def log_likelihood(pdf):
"""
Calculate the likelihood of the given probability density function.
The likelihood is:
.. math::
L = \\frac{1}{N}\\sum_{i=1}^{N} f(x_i)
Parameters
----------
pdf : Numpy array
The probability density function.
Returns
-------
LL : float
The log-likelihood of the given probability density function.
"""
LL = np.average(np.log(pdf))
return LL