"""
Implementation of the Earth Mover's Distance.
"""
import numpy as np
from scipy.optimize import linprog
from ..helpers import normalize_pmfs, numerical_test
def categorical_distances(n):
"""
Construct a categorical distances matrix.
Parameters
----------
n : int
The size of the matrix.
Returns
-------
ds : np.ndarray
The matrix of distances.
"""
return 1 - np.eye(n)
def numerical_distances(x_values, y_values):
"""
Construct matrix of distances between real values.
Parameters
----------
x_values : np.ndarray
The real values on the x dimension.
y_values : np.ndarray
The real values on the y dimension.
Returns
-------
ds : np.ndarray
The matrix of distances.
"""
xx, yy = np.meshgrid(x_values, y_values)
return abs(xx - yy)
def earth_movers_distance_pmf(x, y, distances=None):
"""
Compute the Earth Mover's Distance between `p` and `q`.
Parameters
----------
p : np.ndarray
The first pmf.
q : np.ndarray
The second pmf.
distances : np.ndarray, None
The cost of moving probability from p[i] to q[j]. If None,
the cost is assumed to be i != j.
Returns
-------
emd : float
The Earth Mover's Distance.
"""
n = len(x)
if distances is None:
# assume categorical distribution
distances = categorical_distances(n)
eye = np.eye(n)
A = np.vstack([np.dstack([eye]*n).reshape(n, n**2), np.tile(eye, n)])
b = np.concatenate([x, y], axis=0)
c = distances.flatten()
res = linprog(c, A_eq=A, b_eq=b, bounds=[0, None])
return res.fun
[docs]def earth_movers_distance(dist1, dist2, distances=None):
"""
Compute the Earth Mover's Distance (EMD) between `dist1` and `dist2`. The EMD
is the least amount of "probability mass flow" that must occur to transform
`dist1` to `dist2`.
Parameters
----------
dist1 : Distribution
The first distribution.
dist2 : Distribution
The second distribution.
distances : np.ndarray, None
A matrix of distances between outcomes of the distributions.
If None, a distance matrix is constructed; if the distributions
are categorical each non-equal event is considered at unit distance,
and if numerical abs(x, y) is used as the distance.
Returns
-------
emd : float
The Earth Mover's Distance.
"""
if distances is None:
try:
numerical_test(dist1)
numerical_test(dist2)
p, q = dist1.pmf, dist2.pmf
distances = numerical_distances(dist1.outcomes, dist2.outcomes)
except TypeError:
p, q = normalize_pmfs(dist1, dist2)
distances = categorical_distances(len(p))
else:
p, q = dist1.pmf, dist2.pmf
return earth_movers_distance_pmf(p, q, distances)