Skip to content

Commit 96a2596

Browse files
agoscinskivictorprincipe
authored andcommitted
Adding a numerically more accurate way to compute convex hull distance.
1 parent 41df941 commit 96a2596

File tree

2 files changed

+133
-64
lines changed

2 files changed

+133
-64
lines changed

skmatter/sample_selection/_base.py

Lines changed: 87 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,29 @@ def __init__(
399399
)
400400

401401

402+
def _directional_distance(equations, points):
403+
"""
404+
Computes the distance of the points to the planes defined by the equations
405+
with respect to the direction of the first dimension.
406+
407+
equations : ndarray of shape (n_facets, n_dim)
408+
each row contains the coefficienst for the plane equation of the form
409+
equations[i, 0]*x_1 + ...
410+
+ equations[i, -2]*x_{n_dim} = equations[i, -1]
411+
-equations[i, -1] is the offset
412+
points : ndarray of shape (n_samples, n_dim)
413+
points to compute the directional distance from
414+
415+
Returns
416+
-------
417+
directional_distance : ndarray of shape (nsamples, nequations)
418+
closest distance wrt. the first dimension of the point to the planes
419+
defined by the equations
420+
"""
421+
orthogonal_distances = -(points @ equations[:, :-1].T) - equations[:, -1:].T
422+
return -orthogonal_distances / equations[:, :1].T
423+
424+
402425
class DirectionalConvexHull:
403426
"""
404427
Performs Sample Selection by constructing a Directional Convex Hull and determining the distance to the hull as outlined in the reference
@@ -411,6 +434,14 @@ class DirectionalConvexHull:
411434
directional convex hull construction (also known as the low-
412435
dimensional (LD) hull). By default [0] is used.
413436
437+
tolerance : float, default=1.0E-12
438+
Tolerance for the negative distances to the directional
439+
convex hull to consider a point below the convex hull. Depending
440+
if a point is below or above the convex hull the distance is
441+
differently computed. A very low value can result in a completely
442+
wrong distances. Distances cannot be distinguished from zero up
443+
to tolerance. It is recommended to leave the default setting.
444+
414445
Attributes
415446
----------
416447
@@ -426,22 +457,20 @@ class DirectionalConvexHull:
426457
Interpolater for the features in the high-
427458
dimensional space
428459
429-
interpolator_y_ : scipy.interpolate.interpnd.LinearNDInterpolator
430-
Interpolater for the targets y
431-
432460
References
433461
----------
434462
.. [dch] A. Anelli, E. A. Engel, C. J. Pickard and M. Ceriotti,
435463
Physical Review Materials, 2018.
436464
"""
437465

438-
def __init__(self, low_dim_idx=None):
466+
def __init__(self, low_dim_idx=None, tolerance=1e-12):
439467
self.low_dim_idx = low_dim_idx
440468

441469
if low_dim_idx is None:
442470
self.low_dim_idx = [0]
443471
else:
444472
self.low_dim_idx = low_dim_idx
473+
self.tolerance = tolerance
445474

446475
def fit(self, X, y):
447476
"""
@@ -490,39 +519,43 @@ def fit(self, X, y):
490519
# create high-dimensional feature matrix
491520
high_dim_feats = X[:, self.high_dim_idx_]
492521
# get scipy convex hull
493-
convex_hull = ConvexHull(convex_hull_data)
522+
self.convex_hull_ = ConvexHull(convex_hull_data, incremental=True)
494523
# get normal equations to the hull simplices
495-
y_normal = convex_hull.equations[:, 0]
524+
y_normal = self.convex_hull_.equations[:, 0]
496525

497526
# get vertices_idx of the convex hull
498-
self.selected_idx_ = np.unique(
499-
convex_hull.simplices[np.where(y_normal < 0)[0]].flatten()
500-
)
527+
directional_facets_idx = np.where(y_normal < 0)[0]
528+
self.directional_simplices_ = self.convex_hull_.simplices[
529+
directional_facets_idx
530+
]
531+
self._directional_equations_ = self.convex_hull_.equations[
532+
directional_facets_idx
533+
]
534+
535+
self.selected_idx_ = np.unique(self.directional_simplices_.flatten())
536+
self._directional_points_ = convex_hull_data[self.selected_idx_]
501537

502538
# required for the score_feature_matrix function
503539
self.interpolator_high_dim_ = _linear_interpolator(
504540
points=convex_hull_data[self.selected_idx_, 1:],
505541
values=high_dim_feats[self.selected_idx_],
506542
)
507543

508-
# required to compute the distance of the low-dimensional feature to the convex hull
509-
self.interpolator_y_ = _linear_interpolator(
510-
points=convex_hull_data[self.selected_idx_, 1:],
511-
values=convex_hull_data[self.selected_idx_, 0],
512-
)
513-
514544
return self
515545

546+
@property
547+
def directional_vertices_(self):
548+
return self.selected_idx_
549+
516550
def _check_X_y(self, X, y):
517-
return check_X_y(X, y, ensure_min_features=2, multi_output=False)
551+
return check_X_y(X, y, ensure_min_features=1, multi_output=False)
518552

519553
def _check_is_fitted(self, X):
520554
check_is_fitted(
521555
self,
522556
[
523557
"high_dim_idx_",
524558
"interpolator_high_dim_",
525-
"interpolator_y_",
526559
"selected_idx_",
527560
],
528561
)
@@ -556,7 +589,7 @@ def score_samples(self, X, y):
556589
557590
Returns
558591
-------
559-
dch_distance : numpy.array of shape (n_samples, len(high_dim_idx_))
592+
dch_distance : ndarray of shape (n_samples, len(high_dim_idx_))
560593
The distance (residuals) of samples to the convex hull in
561594
the higher-dimensional space.
562595
"""
@@ -565,17 +598,46 @@ def score_samples(self, X, y):
565598

566599
# features used for the convex hull construction
567600
low_dim_feats = X[:, self.low_dim_idx]
601+
hull_space_data = np.hstack((y.reshape(-1, 1), low_dim_feats))
568602

569603
# the X points projected on the convex surface
570-
interpolated_y = self.interpolator_y_(low_dim_feats).reshape(y.shape)
604+
return self._directional_convex_hull_distance(hull_space_data).reshape(y.shape)
571605

572-
if np.any(np.isnan(interpolated_y)):
573-
warnings.warn(
574-
"There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.",
575-
UserWarning,
576-
)
606+
def _directional_convex_hull_distance(self, points):
607+
"""
608+
Get the distance to the fitted directional convex hull in the target dimension y.
609+
610+
If a point lies above the convex hull, it will have a positive distance.
611+
If a point lies below the convex hull, it will have a negative distance - this should only
612+
occur if you pass a point that the convex hull has not been fitted on in the `fit` function.
613+
If a point lies on the convex hull, it will have a distance of zero.
577614
578-
return y - interpolated_y
615+
Parameters
616+
----------
617+
points : The points for which you would like to calculate the distance to the
618+
fitted directional convex hull.
619+
"""
620+
# directional distances to all plane equations
621+
all_directional_distances = _directional_distance(
622+
self._directional_equations_, points
623+
)
624+
print(all_directional_distances)
625+
# we get negative distances for each plane to check if any distance is below the threshold
626+
below_directional_convex_hull = np.any(
627+
all_directional_distances < -self.tolerance, axis=1
628+
)
629+
# directional distances to corresponding plane equation
630+
directional_distances = np.zeros(len(points))
631+
directional_distances[~below_directional_convex_hull] = np.min(
632+
all_directional_distances[~below_directional_convex_hull], axis=1
633+
)
634+
# some distances can be positive, so we take the max of all negative distances
635+
negative_directional_distances = all_directional_distances.copy()
636+
negative_directional_distances[all_directional_distances > 0] = -np.inf
637+
directional_distances[below_directional_convex_hull] = np.max(
638+
negative_directional_distances[below_directional_convex_hull], axis=1
639+
)
640+
return directional_distances
579641

580642
def score_feature_matrix(self, X):
581643
"""

tests/test_dch.py

Lines changed: 46 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def test_negative_score(self):
101101
distance = selector.score_samples(
102102
self.below_hull_point[:, 1:], self.below_hull_point[:, 0]
103103
)[0]
104+
print("distance", distance)
104105
self.assertTrue(distance < 0.0)
105106

106107
def test_positive_score(self):
@@ -109,34 +110,54 @@ def test_positive_score(self):
109110
In an old implementation we observed this bug for the dataset we use in this test
110111
(see issue #162)
111112
"""
112-
X = [[1.88421449, 0.86675162],
113-
[1.88652863, 0.86577001],
114-
[1.89200182, 0.86573224],
115-
[1.89664107, 0.86937211],
116-
[1.90181908, 0.85964603],
117-
[1.90313135, 0.85695238],
118-
[1.90063025, 0.84948309],
119-
[1.90929015, 0.87526563],
120-
[1.90924666, 0.85509754],
121-
[1.91139146, 0.86115512],
122-
[1.91199225, 0.8681867 ],
123-
[1.90681563, 0.85036791],
124-
[1.90193881, 0.84168907],
125-
[1.90544262, 0.84451744],
126-
[1.91498802, 0.86010812],
127-
[1.91305204, 0.85333203],
128-
[1.89779902, 0.83731807],
129-
[1.91725967, 0.86630218],
130-
[1.91309514, 0.85046796],
131-
[1.89822103, 0.83522425]]
132-
y = [-2.69180967, -2.72443825, -2.77293913, -2.797828 , -2.12097652, -2.69428482,
133-
-2.70275134, -2.80617667, -2.79199375, -2.01707974, -2.74203922, -2.24217962,
134-
-2.03472 , -2.72612763, -2.7071123 , -2.75706683, -2.68925596, -2.77160335,
135-
-2.69528665, -2.70911598]
113+
X = [
114+
[1.88421449, 0.86675162],
115+
[1.88652863, 0.86577001],
116+
[1.89200182, 0.86573224],
117+
[1.89664107, 0.86937211],
118+
[1.90181908, 0.85964603],
119+
[1.90313135, 0.85695238],
120+
[1.90063025, 0.84948309],
121+
[1.90929015, 0.87526563],
122+
[1.90924666, 0.85509754],
123+
[1.91139146, 0.86115512],
124+
[1.91199225, 0.8681867],
125+
[1.90681563, 0.85036791],
126+
[1.90193881, 0.84168907],
127+
[1.90544262, 0.84451744],
128+
[1.91498802, 0.86010812],
129+
[1.91305204, 0.85333203],
130+
[1.89779902, 0.83731807],
131+
[1.91725967, 0.86630218],
132+
[1.91309514, 0.85046796],
133+
[1.89822103, 0.83522425],
134+
]
135+
y = [
136+
-2.69180967,
137+
-2.72443825,
138+
-2.77293913,
139+
-2.797828,
140+
-2.12097652,
141+
-2.69428482,
142+
-2.70275134,
143+
-2.80617667,
144+
-2.79199375,
145+
-2.01707974,
146+
-2.74203922,
147+
-2.24217962,
148+
-2.03472,
149+
-2.72612763,
150+
-2.7071123,
151+
-2.75706683,
152+
-2.68925596,
153+
-2.77160335,
154+
-2.69528665,
155+
-2.70911598,
156+
]
136157
selector = DirectionalConvexHull(low_dim_idx=[0, 1])
137158
selector.fit(X, y)
138159
distances = selector.score_samples(X, y)
139-
self.assertTrue(np.all(distances >= -1e-12))
160+
self.assertTrue(np.all(distances >= -selector.tolerance))
140161

141162
def test_score_function_warnings(self):
142163
"""
@@ -153,20 +174,6 @@ def test_score_function_warnings(self):
153174
y = [1.0, 2.0, 3.0]
154175
selector.fit(X, y)
155176

156-
# check for score_samples
157-
with warnings.catch_warnings(record=True) as warning:
158-
# Cause all warnings to always be triggered.
159-
warnings.simplefilter("always")
160-
# Trigger a warning because it is outsite of range [0, 3]
161-
selector.score_samples([[4.0, 1.0]], [1.0])
162-
# Verify some things
163-
self.assertTrue(len(warning) == 1)
164-
self.assertTrue(issubclass(warning[0].category, UserWarning))
165-
self.assertTrue(
166-
"There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans."
167-
== str(warning[0].message)
168-
)
169-
170177
# check for score_feature_matrix
171178
with warnings.catch_warnings(record=True) as warning:
172179
# Cause all warnings to always be triggered.

0 commit comments

Comments
 (0)