Skip to content

Commit a18b6fd

Browse files
committed
vectorized CVXPY code and have been toying with optimization changes to get the Nelder-Mead to make fewer queries so hyperparameter optimization can be fast. At a point where I want to pull it down and try running the experiments on the big machines, so here is the code, cloud
1 parent a666317 commit a18b6fd

2 files changed

Lines changed: 59 additions & 44 deletions

File tree

pynumdiff/kalman_smooth/_kalman_smooth.py

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
from warnings import warn
33
from scipy.linalg import expm, sqrtm
4+
from scipy.stats import norm
5+
from time import time
46

57
try: import cvxpy
68
except ImportError: pass
@@ -107,15 +109,15 @@ def rts_smooth(_t, A, xhat_pre, xhat_post, P_pre, P_post, compute_P_smooth=True)
107109
return xhat_smooth if not compute_P_smooth else (xhat_smooth, P_smooth)
108110

109111

110-
def rtsdiff(x, _t, order, qr_ratio, forwardbackward):
112+
def rtsdiff(x, _t, order, log_qr_ratio, forwardbackward):
111113
"""Perform Rauch-Tung-Striebel smoothing with a naive constant derivative model. Makes use of :code:`kalman_filter`
112114
and :code:`rts_smooth`, which are made public. :code:`constant_X` methods in this module call this function.
113115
114116
:param np.array[float] x: data series to differentiate
115117
:param float or array[float] _t: This function supports variable step size. This parameter is either the constant
116118
step size if given as a single float, or data locations if given as an array of same length as :code:`x`.
117119
:param int order: which derivative to stabilize in the constant-derivative model
118-
:param qr_ratio: the process noise level of the divided by the measurement noise level, because,
120+
:param log_qr_ratio: the log of the process noise level divided by the measurement noise level, because,
119121
per `our analysis <https://github.com/florisvb/PyNumDiff/issues/139>`_, the mean result is
120122
dependent on the relative rather than absolute size of :math:`q` and :math:`r`.
121123
:param bool forwardbackward: indicates whether to run smoother forwards and backwards
@@ -129,8 +131,8 @@ def rtsdiff(x, _t, order, qr_ratio, forwardbackward):
129131
raise ValueError("If `_t` is given as array-like, must have same length as `x`.")
130132
x = np.asarray(x) # to flexibly allow array-like inputs
131133

132-
q = 10**int(np.log10(qr_ratio)/2) # even-ish split of the powers across 0
133-
r = q/qr_ratio
134+
q = 10**int(log_qr_ratio/2) # even-ish split of the powers across 0
135+
r = q/(10**log_qr_ratio)
134136

135137
A = np.diag(np.ones(order), 1) # continuous-time A just has 1s on the first diagonal (where 0th is main diagonal)
136138
Q = np.zeros(A.shape); Q[-1,-1] = q # continuous-time uncertainty around the last derivative
@@ -262,24 +264,30 @@ def constant_jerk(x, dt, params=None, options=None, r=None, q=None, forwardbackw
262264
return rtsdiff(x, dt, 3, q/r, forwardbackward)
263265

264266

265-
def robustdiff(x, dt, order, q, r, proc_huberM=6, meas_huberM=1.345):
267+
def robustdiff(x, dt, order, log_q, log_r, proc_huberM=6, meas_huberM=1.345):
266268
"""Perform outlier-robust differentiation by solving the Maximum A Priori optimization problem:
267269
:math:`\\min_{\\{x_n\\}} \\sum_{n=0}^{N-1} V(R^{-1/2}(y_n - C x_n)) + \\sum_{n=1}^{N-1} J(Q^{-1/2}(x_n - A x_{n-1}))`,
268270
where :math:`A,Q,C,R` come from an assumed constant derivative model and :math:`V,J` are the :math:`\\ell_1` norm or Huber
269271
loss rather than the :math:`\\ell_2` norm optimized by RTS smoothing. This problem is convex, so this method calls
270272
:code:`convex_smooth`.
271273
272-
Note that for Huber losses, :code:`M` is the radius where the Huber loss function turns from quadratic to linear, in units of
273-
standard deviation, because all inputs to Huber are normalized by noise levels, :code:`q` and :code:`r`. This choice affects
274-
which portion of inputs fall beyond the transition: Assuming Gaussian inliers, :code:`M = scipy.norm.ppf(1 - outlier_portion/2)`.
275-
:code:`M=0` reduces to the :math:`\\ell_1` norm, and :code:`M=6` is essentially the :math:`\\ell_2` norm, becaue the portion of
276-
examples beyond :math:`6\\sigma` is miniscule, about :code:`1.97e-9`.
274+
Note that for Huber losses, :code:`M` is the radius where the Huber loss function turns from quadratic to linear. Because
275+
all inputs to Huber are normalized by noise level, :code:`q` or :code:`r`, :code:`M` is in units of standard deviation.
276+
In other words, this choice affects which portion of inputs are treated as outliers. For example, assuming Gaussian inliers,
277+
the portion beyond :math:`M\\sigma` is :code:`outlier_portion = 2*(1 - scipy.stats.norm.cdf(M))`. The inverse of this is
278+
:code:`M = scipy.stats.norm.ppf(1 - outlier_portion/2)`. By :code:`M=6`, Huber is essentially indistinguishable from the
279+
1/2-sum-of-squares case, :math:`\\frac{1}{2}\\|\\cdot\\|_2^2`, because the portion of examples beyond :math:`6\\sigma` is
280+
miniscule, about :code:`1.97e-9`, and the normalization constant of the Huber loss approaches 1 as :math:`M` increases.
281+
(See :math:`c_2` `in section 6 <https://jmlr.org/papers/volume14/aravkin13a/aravkin13a.pdf>`_, missing a :math:`\\sqrt{\\cdot}`
282+
term there, see p2700). Similarly, as :code:`M` approaches 0, Huber reduces to the :math:`\\ell_1` norm case, because
283+
:math:`c_2` approaches :math:`\\frac{\\sqrt{2}}{M}`, cancelling the :math:`M` multiplying :math:`|\\cdot|` and leaving
284+
behind :math:`\\sqrt{2}`, the proper normalization.
277285
278286
:param np.array[float] x: data series to differentiate
279287
:param float dt: step size
280288
:param int order: which derivative to stabilize in the constant-derivative model (1=velocity, 2=acceleration, 3=jerk)
281-
:param float q: the process noise level
282-
:param float r: measurement noise level
289+
:param float log_q: base 10 logarithm of the process noise level, so :code:`q = 10**log_q`
290+
:param float log_r: base 10 logarithm of the measurement noise level, so :code:`r = 10**log_r`
283291
:param float proc_huberM: quadratic-to-linear transition point for process loss
284292
:param float meas_huberM: quadratic-to-linear transition point for measurement loss
285293
@@ -288,9 +296,9 @@ def robustdiff(x, dt, order, q, r, proc_huberM=6, meas_huberM=1.345):
288296
- **dxdt_hat** -- estimated derivative of x
289297
"""
290298
A_c = np.diag(np.ones(order), 1) # continuous-time A just has 1s on the first diagonal (where 0th is main diagonal)
291-
Q_c = np.zeros(A_c.shape); Q_c[-1,-1] = q # continuous-time uncertainty around the last derivative
299+
Q_c = np.zeros(A_c.shape); Q_c[-1,-1] = 10**log_q # continuous-time uncertainty around the last derivative
292300
C = np.zeros((1, order+1)); C[0,0] = 1 # we measure only y = noisy x
293-
R = np.array([[r]]) # 1 observed state, so this is 1x1
301+
R = np.array([[10**log_r]]) # 1 observed state, so this is 1x1
294302

295303
# convert to discrete time using matrix exponential
296304
eM = expm(np.block([[A_c, Q_c], [np.zeros(A_c.shape), -A_c.T]]) * dt) # Note this could handle variable dt, similar to rtsdiff
@@ -299,12 +307,13 @@ def robustdiff(x, dt, order, q, r, proc_huberM=6, meas_huberM=1.345):
299307
if np.linalg.cond(Q_d) > 1e12: Q_d += np.eye(order + 1)*1e-12 # for numerical stability with convex solver. Doesn't change answers appreciably (or at all).
300308

301309
x_states = convex_smooth(x, A_d, Q_d, C, R, proc_huberM, meas_huberM) # outsource solution of the convex optimization problem
302-
return x_states[:, 0], x_states[:, 1]
310+
return x_states[0], x_states[1]
303311

304312

305313
def convex_smooth(y, A, Q, C, R, proc_huberM=6, meas_huberM=1.345):
306314
"""Solve the optimization problem for robust smoothing using CVXPY. Note this currently assumes constant dt
307-
but could be extended to handle variable step sizes by finding discrete-time A and Q for requisite gaps.
315+
but could be extended to handle variable step sizes by finding discrete-time A and Q for requisite gaps as
316+
in the :code:`kalman_filter` function.
308317
309318
:param np.array[float] y: measurements
310319
:param np.array A: discrete-time state transition matrix
@@ -316,30 +325,36 @@ def convex_smooth(y, A, Q, C, R, proc_huberM=6, meas_huberM=1.345):
316325
:return: np.array -- state estimates (N x state_dim)
317326
"""
318327
N = len(y)
319-
x_states = cvxpy.Variable((N, A.shape[0])) # each row is [position, velocity, acceleration, ...] at step n
320-
321-
Q_sqrt_inv = np.linalg.inv(sqrtm(Q))
322-
R_sqrt_inv = np.linalg.inv(sqrtm(R))
323-
# Process terms: sum of J(Q^(-1/2)(x_n - A x_{n-1}))
324-
objective = cvxpy.sum([cvxpy.norm(Q_sqrt_inv @ (x_states[n] - A @ x_states[n-1]), 1) if proc_huberM < 1e-3 # l1 case
325-
else cvxpy.sum_squares(Q_sqrt_inv @ (x_states[n] - A @ x_states[n-1])) if proc_huberM > (6 - 1e-3) # l2 case
326-
else cvxpy.sum(cvxpy.huber(Q_sqrt_inv @ (x_states[n] - A @ x_states[n-1]), proc_huberM)) # proper Huber
327-
for n in range(1, N)])
328-
# Measurement terms: sum of g V(R^(-1/2)(y_n - C x_n))
329-
objective += cvxpy.sum([cvxpy.norm(R_sqrt_inv @ (y[n] - C @ x_states[n]), 1) if meas_huberM < 1e-3
330-
else cvxpy.sum_squares(R_sqrt_inv @ (y[n] - C @ x_states[n])) if meas_huberM > (6 - 1e-3)
331-
else cvxpy.sum(cvxpy.huber(R_sqrt_inv @ (y[n] - C @ x_states[n]), meas_huberM))
332-
for n in range(N)])
328+
x_states = cvxpy.Variable((A.shape[0], N)) # each column is [position, velocity, acceleration, ...] at step n
329+
330+
def huber_const(M): # from https://jmlr.org/papers/volume14/aravkin13a/aravkin13a.pdf, with correction for missing sqrt
331+
a = 2*np.exp(-M**2 / 2)/M
332+
b = np.sqrt(2*np.pi)*(2*norm.cdf(M) - 1)
333+
return np.sqrt((2*a*(1 + M**2)/M**2 + b)/(a + b))
334+
335+
# It is extremely important to run time that CVXPY expressions be in vectorized form
336+
proc_resids = np.linalg.inv(sqrtm(Q)) @ (x_states[:,1:] - A @ x_states[:,:-1]) # all Q^(-1/2)(x_n - A x_{n-1})
337+
meas_resids = np.linalg.inv(sqrtm(R)) @ (y.reshape(1,-1) - C @ x_states) # all R^(-1/2)(y_n - C x_n)
338+
# Process terms: sum of J(proc_resids)
339+
objective = 0.5*cvxpy.sum_squares(proc_resids) if proc_huberM > (6 - 1e-3) \
340+
else np.sqrt(2)*cvxpy.sum(cvxpy.abs(proc_resids)) if proc_huberM < 1e-3 \
341+
else huber_const(proc_huberM)*cvxpy.sum(cvxpy.huber(proc_resids, proc_huberM)) # 1/2 l2^2, l1, or Huber
342+
# Measurement terms: sum of V(meas_resids)
343+
objective += 0.5*cvxpy.sum_squares(meas_resids) if meas_huberM > (6 - 1e-3) \
344+
else np.sqrt(2)*cvxpy.sum(cvxpy.abs(meas_resids)) if meas_huberM < 1e-3 \
345+
else huber_const(meas_huberM)*cvxpy.sum(cvxpy.huber(meas_resids, meas_huberM)) # CVXPY quirk: norm(, 1) != sum(abs()) for matrices
333346

334347
problem = cvxpy.Problem(cvxpy.Minimize(objective))
335348
try:
349+
before = time()
336350
problem.solve(solver=cvxpy.CLARABEL)
337-
print("CLARABEL succeeded")
338-
except cvxpy.error.SolverError:
339-
print(f"CLARABEL failed. Retrying with SCS.")
340-
problem.solve(solver=cvxpy.SCS) # SCS is a lot slower but pretty bulletproof even with big condition numbers
341-
if x_states.value is None: # There is occasional solver failure with huber as opposed to 1-norm
342-
print("Convex solvers failed with status {problem.status}. Returning NaNs.")
343-
return np.full((N, A.shape[0]), np.nan)
351+
print(f"CLARABEL succeeded in {time() - before}s")
352+
except cvxpy.error.SolverError: pass
353+
#print("CLARABEL failed with status {problem.status}. Returning NaNs.")
354+
#return np.full((A.shape[0], N), np.nan)
355+
# problem.solve(solver=cvxpy.SCS) # Alternatively, SCS is a lot slower but pretty bulletproof even with big condition numbers
356+
if x_states.value is None: # There can be solver failure
357+
#print(f"Convex solvers failed with status {problem.status}. Returning NaNs.")
358+
return np.full((A.shape[0], N), np.nan)
344359

345360
return x_states.value

pynumdiff/optimize/_optimize.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,20 @@
8080
'window_size': (3, 1000)}),
8181
rtsdiff: ({'forwardbackward': {True, False},
8282
'order': {1, 2, 3}, # for this few options, the optimization works better if this is categorical
83-
'qr_ratio': [10**k for k in range(-9, 10, 2)] + [1e12, 1e16]},
84-
{'qr_ratio': [1e-10, 1e20]}), # qr_ratio is usually >>1
83+
'log_qr_ratio': [float(k) for k in range(-9, 10, 2)] + [12, 16]},
84+
{'log_qr_ratio': [-10, 20]}), # qr_ratio is usually >>1
8585
constant_velocity: ({'q': [1e-8, 1e-4, 1e-1, 1e1, 1e4, 1e8], # Deprecated method
8686
'r': [1e-8, 1e-4, 1e-1, 1e1, 1e4, 1e8],
8787
'forwardbackward': {True, False}},
8888
{'q': (1e-10, 1e10),
8989
'r': (1e-10, 1e10)}),
9090
robustdiff: ({'order': {1, 2, 3}, # warning: order 1 hacks the loss function when tvgamma is used, tends to win but is usually suboptimal choice in terms of true RMSE
91-
'q': [1e-1, 1e1, 1e4, 1e8, 1e12],
92-
'r': [1e-1, 1e1, 1e4, 1e8, 1e12],
93-
'proc_huberM': [0., 2, 6], # 0 is l1 norm, 1.345 is Huber 95% "efficiency", 2 assumes about 5% outliers,
91+
'log_q': [1., 4, 8, 12], # decimal after first entry ensure this is treated as float type
92+
'log_r': [-1., 1, 4, 8],
93+
#'proc_huberM': [0., 2, 6], # 0 is l1 norm, 1.345 is Huber 95% "efficiency", 2 assumes about 5% outliers,
9494
'meas_huberM': [0., 2, 6]}, # and 6 assumes basically no outliers -> l2 norm. Try (1 - norm.cdf(M))*2 to see outlier portion
95-
{'q': (1e-1, 1e18),
96-
'r': (1e-5, 1e18),
95+
{'log_q': (-1, 18),
96+
'log_r': (-5, 18),
9797
'proc_huberM': (0, 6),
9898
'meas_huberM': (0, 6)}),
9999
lineardiff: ({'kernel': 'gaussian',

0 commit comments

Comments
 (0)