diff --git a/src/DataDrivenDiffEq.jl b/src/DataDrivenDiffEq.jl index 91719257..e5ebf3f4 100644 --- a/src/DataDrivenDiffEq.jl +++ b/src/DataDrivenDiffEq.jl @@ -119,6 +119,10 @@ export collocate_data include("./utils/utils.jl") export optimal_shrinkage, optimal_shrinkage! +include("./utils/data_augmentation.jl") +export hankel_matrix, delay_embedding +export truncated_svd, reduce_dimension + include("./problem/type.jl") export DataDrivenProblem diff --git a/src/utils/data_augmentation.jl b/src/utils/data_augmentation.jl new file mode 100644 index 00000000..ba2f22a7 --- /dev/null +++ b/src/utils/data_augmentation.jl @@ -0,0 +1,293 @@ +""" +Data augmentation and reduction utilities for the SINDY workflow. + +This module provides functions for: +- Delay embedding (time-delay coordinates) +- Hankel matrix construction +- Dimensionality reduction via truncated SVD + +These are useful for HAVOK analysis and handling chaotic systems where +time-delay embeddings can reveal underlying structure. + +# References + +- Brunton, S. L., Brunton, B. W., Proctor, J. L., Kaiser, E., & Kutz, J. N. (2017). + Chaos as an intermittently forced linear system. Nature Communications, 8(1), 19. + https://doi.org/10.1038/s41467-017-00030-8 + +- Arbabi, H., & Mezic, I. (2017). Ergodic theory, dynamic mode decomposition, + and computation of spectral properties of the Koopman operator. + SIAM Journal on Applied Dynamical Systems, 16(4), 2096-2126. +""" + +""" + $(SIGNATURES) + +Construct a Hankel matrix from a 1D time series vector. + +Given a vector `x = [x₁, x₂, ..., xₙ]` and `num_rows` rows, constructs the Hankel matrix: + +``` +H = [x₁ x₂ x₃ ... xₙ₋ₘ₊₁] + [x₂ x₃ x₄ ... xₙ₋ₘ₊₂] + [x₃ x₄ x₅ ... xₙ₋ₘ₊₃] + [⋮ ⋮ ⋮ ⋱ ⋮ ] + [xₘ xₘ₊₁ xₘ₊₂ ... xₙ ] +``` + +where `m = num_rows`. + +# Arguments + + - `x`: Input vector (1D time series) + - `num_rows`: Number of rows in the Hankel matrix (number of delays + 1) + +# Returns + + - Hankel matrix of size `(num_rows, length(x) - num_rows + 1)` + +# Example + +```julia +x = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] +H = hankel_matrix(x, 4) +# Returns: +# 4×7 Matrix: +# 1.0 2.0 3.0 4.0 5.0 6.0 7.0 +# 2.0 3.0 4.0 5.0 6.0 7.0 8.0 +# 3.0 4.0 5.0 6.0 7.0 8.0 9.0 +# 4.0 5.0 6.0 7.0 8.0 9.0 10.0 +``` +""" +function hankel_matrix(x::AbstractVector, num_rows::Integer) + n = length(x) + @assert num_rows >= 1 "num_rows must be at least 1" + @assert num_rows <= n "num_rows ($num_rows) cannot exceed length of x ($n)" + + num_cols = n - num_rows + 1 + H = similar(x, num_rows, num_cols) + + for j in 1:num_cols + for i in 1:num_rows + H[i, j] = x[i + j - 1] + end + end + + return H +end + +""" + $(SIGNATURES) + +Create delay-embedded coordinates from a data matrix. + +Given data matrix `X` with states as rows and time samples as columns, +creates new coordinates by stacking time-delayed versions of each state. + +For a single state `x = [x(t₁), x(t₂), ...]` with `num_delays` delays and +delay interval `τ`, the embedded coordinates are: + +``` +[x(t), x(t+τ), ...] +[x(t-τ), x(t), ...] +[x(t-2τ), x(t-τ), ...] + ⋮ ⋮ ⋱ +[x(t-mτ), x(t-(m-1)τ), ...] +``` + +where `m = num_delays`. + +# Arguments + + - `X`: Data matrix of size `(n_states, n_samples)` + - `num_delays`: Number of time delays to add (resulting in `num_delays + 1` + copies of each state at different delays) + +# Keyword Arguments + + - `τ::Integer = 1`: Delay interval in samples (default: 1 sample) + +# Returns + + - Embedded data matrix of size `(n_states * (num_delays + 1), n_samples - num_delays * τ)` + +# Example + +```julia +# Single state variable +X = reshape(1.0:10.0, 1, 10) # 1×10 matrix +X_embed = delay_embedding(X, 2) +# Creates 3×8 matrix with original and 2 delayed copies + +# Multiple state variables +X = randn(3, 100) # 3 states, 100 samples +X_embed = delay_embedding(X, 3, τ = 2) +# Creates 12×94 matrix (3 states × 4 time copies) +``` + +See also: [`hankel_matrix`](@ref), [`reduce_dimension`](@ref) +""" +function delay_embedding(X::AbstractMatrix, num_delays::Integer; τ::Integer = 1) + n_states, n_samples = size(X) + @assert num_delays >= 0 "num_delays must be non-negative" + @assert τ >= 1 "delay interval τ must be at least 1" + @assert n_samples > num_delays * τ "Not enough samples for requested delays" + + new_n_states = n_states * (num_delays + 1) + new_n_samples = n_samples - num_delays * τ + + X_embedded = similar(X, new_n_states, new_n_samples) + + for d in 0:num_delays + offset = num_delays * τ - d * τ + for i in 1:n_states + row_idx = d * n_states + i + for j in 1:new_n_samples + X_embedded[row_idx, j] = X[i, j + offset] + end + end + end + + return X_embedded +end + +""" + $(SIGNATURES) + +Delay embedding for a 1D time series vector. + +Convenience method that converts a vector to a 1-row matrix, applies +delay embedding, and returns the result. + +# Arguments + + - `x`: Input vector (1D time series) + - `num_delays`: Number of time delays to add + +# Keyword Arguments + + - `τ::Integer = 1`: Delay interval in samples + +# Returns + + - Embedded data matrix of size `(num_delays + 1, length(x) - num_delays * τ)` + +# Example + +```julia +x = collect(1.0:10.0) +X_embed = delay_embedding(x, 2) +# Returns 3×8 matrix +``` +""" +function delay_embedding(x::AbstractVector, num_delays::Integer; τ::Integer = 1) + X = reshape(x, 1, length(x)) + return delay_embedding(X, num_delays; τ = τ) +end + +""" + $(SIGNATURES) + +Compute a truncated singular value decomposition of data matrix `X`. + +# Arguments + + - `X`: Data matrix + - `rank`: Number of singular values/vectors to retain + +# Returns + + - Named tuple `(U = U_r, S = S_r, V = V_r)` where: + + + `U_r` is the first `rank` columns of U + + `S_r` is a vector of the first `rank` singular values + + `V_r` is the first `rank` columns of V + +# Example + +```julia +X = randn(100, 50) +result = truncated_svd(X, 5) +X_approx = result.U * Diagonal(result.S) * result.V' +``` + +See also: [`reduce_dimension`](@ref), [`optimal_shrinkage`](@ref) +""" +function truncated_svd(X::AbstractMatrix, rank::Integer) + @assert rank >= 1 "rank must be at least 1" + max_rank = minimum(size(X)) + rank = min(rank, max_rank) + + U, S, V = svd(X) + return (U = U[:, 1:rank], S = S[1:rank], V = V[:, 1:rank]) +end + +""" + $(SIGNATURES) + +Reduce the dimensionality of data by projecting onto the top singular vectors. + +Projects data matrix `X` onto its first `rank` left singular vectors, +reducing the number of effective dimensions while preserving the most +important features. + +# Arguments + + - `X`: Data matrix of size `(n_features, n_samples)` + - `rank`: Number of dimensions to retain + +# Returns + + - Reduced data matrix of size `(rank, n_samples)` + - The reduced coordinates `Y = U_r' * X` where `U_r` contains the first `rank` left singular vectors + +# Example + +```julia +X = randn(100, 50) # 100 features, 50 samples +X_reduced = reduce_dimension(X, 5) # Reduce to 5 dimensions +# X_reduced is 5×50 +``` + +See also: [`truncated_svd`](@ref), [`optimal_shrinkage`](@ref) +""" +function reduce_dimension(X::AbstractMatrix, rank::Integer) + result = truncated_svd(X, rank) + return result.U' * X +end + +""" + $(SIGNATURES) + +Reduce the dimensionality of data using automatic rank selection. + +Uses the optimal singular value hard threshold from Gavish & Donoho (2014) +to automatically determine the number of significant singular values, +then projects onto those dimensions. + +# Arguments + + - `X`: Data matrix of size `(n_features, n_samples)` + +# Returns + + - Reduced data matrix with automatically selected dimensionality + +# Example + +```julia +X = randn(100, 50) + 0.1 * randn(100, 50) # Low-rank + noise +X_reduced = reduce_dimension(X) # Automatically selects rank +``` + +See also: [`reduce_dimension(X, rank)`](@ref), [`optimal_shrinkage`](@ref) +""" +function reduce_dimension(X::AbstractMatrix) + m, n = minimum(size(X)), maximum(size(X)) + U, S, V = svd(X) + τ = optimal_svht(m, n) + threshold = τ * median(S) + rank = count(s -> s >= threshold, S) + rank = max(rank, 1) # Keep at least one dimension + return U[:, 1:rank]' * X +end diff --git a/test/utils.jl b/test/utils.jl index d9ea2296..fb722e14 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,6 +1,7 @@ using DataDrivenDiffEq using DataDrivenDiffEq: collocate_data using LinearAlgebra +using Statistics @testset "Optimal Shrinkage" begin t = collect(-2:0.01:2) @@ -53,3 +54,123 @@ end @test x̂≈x atol=1e-1 rtol=1e-1 end end + +@testset "Hankel Matrix" begin + # Test basic Hankel matrix construction + x = collect(1.0:10.0) + H = hankel_matrix(x, 4) + + @test size(H) == (4, 7) + # First column should be [1, 2, 3, 4] + @test H[:, 1] == [1.0, 2.0, 3.0, 4.0] + # Last column should be [7, 8, 9, 10] + @test H[:, end] == [7.0, 8.0, 9.0, 10.0] + # Check Hankel structure: constant along anti-diagonals + @test H[1, 2] == H[2, 1] + @test H[2, 3] == H[3, 2] + @test H[1, 4] == H[4, 1] + + # Edge case: num_rows = 1 + H1 = hankel_matrix(x, 1) + @test size(H1) == (1, 10) + @test vec(H1) == x + + # Edge case: num_rows = length(x) + Hmax = hankel_matrix(x, 10) + @test size(Hmax) == (10, 1) + @test vec(Hmax) == x + + # Test with Float32 + x32 = Float32.(x) + H32 = hankel_matrix(x32, 3) + @test eltype(H32) == Float32 +end + +@testset "Delay Embedding" begin + # Test with 2D matrix input + X = reshape(1.0:20.0, 2, 10) # 2 states, 10 samples + X_embed = delay_embedding(X, 2) + + # Should have 2 * (2 + 1) = 6 rows, 10 - 2 = 8 columns + @test size(X_embed) == (6, 8) + + # Check structure: first 2 rows are current time, next 2 are t-1, etc. + # Row 1 should be state 1 at current time: X[1, 3:10] + @test X_embed[1, :] == X[1, 3:10] + # Row 3 should be state 1 at t-1: X[1, 2:9] + @test X_embed[3, :] == X[1, 2:9] + # Row 5 should be state 1 at t-2: X[1, 1:8] + @test X_embed[5, :] == X[1, 1:8] + + # Test with delay interval τ = 2 + X_embed_tau2 = delay_embedding(X, 2, τ = 2) + # Should have 6 rows, 10 - 2*2 = 6 columns + @test size(X_embed_tau2) == (6, 6) + + # Test with 1D vector input + x = collect(1.0:10.0) + x_embed = delay_embedding(x, 2) + @test size(x_embed) == (3, 8) + @test x_embed[1, :] == x[3:10] + @test x_embed[2, :] == x[2:9] + @test x_embed[3, :] == x[1:8] + + # Test num_delays = 0 (no embedding) + x_no_embed = delay_embedding(x, 0) + @test size(x_no_embed) == (1, 10) + @test vec(x_no_embed) == x +end + +@testset "Truncated SVD" begin + # Create a low-rank matrix + U_true = randn(50, 3) + V_true = randn(30, 3) + S_true = [10.0, 5.0, 1.0] + X = U_true * Diagonal(S_true) * V_true' + + # Truncate to rank 2 + result = truncated_svd(X, 2) + @test length(result.S) == 2 + @test size(result.U) == (50, 2) + @test size(result.V) == (30, 2) + + # Check that singular values are in decreasing order + @test result.S[1] >= result.S[2] + + # Reconstruction should be close to best rank-2 approximation + X_approx = result.U * Diagonal(result.S) * result.V' + _, S_full, _ = svd(X) + @test norm(X - X_approx) ≈ S_full[3] atol = 1e-10 + + # Test rank larger than matrix dimension (should be clamped) + result_large = truncated_svd(X, 100) + @test length(result_large.S) == min(50, 30) +end + +@testset "Reduce Dimension" begin + # Create a low-rank matrix with noise + U_true = randn(100, 5) + V_true = randn(50, 5) + S_true = [100.0, 50.0, 10.0, 5.0, 1.0] + X_true = U_true * Diagonal(S_true) * V_true' + X = X_true + 0.01 * randn(100, 50) + + # Reduce to specified rank + X_reduced = reduce_dimension(X, 3) + @test size(X_reduced) == (3, 50) + + # Verify the reduced data can be projected back to original space + U, _, _ = svd(X) + X_reconstructed = U[:, 1:3] * X_reduced + # Reconstruction error should be similar to what we get from best rank-3 approximation + _, S_full, _ = svd(X) + theoretical_error = sqrt(sum(S_full[4:end] .^ 2)) + @test norm(X - X_reconstructed) ≈ theoretical_error atol = 1e-10 + + # Test automatic rank selection + X_auto = reduce_dimension(X) + # Should have between 1 and min(100, 50) dimensions + @test 1 <= size(X_auto, 1) <= 50 + # Number of samples should be preserved + @test size(X_auto, 2) == 50 +end