From 2c43fa70ec853b49c7b25d182231573a31cd433e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 14 Oct 2024 19:12:15 +0200 Subject: [PATCH 001/266] first restructure --- Project.toml | 14 +++ src/NetworkHistogram.jl | 28 ++---- src/UI.jl | 51 +++++++++++ src/assignments/Assignments.jl | 85 +++++++++++++++++++ src/assignments/assignments_binary.jl | 12 +++ .../assignments_finitely_decorated.jl | 13 +++ src/group_numbering.jl | 15 +--- src/{ => old}/assignment.jl | 0 src/{ => old}/config_rules/accept_rule.jl | 0 .../config_rules/bandwidth_selection_rule.jl | 0 .../config_rules/starting_assignment_rule.jl | 0 src/{ => old}/config_rules/stop_rule.jl | 0 src/{ => old}/config_rules/swap_rule.jl | 0 src/{ => old}/data/datasets.jl | 0 src/{ => old}/data/gt.jl | 0 src/{ => old}/data/utils.jl | 0 src/old/group_numbering.jl | 43 ++++++++++ src/{ => old}/histogram.jl | 0 src/{ => old}/history.jl | 0 src/{ => old}/optimize.jl | 0 src/{ => old}/proposal.jl | 0 src/{ => old}/utils.jl | 0 src/optimisation/opti.jl | 0 src/sbm.jl | 28 ++++++ 24 files changed, 255 insertions(+), 34 deletions(-) create mode 100644 src/UI.jl create mode 100644 src/assignments/Assignments.jl create mode 100644 src/assignments/assignments_binary.jl create mode 100644 src/assignments/assignments_finitely_decorated.jl rename src/{ => old}/assignment.jl (100%) rename src/{ => old}/config_rules/accept_rule.jl (100%) rename src/{ => old}/config_rules/bandwidth_selection_rule.jl (100%) rename src/{ => old}/config_rules/starting_assignment_rule.jl (100%) rename src/{ => old}/config_rules/stop_rule.jl (100%) rename src/{ => old}/config_rules/swap_rule.jl (100%) rename src/{ => old}/data/datasets.jl (100%) rename src/{ => old}/data/gt.jl (100%) rename src/{ => old}/data/utils.jl (100%) create mode 100644 src/old/group_numbering.jl rename src/{ => old}/histogram.jl (100%) rename src/{ => old}/history.jl (100%) rename src/{ => old}/optimize.jl (100%) rename src/{ => old}/proposal.jl (100%) rename src/{ => old}/utils.jl (100%) create mode 100644 src/optimisation/opti.jl create mode 100644 src/sbm.jl diff --git a/Project.toml b/Project.toml index a7719ad..2f466f0 100644 --- a/Project.toml +++ b/Project.toml @@ -8,12 +8,19 @@ ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8" Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MetaGraphsNext = "fa8bd995-216d-47f1-8a91-f3b68fbeb377" +Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" +PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SimpleWeightedGraphs = "47aef6b3-ad0c-573a-a1e2-d07658019622" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7" @@ -23,10 +30,17 @@ ArnoldiMethod = "0.2.0" Arpack = "0.5.4" BenchmarkTools = "1.3.2" CodecZstd = "0.7.2" +DensityInterface = "0.4.0" +Distributions = "0.25.112" +Graphs = "1.9.0" HTTP = "1.7.4" JLD = "0.13.3" Kronecker = "0.5" +MetaGraphsNext = "0.7.0" +Metis = "1.5.0" +PermutationSymmetricTensors = "0.2.0" ProgressMeter = "1.7.2" +SimpleWeightedGraphs = "1.4.0" StatsBase = "0.33.21" TranscodingStreams = "0.9.11" ValueHistories = "0.5.4" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 1272972..a0aa0b0 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,29 +1,11 @@ module NetworkHistogram -using ValueHistories, StatsBase, Random, LinearAlgebra, Kronecker - -using Arpack: eigs -using ArnoldiMethod: partialschur, partialeigen, SR, LR - -export graphhist, PreviousBestValue, Strict, RandomNodeSwap -export OrderedStart, RandomStart, EigenStart, DistStart +using LinearAlgebra, DensityInterface +using Graphs, Distributions +using PermutationSymmetricTensors include("group_numbering.jl") -include("assignment.jl") -include("history.jl") - -include("config_rules/starting_assignment_rule.jl") -include("config_rules/swap_rule.jl") -include("config_rules/accept_rule.jl") -include("config_rules/stop_rule.jl") -include("config_rules/bandwidth_selection_rule.jl") - -include("optimize.jl") -include("histogram.jl") -include("proposal.jl") - -include("utils.jl") +include("sbm.jl") +include("assignments/Assignments.jl") -include("data/gt.jl") -include("data/datasets.jl") end diff --git a/src/UI.jl b/src/UI.jl new file mode 100644 index 0000000..b82009b --- /dev/null +++ b/src/UI.jl @@ -0,0 +1,51 @@ +using Graphs +using MetaGraphsNext +using Distributions +using SimpleWeightedGraphs + +cities = MetaGraph( + Graph(); + label_type = String, + vertex_data_type = Vector{Float64}, + edge_data_type = Float64, + graph_data = nothing, + default_weight = -Inf, + weight_function = identity, +); + +cities["Paris"] = [2.3, 48.9]; +cities["London"] = [0.1, 51.5]; +cities["Berlin"] = [13.4, 52.5]; +cities["Lausanne"] = [6.6, 46.5]; +cities["Paris", "London"] = 0.5; +cities["Paris", "Berlin"] = 1; +cities["London", "Berlin"] = 0; +cities["Paris", "Lausanne"] = 1; + +getindex(cities, "Paris") + + +typeof(cities) + +index = code_for(cities, "Paris") +label_for.(Ref(cities), neighbors(cities, index)) +adjacency_matrix(cities) +collect(weights(cities)) + +G = Graph(20, 20) +adjacency_matrix(G) + + +sources = [1, 2, 1]; + +destinations = [2, 3, 3]; + +weight_edges = [0.5, 0.8, 2.0]; + +g = SimpleWeightedGraph(sources, destinations, weight_edges) +add_vertices!(g, 5) + + +function get_obs(g::SimpleGraph{T}, i::Int, j::Int) where {T} + return has_edge(g, i, j) +end diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl new file mode 100644 index 0000000..7df5814 --- /dev/null +++ b/src/assignments/Assignments.jl @@ -0,0 +1,85 @@ +abstract type AdditionalData end + +struct BasicAdditionalInformation <: AdditionalData + test::Int +end + + +struct Assignment{T} <: AbstractVector{Vector{Int}} + group_size::GroupSize{T} + node_labels::Vector{Int} + additional_data::BasicAdditionalInformation +end + + +function number_groups(assignment::Assignment) + return length(assignment.group_size) +end + +function number_nodes(assignment::Assignment) + return length(assignment.node_labels) +end + +function get_vertex_in_group(assignment::Assignment, group::Int) + return findall(assignment.node_labels .== group) +end + + +function get_edge_indices(a::Assignment, i::Int, j::Int) + return [(x, y) for x in get_vertex_in_group(a, i) + for y in get_vertex_in_group(a, j)] +end + +function get_edge_indices(a::Assignment,i::Int) + nodes_i = get_vertex_in_group(a, i) + return [(x, y) for x in nodes_i for y in nodes_i if x < y] +end + + +Base.size(a::Assignment) = (number_groups(a),number_groups(a)) +Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) + @boundscheck checkbounds(a, i) + return get_vertex_in_group(a, i) +end + + + + + +function get_obs(g::SimpleGraph{T}, x::Tuple) where {T} + return get_obs(g, x[1], x[2]) +end + +function get_obs(g::SimpleGraph{T}, i::Int, j::Int) where {T} + return convert(Bool, has_edge(g, i, j)) +end + + +function loglikelihood(a::Assignment, dists::SBM, g) + loglikelihood = 0.0 + for i in 1:number_nodes(a) + label_a = a.node_labels[i] + for j in i+1:number_nodes(a) + label_b = a.node_labels[j] + loglikelihood += logdensityof(dists[label_a,label_b], get_obs(g,i,j)) + end + end + return loglikelihood +end + + +function fit(a::Assignment,g, distribution) + sizes = [a.group_size[i] for i in 1:number_groups(a)]/number_nodes(a) + dists = initialize_sbm(sizes, distribution) + for group1 in 1:number_groups(a) + for group2 in group1:number_groups(a) + edges = get_edge_indices(a,group1,group2) + dists[group1,group2] = fit(distribution, g, edges) + end + end + return dists +end + +function fit(distribution, g, edges) + return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) +end diff --git a/src/assignments/assignments_binary.jl b/src/assignments/assignments_binary.jl new file mode 100644 index 0000000..c809eff --- /dev/null +++ b/src/assignments/assignments_binary.jl @@ -0,0 +1,12 @@ +mutable struct AdditionalDataBinary <: AdditionalDataBinary + counts :: Matrix{Int} + realized :: Matrix{Float64} + estimated_theta :: Matrix{Float64} + loglikelihood::Float64 +end + +struct DefaultAssignmentBinary <: AssignmentBinary + group_size :: GroupSize{Int} + node_labels :: Vector{Int} + additional_data :: AdditionalDataBinary +end diff --git a/src/assignments/assignments_finitely_decorated.jl b/src/assignments/assignments_finitely_decorated.jl new file mode 100644 index 0000000..4fecb1c --- /dev/null +++ b/src/assignments/assignments_finitely_decorated.jl @@ -0,0 +1,13 @@ +mutable struct AdditionalDataCategorical <: AdditionalData + counts :: Matrix{Int} + realized :: Array{Float64, 3} + estimated_theta :: Array{Float64, 3} + loglikelihood :: Float64 +end + + +struct DefaultAssignmentCategorical <: Assignment + group_size :: GroupSize{Int} + node_labels :: Vector{Int} + additional_data :: AdditionalDataCategorical +end diff --git a/src/group_numbering.jl b/src/group_numbering.jl index 12d9691..e1fa865 100644 --- a/src/group_numbering.jl +++ b/src/group_numbering.jl @@ -1,5 +1,6 @@ """ -Array-like storage for the number of nodes in each group. +Array-like storage for the number of nodes in each group. Try to split the number of nodes +into equal groups, but if it is not possible, the last group may have mode nodes. """ struct GroupSize{T} <: AbstractVector{Int} group_number::T @@ -17,16 +18,8 @@ struct GroupSize{T} <: AbstractVector{Int} if number_groups * standard_group == number_nodes new{Int}(standard_group, number_groups) else - remainder_group = number_nodes - number_groups * standard_group - if remainder_group == 1 - @warn "h has to be changed, only one node in remainder group" - standard_group -= 1 - remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them - if standard_group == 1 - error("Standard group size now 1, please choose a new value for h.") - end - end - new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1) + remainder_group = standard_group + mod(number_nodes, standard_group) + new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups) end end end diff --git a/src/assignment.jl b/src/old/assignment.jl similarity index 100% rename from src/assignment.jl rename to src/old/assignment.jl diff --git a/src/config_rules/accept_rule.jl b/src/old/config_rules/accept_rule.jl similarity index 100% rename from src/config_rules/accept_rule.jl rename to src/old/config_rules/accept_rule.jl diff --git a/src/config_rules/bandwidth_selection_rule.jl b/src/old/config_rules/bandwidth_selection_rule.jl similarity index 100% rename from src/config_rules/bandwidth_selection_rule.jl rename to src/old/config_rules/bandwidth_selection_rule.jl diff --git a/src/config_rules/starting_assignment_rule.jl b/src/old/config_rules/starting_assignment_rule.jl similarity index 100% rename from src/config_rules/starting_assignment_rule.jl rename to src/old/config_rules/starting_assignment_rule.jl diff --git a/src/config_rules/stop_rule.jl b/src/old/config_rules/stop_rule.jl similarity index 100% rename from src/config_rules/stop_rule.jl rename to src/old/config_rules/stop_rule.jl diff --git a/src/config_rules/swap_rule.jl b/src/old/config_rules/swap_rule.jl similarity index 100% rename from src/config_rules/swap_rule.jl rename to src/old/config_rules/swap_rule.jl diff --git a/src/data/datasets.jl b/src/old/data/datasets.jl similarity index 100% rename from src/data/datasets.jl rename to src/old/data/datasets.jl diff --git a/src/data/gt.jl b/src/old/data/gt.jl similarity index 100% rename from src/data/gt.jl rename to src/old/data/gt.jl diff --git a/src/data/utils.jl b/src/old/data/utils.jl similarity index 100% rename from src/data/utils.jl rename to src/old/data/utils.jl diff --git a/src/old/group_numbering.jl b/src/old/group_numbering.jl new file mode 100644 index 0000000..12d9691 --- /dev/null +++ b/src/old/group_numbering.jl @@ -0,0 +1,43 @@ +""" +Array-like storage for the number of nodes in each group. +""" +struct GroupSize{T} <: AbstractVector{Int} + group_number::T + number_groups::Int + + function GroupSize(number_nodes, h::Real) + @assert 0 < h < 1 + standard_group = floor(Int, number_nodes * h) + GroupSize(number_nodes, standard_group) + end + + function GroupSize(number_nodes, standard_group::Integer) + @assert 1 < standard_group < number_nodes + number_groups = number_nodes ÷ standard_group # number of standard groups! + if number_groups * standard_group == number_nodes + new{Int}(standard_group, number_groups) + else + remainder_group = number_nodes - number_groups * standard_group + if remainder_group == 1 + @warn "h has to be changed, only one node in remainder group" + standard_group -= 1 + remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them + if standard_group == 1 + error("Standard group size now 1, please choose a new value for h.") + end + end + new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1) + end + end +end + +Base.size(g::GroupSize) = (g.number_groups,) +Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) + @boundscheck checkbounds(g, i) + return g.group_number +end + +Base.@propagate_inbounds function Base.getindex(g::GroupSize{Tuple{Int, Int}}, i::Int) + @boundscheck checkbounds(g, i) + return i < length(g) ? g.group_number[1] : g.group_number[2] +end diff --git a/src/histogram.jl b/src/old/histogram.jl similarity index 100% rename from src/histogram.jl rename to src/old/histogram.jl diff --git a/src/history.jl b/src/old/history.jl similarity index 100% rename from src/history.jl rename to src/old/history.jl diff --git a/src/optimize.jl b/src/old/optimize.jl similarity index 100% rename from src/optimize.jl rename to src/old/optimize.jl diff --git a/src/proposal.jl b/src/old/proposal.jl similarity index 100% rename from src/proposal.jl rename to src/old/proposal.jl diff --git a/src/utils.jl b/src/old/utils.jl similarity index 100% rename from src/utils.jl rename to src/old/utils.jl diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/sbm.jl b/src/sbm.jl new file mode 100644 index 0000000..9235f50 --- /dev/null +++ b/src/sbm.jl @@ -0,0 +1,28 @@ +struct SBM{T,K} <: AbstractMatrix{T} + sizes :: Vector{Float64} + probs:: SymmetricTensor{T, K, K} + +end + + + +function initialize_sbm(sizes::Vector{Float64}, dist, k = length(sizes)) + probs = Vector{typeof(dist)}(undef, binomial(k + 1, 2)) + fill!(probs, dist) + return SBM(sizes, SymmetricTensor(probs, Val(k), Val(k))) +end + +function initialize_sbm(k::Int, dist) + return initialize_sbm(ones(k) / k, dist) +end + +number_blocks(::SBM{T,K}) where {T,K} = K + +Base.size(s::SBM)= size(s.probs) +Base.ndims(s::SBM) = 2 +Base.eltype(s::SBM{T,K}) where {T,K} = T +Base.setindex!(s::SBM, v, i, j) = setindex!(s.probs, v, i, j) +Base.@propagate_inbounds function Base.getindex(s::SBM, i, j) + @boundscheck checkbounds(s.probs, i, j) + return Base.getindex(s.probs, i, j) +end From f5f0cd36c9b22df4809391868de3e23084e4c06d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 15 Oct 2024 15:04:51 +0200 Subject: [PATCH 002/266] add SBM utilities --- Project.toml | 4 ++ src/NetworkHistogram.jl | 7 ++- src/UI.jl | 34 +++++++++++ src/assignments/Assignments.jl | 10 +--- src/assignments/assignments_binary.jl | 12 ---- .../assignments_finitely_decorated.jl | 13 ----- src/sbm.jl | 58 +++++++++++++++---- 7 files changed, 94 insertions(+), 44 deletions(-) delete mode 100644 src/assignments/assignments_binary.jl delete mode 100644 src/assignments/assignments_finitely_decorated.jl diff --git a/Project.toml b/Project.toml index 2f466f0..cf14ead 100644 --- a/Project.toml +++ b/Project.toml @@ -21,6 +21,8 @@ PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SimpleWeightedGraphs = "47aef6b3-ad0c-573a-a1e2-d07658019622" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7" @@ -41,6 +43,8 @@ Metis = "1.5.0" PermutationSymmetricTensors = "0.2.0" ProgressMeter = "1.7.2" SimpleWeightedGraphs = "1.4.0" +SparseArrays = "1.11.0" +StaticArrays = "1.9.7" StatsBase = "0.33.21" TranscodingStreams = "0.9.11" ValueHistories = "0.5.4" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index a0aa0b0..45648d7 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,9 +1,12 @@ module NetworkHistogram -using LinearAlgebra, DensityInterface -using Graphs, Distributions +using LinearAlgebra, SparseArrays +using Distributions,DensityInterface +using Graphs using PermutationSymmetricTensors +import StatsBase, Random + include("group_numbering.jl") include("sbm.jl") include("assignments/Assignments.jl") diff --git a/src/UI.jl b/src/UI.jl index b82009b..4f60963 100644 --- a/src/UI.jl +++ b/src/UI.jl @@ -49,3 +49,37 @@ add_vertices!(g, 5) function get_obs(g::SimpleGraph{T}, i::Int, j::Int) where {T} return has_edge(g, i, j) end + +## + +using NetworkHistogram +group_number = NetworkHistogram.GroupSize(nv(G), 3) +if typeof(group_number) == NetworkHistogram.GroupSize{Tuple{Int, Int}} + node_labels = repeat(1:length(group_number)-1, inner = group_number[1]) + last_labels = fill(length(group_number), group_number[end]) + node_labels = vcat(node_labels, last_labels) +else + node_labels = repeat(1:length(group_number), inner = group_number[1]) +end +additional_info = NetworkHistogram.BasicAdditionalInformation(1) +a = NetworkHistogram.Assignment(group_number, node_labels, additional_info) +dist = Bernoulli(0.5) +sbm_fit = NetworkHistogram.fit(a, G, dist) + + +sbm = NetworkHistogram.initialize_sbm([1/3, 1/3, 1/3], dist) +for i in 1:3 + sbm[i, i] = Bernoulli(0.8) + for j in i+1:3 + sbm[i, j] = Bernoulli(0.01+0.1*(i+j)) + end +end + +size_per_block = 200 +A, node_labels = NetworkHistogram.sample(sbm,3*size_per_block); +node_labels = repeat(1:3, inner = size_per_block) +group_number = NetworkHistogram.GroupSize(size(A,1), size_per_block) +a_star = NetworkHistogram.Assignment(group_number, node_labels, additional_info) +sbm_fitted = NetworkHistogram.fit(a_star, SimpleGraph(A), dist) + +sbm_fitted diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 7df5814..f7f37d8 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -5,10 +5,10 @@ struct BasicAdditionalInformation <: AdditionalData end -struct Assignment{T} <: AbstractVector{Vector{Int}} +struct Assignment{T,B<:AdditionalData} <: AbstractVector{Vector{Int}} group_size::GroupSize{T} node_labels::Vector{Int} - additional_data::BasicAdditionalInformation + additional_data::B end @@ -43,9 +43,6 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) end - - - function get_obs(g::SimpleGraph{T}, x::Tuple) where {T} return get_obs(g, x[1], x[2]) end @@ -69,8 +66,7 @@ end function fit(a::Assignment,g, distribution) - sizes = [a.group_size[i] for i in 1:number_groups(a)]/number_nodes(a) - dists = initialize_sbm(sizes, distribution) + dists = initialize_sbm(a.group_size, distribution) for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edges = get_edge_indices(a,group1,group2) diff --git a/src/assignments/assignments_binary.jl b/src/assignments/assignments_binary.jl deleted file mode 100644 index c809eff..0000000 --- a/src/assignments/assignments_binary.jl +++ /dev/null @@ -1,12 +0,0 @@ -mutable struct AdditionalDataBinary <: AdditionalDataBinary - counts :: Matrix{Int} - realized :: Matrix{Float64} - estimated_theta :: Matrix{Float64} - loglikelihood::Float64 -end - -struct DefaultAssignmentBinary <: AssignmentBinary - group_size :: GroupSize{Int} - node_labels :: Vector{Int} - additional_data :: AdditionalDataBinary -end diff --git a/src/assignments/assignments_finitely_decorated.jl b/src/assignments/assignments_finitely_decorated.jl deleted file mode 100644 index 4fecb1c..0000000 --- a/src/assignments/assignments_finitely_decorated.jl +++ /dev/null @@ -1,13 +0,0 @@ -mutable struct AdditionalDataCategorical <: AdditionalData - counts :: Matrix{Int} - realized :: Array{Float64, 3} - estimated_theta :: Array{Float64, 3} - loglikelihood :: Float64 -end - - -struct DefaultAssignmentCategorical <: Assignment - group_size :: GroupSize{Int} - node_labels :: Vector{Int} - additional_data :: AdditionalDataCategorical -end diff --git a/src/sbm.jl b/src/sbm.jl index 9235f50..0c2e1fb 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -1,15 +1,34 @@ -struct SBM{T,K} <: AbstractMatrix{T} - sizes :: Vector{Float64} - probs:: SymmetricTensor{T, K, K} +struct SBM{T,K,F<:Real} <: AbstractMatrix{T} + sizes :: Vector{F} + probs:: SymmetricTensor{T, K, 2} +end + +function _check_sizes(sizes) + @assert sum(sizes) ≈ 1 "Sizes must sum to 1, got $(sum(sizes))" + return sizes end +function _check_sizes(sizes::Vector{Int}) + return sizes./sum(sizes) +end -function initialize_sbm(sizes::Vector{Float64}, dist, k = length(sizes)) - probs = Vector{typeof(dist)}(undef, binomial(k + 1, 2)) +function initialize_sbm(sizes::Vector, dist, k = length(sizes)) + sizes = _check_sizes(sizes) + n_dims = binomial(k + 1, 2) + probs = Vector{typeof(dist)}(undef, n_dims) fill!(probs, dist) - return SBM(sizes, SymmetricTensor(probs, Val(k), Val(k))) + return SBM(sizes, SymmetricTensor(probs, Val(k), Val(2))) +end + + +function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) + size_bins = sizes./sum(sizes) + n_dims = binomial(k + 1, 2) + probs = Vector{typeof(dist)}(undef, n_dims) + fill!(probs, dist) + return SBM(size_bins, SymmetricTensor(probs, Val(k), Val(2))) end function initialize_sbm(k::Int, dist) @@ -19,10 +38,29 @@ end number_blocks(::SBM{T,K}) where {T,K} = K Base.size(s::SBM)= size(s.probs) -Base.ndims(s::SBM) = 2 -Base.eltype(s::SBM{T,K}) where {T,K} = T +Base.ndims(::SBM) = 2 +Base.eltype(::SBM{T,K}) where {T,K} = T Base.setindex!(s::SBM, v, i, j) = setindex!(s.probs, v, i, j) Base.@propagate_inbounds function Base.getindex(s::SBM, i, j) - @boundscheck checkbounds(s.probs, i, j) - return Base.getindex(s.probs, i, j) + return getindex(s.probs, i, j) +end + + + +function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) + n_blocks = number_blocks(sbm) + node_labels = StatsBase.sample( + rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) + if sorted + sort!(node_labels) + end + A = BitMatrix(undef, n_nodes, n_nodes) + for i in 1:n_nodes + for j in i+1:n_nodes + A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) + A[j, i] = A[i, j] + end + end + return sparse(A), node_labels end +sample(sbm::SBM, n_nodes::Int) = sample(Random.default_rng(), sbm, n_nodes) From 0a131ba6002aa4371d591a9fbf6fd2fbc91707a0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 15 Oct 2024 17:59:23 +0200 Subject: [PATCH 003/266] need to parametrize the init rule based on additional type --- Project.toml | 2 + src/NetworkHistogram.jl | 8 +- src/UI.jl | 22 +- src/assignments/Assignments.jl | 52 +++-- src/assignments/data_structures.jl | 1 + .../specialised_data/Bernoulli_struct.jl | 49 +++++ src/observations.jl | 9 + src/old/assignment.jl | 195 ------------------ src/old/config_rules/accept_rule.jl | 26 --- .../config_rules/starting_assignment_rule.jl | 57 ----- src/old/config_rules/stop_rule.jl | 25 --- src/old/data/datasets.jl | 17 -- src/old/data/gt.jl | 40 ---- src/old/data/utils.jl | 20 -- src/old/group_numbering.jl | 43 ---- src/old/histogram.jl | 43 ---- src/old/history.jl | 88 -------- src/old/optimize.jl | 154 -------------- src/old/proposal.jl | 135 ------------ src/old/utils.jl | 109 ---------- src/optimisation/config_rules/InitRule.jl | 40 ++++ src/optimisation/config_rules/accept_rule.jl | 17 ++ .../config_rules/bandwidth_selection_rule.jl | 7 - src/optimisation/config_rules/stop_rule.jl | 35 ++++ .../config_rules/swap_rule.jl | 14 +- src/optimisation/opti.jl | 15 ++ src/optimisation/swap.jl | 18 ++ src/sbm.jl | 25 +-- test/config_rules/accept_rule_test.jl | 33 --- test/config_rules/config_rule_test.jl | 6 - .../starting_assigment_rule_test.jl | 33 --- test/config_rules/stop_rule_test.jl | 20 -- test/config_rules/swap_rule_test.jl | 7 - test/data_tests/utils.jl | 13 -- test/error_handling_tests.jl | 27 --- test/oracle_bandwidth_test.jl | 18 -- test/pipeline_test.jl | 84 -------- test/proposal_test.jl | 30 --- test/runtests.jl | 9 +- test/simple_test_example.jl | 39 ---- test/starting_labels_test.jl | 41 ---- test/test_files/sbm.jld | Bin 528752 -> 0 bytes test/test_multilayer.jl | 10 - test/test_swap.jl | 7 + 44 files changed, 249 insertions(+), 1394 deletions(-) create mode 100644 src/assignments/data_structures.jl create mode 100644 src/assignments/specialised_data/Bernoulli_struct.jl create mode 100644 src/observations.jl delete mode 100644 src/old/assignment.jl delete mode 100644 src/old/config_rules/accept_rule.jl delete mode 100644 src/old/config_rules/starting_assignment_rule.jl delete mode 100644 src/old/config_rules/stop_rule.jl delete mode 100644 src/old/data/datasets.jl delete mode 100644 src/old/data/gt.jl delete mode 100644 src/old/data/utils.jl delete mode 100644 src/old/group_numbering.jl delete mode 100644 src/old/histogram.jl delete mode 100644 src/old/history.jl delete mode 100644 src/old/optimize.jl delete mode 100644 src/old/proposal.jl delete mode 100644 src/old/utils.jl create mode 100644 src/optimisation/config_rules/InitRule.jl create mode 100644 src/optimisation/config_rules/accept_rule.jl rename src/{old => optimisation}/config_rules/bandwidth_selection_rule.jl (85%) create mode 100644 src/optimisation/config_rules/stop_rule.jl rename src/{old => optimisation}/config_rules/swap_rule.jl (50%) create mode 100644 src/optimisation/swap.jl delete mode 100644 test/config_rules/accept_rule_test.jl delete mode 100644 test/config_rules/config_rule_test.jl delete mode 100644 test/config_rules/starting_assigment_rule_test.jl delete mode 100644 test/config_rules/stop_rule_test.jl delete mode 100644 test/config_rules/swap_rule_test.jl delete mode 100644 test/data_tests/utils.jl delete mode 100644 test/error_handling_tests.jl delete mode 100644 test/oracle_bandwidth_test.jl delete mode 100644 test/pipeline_test.jl delete mode 100644 test/proposal_test.jl delete mode 100644 test/simple_test_example.jl delete mode 100644 test/starting_labels_test.jl delete mode 100644 test/test_files/sbm.jld delete mode 100644 test/test_multilayer.jl create mode 100644 test/test_swap.jl diff --git a/Project.toml b/Project.toml index cf14ead..e52cc10 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" @@ -32,6 +33,7 @@ ArnoldiMethod = "0.2.0" Arpack = "0.5.4" BenchmarkTools = "1.3.2" CodecZstd = "0.7.2" +DataStructures = "0.18.20" DensityInterface = "0.4.0" Distributions = "0.25.112" Graphs = "1.9.0" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 45648d7..df6a79a 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,7 +1,7 @@ module NetworkHistogram -using LinearAlgebra, SparseArrays -using Distributions,DensityInterface +using LinearAlgebra, SparseArrays, DataStructures +using Distributions, DensityInterface using Graphs using PermutationSymmetricTensors @@ -9,6 +9,10 @@ import StatsBase, Random include("group_numbering.jl") include("sbm.jl") +include("observations.jl") include("assignments/Assignments.jl") +include("optimisation/swap.jl") +include("optimisation/opti.jl") +include("assignments/data_structures.jl") end diff --git a/src/UI.jl b/src/UI.jl index 4f60963..03478f5 100644 --- a/src/UI.jl +++ b/src/UI.jl @@ -10,7 +10,7 @@ cities = MetaGraph( edge_data_type = Float64, graph_data = nothing, default_weight = -Inf, - weight_function = identity, + weight_function = identity ); cities["Paris"] = [2.3, 48.9]; @@ -24,7 +24,6 @@ cities["Paris", "Lausanne"] = 1; getindex(cities, "Paris") - typeof(cities) index = code_for(cities, "Paris") @@ -35,7 +34,6 @@ collect(weights(cities)) G = Graph(20, 20) adjacency_matrix(G) - sources = [1, 2, 1]; destinations = [2, 3, 3]; @@ -45,7 +43,6 @@ weight_edges = [0.5, 0.8, 2.0]; g = SimpleWeightedGraph(sources, destinations, weight_edges) add_vertices!(g, 5) - function get_obs(g::SimpleGraph{T}, i::Int, j::Int) where {T} return has_edge(g, i, j) end @@ -55,30 +52,29 @@ end using NetworkHistogram group_number = NetworkHistogram.GroupSize(nv(G), 3) if typeof(group_number) == NetworkHistogram.GroupSize{Tuple{Int, Int}} - node_labels = repeat(1:length(group_number)-1, inner = group_number[1]) + node_labels = repeat(1:(length(group_number) - 1), inner = group_number[1]) last_labels = fill(length(group_number), group_number[end]) node_labels = vcat(node_labels, last_labels) else node_labels = repeat(1:length(group_number), inner = group_number[1]) end -additional_info = NetworkHistogram.BasicAdditionalInformation(1) -a = NetworkHistogram.Assignment(group_number, node_labels, additional_info) +additional_info = 1 +a = NetworkHistogram.Assignment(group_number, node_labels) dist = Bernoulli(0.5) sbm_fit = NetworkHistogram.fit(a, G, dist) - -sbm = NetworkHistogram.initialize_sbm([1/3, 1/3, 1/3], dist) +sbm = NetworkHistogram.initialize_sbm([1 / 3, 1 / 3, 1 / 3], dist) for i in 1:3 sbm[i, i] = Bernoulli(0.8) - for j in i+1:3 - sbm[i, j] = Bernoulli(0.01+0.1*(i+j)) + for j in (i + 1):3 + sbm[i, j] = Bernoulli(0.01 + 0.1 * (i + j)) end end size_per_block = 200 -A, node_labels = NetworkHistogram.sample(sbm,3*size_per_block); +A, node_labels = NetworkHistogram.sample(sbm, 3 * size_per_block); node_labels = repeat(1:3, inner = size_per_block) -group_number = NetworkHistogram.GroupSize(size(A,1), size_per_block) +group_number = NetworkHistogram.GroupSize(size(A, 1), size_per_block) a_star = NetworkHistogram.Assignment(group_number, node_labels, additional_info) sbm_fitted = NetworkHistogram.fit(a_star, SimpleGraph(A), dist) diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index f7f37d8..ebe584e 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -1,16 +1,23 @@ -abstract type AdditionalData end - -struct BasicAdditionalInformation <: AdditionalData - test::Int -end - - -struct Assignment{T,B<:AdditionalData} <: AbstractVector{Vector{Int}} +struct Assignment{T, B} <: AbstractVector{Vector{Int}} group_size::GroupSize{T} node_labels::Vector{Int} additional_data::B + + function Assignment(group_size::GroupSize{T}, node_labels::Vector{Int}, + additional_data::B) where {T, B} + if length(node_labels) != sum(group_size) + throw(ArgumentError("The length of `node_labels` must be equal to the sum of `group_size`")) + end + return new{T, B}(group_size, node_labels, additional_data) + end end +function Assignment(group_size::GroupSize{T}, node_labels::Vector{Int}) where {T} + if length(node_labels) != sum(group_size) + throw(ArgumentError("The length of `node_labels` must be equal to the sum of `group_size`")) + end + return Assignment(group_size, node_labels, nothing) +end function number_groups(assignment::Assignment) return length(assignment.group_size) @@ -24,53 +31,40 @@ function get_vertex_in_group(assignment::Assignment, group::Int) return findall(assignment.node_labels .== group) end - function get_edge_indices(a::Assignment, i::Int, j::Int) return [(x, y) for x in get_vertex_in_group(a, i) - for y in get_vertex_in_group(a, j)] + for y in get_vertex_in_group(a, j)] end -function get_edge_indices(a::Assignment,i::Int) +function get_edge_indices(a::Assignment, i::Int) nodes_i = get_vertex_in_group(a, i) return [(x, y) for x in nodes_i for y in nodes_i if x < y] end - -Base.size(a::Assignment) = (number_groups(a),number_groups(a)) +Base.size(a::Assignment) = (number_groups(a),) Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) @boundscheck checkbounds(a, i) return get_vertex_in_group(a, i) end - -function get_obs(g::SimpleGraph{T}, x::Tuple) where {T} - return get_obs(g, x[1], x[2]) -end - -function get_obs(g::SimpleGraph{T}, i::Int, j::Int) where {T} - return convert(Bool, has_edge(g, i, j)) -end - - function loglikelihood(a::Assignment, dists::SBM, g) loglikelihood = 0.0 for i in 1:number_nodes(a) label_a = a.node_labels[i] - for j in i+1:number_nodes(a) + for j in (i + 1):number_nodes(a) label_b = a.node_labels[j] - loglikelihood += logdensityof(dists[label_a,label_b], get_obs(g,i,j)) + loglikelihood += logdensityof(dists[label_a, label_b], get_obs(g, i, j)) end end return loglikelihood end - -function fit(a::Assignment,g, distribution) +function fit(a::Assignment, g, distribution) dists = initialize_sbm(a.group_size, distribution) for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) - edges = get_edge_indices(a,group1,group2) - dists[group1,group2] = fit(distribution, g, edges) + edges = get_edge_indices(a, group1, group2) + dists[group1, group2] = fit(distribution, g, edges) end end return dists diff --git a/src/assignments/data_structures.jl b/src/assignments/data_structures.jl new file mode 100644 index 0000000..683cd65 --- /dev/null +++ b/src/assignments/data_structures.jl @@ -0,0 +1 @@ +include("specialised_data/Bernoulli_struct.jl") diff --git a/src/assignments/specialised_data/Bernoulli_struct.jl b/src/assignments/specialised_data/Bernoulli_struct.jl new file mode 100644 index 0000000..87e94d9 --- /dev/null +++ b/src/assignments/specialised_data/Bernoulli_struct.jl @@ -0,0 +1,49 @@ +struct BernoulliData{T} + counts::Matrix{Int} + realized::Matrix{Int} + estimated_theta::Matrix{T} + A::BitMatrix +end + +const BernoulliAssignment{T} = Assignment{T, BernoulliData} +const BernoulliInitRule{S,T} = InitRule{S, Val{BernoulliData{T}}} + +function BernoulliAssignment( + G, node_labels::Vector{Int}, group_size::GroupSize{T}) where {T} + k = length(group_size) + return BernoulliAssignment{T}(group_size, node_labels, + BernoulliData(zeros(Int, k, k), zeros(Int, k, k), zeros(T, k, k), BitMatrix(G))) +end + +function make_assignment(G, h, init_rule::BernoulliInitRule{S}) where S + return BernoulliAssignment(initialize_node_labels( + G, h, init_rule.starting_assignment_rule)...) +end + + + + + +mutable struct BernoulliSwap{T} <: Swap + index1::Int + index2::Int + old_assignment::BernoulliAssignment{T} +end + +function make_swap(assignment::BernoulliAssignment{T}, id::Tuple{Int}) where {T} + return BernoulliSwap(id[1], id[2], deepcopy(assignment)) +end + +function make_swap!(swap::BernoulliSwap{T}, assignment::BernoulliAssignment{T}, + id::Tuple{Int}) where {T} + swap.index1, swap.index2 = id + swap.old_assignment = deepcopy(assignment) +end + +function revert_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} + assignment = deepcopy(swap.old_assignment) +end + +function swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} + # perform fast update +end diff --git a/src/observations.jl b/src/observations.jl new file mode 100644 index 0000000..a38c19c --- /dev/null +++ b/src/observations.jl @@ -0,0 +1,9 @@ +# getters for observations + +function get_obs(g::SimpleGraph, x::Tuple) + return get_obs(g, x[1], x[2]) +end + +function get_obs(g::SimpleGraph, i::Int, j::Int) + return convert(Bool, has_edge(g, i, j)) +end diff --git a/src/old/assignment.jl b/src/old/assignment.jl deleted file mode 100644 index ea06117..0000000 --- a/src/old/assignment.jl +++ /dev/null @@ -1,195 +0,0 @@ -mutable struct Assignment{T, M} - const group_size::GroupSize{T} - - const node_labels::Vector{Int} - const counts::Matrix{Int} - const realized::Array{Float64, M} - const estimated_theta::Array{Float64, M} - const number_layers::Int - - likelihood::Float64 - - function Assignment(A, node_labels, group_size::GroupSize{T}) where {T} - M = ndims(A) - number_groups = length(group_size) - - counts = zeros(Int64, number_groups, number_groups) - realized = zeros(Int64, number_groups, number_groups) - - @inbounds @simd for k in 1:number_groups - for l in k:number_groups - realized[k, l] = sum(A[node_labels .== k, node_labels .== l]) - realized[l, k] = realized[k, l] - counts[k, l] = group_size[k] * group_size[l] - counts[l, k] = counts[k, l] - end - end - - @inbounds @simd for k in 1:number_groups - counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 - realized[k, k] = sum(A[node_labels .== k, node_labels .== k]) ÷ 2 - end - - estimated_theta = realized ./ counts - likelihood = compute_log_likelihood(number_groups, estimated_theta, counts, - size(A, 1)) - - new{T, M}(group_size, - node_labels, - counts, - realized, - estimated_theta, - 1, - likelihood) - end - - function Assignment(A::Array{I, 3}, - node_labels, - group_size::GroupSize{T}) where {I, T} - M = ndims(A) - number_groups = length(group_size) - - counts = zeros(Int64, number_groups, number_groups) - realized = zeros(Int64, number_groups, number_groups, 2^size(A, 3)) - - A_updated = zeros(Int64, size(A, 1), size(A, 2)) - for i in 1:size(A, 1) - for j in (i + 1):size(A, 2) - A_updated[i, j] = _binary_to_index(A[i, j, :]) - A_updated[j, i] = A_updated[i, j] - end - end - - @inbounds @simd for m in 1:size(realized, 3) - for k in 1:number_groups - for l in k:number_groups - realized[k, l, m] = sum(A_updated[node_labels .== k, - node_labels .== l] .== m) - realized[l, k, m] = realized[k, l, m] - counts[k, l] = group_size[k] * group_size[l] - counts[l, k] = counts[k, l] - end - end - end - - @inbounds @simd for m in 1:size(realized, 3) - for k in 1:number_groups - counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 - realized[k, k, m] = sum(A_updated[node_labels .== k, node_labels .== k] .== - m) ÷ 2 - end - end - estimated_theta = realized ./ counts - likelihood = compute_multivariate_log_likelihood(number_groups, - estimated_theta, - realized) - - new{T, M}(group_size, - node_labels, - counts, - realized, - estimated_theta, - size(A, 3), - likelihood) - end -end - -function _binary_to_index(binary_vector::Vector{Int}) - total = 1 - for i in 1:length(binary_vector) - total += binary_vector[i] * 2^(i - 1) - end - return total -end - -function _index_to_binary(index::Int, M) - binary_vector = zeros(Int, M) - index -= 1 - for i in 1:M - binary_vector[i] = index % 2 - index = index ÷ 2 - end - return binary_vector -end - -""" - compute_log_likelihood(number_groups, estimated_theta, counts, number_nodes) - -Compute the scaled log-likelihood in terms of communities: -```math -l(z;A) = \\frac{1}{n} \\sum_{g_1 = 1}^{G} \\sum_{g_2 \\geq g_1}^{G} \\left[ \\theta_{g_1g_2} \\log(\\theta_{g_1g_2}) + (1 - \\theta_{g_1g_2}) \\log(1 - \\theta_{g_1g_2}) \\right] \\cdot c_{g_1g_2}, -``` - -where ``c_{g_1g_2}`` (``\\theta_{g_1g_2}``) is the number of possible edges (estimated -probability) between communities ``g_1`` and ``g_2``, ``n`` is the number of nodes, and -``z_i ∈ \\{1, \\dots, G\\}`` is the community assignment of node ``i``. -""" -function compute_log_likelihood(number_groups, estimated_theta, counts, number_nodes) - loglik = 0.0 - @inbounds @simd for i in 1:number_groups - for j in i:number_groups - θ = estimated_theta[i, j] - θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) - loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * counts[i, j] - end - end - return loglik -end - -function compute_multivariate_log_likelihood(number_groups, estimated_theta, realized) - loglik = 0.0 - @inbounds @simd for i in 1:number_groups - for j in i:number_groups - for m in 1:size(realized, 3) - if realized[i, j, m] != 0 - θ = estimated_theta[i, j, m] - θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) - loglik += log(θ_c) * realized[i, j, m] - end - end - end - end - return loglik -end - -""" - compute_log_likelihood(assignment::Assignment) - -Compute the scaled log-likelihood of the assignment. - -```math - l(z;A) = \\frac{1}{n}\\sum\\limits_{i=1}^n \\sum\\limits_{j>i}^n \\left[ A_{ij} \\log(\\hat{\\theta}_{z_i z_j}) + (1 - A_{ij}) \\log(1 - \\hat{\\theta}_{z_i z_j}) \\right], -``` - -where ``\\hat{\\theta}_{ab}`` is the estimated probability of an edge between communities -``a`` and ``b`` - -```math - \\hat{\\theta}_{ab} = \\frac{\\sum\\limits_{i current.likelihood - deepcopy!(current, proposal) - end - - update_current!(history, iteration, current.likelihood) - return current -end diff --git a/src/old/config_rules/starting_assignment_rule.jl b/src/old/config_rules/starting_assignment_rule.jl deleted file mode 100644 index 1ee0261..0000000 --- a/src/old/config_rules/starting_assignment_rule.jl +++ /dev/null @@ -1,57 +0,0 @@ -abstract type StartingAssignment end -struct OrderedStart <: StartingAssignment end -struct RandomStart <: StartingAssignment end -struct EigenStart <: StartingAssignment end -struct DistStart <: StartingAssignment end - -""" - initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment) - -initialize node labels based on the `starting_assignment_rule`, and return a vector of -node labels and a `GroupSize` object. - -# Implemenented rules -- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. -- `RandomStart()`: Randomly assign nodes to groups. -- `EigenStart()`: Assign nodes to groups based on the second eigenvector of the normalized Laplacian. -- `DistStart()`: Assign nodes to groups based on the Hamming distance between rows of `A`. -""" -initialize_node_labels - -function initialize_node_labels(A, h, ::OrderedStart) - group_size = GroupSize(size(A, 1), h) - node_labels = inverse_rle(1:length(group_size), group_size) - return node_labels, group_size -end - -function initialize_node_labels(A, h, ::RandomStart) - node_labels, group_size = initialize_node_labels(A, h, OrderedStart()) - node_labels = shuffle!(node_labels) - return node_labels, group_size -end - -function initialize_node_labels(A, h, ::EigenStart) - group_size = GroupSize(size(A, 1), h) - node_labels = zeros(Int, size(A, 1)) - - laplacian = normalized_laplacian(A) - _, eigenvectors = eigs(laplacian, nev = 2, which = :LR, tol = 1e-2) - #_, eigenvectors = eigen(Symmetric(laplacian), (size(A, 1) - 1):(size(A, 1) - 1)) - - # get 2nd eigenvector, sort its components - indices = sortperm(eigenvectors[:, 1]) - # bin them into groups of correct size - start = 1 - for (i, group) in enumerate(group_size) - stop = start + group - 1 - node_labels[indices[start:stop]] .= i - start = stop + 1 - end - return node_labels, group_size -end - -function initialize_node_labels(A, h, ::DistStart) - group_size = GroupSize(size(A, 1), h) - node_labels = spectral_clustering(A, h) - return node_labels, group_size -end diff --git a/src/old/config_rules/stop_rule.jl b/src/old/config_rules/stop_rule.jl deleted file mode 100644 index d2ae852..0000000 --- a/src/old/config_rules/stop_rule.jl +++ /dev/null @@ -1,25 +0,0 @@ -abstract type StopRule end -struct PreviousBestValue <: StopRule - k::Int - function PreviousBestValue(k::Int) - @assert k > 0 - new(k) - end -end - -""" - stopping_rule(history, stop_rule::StopRule) - -Returns a Bool with true if we should stop the optimization based on the `stop_rule`. - -# Implemented rules -- `PreviousBestValue(k)`: Stop if the current iteration is `k` iterations away from the - iteration with the best value. -""" -stopping_rule - -function stopping_rule(history::GraphOptimizationHistory, stop_rule::PreviousBestValue) - current_itr = get_currentitr(history) - prev_best_itr = get_bestitr(history) - return current_itr[1] - prev_best_itr[1] > stop_rule.k -end diff --git a/src/old/data/datasets.jl b/src/old/data/datasets.jl deleted file mode 100644 index 8c7e071..0000000 --- a/src/old/data/datasets.jl +++ /dev/null @@ -1,17 +0,0 @@ -using HTTP, CodecZstd, TranscodingStreams - -const url_ref = "https://networks.skewed.de" - -include("utils.jl") - -function get_netzschleuder_network(name::String) - url = joinpath(url_ref, "net", name, "files", name * ".gt.zst") - res = HTTP.get(url) - - if res.status != 200 - error("Error downloading network" * res.status) - end - - decompressed = Base.IOBuffer(transcode(ZstdDecompressor, res.body)) - return readgt(decompressed) -end diff --git a/src/old/data/gt.jl b/src/old/data/gt.jl deleted file mode 100644 index 77e283f..0000000 --- a/src/old/data/gt.jl +++ /dev/null @@ -1,40 +0,0 @@ -"""Utils to read .gt files - -Inspired from the library Erdos.jl from CarloLucibello (precisely the file -`src/persistence.jl`) -""" - -const start_gt_format = "⛾ gt" - -function readgt_simple_network!(io::IO, adj, ::Type{T}) where {T} - n = size(adj, 1) - for i in 1:n - k = read(io, UInt64) - for _ in 1:k - j = read(io, T) + 1 - if i != j - adj[i, j] = 1 - adj[j, i] = 1 - end - end - end -end - -function readgt(io::IO) - @assert String(read(io, 6))==start_gt_format "gt file not correctly formatted" - ver = read(io, UInt8) ## version - indian = read(io, Bool) - @assert indian == false - lencomment = read(io, UInt64) - read(io, lencomment) - isdir = read(io, Bool) - n = read(io, UInt64) - T = minutype(n) - if isdir - @warn "Directed graphs are not supported, automatically converting to undirected." - end - adj = zeros(Int, n, n) - - readgt_simple_network!(io, adj, T) - return adj -end diff --git a/src/old/data/utils.jl b/src/old/data/utils.jl deleted file mode 100644 index 083fa67..0000000 --- a/src/old/data/utils.jl +++ /dev/null @@ -1,20 +0,0 @@ -function drop_isolated_vertices(A) - degrees = vec(sum(A, dims = 2)) - return A[degrees .> 0, degrees .> 0] -end - -####### From Erdos.jl ####### - -function minutype(n::Integer) - @assert n ≥ 0 - if n < 2^8 - return UInt8 - elseif n < 2^16 - return UInt16 - elseif n < 2^32 - return UInt32 - elseif n < 2^64 - return UInt64 - end - error("No type big enough") -end diff --git a/src/old/group_numbering.jl b/src/old/group_numbering.jl deleted file mode 100644 index 12d9691..0000000 --- a/src/old/group_numbering.jl +++ /dev/null @@ -1,43 +0,0 @@ -""" -Array-like storage for the number of nodes in each group. -""" -struct GroupSize{T} <: AbstractVector{Int} - group_number::T - number_groups::Int - - function GroupSize(number_nodes, h::Real) - @assert 0 < h < 1 - standard_group = floor(Int, number_nodes * h) - GroupSize(number_nodes, standard_group) - end - - function GroupSize(number_nodes, standard_group::Integer) - @assert 1 < standard_group < number_nodes - number_groups = number_nodes ÷ standard_group # number of standard groups! - if number_groups * standard_group == number_nodes - new{Int}(standard_group, number_groups) - else - remainder_group = number_nodes - number_groups * standard_group - if remainder_group == 1 - @warn "h has to be changed, only one node in remainder group" - standard_group -= 1 - remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them - if standard_group == 1 - error("Standard group size now 1, please choose a new value for h.") - end - end - new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1) - end - end -end - -Base.size(g::GroupSize) = (g.number_groups,) -Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) - @boundscheck checkbounds(g, i) - return g.group_number -end - -Base.@propagate_inbounds function Base.getindex(g::GroupSize{Tuple{Int, Int}}, i::Int) - @boundscheck checkbounds(g, i) - return i < length(g) ? g.group_number[1] : g.group_number[2] -end diff --git a/src/old/histogram.jl b/src/old/histogram.jl deleted file mode 100644 index 126d1ad..0000000 --- a/src/old/histogram.jl +++ /dev/null @@ -1,43 +0,0 @@ -struct GraphHist{T, M} - θ::Array{T, M} - node_labels::Vector{Int} - num_layers::Int - function GraphHist(a::Assignment{T, M}) where {T, M} - θ = a.estimated_theta - node_labels = a.node_labels - new{typeof(θ[1]), M}(θ, node_labels, a.number_layers) - end -end - -""" -Network Histogram approximation [1]. - -Contains the estimated network histogram and the node labels. - -# Fields -- `θ::Matrix{T}`: Estimated stochastic block model parameters. -- `node_labels::Vector{Int}`: Node labels for each node in the adjacency matrix used - to estimate the network histogram. - -# References -[1] - Olhede, Sofia C., and Patrick J. Wolfe. "Network histograms and universality of -blockmodel approximation." Proceedings of the National Academy of Sciences 111.41 (2014): 14722-14727. -""" -GraphHist - -function get_moment_representation(g::GraphHist{T, 2}) where {T} - return g.θ -end - -function get_moment_representation(g::GraphHist{T, 3}) where {T} - moments = zeros(size(g.θ, 1), size(g.θ, 2), 2^g.num_layers - 1) - transition = collect(kronecker([1 1; 0 1], g.num_layers)) - for i in 1:size(g.θ, 1) - for j in 1:size(g.θ, 2) - moments[i, j, :] .= (transition * g.θ[i, j, :])[2:end] - end - end - indices_for_moments = [findall(x -> x == 1, _index_to_binary(e, g.num_layers)) - for e in 2:size(g.θ, 3)] - return moments, indices_for_moments -end diff --git a/src/old/history.jl b/src/old/history.jl deleted file mode 100644 index 381f175..0000000 --- a/src/old/history.jl +++ /dev/null @@ -1,88 +0,0 @@ -abstract type GraphOptimizationHistory end -struct TraceHistory{M <: MVHistory} <: GraphOptimizationHistory - history::M -end -mutable struct NoTraceHistory <: GraphOptimizationHistory - current_iteration::Int - best_iteration::Int - best_likelihood::Float64 -end - -""" - initialize_history(best, current, proposal, ::Val{true}) - - initialize the history when `record_trace=true` is passed to `graphhist`. -""" -function initialize_history(best, current, proposal, ::Val{true}) - history = MVHistory(Dict([ - :proposal_likelihood => QHistory(Float64), - :current_likelihood => QHistory(Float64), - :best_likelihood => QHistory(Float64), - ])) - push!(history, :proposal_likelihood, 0, proposal.likelihood) - push!(history, :current_likelihood, 0, current.likelihood) - push!(history, :best_likelihood, 0, best.likelihood) - return TraceHistory(history) -end - -""" - initialize_history(best, current, proposal, ::Val{false}) - -initialize the history when `record_trace=false` is passed to `graphhist`. -""" -function initialize_history(best, current, proposal, ::Val{false}) - return NoTraceHistory(0, 0, best.likelihood) -end - -""" - get_currentitr(history::GraphOptimizationHistory) - -Return the current iteration of the optimization from the history. -""" -get_currentitr(history::TraceHistory) = last(history.history, :current_likelihood) -get_currentitr(history::NoTraceHistory) = history.current_iteration - -""" - get_bestitr(history::GraphOptimizationHistory) - -Return the best iteration of the optimization from the history. -""" -get_bestitr(history::TraceHistory) = last(history.history, :best_likelihood) -get_bestitr(history::NoTraceHistory) = history.best_iteration - -""" - update_current!(history::GraphOptimizationHistory, iteration, likelihood) - -Updates the current value and iteration in history. -""" -function update_current!(history::TraceHistory, iteration, likelihood) - push!(history.history, :current_likelihood, iteration, likelihood) -end -function update_current!(history::NoTraceHistory, iteration, likelihood) - history.current_iteration = iteration -end - -""" - update_best!(history::GraphOptimizationHistory, iteration, likelihood) - -Updates the best value and iteration in history. -""" -function update_best!(history::TraceHistory, iteration, likelihood) - push!(history.history, :best_likelihood, iteration, likelihood) -end -function update_best!(history::NoTraceHistory, iteration, likelihood) - history.best_iteration = iteration - history.best_likelihood = likelihood -end - -""" - update_previous!(history::GraphOptimizationHistory, iteration, likelihood) - -Updates the previous value and iteration in history. - -Note this does not apply is `history` is a `NoTraceHistory`, so nothign happens. -""" -function update_proposal!(history::TraceHistory, iteration, likelihood) - push!(history.history, :proposal_likelihood, iteration, likelihood) -end -update_proposal!(history::NoTraceHistory, iteration, likelihood) = nothing diff --git a/src/old/optimize.jl b/src/old/optimize.jl deleted file mode 100644 index 751ffcf..0000000 --- a/src/old/optimize.jl +++ /dev/null @@ -1,154 +0,0 @@ -function checkadjacency(A) - @assert eltype(A) <: Real - if !(eltype(A) === Bool) - @assert all(a ∈ [zero(eltype(A)), one(eltype(A))] for a in A) "All elements of the ajacency matrix should be zero or one." - end - check_symmetry_and_diag(A) - return nothing -end - -function check_symmetry_and_diag(A) - @assert issymmetric(A) - @assert all(A[i, i] == zero(eltype(A)) for i in 1:size(A, 1)) "The diagonal of the adjacency matrix should all be zeros." -end - -function check_symmetry_and_diag(A::Array{T, 3}) where {T} - for layer in eachslice(A, dims = 3) - check_symmetry_and_diag(layer) - @assert all(layer[i, i] == zero(eltype(layer)) for i in 1:size(layer, 1)) "The diagonal of the adjacency matrix should all be zeros." - end -end - -function update_adj(A::Array{T, 2}) where {T} - return A -end - -function update_adj(A::Array{T, 3}) where {T} - A_updated = zeros(Int64, size(A, 1), size(A, 2)) - for i in 1:size(A, 1) - for j in (i + 1):size(A, 2) - A_updated[i, j] = _binary_to_index(A[i, j, :]) - A_updated[j, i] = A_updated[i, j] - end - end - return A_updated -end - -""" - graphhist(A; h = select_bandwidth(A), maxitr = 1000, swap_rule = RandomNodeSwap(), - starting_assignment_rule = RandomStart(), accept_rule = Strict(), - stop_rule = PreviousBestValue(3), record_trace=true) - -Computes the graph histogram approximation. - -# Arguments -- `A`: adjacency matrix of a simple graph - -- `h`: bandwidth of the graph histogram (number of nodes in a group or percentage (in [0,1]) of - nodes in a group) - -- `record_trace` (optional): whether to record the trace of the optimization process and return - it as part of the output. Default is `true`. - -# Returns -named tuple with the following fields: -- `graphhist`: the graph histogram approximation -- `trace`: the trace of the optimization process (if `record_trace` is `true`) -- `likelihood`: the loglikelihood of the graph histogram approximation - -# Examples -```julia -julia> A = [0 0 1 0 1 0 1 1 0 1 - 0 0 1 1 1 1 1 1 0 0 - 1 1 0 1 0 0 0 0 1 0 - 0 1 1 0 1 0 1 0 0 0 - 1 1 0 1 0 0 1 0 0 1 - 0 1 0 0 0 0 0 1 0 0 - 1 1 0 1 1 0 0 1 0 1 - 1 1 0 0 0 1 1 0 0 1 - 0 0 1 0 0 0 0 0 0 1 - 1 0 0 0 1 0 1 1 1 0] -julia> out = graphhist(A); -julia> graphist_approx = out.graphist -... -julia> trace = out.trace -NetworkHistogram.TraceHistory{...} - :best_likelihood => 1 elements {Int64,Float64} - :proposal_likelihood => 5 elements {Int64,Float64} - :current_likelihood => 5 elements {Int64,Float64}) -julia> loglikelihood = out.likelihood --22.337057781338277 -``` -""" -function graphhist(A; h = select_bandwidth(A), maxitr = 10000, - swap_rule::NodeSwapRule = RandomNodeSwap(), - starting_assignment_rule::StartingAssignment = EigenStart(), - accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(100), record_trace = true) - checkadjacency(A) - @assert maxitr > 0 - A = drop_disconnected_components(A) - - return _graphhist(A, Val{record_trace}(), h = h, maxitr = maxitr, swap_rule = swap_rule, - starting_assignment_rule = starting_assignment_rule, - accept_rule = accept_rule, - stop_rule = stop_rule) -end - -""" - _graphhist(A, record_trace=Val{true}(); h, maxitr, swap_rule, starting_assignment_rule, accept_rule, stop_rule) - -Internal version of `graphhist` which is type stable. -""" -function _graphhist(A, record_trace = Val{true}(); h, maxitr, swap_rule, - starting_assignment_rule, accept_rule, stop_rule) - best, current, proposal, history, A = initialize(A, h, starting_assignment_rule, - record_trace) - - for i in 1:maxitr - proposal = create_proposal!(history, i, proposal, current, A, swap_rule) - current = accept_reject_update!(history, i, proposal, current, accept_rule) - best = update_best!(history, i, current, best) - if stopping_rule(history, stop_rule) - break - end - end - - return graphhist_format_output(best, history) -end - -""" - graphhist_format_output(best, history) - -Formates the `graphhist` output depending on the type of `history` requested by the user. -""" -function graphhist_format_output(best, history::TraceHistory) - return (graphhist = GraphHist(best), trace = history, likelihood = best.likelihood) -end -function graphhist_format_output(best, history::NoTraceHistory) - return (graphhist = GraphHist(best), likelihood = history.best_likelihood) -end - -function update_best!(history::GraphOptimizationHistory, iteration::Int, - current::Assignment, - best::Assignment) - if current.likelihood > best.likelihood - update_best!(history, iteration, current.likelihood) - deepcopy!(best, current) - end - return best -end - -""" - initialize(A, h, starting_assignment_rule, record_trace) - -Initialize the memory required for finding optimal graph histogram. -""" -function initialize(A, h, starting_assignment_rule, record_trace) - node_labels, group_size = initialize_node_labels(A, h, starting_assignment_rule) - proposal = Assignment(A, node_labels, group_size) - current = deepcopy(proposal) - best = deepcopy(proposal) - history = initialize_history(best, current, proposal, record_trace) - return best, current, proposal, history, update_adj(A) -end diff --git a/src/old/proposal.jl b/src/old/proposal.jl deleted file mode 100644 index 848a2fb..0000000 --- a/src/old/proposal.jl +++ /dev/null @@ -1,135 +0,0 @@ -"""Functions to create and evaluate possible labels update.""" - -""" - create_proposal!(history::GraphOptimizationHistory, iteration::Int, proposal::Assignment, - current::Assignment, A, swap_rule) - -Create a new proposal by swapping the labels of two nodes. The new assignment is stored in -`proposal`. The swap is selected using the `swap_rule` function. The likelihood of the new -proposal is stored in the history. - -!!! warning - The `proposal` assignment is modified in place to avoid unnecessary memory allocation. -""" -function create_proposal!(history::GraphOptimizationHistory, iteration::Int, - proposal::Assignment, - current::Assignment, A, swap_rule) - swap = select_swap(current, A, swap_rule) - make_proposal!(proposal, current, swap, A) - update_proposal!(history, iteration, proposal.likelihood) - return proposal -end - -""" - make_proposal!(proposal::Assignment, current::Assignment, swap::Tuple{Int, Int}, A) - -From the current assignment, create a new assignment by swapping the labels of the nodes -specified in `swap`. The new assignment is stored in `proposal`. -""" -function make_proposal!(proposal::Assignment, current::Assignment, swap::Tuple{Int, Int}, A) - # copy current in proposal - deepcopy!(proposal, current) - # update realized, estimated_theta - update_observed!(proposal, swap, A) - # update node labels (has to happen after!!!) - update_labels!(proposal, swap, current) - # update ll - updateLL!(proposal) -end - -""" - update_labels!(proposal::Assignment, swap::Tuple{Int, Int}, current::Assignment) - -Update the labels of the nodes specified in `swap` in the `proposal` assignment. -""" -function update_labels!(proposal::Assignment, swap::Tuple{Int, Int}, current::Assignment) - proposal.node_labels[swap[1]] = current.node_labels[swap[2]] - proposal.node_labels[swap[2]] = current.node_labels[swap[1]] -end - -""" - updateLL!(proposal::Assignment) - -Update the likelihood of the `proposal` assignment based on its observed and estimated -attributes. -""" -function updateLL!(proposal::Assignment) - # O(G^2) where G is the number of groups - proposal.likelihood = NetworkHistogram.compute_log_likelihood(proposal) -end - -""" - update_observed!(proposal::Assignment, swap::Tuple{Int, Int}, A) - -Update the observed and estimated attributes of the `proposal` assignment based on the -swap of the nodes specified in `swap`. - -NOTE labels of the nodes before the swap -""" - -function update_observed!(proposal::Assignment{T, 2}, swap::Tuple{Int, Int}, A) where {T} - group_node_1 = proposal.node_labels[swap[1]] - group_node_2 = proposal.node_labels[swap[2]] - - for i in axes(A, 1) - if i == swap[1] || i == swap[2] || A[swap[1], i] == A[swap[2], i] - continue - end - group_i = proposal.node_labels[i] - if A[i, swap[1]] == 1 - proposal.realized[group_node_1, group_i] -= 1 - proposal.realized[group_i, group_node_1] = proposal.realized[group_node_1, - group_i] - - proposal.realized[group_node_2, group_i] += 1 - proposal.realized[group_i, group_node_2] = proposal.realized[group_node_2, - group_i] - end - if A[i, swap[2]] == 1 - proposal.realized[group_node_2, group_i] -= 1 - proposal.realized[group_i, group_node_2] = proposal.realized[group_node_2, - group_i] - - proposal.realized[group_node_1, group_i] += 1 - proposal.realized[group_i, group_node_1] = proposal.realized[group_node_1, - group_i] - end - end - - @. proposal.estimated_theta = proposal.realized / proposal.counts - - return nothing -end - -function update_observed!(proposal::Assignment{T, 3}, swap::Tuple{Int, Int}, A) where {T} - group_node_1 = proposal.node_labels[swap[1]] - group_node_2 = proposal.node_labels[swap[2]] - if group_node_1 == group_node_2 - return nothing - end - - for i in axes(A, 1) - if i == swap[1] || i == swap[2] || A[swap[1], i] == A[swap[2], i] - continue - end - group_i = proposal.node_labels[i] - - proposal.realized[group_node_1, group_i, A[i, swap[1]]] -= 1 - proposal.realized[group_i, group_node_1, A[i, swap[1]]] = proposal.realized[group_node_1, - group_i, A[i, swap[1]]] - proposal.realized[group_node_2, group_i, A[i, swap[1]]] += 1 - proposal.realized[group_i, group_node_2, A[i, swap[1]]] = proposal.realized[group_node_2, - group_i, A[i, swap[1]]] - - proposal.realized[group_node_1, group_i, A[i, swap[2]]] += 1 - proposal.realized[group_i, group_node_1, A[i, swap[2]]] = proposal.realized[group_node_1, - group_i, A[i, swap[2]]] - proposal.realized[group_node_2, group_i, A[i, swap[2]]] -= 1 - proposal.realized[group_i, group_node_2, A[i, swap[2]]] = proposal.realized[group_node_2, - group_i, A[i, swap[2]]] - end - - @. proposal.estimated_theta = proposal.realized / proposal.counts - - return nothing -end diff --git a/src/old/utils.jl b/src/old/utils.jl deleted file mode 100644 index 3b85a6f..0000000 --- a/src/old/utils.jl +++ /dev/null @@ -1,109 +0,0 @@ -function laplacian(A) - s = sum(A; dims = 1) - return diagm(vec(s)) - A -end - -function normalized_laplacian(A) - L = zeros(size(A)) - degrees = vec(sum(A, dims = 1)) - for j in 1:size(A, 1) - for i in 1:size(A, 2) - if i == j - L[i, j] = 1 - else - L[i, j] = A[i, j] / sqrt(degrees[i] * degrees[j]) - end - end - end - return L -end - -function normalized_laplacian(A::AbstractArray{T, 3}) where {T} - L = zeros(size(A, 1), size(A, 2)) - for layer in eachslice(A, dims = 3) - L .+= normalized_laplacian(layer) - end - return L ./ size(A, 3) -end - -function drop_disconnected_components(A::AbstractArray{T, 2}) where {T} - indices = findall(x -> x != 0, vec(sum(A, dims = 1))) - return A[indices, indices] -end - -function drop_disconnected_components(A::AbstractArray{T, 3}) where {T} - indices = findall(x -> x != 0, vec(sum(A, dims = (1, 3)))) - return A[indices, indices, :] -end - -""" - hamming_distance(x, y) - -Compute the normalized Hamming distance between two vectors `x` and `y`. -""" -function hamming_distance(x::Vector{T}, y::Vector{T}) where {T} - return sum(x .!= y) / length(x) -end - -""" - pairwise_hamming_distance(A) - -Compute the pairwise Hamming distance between all rows of `A`. If `A` is a 3D -array, then the average Hamming distance for each layer of the array is returned. -""" -pairwise_hamming_distance - -function pairwise_hamming_distance(matrix::AbstractArray{T, 2}) where {T} - n = size(matrix, 1) - dist_matrix = zeros(n, n) - for i in 1:n, j in (i + 1):n - dist_matrix[i, j] = hamming_distance(matrix[i, :], matrix[j, :]) - dist_matrix[j, i] = dist_matrix[i, j] # Symmetric matrix - end - return dist_matrix -end - -function pairwise_hamming_distance(matrix::AbstractArray{T, 3}) where {T} - n = size(matrix, 1) - dist_matrix = zeros(n, n) - for layer in eachslice(matrix, dims = 3) - dist_matrix .+= pairwise_hamming_distance(layer) - end - return dist_matrix ./ size(matrix, 3) -end - -function spectral_clustering(A, h) - n = size(A, 1) - - L = 1 .- pairwise_hamming_distance(A) ./ n - - # Compute the degree matrix - d = sum(L, dims = 2) - - # Compute the normalized Laplacian - normalized_L = sum(1.0 ./ d) .* L .- sum(d) / sqrt(sum(d .^ 2)) - - # Compute eigenvalues and eigenvectors of the normalized Laplacian - - decomp, history = partialschur(normalized_L, nev = 2, which = LR()) - _, eigen_vecs = partialeigen(decomp) - - # Extract the second eigenvector - u = real.(eigen_vecs[:, 1]) - u = u .* sign(u[1]) # Set the first coordinate >= 0 wlog - - # Sort based on the embedding - ind = sortperm(u, alg = QuickSort, rev = false) - - # Determine the number of clusters - k = ceil(Int, n / h) - - # Initialize cluster assignments - idxInit = zeros(Int, n) - for i in 1:k - for j in ((i - 1) * h + 1):min(n, i * h) - idxInit[ind[j]] = i - end - end - return idxInit -end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl new file mode 100644 index 0000000..65320ff --- /dev/null +++ b/src/optimisation/config_rules/InitRule.jl @@ -0,0 +1,40 @@ +abstract type StartingAssignment end +struct OrderedStart <: StartingAssignment end +struct RandomStart <: StartingAssignment end + + +struct InitRule{S <: StartingAssignment, I} + starting_assignment_rule::S + assignment_rule::I +end + +function make_assignment(A, h, init_rule::InitRule{S, Nothing}) where S + return Assignment(initialize_node_labels(A, h, init_rule.starting_assignment_rule)...) +end + +""" + initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment) + +initialize node labels based on the `starting_assignment_rule`, and return a vector of +node labels and a `GroupSize` object. + +# Implemenented rules +- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. +- `RandomStart()`: Randomly assign nodes to groups. +""" +initialize_node_labels + +function initialize_node_labels(A, h, ::OrderedStart) + group_size = GroupSize(size(A, 1), h) + node_labels = inverse_rle(1:length(group_size), group_size) + return node_labels, group_size +end + +function initialize_node_labels(A, h, ::RandomStart) + node_labels, group_size = initialize_node_labels(A, h, OrderedStart()) + node_labels = shuffle!(node_labels) + return node_labels, group_size +end + +# check https://github.com/TrainOfCode/LocalFennelPartitioning.jl/tree/main +# check https://github.com/JuliaSparse/Metis.jl diff --git a/src/optimisation/config_rules/accept_rule.jl b/src/optimisation/config_rules/accept_rule.jl new file mode 100644 index 0000000..e3e57ea --- /dev/null +++ b/src/optimisation/config_rules/accept_rule.jl @@ -0,0 +1,17 @@ +abstract type AcceptRule end +struct Strict <: AcceptRule end + +""" + accept_reject_update!(proposal::Assignment, current::Assignment, + accept_rule::AcceptRule) + + +Return the updated `current` assignment based on the `accept_rule`. + +# Implemented rules +- `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. +""" +accept_reject_update! + +function accept_reject_update!(swap::Swap, current::Assignment, ::Strict) +end diff --git a/src/old/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl similarity index 85% rename from src/old/config_rules/bandwidth_selection_rule.jl rename to src/optimisation/config_rules/bandwidth_selection_rule.jl index fc0eb40..15ff178 100644 --- a/src/old/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -4,13 +4,6 @@ function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int return max(2, min(size(A)[1], round(Int, h))) end -function select_bandwidth(A::Array{T, 3}; type = "degs", alpha = 1, c = 1)::Int where {T} - hs = [select_bandwidth(A[:, :, i]; type, alpha, c) for i in 1:size(A, 3)] - @warn "Naive bandwidth selection for multilayer graph histogram: using minimum over layers" - h = max(2, min(size(A, 1), round(Int, minimum(hs)))) - return h -end - """ oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl new file mode 100644 index 0000000..73e6de4 --- /dev/null +++ b/src/optimisation/config_rules/stop_rule.jl @@ -0,0 +1,35 @@ +abstract type StopRule end +mutable struct PreviousBestValue{T} <: StopRule + k::Int + past_values::Queue{T} + function PreviousBestValue(k::Int) + @assert k > 0 + new{T}(k, Queue{T}(k)) + end +end + +""" + stopping_rule(assignment::Assignment, stop_rule::StopRule) + +Returns a Bool with true if we should stop the optimization based on the `stop_rule`. + +# Implemented rules +- `PreviousBestValue(k)`: Stop if the current iteration is `k` iterations away from the + iteration with the best value. +""" +stopping_rule + +function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) + loglikelihood = loglikelihood(assignment) + if length(stop_rule.past_values) == 0 + push!(stop_rule.past_values, loglikelihood) + return false + elseif loglikelihood > first(stop_rule.past_values) + empty!(stop_rule.past_values) + push!(stop_rule.past_values, loglikelihood) + return false + else + push!(stop_rule.past_values, loglikelihood) + return length(stop_rule.past_values) == stop_rule.k + 1 #always keep the best value + end +end diff --git a/src/old/config_rules/swap_rule.jl b/src/optimisation/config_rules/swap_rule.jl similarity index 50% rename from src/old/config_rules/swap_rule.jl rename to src/optimisation/config_rules/swap_rule.jl index dcd21a7..be76aab 100644 --- a/src/old/config_rules/swap_rule.jl +++ b/src/optimisation/config_rules/swap_rule.jl @@ -3,7 +3,7 @@ abstract type NodeSwapRule end struct RandomNodeSwap <: NodeSwapRule end """ - select_swap(node_assignment::Assignment, A, ::NodeSwapRule) + select_swap(node_assignment::Assignment, ::NodeSwapRule) Selects two nodes to swap based on the `NodeSwapRule`, the adjacency matrix `A` and the current assignment `node_assignment`. @@ -13,15 +13,15 @@ current assignment `node_assignment`. """ select_swap -function select_swap(node_assignment::Assignment, A, ::RandomNodeSwap) - index1 = rand(1:size(A, 1)) - label1 = node_assignment.node_labels[index1] +function select_swap(assignment::Assignment, ::RandomNodeSwap) + index1 = rand(1:number_nodes(assignment)) + label1 = assignment.node_labels[index1] index2 = index1 for _ in 1:10 - index2 = rand(1:size(A, 1)) - if node_assignment.node_labels[index2] != label1 + index2 = rand(1:number_nodes(assignment)) + if assignment.node_labels[index2] != label1 break end end - return (index1, index2) + return make_swap(assignment, (index1, index2)) end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index e69de29..529f921 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -0,0 +1,15 @@ +include("config_rules/swap_rule.jl") +include("config_rules/accept_rule.jl") +include("config_rules/InitRule.jl") +include("config_rules/stop_rule.jl") +include("config_rules/bandwidth_selection_rule.jl") + + + +function greedy_improve!(a::Assignment, G; max_iter::Int=1000, + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict(), + initialise_rule::InitRule = InitRule(RandomStart(), nothing), +) + +end diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl new file mode 100644 index 0000000..3fb09ff --- /dev/null +++ b/src/optimisation/swap.jl @@ -0,0 +1,18 @@ +abstract type Swap end + +mutable struct DefaultSwap <: Swap + index1::Int + index2::Int +end + +make_swap(::Assignment{T,Nothing}, id::Tuple{Int}) where {T} = DefaultSwap(id[1], id[2]) +function make_swap!(swap::DefaultSwap, assignment::Assignment, id::Tuple{Int}) + swap.index1, swap.index2 = id +end + +function swap!(assignment::Assignment, swap::DefaultSwap) + assignment.node_labels[swap.index1], assignment.node_labels[swap.index2] = assignment.node_labels[swap.index2], + assignment.node_labels[swap.index1] +end + +revert_swap!(assignment::Assignment, swap::DefaultSwap) = swap!(assignment, swap) diff --git a/src/sbm.jl b/src/sbm.jl index 0c2e1fb..7f8e4e2 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -1,19 +1,17 @@ -struct SBM{T,K,F<:Real} <: AbstractMatrix{T} - sizes :: Vector{F} - probs:: SymmetricTensor{T, K, 2} +struct SBM{T, K, F <: Real} <: AbstractMatrix{T} + sizes::Vector{F} + probs::SymmetricTensor{T, K, 2} end - function _check_sizes(sizes) - @assert sum(sizes) ≈ 1 "Sizes must sum to 1, got $(sum(sizes))" + @assert sum(sizes)≈1 "Sizes must sum to 1, got $(sum(sizes))" return sizes end function _check_sizes(sizes::Vector{Int}) - return sizes./sum(sizes) + return sizes ./ sum(sizes) end - function initialize_sbm(sizes::Vector, dist, k = length(sizes)) sizes = _check_sizes(sizes) n_dims = binomial(k + 1, 2) @@ -22,9 +20,8 @@ function initialize_sbm(sizes::Vector, dist, k = length(sizes)) return SBM(sizes, SymmetricTensor(probs, Val(k), Val(2))) end - function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) - size_bins = sizes./sum(sizes) + size_bins = sizes ./ sum(sizes) n_dims = binomial(k + 1, 2) probs = Vector{typeof(dist)}(undef, n_dims) fill!(probs, dist) @@ -35,18 +32,16 @@ function initialize_sbm(k::Int, dist) return initialize_sbm(ones(k) / k, dist) end -number_blocks(::SBM{T,K}) where {T,K} = K +number_blocks(::SBM{T, K}) where {T, K} = K -Base.size(s::SBM)= size(s.probs) +Base.size(s::SBM) = size(s.probs) Base.ndims(::SBM) = 2 -Base.eltype(::SBM{T,K}) where {T,K} = T +Base.eltype(::SBM{T, K}) where {T, K} = T Base.setindex!(s::SBM, v, i, j) = setindex!(s.probs, v, i, j) Base.@propagate_inbounds function Base.getindex(s::SBM, i, j) return getindex(s.probs, i, j) end - - function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) n_blocks = number_blocks(sbm) node_labels = StatsBase.sample( @@ -56,7 +51,7 @@ function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) end A = BitMatrix(undef, n_nodes, n_nodes) for i in 1:n_nodes - for j in i+1:n_nodes + for j in (i + 1):n_nodes A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) A[j, i] = A[i, j] end diff --git a/test/config_rules/accept_rule_test.jl b/test/config_rules/accept_rule_test.jl deleted file mode 100644 index 56de1f3..0000000 --- a/test/config_rules/accept_rule_test.jl +++ /dev/null @@ -1,33 +0,0 @@ -import NetworkHistogram: accept_reject_update!, initialize_history -@testset "accept rule" begin - iteration = 3 - A, node_labels, group_size, proposal = make_simple_example() - - proposal.likelihood = 0.0 - current = deepcopy(proposal) - best = deepcopy(proposal) - - histories = [ - initialize_history(best, current, proposal, Val{true}()), - initialize_history(best, current, proposal, Val{false}()), - ] - test_likelihoods = [-0.1, 0.1] - for history in histories, lik in test_likelihoods - proposal.likelihood = lik # set proposal - accept_reject_update!(history, iteration, proposal, current, Strict()) - - @testset "Strict with history is $(typeof(history).name.name), likelihood=$lik" begin - @test current.likelihood == max(lik, 0.0) # should have accepted if better - if history isa NetworkHistogram.TraceHistory - @test get(history.history, :current_likelihood)[1][end] == iteration - @test get(history.history, :current_likelihood)[2][end] == - current.likelihood - else - @test history.current_iteration == iteration - end - end - - current.likelihood = 0.0 # reset for next example - iteration += 1 # otherwise will get an error from history - end -end diff --git a/test/config_rules/config_rule_test.jl b/test/config_rules/config_rule_test.jl deleted file mode 100644 index 7c7c95c..0000000 --- a/test/config_rules/config_rule_test.jl +++ /dev/null @@ -1,6 +0,0 @@ -@testset "config rules" begin - include("accept_rule_test.jl") - include("starting_assigment_rule_test.jl") - include("stop_rule_test.jl") - include("swap_rule_test.jl") -end diff --git a/test/config_rules/starting_assigment_rule_test.jl b/test/config_rules/starting_assigment_rule_test.jl deleted file mode 100644 index 5ff2617..0000000 --- a/test/config_rules/starting_assigment_rule_test.jl +++ /dev/null @@ -1,33 +0,0 @@ -import NetworkHistogram: initialize_node_labels - -@testset "starting assignment rules" begin - @testset "starting assigment rule simple graphs" begin - A, _, _, _ = make_simple_example() - for method in (OrderedStart(), - RandomStart(), - EigenStart(), - DistStart()) - node_labels, group_size = initialize_node_labels(A, 4, method) - if method isa OrderedStart - @test sort(node_labels) == node_labels - end - @test all(sum(n -> n == j, node_labels) == group_size[j] - for j in unique(node_labels)) - end - end - - @testset "starting assigment rule multilayer graphs" begin - A, _, _, _ = make_multivariate_example() - for method in (OrderedStart(), - RandomStart(), - EigenStart(), - DistStart()) - node_labels, group_size = initialize_node_labels(A, 4, method) - if method isa OrderedStart - @test sort(node_labels) == node_labels - end - @test all(sum(n -> n == j, node_labels) == group_size[j] - for j in unique(node_labels)) - end - end -end diff --git a/test/config_rules/stop_rule_test.jl b/test/config_rules/stop_rule_test.jl deleted file mode 100644 index e814c70..0000000 --- a/test/config_rules/stop_rule_test.jl +++ /dev/null @@ -1,20 +0,0 @@ -import NetworkHistogram: stopping_rule -@testset "stop rule" begin - A, node_labels, group_size, proposal = make_simple_example() - - proposal.likelihood = -0.1 - current = deepcopy(proposal) - best = deepcopy(proposal) - - histories = [ - initialize_history(best, current, proposal, Val{true}()), - initialize_history(best, current, proposal, Val{false}()), - ] - for history in histories - for i in 1:4 - NetworkHistogram.update_current!(history, i, 0.0) - end - @test stopping_rule(history, PreviousBestValue(3)) == true - @test stopping_rule(history, PreviousBestValue(4)) == false - end -end diff --git a/test/config_rules/swap_rule_test.jl b/test/config_rules/swap_rule_test.jl deleted file mode 100644 index bcf4075..0000000 --- a/test/config_rules/swap_rule_test.jl +++ /dev/null @@ -1,7 +0,0 @@ -import NetworkHistogram: select_swap -@testset "swap rule" begin - A, node_labels, group_size, assignment = make_simple_example() - x = select_swap(assignment, A, RandomNodeSwap()) - @test x isa Tuple{Int, Int} - @test all(1 .≤ x .≤ size(A, 1)) -end diff --git a/test/data_tests/utils.jl b/test/data_tests/utils.jl deleted file mode 100644 index d37db8a..0000000 --- a/test/data_tests/utils.jl +++ /dev/null @@ -1,13 +0,0 @@ -@testset "Data utils" begin - A = [0 0 0 1 - 0 0 0 0 - 1 0 0 1 - 0 0 1 0] - - @testset "drop isolated vertices" begin - B = NetworkHistogram.drop_isolated_vertices(A) - @test B == [0 1 1 - 1 0 1 - 0 1 0] - end -end diff --git a/test/error_handling_tests.jl b/test/error_handling_tests.jl deleted file mode 100644 index 76fae2b..0000000 --- a/test/error_handling_tests.jl +++ /dev/null @@ -1,27 +0,0 @@ -@testset "Error handling" begin - @testset "Adjacency matrix" begin - As = [ - [0 1 - 0 0], [1 1 - 1 0], [0 2 - 2 0], [0 1 - 1 0 - 0 1], - ] - for A in As - @test_throws AssertionError graphhist(A, h = 2) - @test_throws AssertionError graphhist(Bool.(min.(A, 1)), h = 2) - @test_throws AssertionError graphhist(Float64.(A), h = 2) - end - @test_throws AssertionError graphhist(["0" "1"; "1" "0"], h = 2) - end - @testset "maxitr" begin - @test_throws AssertionError graphhist([0 1; 1 0], h = 2, - maxitr = -1) - end - @testset "h" begin - for h in (3, -1, 1.1, -0.1) - @test_throws AssertionError graphhist([0 1; 1 0], h = h) - end - end -end diff --git a/test/oracle_bandwidth_test.jl b/test/oracle_bandwidth_test.jl deleted file mode 100644 index 27e1f26..0000000 --- a/test/oracle_bandwidth_test.jl +++ /dev/null @@ -1,18 +0,0 @@ -@testset "oracle bandwidth test" begin - A = [0 0 1 0 1 0 1 1 0 1 - 0 0 1 1 1 1 1 1 0 0 - 1 1 0 1 0 0 0 0 1 0 - 0 1 1 0 1 0 1 0 0 0 - 1 1 0 1 0 0 1 0 0 1 - 0 1 0 0 0 0 0 1 0 0 - 1 1 0 1 1 0 0 1 0 1 - 1 1 0 0 0 1 1 0 0 1 - 0 0 1 0 0 0 0 0 0 1 - 1 0 0 0 1 0 1 1 1 0] - h = NetworkHistogram.oracle_bandwidth(A) - rho = sum(A) / (size(A, 1) * (size(A, 1) - 1)) - h_true_nethist = 2.643731 # version 0.2.3 from nethist package - h_clean = 3 - @test h≈h_true_nethist atol=1e-4 - @test NetworkHistogram.select_bandwidth(A) == h_clean -end diff --git a/test/pipeline_test.jl b/test/pipeline_test.jl deleted file mode 100644 index 7f86664..0000000 --- a/test/pipeline_test.jl +++ /dev/null @@ -1,84 +0,0 @@ -@testset "Pipeline" begin - A = [0 0 1 0 1 0 1 1 0 1 - 0 0 1 1 1 1 1 1 0 0 - 1 1 0 1 0 0 0 0 1 0 - 0 1 1 0 1 0 1 0 0 0 - 1 1 0 1 0 0 1 0 0 1 - 0 1 0 0 0 0 0 1 0 0 - 1 1 0 1 1 0 0 1 0 1 - 1 1 0 0 0 1 1 0 0 1 - 0 0 1 0 0 0 0 0 0 1 - 1 0 0 0 1 0 1 1 1 0] - @testset "dummy run" begin - @testset "run bandwidth float" begin - estimated = graphhist(A; h = 0.5) - @test all(estimated.graphhist.θ .>= 0.0) - @test all(estimated.graphhist.θ .<= 1.0) - @test size(estimated.graphhist.θ) == (2, 2) - end - @testset "run bandwidth int" begin - estimated = graphhist(A; h = 5) - @test all(estimated.graphhist.θ .>= 0.0) - @test all(estimated.graphhist.θ .<= 1.0) - @test size(estimated.graphhist.θ) == (2, 2) - end - @testset "run with automatic bandwidth" begin - estimated = graphhist(A) - @test all(estimated.graphhist.θ .>= 0.0) - @test all(estimated.graphhist.θ .<= 1.0) - end - end - - @testset "associative stochastic block model" begin - adjacencies = load(pwd() * "/test_files/sbm.jld") - - for (name, adjacency) in adjacencies - @testset "$name" begin - estimated, history = graphhist(adjacency; h = 0.3, - stop_rule = PreviousBestValue(100), - starting_assignment_rule = OrderedStart()) - @test all(estimated.θ .>= 0.0) - estimated, history = graphhist(adjacency; h = 0.3, - stop_rule = PreviousBestValue(100), - starting_assignment_rule = OrderedStart(), - record_trace = false) - @test all(estimated.θ .>= 0.0) - end - end - end - - @testset "multilayer run" begin - @testset "2 layers perfectly correlated" begin - A_2 = cat(A, A, dims = 3) - estimated, history = graphhist(A_2; h = 0.5) - @test all(estimated.θ .>= 0.0) - @test all(estimated.θ .<= 1.0) - @test size(estimated.θ) == (2, 2, 4) - end - @testset "run with automatic bandwidth" begin - A_2 = cat(A, A, dims = 3) - estimated, history = graphhist(A_2) - @test all(estimated.θ .>= 0.0) - @test all(estimated.θ .<= 1.0) - end - - @testset "2 layers perfectly anti-correlated" begin - A_2 = cat(A, abs.(A .- 1), dims = 3) - for i in 1:size(A, 1) - A_2[i, i, 2] = 0 - end - estimated, history = graphhist(A_2; h = 0.5) - @test all(estimated.θ .>= 0.0) - @test all(estimated.θ .<= 1.0) - @test size(estimated.θ) == (2, 2, 4) - end - - @testset "3 layers" begin - A_3 = cat(A, A, A, dims = 3) - estimated, history = graphhist(A_3; h = 0.5) - @test all(estimated.θ .>= 0.0) - @test all(estimated.θ .<= 1.0) - @test size(estimated.θ) == (2, 2, 8) - end - end -end diff --git a/test/proposal_test.jl b/test/proposal_test.jl deleted file mode 100644 index 0328b8e..0000000 --- a/test/proposal_test.jl +++ /dev/null @@ -1,30 +0,0 @@ -@testset "Proposal" begin - A, node_labels, group_size, assignment = make_simple_example() - h = 0.5 - swap = (2, 5) - proposal = deepcopy(assignment) - NetworkHistogram.make_proposal!(proposal, assignment, swap, A) - reference_proposal = NetworkHistogram.Assignment(A, [1, 2, 1, 1, 1, 2, 2, 2], - group_size) - - @testset "update labels" begin - @test proposal.node_labels[swap[1]] == reference_proposal.node_labels[swap[1]] == 2 - @test proposal.node_labels[swap[2]] == reference_proposal.node_labels[swap[2]] == 1 - end - - @testset "update realized edges" begin - @test proposal.realized[1, 2] == reference_proposal.realized[1, 2] == 8 - @test proposal.realized[2, 1] == reference_proposal.realized[2, 1] == 8 - @test proposal.realized[1, 1] == reference_proposal.realized[1, 1] == 2 - @test proposal.realized[2, 2] == reference_proposal.realized[2, 2] == 2 - end - - @testset "fast likelihood update" begin - # inside each group likelihood contribution - theoretical_after_update = 2 * (2 * log(2 / 6) + log(4 / 6) * 4) - # between group likelihood contribution - theoretical_after_update += 8 * log(8 / 16) * 2 - @test proposal.likelihood == theoretical_after_update == - reference_proposal.likelihood - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 643bfc7..d785c62 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,14 +2,7 @@ using NetworkHistogram using Test using JLD -include("simple_test_example.jl") @testset "NetworkHistogram.jl" begin - include("pipeline_test.jl") - include("test_multilayer.jl") - include("proposal_test.jl") - include("starting_labels_test.jl") - include("oracle_bandwidth_test.jl") - include("error_handling_tests.jl") - include("config_rules/config_rule_test.jl") + include("test_swap.jl") end diff --git a/test/simple_test_example.jl b/test/simple_test_example.jl deleted file mode 100644 index 010f511..0000000 --- a/test/simple_test_example.jl +++ /dev/null @@ -1,39 +0,0 @@ -""" - make_simple_example() - -Makes the simple example used in many tests. -Returns A, node_labels, group_size, assignment -""" -function make_simple_example() - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - node_labels = [1, 1, 1, 1, 2, 2, 2, 2] - group_size = NetworkHistogram.GroupSize(8, 4) - assignment = NetworkHistogram.Assignment(A, node_labels, group_size) - return A, node_labels, group_size, assignment -end - -function make_multivariate_example() - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - A = cat(A, abs.(A .- 1), dims = 3) - for i in size(A, 1) - A[i, i, :] .= 0 - end - node_labels = [1, 1, 1, 1, 2, 2, 2, 2] - group_size = NetworkHistogram.GroupSize(8, 4) - assignment = NetworkHistogram.Assignment(A, node_labels, group_size) - return A, node_labels, group_size, assignment -end diff --git a/test/starting_labels_test.jl b/test/starting_labels_test.jl deleted file mode 100644 index e94c3e4..0000000 --- a/test/starting_labels_test.jl +++ /dev/null @@ -1,41 +0,0 @@ - -""" - test_basic_node_labels(node_labels, group_size) - -Test that the node labels are valid: - - correct number of labels - - labels are positive - - labels are within the range of number of groups -""" -function test_basic_node_labels(node_labels, group_size) - @test length(node_labels) == sum(group_size) - @test all(node_labels .> 0) - @test all(node_labels .<= length(group_size)) - for (i, group_s) in enumerate(group_size) - @test count(x -> x == i, node_labels) == group_s - end -end - -@testset "Initial node labels" begin - A, _, _, _ = make_simple_example() - h = 0.5 - - @testset "random start" begin - node_labels, group_size = NetworkHistogram.initialize_node_labels(A, h, - RandomStart()) - test_basic_node_labels(node_labels, group_size) - end - - @testset "ordered start" begin - node_labels, group_size = NetworkHistogram.initialize_node_labels(A, h, - OrderedStart()) - test_basic_node_labels(node_labels, group_size) - @test node_labels == sort(node_labels) - end - - @testset "eigenvalue start" begin - node_labels, group_size = NetworkHistogram.initialize_node_labels(A, h, - EigenStart()) - test_basic_node_labels(node_labels, group_size) - end -end diff --git a/test/test_files/sbm.jld b/test/test_files/sbm.jld deleted file mode 100644 index dcd8b9f3e9f9f4d083b653c1d9854da5b66b5b90..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 528752 zcmeI5Kd)}dv(`IPWSlYPm}AaouJ;@}4gA4>`13#e#pnO} z7k~MS&;QMz{rR7M{+%Cu@1OqTzyA+D|BrwAtAGDzfAQ}=|A)W(kAC;}-qruy^#cFt z{rCQzZ~eV*eSF_HzxnPr-hKP&sb717|M&bqzx>G{O*5#kpZw}ifAN=p@hdpL%P$Zb@w4~8*Vlf(Up)U1KQ8q?^Xd2d z&i;NI$CmwO7tr4?|Bn6D^S^66m)|SDEA(Fc*^htnz0cnNf|>XE*PoJ@^MhypZ)K0( zd3Wvm?A>pE{CD7+&xhlp-0|zr|4w}S{Q-XC;}Jgp_`kpL{_-C@U;FH9AFbbf{t{n* z@A@G8ooD{x&;RTH{*OO-_wKJh_Fd(F{rW%8fBR|UFJ$p=J-;6r|6l*s>;4sY{_xws z%boxDZ@zvv?)u+;=k@xEyMFkO-mBmBYrp<)ru;Wf@ac}f`CR|(+n@dRAN=!w_QUV~ z{13nTNB`sp|NHsp`~L56^52E`KfFn*)%*U} zPpbR=@vC?H(RcsgCqMnp-*{g6llpJ{i=X`Tdq4l#5C7%&zj*#f_-*Z1D^zqeoe z`fq&op4a;K_LI9k{~mwz!yo_Tr~mosxApG*ukq=>8NdI%e(U=`{@xG2`{SSg{!f1N zU!MQf`0}nF|HJ3woxk((i#-3w(+9)T{}}xBf6l!h_%!p2kN#i(p1;O_&OPn%_0N9! z{D0};e~SO=clhN~;o$!r{r%Vf{QC8M-+hz+Q-l9X^>OoG*T4PQH-7f}Kl=Ia{h#N5 zkdyD??9NVIG4f~Y`(q+fZYQ+Md{oWJz33)H>G z?iY2}eWMGVTd(i4=Kj%LwXU44R9B9#7hRsyvvaf`4w9oB9gOPu9A^8~r(E-0-Rbme zT{*tOITz6TDxWiDUP^YqtS{PZR!%g>)mP15mFwP7`n@jD`uo8T?s5(G0ekmy{aIao ztvtK8=F;bM>JhsBNT&|!oK{`?i$CTC-0x=hYg~2j-S1lat-V0+@3ys1{f?u9?)6%C z936Cdqk3z8uFL2iPIcsj~b+58#>${Hj z*B#ehT<@!#=}ynqoiECzFTyA%s=Ivp6!D|O9cF#>XZ`p<`bSaolxu!eSAWE3=SB0X zUKe=e3tav0+gbacHFuZas(ZiFvwM&BXXo_#qWxUm>DfK#2eq$Nr=LC$9aLR8zATRR zXXmJI*4$Y+x?l0Sz!fgwe*aFt!kb2Ry!yL!f926_vi__unnyj#i|XWnkw2=FGYgNN z$KKK0s7}sNxZ6HQy}Rbr1wLG0_U}KNm-W?G+T#MNzE?I^eaiJpQC&F0k9* zqwGFeU-XU00ek)QgXF*+;#-Bx&E{wQS)b00`l7n#DIe`8Z&pq=Z}vRqf~)hHb2qHJ zUGs|`_1%t|6Mco=JzD#+e)TDL@0sn>Ihqd#@hgvX>LC4!)D`im?zsAu)7PrIK9}SC z=!)ck)FY1XuBh+7>pSaRZ(rc(-@(!I-hO>mXI&uruCA^+9ArN@sJi3mvyk~4Ssl4kKSwce$#&9H(o4#K{5GuX@C_zxZQaKz|?Z^;w_mzl&aXxXYJ3=NGXMN0#{90G;a-0wU zD6Y;Q-R~~_(Ve6EZg*I<@2cFh?svEMoOQ=pb7y@Im*ezV`$sv^{HPw~NA=!(`W!}c zU7b9wx9Zy8%3J3-f9u@x*9GbVb%DA-UEr+?^#12f)aU9>*B!Gy?Nd$;NF5HMD@Jwv z_&{_;>Y&!)QQi5nee~ZId+*TOch()|OWa?Nph=pcQouy+o9pxaOD%3WXSb%9SWP`{f`uTg(?7pULK%geZXpo1^F@{E2k z>dgAc!4Ikq2i1oTMs<9O^nqH3XY1;NgXHxhIbd%NeZ4uoeay|`QFGX5^*qg2-nxhD zC#M%z<@WAVeRY9Hyuhr#hrKzyeXZYW_8iSq&VJ}1dN0n((fQH>GXz}5dgx9+cg+dkR7%XjSy@Vf)` zHRxb#UHz@x&27zz{H=4;4+mGzbNSi%u3vTXK?GZ^)9Xb>W71^^POMwlxts9SHJRXf8?VM>Kr(A5M5F0%GK|<>q7@!ztemB zTwdwdzCi2mq1DfIdi&_VIzIaQcl&;4eUH((+OJ&itaUi3{!y;E%E?Cu(LtxXy7N1o z9B{TzbF=x{2Um1?=nl!D4hNl1U9r{g`dyCGHD@<|)xCS~_-OC#y;Jo)UEooFFVVZB zdi48o{gLkKz4Kfja}=E)9Y0*Lw;uKB9JuzuBXs@fpv$4|kUr|lRd;oKg>x@Z_nv#t zN~;T8`2wr{Js?++TcN{`MxF1t>bj?^J3Z>p_GwPU$p!I89N#R|d84`HfuniazbaSf z6hG$z^?lE|`=d>LwBH%;1^M2}(MQqcs;)emdz7C!AbUj|-%+Uh=v=MC6}9itIQJNR z_pF@JJ*t0?3)JsnkMD5zEqZ@vQJmKxjt^Mksm7}@v)%naRy!Hk3ccOgv*R9&un^MI*8P;Pnj7WrvPZP%QT5h)s9$+@ z@7_MmCr43zqn!NFoYnnT*)zJsUEUGB^Xm2bPFMee$L!zxJBj*S-RZ5b)g0xmdCtEo zclJE?1ZU^FJanB42k{jidjakb9y`6J)dlJTJ70i*7eL+B>8-xe9QqyVKE0fL?RR?C zm+jX)$H`N4esuN2sXL5(T31ePq`Nx(5svmp`BiVbz;54Tbf0X0*4K8;FWuSvtnbU@ z*Z(vt=k@ebkGg>Sj!uvEq7PKR>dMt0ar(i?PaQvq4mv%m(+A>@5I-3CT|Mfj4!WGI zt~rjYPQL2O@wH-WPOHClpWXbsymyy9cKbe8-zWQBvp#o^(SDaZs;}N_bw2Np=1>Re z%VN|=9gK3bK5|<9^vy>00JHh4{kjM9Kj4yFF?v-?DTt&`h}nx~vxr&CwN4^mf- zu9)@p_N~rmuk0LrApH)rKG$FR+zZtAn0wFOw7Y+YquzJaoZj!es&Cf2HAgx7s}2Y8 zDN+Zi!xgDVnC;8@qIsir_8=&ym!rN`uDKC!&3At0fX?UiY+u&r=AdVD zvOaPP>jGz7;OyV&?D@0v&iE!Z^SBG}dr@QqbhyK;&-JT5nwRY#_0bRNJ0c$SQ3sRCTNu(cmQs5$6a9PQ7}Dc{@+TKl=Ww_0RrJv-7TY z&+NXBawpyiy8E~~IvDjkA3ErABHi_)gS|PfPjj;zAL#PYvp)O|vpJD3s%Pi-=D7T; z>UD>Q@8FAG3;Aw}`U>i*!$<4nfKeX4 z2(vj^Uv}QCJU1`Or>^Mo(D6Ht4z~KWPy4kFFFfZ0?)#jx&2^{N9oKw2y{q!sQ&;Do zwXeQQIeR&sy5cH-lpEFAGx9}saw2TaQ9m4Ho^teAIGW$uANiyDtaq=@+4UW}?zQ@R zkgxAoj<0a+0*`vv-QPXBAN^qKuO#ZzIvi}BH_IQ*rw-1VOU|m?v+{Wl$R34tfg>*v z{r;l*k-NS&Yxn(L+j&I0`g_mXzjtob*ITC#)Zav_uKlf?d0^D9J~-Igul{U~>!Tl3 zpK^R4b;r>a>Dvw2bGN-}pSr+`3%K9A(^u?JnfqO!_4`NusLloOMW}xGC^FaOP>+y) zd~k>AQ=aX^=lY#aUlv<)TK(BQ9>sUn`(1T!{axMV9r~__kG^}fKicnZ^+&xI?^e9p z_htXH_s;H-_3@rozxMU^YYzM_n7vEZm%YoZyjeNfy&v7zdhe`1>wEP3>-+C^H+>Hr z)b~5?{7xr_x*~N&eCTjR>L7KvVpLaumTRu^Yz{v96{%;De)T(DeG%83**uy*yRQ4s z&cAEk)$VrozGu(h?RQxBnDa&SjuF?X>eo8l`Orc9AUcSyNF8)_)g5;^>YwGy=IDHQ z)K6WJenot$JB}Z8zPssXz4xfON6oq0@1ovQ7r4>|M*p5`zjAJ&Ivm6YI-Poi^eehP ze6AkpQNPxelaCIfM~EN9A8~x3^Q(>@qz9_QdgvYb>v-Vzhjp@&b~vl>AVXZPkcM!s5x(MvA+5!?_7V$wy1&)0jb z;*avqRrl-7&E7rhi}r9i=wOsX9Y2T;w(9hO^nfq}4>6}*1e)NHnPJK4&{MLN>6*Y&t>Jg_8jQm=s57au` z)tye?Z0w!a+c$gv${V{t>vwGRk8M}oyIgreS`khXHRB!d+FFf}G+3&h~%ho+w{j2wP`EF0AyPVSN0_e@Dt+ z>v>Cy`lGsLt6w>Or&Et`w4eOmoT!hwB6Ah-sou(6j?<%j`V^zSs@DbHx&XhYw>&+y zRk=O8eA=_?UZ3t*f9}7)(cf3+jq>Q_%*T#azXmwpz81}w&tkc&2c_- z(DkE_`m+7ZE36CD1;#H>fA7X`*phJdd!Mzx&Q%_LgRCDP*qW<-$~7;lFBy|ZKl`qCzu(p0ch&u`viDhgG8c^g z9f*9c?sT1p4~*(rpXMlcIqHLF{aGKmAbpBjhet?`^F{U6zS((heyiTQS2Sl;uA6)G zJh#Vcz23R{!gKQP`knM$%I93~xD>a)>dJM)sE$AD!><_ikJgzN<>1=|*%w^3*X;Z# zFRHJ4C%I?OXOCXI`u?r&cJFn1Ykv9b0$093^!LEkRac(934NgUwd(Y{KI&lBhd)A> z7wN99^VA0i)!)lqp6YPLXfA$y%F)57j&B#-ec#>pbKmi*d-I;F@6S65?|A|D`$PwM zz4Az>9^tIqY>xVr&)SzdGUXxzrXAr_&~j;)A2=hd=ZiZ;#ZD-G_nV{YQE+xUp1Fp=7Et; zy%%*(Z%*~iy+D1Jx%Yh3wE7+OeR;hb93&MEqJynEeXCISfji9ZLBI2BUAgOQf&1wcg94xl!(DKl$V+QlE`(4tg{XABZ0gst#X` zI)C-N%C8HIT;NfEuaElfk>5(DSMRDklku9THK*18+RP7HHYe-TIrJ$;KI+*%{0_-+ zb=6z_QGaWm`jtoXw7x5k-nHxAyX#kSZchQ2`=_&}{I&-%14;;V9_xmB+V)CKASM_*vozjv#0 z^*%aBIew>82l0XEqevbYstw(*-9cmsNKm2H9KB)6m zhlBVdjxURv7xm#+qz@flcstE{ocx3uaWJ?2Qmk)sCCED70E-7I6e@+qSlq;R~W}I!S9O=8x^m6Q@+fDtp53E-k9mRU_w|@psaMwpUS5E|C*aF! z%(PFrCXCjzdC?qlvOfG-Bv+AsboeNe2ae`xKe-@vh0^59$^;vk-9NqVdzH6dlYX$ZKkL_g<=Gs3u0N|gANdi|uNd|9)|o@Ua&*P4Px~Ct=HUZfUg`I` zKz;9Xzi03Ld;89PZGQUQ-rU|u>i#R!ot$kTPKIXxDQFD~L+^la^Ke_Co zNIgROB41S39OY5}+4ZaKd-lDqHvcN$_bPi{?faj-FW(iMwNGnatDilizNk*lD7rc5 zAamd#dKP>8viY-o**&wq>|V3;vUAHf_X71D=iW0)+x6b7_tJdj(cM#5r@K7-AU@^jpw^W;A9@xwS2;PYI(>!rxPbfq_jq0V zE9xHYH@ic}S>L^NZmWOv+hpfueeBcf*FNo6j$cuI%GD3Yr$`-)>gt0llB0UWvwc}# zH1Fv8RrjttJ$f zL;9&Jk96t|U7q^!fm&A{`KW{R!xgE6)Zvecb%#g&^7njq>+iYMf6uF}e^vI*_(0vv z>G+~LKG6A{P9K={;a8*|)H*z>)YiXZVz-t?uZUo)Oy6D{;2Nq(Lv40a{9q+pZe$rsVi6A)$xJqi@5fw zpE?-T@hQ4Kr)&S+dG8%upVRMt$9mt~3)Jsu?me%X#_s@ZeTVBq2Q?QR)VkxXewU-V z<~puAIZmhEisUJ_=0tw-@xejW;fm^~?sV$A;nnty?zj3q+5D_8de3Nmmpw+`QT>l` zfmy%T)j2NT>7!q7RW5t2&TpN=KA`K@I=)$``O0U_A!l{|ZgY-$FYhj#dx83XbMHAT z?d<-9|6<@jcy%Xhk)oApONS9d!5M0I=-&dQ1A>{8$Dz0p2V z{ph>yw$IV;fAo20fB#kUSLM3j3-dtdbGr5^cYUq8%gyS}#~#!{)#0G}&?D^ai*lp- z?76LZ>Q^4^-&((Ve(QUW_OHJ0(eGJ3@2Gn*2VDIw%|C0d?(t~6_wL#kad%hkbG}iX zIpC~3&6$<2xsJPePIoy+>+Gkfdr(*XF8t{C&7QY=Ui3Sey?->Xwa$KEYk#Z%s5z~B zmH(Uz%=$gd%E?|q=P4%#9aLR8K9D--bm|di`?EebPx~Y8az=HXt9emf{YP=Pm(!2h ztLD@N>H>9v)fdoz=is25AJw%l>(BZ$AMX0Iy7Q5**y`7QxXX$3s7^lU{M5l#AAV4C z&_U{jb1sm*Yt}bst4cZR0(@tXJ(QzI*xMK7UcJ8SUEE%(!$JKX=!)u#IQ{sPqbuSA zwGNN!_`p_ww$J6Z>dXVP{mw@Yhz~>`Ma@%AzSF5^aaK-tp8C$7ulqb2Uw!}8`A2`R zyS?Y=`(15b>-TQ;NB3}f=wOtib^1W9JC5HWeOgBcqq_RwStMWmPFJ7u-agGKKIZ~= z{~oj6H<~lrKj&*dZpzX3XCC;dm*@S8*?Z8J^{dbIJ3ZU4zERFT;AmcNKl4G&K?k+o z$~8y1=0$b&J3cGN<+tj)?c?@u-Luv2=9XR;IN}2Jdwu)Lz4saI%l6{~-+s+ktFF)K zUrl&YoF5(3esqv}#PLPwa#SbB>C_$e_PIQ#>l`>ppYlkjUU;nwxbM8%D~{Ups5!0o z)xDIr=9T~27g+uMqkLCK2cs+F10#QHJ)4{Lwa$0`(RuWP(H!-)a+kkK*ZpSAS(Q6$ zzpLaNb;qOTT;=Yy=iCdl{=QoMv$l=qM)hb9`oYLgUH!`Ofm&DYeCVL-qdp7W9CXFr zx#ZBV9DNj{+|l~ze$~In1)|@<9$S971^a?uR`&Xn?x%dOZEKs|E}*}ch_~*qdGtG- z`YdD)NUm~pkUEI2n5{>?Y_8_OLCuM{_9>@db>+^74!S<-g>`|tKwV(h3-Irk;;#F; zz1W|+@<^xNiaJMmHqZ5`4hN%r=Tlv|=0*{sY(b%@hRf#Me=(4tFP?>(Y>jIE*UPFXs_AxR^{;yuyu~>b2|B>b#fHR1*t1XFTBPD>ie~QC+?-#mQwCV zy8yqJ=-pYL=4AWS2M0B`mAB?Nzv_;w-n#Em{_LJvU+dnjx$0NmI=}pF7s%c_>uZ~S zocrqgA9tym^yvj2<#*HjPWr%4Z};h+SJywC_359pbMb+nX1w<2tWSHheOaI8!?oXW zbddfm(g(Ko;iumrzQVe|P8Ziw>| zckZjqb7k+8^|6`ymE!}cgORRvc!Zj-e(E4~IEbD_?Nd%pq`Nx(it3MeYrp!HGY{0d za_4ip_8-l6-OJtKYWwOt&ffp*`P^fdJ=kYgtUJB@GTZ}vS)Iqo2VF9CFzZu4TrtX< zRcEfto24@coRvE(=jgqTp2wZen#=sz7`-Q|&)%o<9_s=}{l1Qxv-<0;dPi?=Z=bsp zd7$&Du72hCRgXA*vry;t=FZAt{%9WkAh{8?`lGzN*Xv&AeBrCzeb&CSa?bf`HI)nK zcZ7q^hpwo7)SXVV@~Zz**n>thu{>y}HM)U*?>>X8j&qj?;NQQmpA?^64|egXG;b^7Zm z$EmCK(p)&$`$E@0OK;taxuEu|u3Y_a^+lXMF!ECeU0<(GpQ6sAjt-(<4X?ic)$YA& z-};WLE<2W6-(~E6+8y09s=GY;Kzwiz9dtVNRwPf69CWxte2VI)j_$bA>2s)kSs#5M zeQ-tUkAl4aQQo`zJL`MA{e`mk$@<=&_U_%RoM=wf_qqVz9n|g6L9Hvtr>H(SKIN)w z9X=b~T;@b|_2CDpM~GiB>Z48{+~KH?esFY->o5K30@0m!ztdIrpS@>q-s-;I{M~i9wDkLtaB^FyS`REn&awjj`K&l%R^V}&5Qc7b@supJkqJ}itatT z?t9n0&iW2#&7JifNB7tMR_@-RInLLrugY!R<7hwc26aBVqWWfY=FQH{<}n}4_Ko^R z=SBV0LAUQ{-Q}WdzH)qpb%7l&aF^fbUEZ<7Z{iL+jQ$>a`(2;Yxph>>7hyIh>x<@5 zCl5pi(LwYqx<03CZsc=yr)xeOzan*zI=pbr1+I4IUG~*Il&|?(_e~wW-)R3)m)&(w zx7RFP@9T22zHGndwQ}Zwt^Mksl|wFhAo^}AObIX731^Diu+orXSU(w~OPM_+^oev$%_TlU8qhHbGsn79jf7a*bl)lFW zj{1&A&DrCskF!O6*JHo6-Wv|;Ht3+%BOdj+daKSJ^nvK0>dNssR6iVaeNLxO(fL)6 z`lxH4a_4ip_7|@Ue7HdUUgrJ|>Yj7&S!rz-X#M@Q`gI%aSFZk9oH?N8I$eG6tUvNa zb>=B%`|yF91E&tE4hQi?h`(^|1-Sd%)cdB@z3%&M>t*o;`2B-%N1G#?oAtTr^n>`| zAi5%TQ0qr==7ZU}t$uP{pVR46RDZbj?L*RQ(rY!1GtA72)uJnCRJ*ZDL@x$BE`a=^$>UH!`O^&)v-Z_cW|UH6*( z-bZ=o?7KyIk9OC)zW3~R9X+3U;Mu=(-9ve_Uu&IwkbdP^oj#C0MOSya_AA#uIH>+C zr(cnNr&He@*?+e^XYDmB=ji(!z4y`c+&y>M!|iufee|xQ{a3vkdyl?9>esq*_HgL@ zPIo!II&(nHK?hsw>W8~L>R{G~|7^V4zGvU7=I1U@zpLDxuHvfiIp<4pM=+bJe&yt- zt{mS{7|oCBZqMGn);yh~+~u|Enyb7uul##mVD|T%l{YJAuWPT{rtUR%+uiS%-LKWB z{)ltsUZ3lCy3WCu#nyiOV0Mo4kptpW-mB9WA@e}z*E$@`_Bmhab%9SW5dE$`_12&3 z0=*YF`*(KqeCC0AH>ax)-s+F~$OG{^jC`&>OLu#r>l`?!KBwbTR6lj6Q+L?f7x}4! zv*x;-((3|sfx3WRAo}-5{g1}E54h_6ny);XtA6F$PaV`cTkm?4dq5=zNh*J&QUo+t=!g=D51kTj!U5&IRgw%(i9tQJB|+0KZ@jmt$DqDt#cy3?x8&DKdQd#URQhnqu=qU?{$?q z^?TU+OYwa`-4>Nb`Jfme)?Lmb;Cwoy{LoqDMoet zkuR!~Q&<;x?E>}tdF?9K)9O2(`_1(J%GoZ`se|+>Y8?(neywM@=EFgkqjlx358e4& zb@Cj#Jm-&e@<9B`(ZQ&W&tYqStKZEn{q+m1zVGV%*I(7I&fa%+UjNIgGjoBfe6Lx1 z&dP~yo1KFXBv&~)7}fEC_!X%u;zL*NbmzklI-lxrkbcFej^FvvLHrK0KG$FRxfiJK zch3E1-F;ThQCE2MeWUkBb@!d3zNoIb>Q|0mb>;XJ@qyHpqbuTz(Dk9y4|nK%PIo!2 zy32Pu`B5ETgybk{A9dBQ%J23bzQbAXYMtBa=Y55Bfk(K2`}?7~a&F^v>R{G~-=XFx zr%!d|_#*7>BRAWR52Sw<_U1-?T30@6|5-Wg2S&MBUzAhz$GkxH_mlNK<`u8^>Z^QD z_Egl1Q&&CWv->npx$cv#t1sd@$8pVZKJ->UeOYw*PS575FUr$89Mrrlcl}P!=9X{m z1@!wUUpsw$`rY2M{$e%Zhzmr&ORXzsM|2R~Vb({#`r(TB74fOA9G{}|Wp(vMocZ{a z_v-Y4E|>ajWF9y>|E_sgxz|igc@9_m-P~4P_k@Ekk2>glSJm(Ou3f)J-Fw$B^H_WFI|N%_-`m$Z zr`OMZU~i7=(_H1+7uD6T-1V#OxYKn`mXoL0+Mo5~V;=k{Ts?Pn{!w>2d(P}Tx;&?! z{r*++ukzlj?78YYjpn-hIem2ARr{lRMD<%8IkR$f4{|}R--RFjzN6<|^*hy_=6<Bd4uNRM=*SqJezPrEosP`Q;XV!PGoVq~E1)|?~ zRBxHp@*Q+It%r>_@R<@WBg zy6@56yXu}-x#Ly#T=kvK&b`|Gci&g{Q=a|ay?vTb4ybi_RL9qfE*Bkixl!HqW%bc~ z`oZj6e7$}27uE&p0>@v#{k?Vi@f%)i;_ACa`BA<1nwsl4I_Ty_b?tM0bVcpYa&pwK zx^nzZ*ShmLo&HgD`A#35w|jr~-C5u6?`6N;-!pr^*?HV?HC}aZ?sE10dB@qPcT}$X zXpa&V-IZoI9R_=0DSMKIyee`Gj_(1v{W__;z zQS`dgqkQSHch>Ku?|$43Bqieb6tz$5SL6B~<-Et~uI~KkisUHbi*)C6efUA=Q(d|1 zSD$j%*Q>ie=8o3MaTw*hK6Ls)^rP|Y`MvLTdpo^%PPT9L+|~KncRs4G?*ICif0Vnp zcRHPGX&nyY1JObBR@8nt809%%q*GUP^Rhner>|FU?Qfk!K1hGW@!cKu4y)dK)SOlO zlk48$boy4|S#wvtXLfGgXZA&p$eWwgI{ZuQj#u6%*r{@&<5tvvfJ$OW|z9dvco z;q)m|cZlEBRaZY;QTvp)`dvRddGOV^Y96`Z>UTu>yX+C|*;>EqyRN!->-Smhzv~@) z-__f<`aA8K@9u@}u=PFUM1AU09`(-<)}dLQz2FZ5Adl{>oEX#dsUb@ly6zh7^E z-L3a>Pkp`j>+NgZ_NY0nd#&~#^=>!E>7(x^7t}nbh9f6AARSl+|j+N z|K$bhJH5OItnR`k=Ahma4!SAS!N}+8s=GX=)9>m|r!R||qdc24>eIQ(HQ&`$Z{<2i zd6cLAR<8N2{pElC0#~`~RrY-S%GX!3_s{yyzDRb@tZ(!^tLJFG@@W57zve~#QC;&K zcRA=0YOeZS9UXs`<5Sd}*1G!PQLgj3KBtqD#aTJoc~|q*eb#=ly6@Wi9-Us_^XcMx zhluNUQoq*WkuR!~Df9ykUUUzcosE> zK6DWMC_HQKQSUo@&Z_&4=8pECeb?6cyT4cWQ{MVs{Ihb%+a1q-@9g=Sr=0JhIvm8; zicx=5pFPjzW%p1Y9CZ0krw??#R$cRyyFB$NcYSx)^$yDK{{DL3V_o2GzwhWhqxI2W zM>OB%_x5R?%TZlYSDij|<>;WRJDom7=T}|1=D@Q) z{0>KRM*H1fXX|%;*V%Wi`E`LEFA)8G*Sd1vq`KpgPF;~Xt8sMhtbTnbxvEE;KE%IL)_vzkO=iG1f9kX&q_vr1v`u@G|Ztbf(wO(w@f7N@lH>g)|y80p>C_AB0-s(W`+LdyKF$90&!hdHX3hBX=su(UGv3gX zan-$h-rCc1mHV(K7`-*}MRoG=fvUq5)vtBOwNJV0Yt>yYI!LZ^bVc>SLHx?m9k%*g zb7tkcxuuU?V3+T6)E-C88T%^LJ$iw&zH{%~-oD-M#JfP<*Xin8#hDMznnNBK-w2?sxICASA5@=m=R+^73)BVHU%>snTs`@$xvSrG_59KA z;`&>4y`%QSLG>$lKBvQ?R=yIIT`Bit^>299t$~8~x5g+Z3@>}b>?bEvd z?tbnI*1o%6p!U1t1@!M<#JPd`&}SoaK+Si$`jl&*a`h`$zw*eh^(+b&S|ZM#>w&$&STPS3f!PIcd5RCoP4UwLa^)}Qr_?&JD(zH<7U9@TsM$n8bu zD)#0b)yKZG=aCD}&euGhqdf9!y?9-qF7V|AqQ6gHI_pn87vR2NHbec6ql4rtM{mX6 z9PNuZdu4s<*FNfs_!QOG%d@%oM(6bQ@76bZkKNu=`;1*6`aMMTvFX?AKI$Gv&56Fi ztUbtykbUsMTXD2s=V=}s#IGDZi}Zt0U)D#C`Xiq0!`F-Kq1c;OeS2Pj@2lH6U46yt z0%u&HexJFkb0^SsX&nx_K6K~H>dr_0*{J&|KYKssM|){qIXPLRAJjgl_Z(6svdFr zK=q?5W_{Y{`0PA4C#$m$nC-(ypQ5WrI(0?zo$l(+uliki_Ac)INB57;bMLsD{%Ci4 zwD;cS_uBg1@Po7OFq%t0==O{1uFvV&ysXd7iS*I@(SF^JJkZtAosaq`uFk*vexvWJ z{(D_u_rFi~y7s!)=y!~Gw2AgRf7a*vRnO){zTP^0pzarO`oPG)Tb;eZ-S)fcKD*v? z^jrE9xBjolgBKIQov!{@HihZJ*KiUFAKadye*x-kJVg_F$h!$I?l?K1^R?=l2Um2t>VvD_>G)i|RqxHCFNxD>;>d)@6i|_95JL=ANyW>&snKfrt&QZUY);YcJ>+RG1$N{xJ%3ZG0ndj z>FSYA9c1+|zTYdP!Y(9RFJ`g=Z^(&A1wI20Hbe0)dE zVcy+v)%{lG&c6Ru_MH9h)p>Q-w=W#sC#t`l?yKa2-uZgP^{EaAUr&qGs6VRf1pHvO zjvu5?Il7|rsjggeTDj&akMi(?SwBA4hptFpE0WXNhkvx6oLRWrKC|AnD(C9=Vt+7u zr&fP9SA9p#(YaZ^`+oY4UH5f+IbGl7cyE7ipW7p=GZ&;U;`l)I_v-Y4E|+>Qk_$#T z>Qf%|SN$3nsP8)Vo$7vL_p9!)3$*_JoZsngJN)2mUzDr!B7aoZJUD$IdM|2@a+m9T zPM@8_95Bj{e6AmTw4WTsRk=E+m+OAYdvkZ|yX(7lyL0U`=K@#xy|(VDe&wy#i~6HF z^PEq0<@AGEhlBVcRKMd{J(?HQbx!1`p6yega`Hgxpz82dsB>4%>CJ83uhrkXSM}8e z>H_CqVAkLFYz{t<3plR2^P@-T^5~-ust!L3*$Zsl!}Xzqt+~#xdCFa1tG+6?b&vAj z;{sRzJzahO-F`)N&!;cq?%;I&j*%~_YaX0F5FK>7t4DtN9MTWsgM;Xx)2V}5AAT_E zb9Ho(JkaUXTXA+?>-_RRU10P(jP}=e$X#H!JNNG6`kbD-&f~d`zSpXGqq*7stWSSQ z+UGdBqRYdl^;z8Ix9aTU`lu_aUwP!$dhypU5Z(3luJcvhTlwk@kDj;sJ+1k@@9*tv z-LL#}E->qR&B}4FTAj~aH%E2l+VAT4K=r{v=R*h8j}B5-r0&r9(OqA!uDKEKol|`; zFQ9jRS>1=)ID96>aIry?U_#@OjI2iS7 zojy?OSK(LR|0;K`J)bU+{k>&ipil&g#4BJPh_l(xBy7%b2d;7T;NDdrCk1+B__1=8Vg*%MqP^TZh3ZpsH!Dx@j=jy7ra`uY) z)R*3r*-u`S~ z)~EYNeNnwN&-qu+(Yd?u)_3o+H+x)tf8G({Ro;2jo=44zzDw1gF2MHy-CbPW>EvkL zanQCzWsFHxC`u9H;mCwD0abd%wHi zq25;)_;7*IzYnARcl!!@54d9X-Sny7>FRTQw7)l(xuDBYz11J}Yo2oL&+;gTI+)E> zAN?S8<>-ayTtL5v@^iL&ys5j~eV0A%_6xYmU3Je^uHQ}jsXLweDr7Dg&B^+X%8BMs zXKxTaLiHDd#hJq#ci82LAuU+@jJ1Di^}$^Zb%#-3RCl?h z*9GbV_q#y-d+_p`>OJ7#OLv~p?~o+w@CfmPS-c-WSOJzVLx(zp8hS-oDoNMsu=te2Q)#=TjZ7 zIguXK$-@V#?l^j3U7#+o?*;VlHJdt}df^@y$bKj1LkHcfqPpvwrR#jznbH_ce&@? z_lVxtTaWtm7Zv%n9`V-vQNQ-Xbx-uGA@2Znf2a5QwD0abd%wHi;ZfgL-+%8fKl*(} z`?J@{&QTv+F*~=_#~jz!Tc-~sUpe|H_U5+ssb9H!$84XQb9Fs?r>noS_OAYBR_R8 zyNC0U1L9MTo<;3f9_8Q%@heAH#HXmfh|?eWu3FE&FYB9ihok4s+WYLBRd<`6J3H^} zyVv}+7ij&x;0M_b4yq3CMRFB;^UmsHe>dONo$hkbqrRw4PK25ham`b|){EB#>H_mG zFzfH#tep8Td`diPzq97LJGyaV}5)Uz*e_IK0y%H4g`r`+{Ly5_(^*B{lRKKeoR zql47pAbKlmKO7`aIXXxkL?6W{k2*Lyuln0AaQC}k^?keSo!ukrYx{MU`|1}s>vzFi zFnh;Ve>N9iYd(IkH-|n&&DHuS*L>xgAJs?w+V8mA!|5)k^p_W?`@Ou!)mVMMHD7Gi zU1#M+Ia-G+y6fT7Iy&g;s>6Hx=m%X6bujD0uSmZlK6Lmfx;%8n)$=r;`B`0kkH+1- z?sA9L_qF=3cBi|)FZv#j{{HAby>)k2r@J}M?{spggXoH`uDWu~>E+}qY94jfl}A46 zp!SdQDA(1|LH1CN4pIlvBg79zey#WRX-@ISxSoCyVrf+LzUxkG!akFT$g8X3gO}v-4)>uG)`13y-!tk-KpFWU&5PcLi zPdWLjE63+h{mQkEI;izl9_46VdF#Bp`Rg8c`=a%pmJ96u``>N9)_q$2?%RL z^^NAZesqv`C`SifU3KO3D`tJ#H=FAo%H2HmDR+IPAA5mS-(^+qvDc|-XMe}+eDx{k zOFEtUYV4iAy0151@0HCN^=12YKgV63(_PM}uJe?S=4(IsAoYmjQ|#^2oZ@wXy1-X2 zpnt!<>L}$AFTn2tjBXJ5TpeAJdFWZLKJBNz8e8Y$2WRisn(O>d&+e=JaM0z9)?IGt zb%DA-U7#+IyFm2su%8`Fkla=r?QflTv|rz+{OEn6d9(MMofqwY)%xl?W%IMXS$95qUUu(Y_sZ_CKHdXT zSB{>=(SA3lH5WfPE0>%o7axcpR2|-n9egWehgx>etdBQ2ztf{W>L58F`q?Id|`K z_j3BId#ujC+n&+AqWY|R+$HDed+9voy?1pvsw?lE*V{L1zUC;8_H%yKBd&AsgRYLQ zKIQmkq2?=}HRn-t>Yk7Ku{|*t~&jot2>>(R-B#3{A{lJ zB2NA&>Kx5$tw(p%{I0IL^E+MpXK^<#(y4>leX@S%Q{8dZqy5wm2cv%KS%20?ZsE%d)cs!G zqu9#})bHcvJ$7eDeJ`D(T<@fH<&j_OaB>u>JH)Sb)!`1cAC4cMMeT!wQBKz9az^`I zuG4j%7Ic(m_w^m~t<_xe}fN3DB2eT&(5AP>xbarMJN%~4%> z)vBR}=s{Ool$w%+CJeQW-n7pULS*moJd-)O&X zKRZwJM|pO?tWWpRzKCmo#5E7U>dNs&*xRSME(cvv`{DTD5svy>`*k1J?{w{-&0FWT z`bYPjJ#Tj2==)mxue$$L_ip_@t^U@#wEA23$@*)L?4{oJ&AKBw;M-G=uj*dPTQ_q3 zP9M$B=4O5D6ZxY$IlJLi_jY$Y>ODuzaqlesF)q;hJ8Sjtdd1dvs2>ixcSd#B=XA}3 zJ9IgbPG015b?sN~`p{eb^o^pMtGe>hxvu}HIc|QhzH0APxxMdvl)k&)ee`{!{g3iH zI{KYg{Vucjc6qA96YTIg zfArkh^E6NStlz_`+*$iwEhl?V^(p7wPNyCr{fbdvwoWd7<>*<|e&sGleGzv#sz*6d zUGrvf=7QuZM+c)izE)h7+q%cy{72vQ=y|(;udCkUs(X+A9(H|qv{zK;F3#t4`W(7G jbg;MI^{K9Nl&c@EKIQmYksPqMU;8vqx%y{uo%8 Date: Wed, 16 Oct 2024 08:22:56 +0200 Subject: [PATCH 004/266] Finally got Value types Should use `isa` for my testing instead of `<:`. See design pattern book p. 64: I'm testing if a `Val(Foo{T})` is an instance of `Val{Foo}`, not if they are subtypes of each other. --- .../specialised_data/Bernoulli_struct.jl | 23 ++++++++++++++----- src/optimisation/config_rules/InitRule.jl | 3 +-- src/optimisation/opti.jl | 7 ++---- src/optimisation/swap.jl | 2 +- src/{UI.jl => run_dev.jl} | 0 5 files changed, 21 insertions(+), 14 deletions(-) rename src/{UI.jl => run_dev.jl} (100%) diff --git a/src/assignments/specialised_data/Bernoulli_struct.jl b/src/assignments/specialised_data/Bernoulli_struct.jl index 87e94d9..430efbd 100644 --- a/src/assignments/specialised_data/Bernoulli_struct.jl +++ b/src/assignments/specialised_data/Bernoulli_struct.jl @@ -6,8 +6,10 @@ struct BernoulliData{T} end const BernoulliAssignment{T} = Assignment{T, BernoulliData} -const BernoulliInitRule{S,T} = InitRule{S, Val{BernoulliData{T}}} +const BernoulliInitRule{S} = InitRule{S, Val{BernoulliData}} + +# is this type stable? should this be BernoulliAssignment{T,F}? see line 8 above function BernoulliAssignment( G, node_labels::Vector{Int}, group_size::GroupSize{T}) where {T} k = length(group_size) @@ -15,15 +17,11 @@ function BernoulliAssignment( BernoulliData(zeros(Int, k, k), zeros(Int, k, k), zeros(T, k, k), BitMatrix(G))) end -function make_assignment(G, h, init_rule::BernoulliInitRule{S}) where S +function make_assignment(G, h, init_rule::BernoulliInitRule{S}) where {S} return BernoulliAssignment(initialize_node_labels( G, h, init_rule.starting_assignment_rule)...) end - - - - mutable struct BernoulliSwap{T} <: Swap index1::Int index2::Int @@ -47,3 +45,16 @@ end function swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} # perform fast update end + + + +function test_init_rule(rule::InitRule{S, Val{T}}) where {S, T <: BernoulliData} + println("Method with BernoulliData") + println("Starting assignment rule: ", rule.starting_assignment_rule) +end + + +function test_init_rule_with_alias(rule::BernoulliInitRule{S}) where {S} + println("Method with BernoulliData") + println("Starting assignment rule: ", rule.starting_assignment_rule) +end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 65320ff..58e67a5 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -2,13 +2,12 @@ abstract type StartingAssignment end struct OrderedStart <: StartingAssignment end struct RandomStart <: StartingAssignment end - struct InitRule{S <: StartingAssignment, I} starting_assignment_rule::S assignment_rule::I end -function make_assignment(A, h, init_rule::InitRule{S, Nothing}) where S +function make_assignment(A, h, init_rule::InitRule{S, Nothing}) where {S} return Assignment(initialize_node_labels(A, h, init_rule.starting_assignment_rule)...) end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index 529f921..9693e97 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -4,12 +4,9 @@ include("config_rules/InitRule.jl") include("config_rules/stop_rule.jl") include("config_rules/bandwidth_selection_rule.jl") - - -function greedy_improve!(a::Assignment, G; max_iter::Int=1000, +function greedy_improve!(a::Assignment, G; max_iter::Int = 1000, swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), - initialise_rule::InitRule = InitRule(RandomStart(), nothing), + initialise_rule::InitRule = InitRule(RandomStart(), nothing) ) - end diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 3fb09ff..425a9a8 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -5,7 +5,7 @@ mutable struct DefaultSwap <: Swap index2::Int end -make_swap(::Assignment{T,Nothing}, id::Tuple{Int}) where {T} = DefaultSwap(id[1], id[2]) +make_swap(::Assignment{T, Nothing}, id::Tuple{Int}) where {T} = DefaultSwap(id[1], id[2]) function make_swap!(swap::DefaultSwap, assignment::Assignment, id::Tuple{Int}) swap.index1, swap.index2 = id end diff --git a/src/UI.jl b/src/run_dev.jl similarity index 100% rename from src/UI.jl rename to src/run_dev.jl From b537c55f2614a6af5b7e61e0123fb25a8c5f3e6f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 09:19:55 +0200 Subject: [PATCH 005/266] renaming and updating folder structure --- src/assignments/BernoulliAssignment/struct.jl | 24 ++++++++ src/assignments/BernoulliAssignment/swap.jl | 23 +++++++ src/assignments/data_structures.jl | 2 +- .../specialised_data/Bernoulli_struct.jl | 60 ------------------- 4 files changed, 48 insertions(+), 61 deletions(-) create mode 100644 src/assignments/BernoulliAssignment/struct.jl create mode 100644 src/assignments/BernoulliAssignment/swap.jl delete mode 100644 src/assignments/specialised_data/Bernoulli_struct.jl diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl new file mode 100644 index 0000000..149af79 --- /dev/null +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -0,0 +1,24 @@ +struct BernoulliData{T} + counts::Matrix{Int} + realized::Matrix{Int} + estimated_theta::Matrix{T} + A::BitMatrix +end + +const BernoulliAssignment{T} = Assignment{T, BernoulliData} +const BernoulliInitRule{S} = InitRule{S, Val{BernoulliData}} + +# is this type stable? should this be BernoulliAssignment{T,F}? see line 8 above +function BernoulliAssignment( + G, node_labels::Vector{Int}, group_size::GroupSize{T}) where {T} + k = length(group_size) + return BernoulliAssignment{T}(group_size, node_labels, + BernoulliData(zeros(Int, k, k), zeros(Int, k, k), zeros(T, k, k), BitMatrix(G))) +end + +function make_assignment(G, h, init_rule::BernoulliInitRule{S}) where {S} + return BernoulliAssignment(initialize_node_labels( + G, h, init_rule.starting_assignment_rule)...) +end + +include("swap.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl new file mode 100644 index 0000000..63ca218 --- /dev/null +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -0,0 +1,23 @@ +mutable struct BernoulliSwap{T} <: Swap + index1::Int + index2::Int + old_assignment::BernoulliAssignment{T} +end + +function make_swap(assignment::BernoulliAssignment{T}, id::Tuple{Int}) where {T} + return BernoulliSwap(id[1], id[2], deepcopy(assignment)) +end + +function make_swap!(swap::BernoulliSwap{T}, assignment::BernoulliAssignment{T}, + id::Tuple{Int}) where {T} + swap.index1, swap.index2 = id + swap.old_assignment = deepcopy(assignment) +end + +function revert_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} + assignment = deepcopy(swap.old_assignment) +end + +function swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} + # perform fast update +end diff --git a/src/assignments/data_structures.jl b/src/assignments/data_structures.jl index 683cd65..e6e4e65 100644 --- a/src/assignments/data_structures.jl +++ b/src/assignments/data_structures.jl @@ -1 +1 @@ -include("specialised_data/Bernoulli_struct.jl") +include("BernoulliAssignment/struct.jl") diff --git a/src/assignments/specialised_data/Bernoulli_struct.jl b/src/assignments/specialised_data/Bernoulli_struct.jl deleted file mode 100644 index 430efbd..0000000 --- a/src/assignments/specialised_data/Bernoulli_struct.jl +++ /dev/null @@ -1,60 +0,0 @@ -struct BernoulliData{T} - counts::Matrix{Int} - realized::Matrix{Int} - estimated_theta::Matrix{T} - A::BitMatrix -end - -const BernoulliAssignment{T} = Assignment{T, BernoulliData} -const BernoulliInitRule{S} = InitRule{S, Val{BernoulliData}} - - -# is this type stable? should this be BernoulliAssignment{T,F}? see line 8 above -function BernoulliAssignment( - G, node_labels::Vector{Int}, group_size::GroupSize{T}) where {T} - k = length(group_size) - return BernoulliAssignment{T}(group_size, node_labels, - BernoulliData(zeros(Int, k, k), zeros(Int, k, k), zeros(T, k, k), BitMatrix(G))) -end - -function make_assignment(G, h, init_rule::BernoulliInitRule{S}) where {S} - return BernoulliAssignment(initialize_node_labels( - G, h, init_rule.starting_assignment_rule)...) -end - -mutable struct BernoulliSwap{T} <: Swap - index1::Int - index2::Int - old_assignment::BernoulliAssignment{T} -end - -function make_swap(assignment::BernoulliAssignment{T}, id::Tuple{Int}) where {T} - return BernoulliSwap(id[1], id[2], deepcopy(assignment)) -end - -function make_swap!(swap::BernoulliSwap{T}, assignment::BernoulliAssignment{T}, - id::Tuple{Int}) where {T} - swap.index1, swap.index2 = id - swap.old_assignment = deepcopy(assignment) -end - -function revert_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} - assignment = deepcopy(swap.old_assignment) -end - -function swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} - # perform fast update -end - - - -function test_init_rule(rule::InitRule{S, Val{T}}) where {S, T <: BernoulliData} - println("Method with BernoulliData") - println("Starting assignment rule: ", rule.starting_assignment_rule) -end - - -function test_init_rule_with_alias(rule::BernoulliInitRule{S}) where {S} - println("Method with BernoulliData") - println("Starting assignment rule: ", rule.starting_assignment_rule) -end From 82104ee1eb331dcdb4d7a017e19a5be7c6158827 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 09:22:04 +0200 Subject: [PATCH 006/266] Add Observations structure for handling data Contain both the data and the distribution to be fitted, can be used to carry around all the information needed in a neat way. Will be build based on user data (with automatic detection for some case, e.g. simple unweighted graphs will be assigned a Bernoulli) and then pass around internally. --- src/assignments/Assignments.jl | 21 +++++++----- src/observations.jl | 10 ++++++ src/optimisation/config_rules/stop_rule.jl | 19 ++++++----- src/optimisation/opti.jl | 39 ++++++++++++++++++++-- src/run_dev.jl | 6 ++-- 5 files changed, 74 insertions(+), 21 deletions(-) diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index ebe584e..204635f 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -47,29 +47,34 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) return get_vertex_in_group(a, i) end -function loglikelihood(a::Assignment, dists::SBM, g) - loglikelihood = 0.0 +function log_likelihood(a::Assignment, dists::SBM, g) + log_likelihood = 0.0 for i in 1:number_nodes(a) label_a = a.node_labels[i] for j in (i + 1):number_nodes(a) label_b = a.node_labels[j] - loglikelihood += logdensityof(dists[label_a, label_b], get_obs(g, i, j)) + log_likelihood += logdensityof(dists[label_a, label_b], get_obs(g, i, j)) end end - return loglikelihood + return log_likelihood end -function fit(a::Assignment, g, distribution) - dists = initialize_sbm(a.group_size, distribution) +function fit(a::Assignment, g::Observations) + dists = initialize_sbm(a.group_size, g.dist_ref) for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) - edges = get_edge_indices(a, group1, group2) - dists[group1, group2] = fit(distribution, g, edges) + edge_indices = get_edge_indices(a, group1, group2) + dists[group1, group2] = fit(g.dist_ref, g.graph, edge_indices) end end return dists end +function log_likelihood(a::Assignment, g::Observations) + dists = fit(a, g.graph, g.dist_ref) + return log_likelihood(a, dists, g.graph) +end + function fit(distribution, g, edges) return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end diff --git a/src/observations.jl b/src/observations.jl index a38c19c..9b34845 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -1,5 +1,15 @@ # getters for observations +# +struct Observations{G,D} + graph::G + dist_ref::D +end + +function get_obs(g::Observations, x::Tuple) + return get_obs(g.graph, x[1], x[2]) +end + function get_obs(g::SimpleGraph, x::Tuple) return get_obs(g, x[1], x[2]) end diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index 73e6de4..3a0c998 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -2,14 +2,15 @@ abstract type StopRule end mutable struct PreviousBestValue{T} <: StopRule k::Int past_values::Queue{T} - function PreviousBestValue(k::Int) + function PreviousBestValue( + k::Int, x::T=0.0) where {T <: Real} @assert k > 0 - new{T}(k, Queue{T}(k)) + new{T}(k, Queue{T}()) end end """ - stopping_rule(assignment::Assignment, stop_rule::StopRule) + stopping_rule(assignment::Assignment,G, stop_rule::StopRule) Returns a Bool with true if we should stop the optimization based on the `stop_rule`. @@ -19,17 +20,17 @@ Returns a Bool with true if we should stop the optimization based on the `stop_r """ stopping_rule -function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) - loglikelihood = loglikelihood(assignment) +function stopping_rule(assignment::Assignment, G, stop_rule::PreviousBestValue) + log_likelihood = log_likelihood(assignment, G) if length(stop_rule.past_values) == 0 - push!(stop_rule.past_values, loglikelihood) + push!(stop_rule.past_values, log_likelihood) return false - elseif loglikelihood > first(stop_rule.past_values) + elseif log_likelihood > first(stop_rule.past_values) empty!(stop_rule.past_values) - push!(stop_rule.past_values, loglikelihood) + push!(stop_rule.past_values, log_likelihood) return false else - push!(stop_rule.past_values, loglikelihood) + push!(stop_rule.past_values, log_likelihood) return length(stop_rule.past_values) == stop_rule.k + 1 #always keep the best value end end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index 9693e97..efb39ef 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -4,9 +4,44 @@ include("config_rules/InitRule.jl") include("config_rules/stop_rule.jl") include("config_rules/bandwidth_selection_rule.jl") -function greedy_improve!(a::Assignment, G; max_iter::Int = 1000, +function optimize(G, h; initialise_rule::InitRule = InitRule(RandomStart(), nothing)) + a = make_assignment(G, h, initialise_rule) + greedy_improve!(a, G) + return a +end + +function optimize(G, h = select_bandwidth(G); + max_iter::Int = 1000, + initialise_rule::InitRule = InitRule(RandomStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), - initialise_rule::InitRule = InitRule(RandomStart(), nothing) + stop_rule::StopRule = MaxIter(1000) +) + a = make_assignment(G, h, initialise_rule) + for i in 1:max_iter + local_search!(a, G, swap_rule = swap_rule, accept_rule = accept_rule) + if stop_rule(a, G, stop_rule) + break + end + end + greedy_improve!(a, G; max_iter, swap_rule, accept_rule, stop_rule) + return a +end + +function greedy_improve!(a::Assignment, G, max_iter::Int = 1000; + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict(), + stop_rule::StopRule = MaxIter(1000) +) +end + +# perform local search by trying a swap and accepting it if it improves the likelihood +function local_search!(a::Assignment, G; + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict() ) + swap = select_swap(a, swap_rule) + if accept_swap(a, swap, G, accept_rule) + apply_swap!(a, swap) + end end diff --git a/src/run_dev.jl b/src/run_dev.jl index 03478f5..d8b632b 100644 --- a/src/run_dev.jl +++ b/src/run_dev.jl @@ -61,7 +61,8 @@ end additional_info = 1 a = NetworkHistogram.Assignment(group_number, node_labels) dist = Bernoulli(0.5) -sbm_fit = NetworkHistogram.fit(a, G, dist) +obs = NetworkHistogram.Observations(G, dist) +sbm_fit = NetworkHistogram.fit(a, obs) sbm = NetworkHistogram.initialize_sbm([1 / 3, 1 / 3, 1 / 3], dist) for i in 1:3 @@ -76,6 +77,7 @@ A, node_labels = NetworkHistogram.sample(sbm, 3 * size_per_block); node_labels = repeat(1:3, inner = size_per_block) group_number = NetworkHistogram.GroupSize(size(A, 1), size_per_block) a_star = NetworkHistogram.Assignment(group_number, node_labels, additional_info) -sbm_fitted = NetworkHistogram.fit(a_star, SimpleGraph(A), dist) +obs_star = NetworkHistogram.Observations(SimpleGraph(A), dist) +sbm_fitted = NetworkHistogram.fit(a_star, obs_star) sbm_fitted From 8841c2268a00030f9be6e649c27829f69c1f1da7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 11:44:08 +0200 Subject: [PATCH 007/266] functionning naive implementation --- src/NetworkHistogram.jl | 6 ++- src/assignments/Assignments.jl | 36 ++------------ src/assignments/BernoulliAssignment/swap.jl | 2 +- src/assignments/fit.jl | 38 ++++++++++++++ .../{data_structures.jl => include.jl} | 0 src/observations.jl | 8 +-- src/optimisation/config_rules/InitRule.jl | 26 +++++----- src/optimisation/config_rules/accept_rule.jl | 18 +++++-- src/optimisation/config_rules/stop_rule.jl | 37 +++++++++----- src/optimisation/config_rules/swap_rule.jl | 14 ++---- src/optimisation/opti.jl | 49 ++++++++++--------- src/optimisation/swap.jl | 15 +++--- src/run_dev.jl | 22 +++++++-- 13 files changed, 162 insertions(+), 109 deletions(-) create mode 100644 src/assignments/fit.jl rename src/assignments/{data_structures.jl => include.jl} (100%) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index df6a79a..9bbb3c4 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -4,7 +4,7 @@ using LinearAlgebra, SparseArrays, DataStructures using Distributions, DensityInterface using Graphs using PermutationSymmetricTensors - +using ProgressMeter: Progress,next!,finish! import StatsBase, Random include("group_numbering.jl") @@ -13,6 +13,8 @@ include("observations.jl") include("assignments/Assignments.jl") include("optimisation/swap.jl") include("optimisation/opti.jl") -include("assignments/data_structures.jl") + +# more specialised and faster assignment types and methods +include("assignments/include.jl") end diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 204635f..cdb2b26 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -14,7 +14,9 @@ end function Assignment(group_size::GroupSize{T}, node_labels::Vector{Int}) where {T} if length(node_labels) != sum(group_size) - throw(ArgumentError("The length of `node_labels` must be equal to the sum of `group_size`")) + message = "The length of `node_labels` $(length(node_labels)) must be equal " + message *= "to the sum of `group_size` $(sum(group_size))" + throw(ArgumentError(message)) end return Assignment(group_size, node_labels, nothing) end @@ -47,34 +49,4 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) return get_vertex_in_group(a, i) end -function log_likelihood(a::Assignment, dists::SBM, g) - log_likelihood = 0.0 - for i in 1:number_nodes(a) - label_a = a.node_labels[i] - for j in (i + 1):number_nodes(a) - label_b = a.node_labels[j] - log_likelihood += logdensityof(dists[label_a, label_b], get_obs(g, i, j)) - end - end - return log_likelihood -end - -function fit(a::Assignment, g::Observations) - dists = initialize_sbm(a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in group1:number_groups(a) - edge_indices = get_edge_indices(a, group1, group2) - dists[group1, group2] = fit(g.dist_ref, g.graph, edge_indices) - end - end - return dists -end - -function log_likelihood(a::Assignment, g::Observations) - dists = fit(a, g.graph, g.dist_ref) - return log_likelihood(a, dists, g.graph) -end - -function fit(distribution, g, edges) - return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) -end +include("fit.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 63ca218..7514742 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -18,6 +18,6 @@ function revert_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T} assignment = deepcopy(swap.old_assignment) end -function swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} +function apply_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} # perform fast update end diff --git a/src/assignments/fit.jl b/src/assignments/fit.jl new file mode 100644 index 0000000..e39fbab --- /dev/null +++ b/src/assignments/fit.jl @@ -0,0 +1,38 @@ +# method to compute estimator from node clustering as specified in assignment +function fit(a::Assignment, g::Observations) + dists = initialize_sbm(a.group_size, g.dist_ref) + for group1 in 1:number_groups(a) + for group2 in group1:number_groups(a) + edge_indices = get_edge_indices(a, group1, group2) + dists[group1, group2] = fit(g.dist_ref, g.graph, edge_indices) + end + end + return dists +end + +function fit(distribution, g, edges) + return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) +end + +# method to compute the log likelihood of fitted SBM +function log_likelihood(a::Assignment, sbm::SBM, g) + log_likelihood = 0.0 + for i in 1:number_nodes(a) + label_a = a.node_labels[i] + for j in (i + 1):number_nodes(a) + label_b = a.node_labels[j] + log_likelihood += logdensityof(sbm[label_a, label_b], get_obs(g, i, j)) + end + end + return log_likelihood +end + +function log_likelihood(a::Assignment, g::Observations) + dists = fit(a, g) + return log_likelihood(a, dists, g.graph) +end + +# default score is the log likelihood +function score(a::Assignment, g::Observations) + return log_likelihood(a, g)/binomial(number_nodes(a), 2) +end diff --git a/src/assignments/data_structures.jl b/src/assignments/include.jl similarity index 100% rename from src/assignments/data_structures.jl rename to src/assignments/include.jl diff --git a/src/observations.jl b/src/observations.jl index 9b34845..8e11684 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -1,11 +1,13 @@ -# getters for observations - # -struct Observations{G,D} +struct Observations{G, D} graph::G dist_ref::D end +function number_nodes(g::Observations) + return nv(g.graph) +end + function get_obs(g::Observations, x::Tuple) return get_obs(g.graph, x[1], x[2]) end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 58e67a5..5a1eff1 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -7,15 +7,15 @@ struct InitRule{S <: StartingAssignment, I} assignment_rule::I end -function make_assignment(A, h, init_rule::InitRule{S, Nothing}) where {S} - return Assignment(initialize_node_labels(A, h, init_rule.starting_assignment_rule)...) +function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} + return Assignment(initialize_node_labels(g, h, init_rule.starting_assignment_rule)...) end """ - initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment) + initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) -initialize node labels based on the `starting_assignment_rule`, and return a vector of -node labels and a `GroupSize` object. +initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` +objecta vector of node labels. # Implemenented rules - `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. @@ -23,16 +23,16 @@ node labels and a `GroupSize` object. """ initialize_node_labels -function initialize_node_labels(A, h, ::OrderedStart) - group_size = GroupSize(size(A, 1), h) - node_labels = inverse_rle(1:length(group_size), group_size) - return node_labels, group_size +function initialize_node_labels(g, h, ::OrderedStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) + return group_size, node_labels end -function initialize_node_labels(A, h, ::RandomStart) - node_labels, group_size = initialize_node_labels(A, h, OrderedStart()) - node_labels = shuffle!(node_labels) - return node_labels, group_size +function initialize_node_labels(g, h, ::RandomStart) + group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) + Random.shuffle!(node_labels) + return group_size, node_labels end # check https://github.com/TrainOfCode/LocalFennelPartitioning.jl/tree/main diff --git a/src/optimisation/config_rules/accept_rule.jl b/src/optimisation/config_rules/accept_rule.jl index e3e57ea..4f864f7 100644 --- a/src/optimisation/config_rules/accept_rule.jl +++ b/src/optimisation/config_rules/accept_rule.jl @@ -2,16 +2,26 @@ abstract type AcceptRule end struct Strict <: AcceptRule end """ - accept_reject_update!(proposal::Assignment, current::Assignment, - accept_rule::AcceptRule) + accept_reject_update!(a::Assignment, swap::Swap, g, accept_rule::AcceptRule) -Return the updated `current` assignment based on the `accept_rule`. +Perform the swap and accept it if it improves the likelihood of the assignment. `a` will +be updated in place if the swap is accepted. # Implemented rules - `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. """ accept_reject_update! -function accept_reject_update!(swap::Swap, current::Assignment, ::Strict) +function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) + # calculate the score of the current assignment + current_score = score(a, g) + # perform the swap + apply_swap!(a, swap) + # calculate the score of the new assignment + new_score = score(a, g) + # if the new assignment is worse, revert the swap + if new_score < current_score + revert_swap!(a, swap) + end end diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index 3a0c998..6efc980 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -1,16 +1,30 @@ abstract type StopRule end + +function initialise_stop_rule!( + stop_rule::StopRule, a, g) +end + mutable struct PreviousBestValue{T} <: StopRule k::Int - past_values::Queue{T} + past_values::CircularDeque{T} function PreviousBestValue( - k::Int, x::T=0.0) where {T <: Real} + k::Int, x::T = -Inf) where {T <: Real} @assert k > 0 - new{T}(k, Queue{T}()) + # queue stores the best values and at most k subsequent values + queue = CircularDeque{T}(k + 1) + push!(queue, x) + new{T}(k, queue) end end +function initialise_stop_rule!(stop_rule::PreviousBestValue, a, g) + score_value = score(a, g) + empty!(stop_rule.past_values) + push!(stop_rule.past_values, score_value) +end + """ - stopping_rule(assignment::Assignment,G, stop_rule::StopRule) + stopping_rule(assignment::Assignment,g, stop_rule::StopRule) Returns a Bool with true if we should stop the optimization based on the `stop_rule`. @@ -20,17 +34,16 @@ Returns a Bool with true if we should stop the optimization based on the `stop_r """ stopping_rule -function stopping_rule(assignment::Assignment, G, stop_rule::PreviousBestValue) - log_likelihood = log_likelihood(assignment, G) - if length(stop_rule.past_values) == 0 - push!(stop_rule.past_values, log_likelihood) +function stopping_rule(assignment::Assignment, g, stop_rule::PreviousBestValue) + score_value = score(assignment, g) + if isempty(stop_rule.past_values) + push!(stop_rule.past_values, score_value) return false - elseif log_likelihood > first(stop_rule.past_values) + elseif score_value > first(stop_rule.past_values) empty!(stop_rule.past_values) - push!(stop_rule.past_values, log_likelihood) + push!(stop_rule.past_values, score_value) return false else - push!(stop_rule.past_values, log_likelihood) - return length(stop_rule.past_values) == stop_rule.k + 1 #always keep the best value + return length(stop_rule.past_values) == capacity(stop_rule.past_values) end end diff --git a/src/optimisation/config_rules/swap_rule.jl b/src/optimisation/config_rules/swap_rule.jl index be76aab..c1a8afa 100644 --- a/src/optimisation/config_rules/swap_rule.jl +++ b/src/optimisation/config_rules/swap_rule.jl @@ -14,14 +14,8 @@ current assignment `node_assignment`. select_swap function select_swap(assignment::Assignment, ::RandomNodeSwap) - index1 = rand(1:number_nodes(assignment)) - label1 = assignment.node_labels[index1] - index2 = index1 - for _ in 1:10 - index2 = rand(1:number_nodes(assignment)) - if assignment.node_labels[index2] != label1 - break - end - end - return make_swap(assignment, (index1, index2)) + groups = StatsBase.sample(1:number_groups(assignment), 2; replace=false) + index1 = rand(get_vertex_in_group(assignment, groups[1])) + index2 = rand(get_vertex_in_group(assignment, groups[2])) + return (index1, index2) end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index efb39ef..a09e8ab 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -4,44 +4,47 @@ include("config_rules/InitRule.jl") include("config_rules/stop_rule.jl") include("config_rules/bandwidth_selection_rule.jl") -function optimize(G, h; initialise_rule::InitRule = InitRule(RandomStart(), nothing)) - a = make_assignment(G, h, initialise_rule) - greedy_improve!(a, G) - return a -end - -function optimize(G, h = select_bandwidth(G); +function optimize(g, h = select_bandwidth(g); max_iter::Int = 1000, initialise_rule::InitRule = InitRule(RandomStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = MaxIter(1000) + stop_rule::StopRule = PreviousBestValue(10), + progress_bar::Bool = false ) - a = make_assignment(G, h, initialise_rule) - for i in 1:max_iter - local_search!(a, G, swap_rule = swap_rule, accept_rule = accept_rule) - if stop_rule(a, G, stop_rule) - break - end - end - greedy_improve!(a, G; max_iter, swap_rule, accept_rule, stop_rule) + a = make_assignment(g, h, initialise_rule) + initialise_stop_rule!(stop_rule, a, g) + greedy_improve!(a, g; max_iter, swap_rule, accept_rule, stop_rule, progress_bar) return a end -function greedy_improve!(a::Assignment, G, max_iter::Int = 1000; +function greedy_improve!(a::Assignment, g; max_iter::Int = 1000, swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = MaxIter(1000) + stop_rule::StopRule = PreviousBestValue(10), + progress_bar::Bool = false, ) + # swap memory allocation + swap = make_swap(a, (1, 1)) + p = Progress(max_iter; enabled = progress_bar) + # perform local search until the stopping rule is met + for i in 1:max_iter + local_search!(a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) + next!(p) + if stopping_rule(a, g, stop_rule) + finish!(p) + break + end + end end # perform local search by trying a swap and accepting it if it improves the likelihood -function local_search!(a::Assignment, G; +function local_search!(a::Assignment, g, swap::Swap = make_swap(a, (1, 1)); swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict() ) - swap = select_swap(a, swap_rule) - if accept_swap(a, swap, G, accept_rule) - apply_swap!(a, swap) - end + # select two nodes to swap and build the swap object + make_swap!(swap, a, select_swap(a, swap_rule)) + # perform the swap and accept it if it improves the likelihood + accept_reject_update!(a, swap, g, accept_rule) end diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 425a9a8..17d5a5e 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -5,14 +5,17 @@ mutable struct DefaultSwap <: Swap index2::Int end -make_swap(::Assignment{T, Nothing}, id::Tuple{Int}) where {T} = DefaultSwap(id[1], id[2]) -function make_swap!(swap::DefaultSwap, assignment::Assignment, id::Tuple{Int}) +function make_swap(::Assignment{T, Nothing}, id::Tuple{Int, Int}) where {T} + return DefaultSwap(id[1], id[2]) +end + +function make_swap!(swap::DefaultSwap, a::Assignment, id::Tuple{Int, Int}) swap.index1, swap.index2 = id end -function swap!(assignment::Assignment, swap::DefaultSwap) - assignment.node_labels[swap.index1], assignment.node_labels[swap.index2] = assignment.node_labels[swap.index2], - assignment.node_labels[swap.index1] +function apply_swap!(a::Assignment, s::DefaultSwap) + a.node_labels[s.index1], a.node_labels[s.index2] = a.node_labels[s.index2], + a.node_labels[s.index1] end -revert_swap!(assignment::Assignment, swap::DefaultSwap) = swap!(assignment, swap) +revert_swap!(assignment::Assignment, swap::DefaultSwap) = apply_swap!(assignment, swap) diff --git a/src/run_dev.jl b/src/run_dev.jl index d8b632b..6a40b25 100644 --- a/src/run_dev.jl +++ b/src/run_dev.jl @@ -49,7 +49,8 @@ end ## -using NetworkHistogram +using NetworkHistogram, Random + group_number = NetworkHistogram.GroupSize(nv(G), 3) if typeof(group_number) == NetworkHistogram.GroupSize{Tuple{Int, Int}} node_labels = repeat(1:(length(group_number) - 1), inner = group_number[1]) @@ -66,18 +67,33 @@ sbm_fit = NetworkHistogram.fit(a, obs) sbm = NetworkHistogram.initialize_sbm([1 / 3, 1 / 3, 1 / 3], dist) for i in 1:3 - sbm[i, i] = Bernoulli(0.8) + sbm[i, i] = Bernoulli(0.2) for j in (i + 1):3 - sbm[i, j] = Bernoulli(0.01 + 0.1 * (i + j)) + sbm[i, j] = Bernoulli(0.01) end end size_per_block = 200 A, node_labels = NetworkHistogram.sample(sbm, 3 * size_per_block); node_labels = repeat(1:3, inner = size_per_block) +Random.shuffle!(node_labels) group_number = NetworkHistogram.GroupSize(size(A, 1), size_per_block) a_star = NetworkHistogram.Assignment(group_number, node_labels, additional_info) obs_star = NetworkHistogram.Observations(SimpleGraph(A), dist) sbm_fitted = NetworkHistogram.fit(a_star, obs_star) sbm_fitted + +init_rule = NetworkHistogram.InitRule(NetworkHistogram.RandomStart(), nothing) +ll_old = NetworkHistogram.score(a_star, obs_star) +println("Log likelihood: ", ll_old) +a_best = NetworkHistogram.optimize(obs_star, size_per_block, max_iter = 100; + stop_rule = NetworkHistogram.PreviousBestValue(5), + initialise_rule = init_rule, progress_bar = true) +ll_new = NetworkHistogram.score(a_best, obs_star) +println("Log likelihood: ", ll_new) +if ll_old < ll_new + println("Optimization improved the log likelihood.") +else + println("Optimization did not improve the log likelihood.") +end From c8f4173efa9fecd01f735d32934703eb659f2c65 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 11:54:14 +0200 Subject: [PATCH 008/266] refactoring --- src/NetworkHistogram.jl | 4 ++-- src/assignments/Assignments.jl | 2 -- src/{assignments => }/fit.jl | 26 +++++++++++------------- src/optimisation/config_rules/include.jl | 5 +++++ src/optimisation/include.jl | 2 ++ src/optimisation/loss.jl | 4 ++++ src/optimisation/opti.jl | 7 ++----- src/optimisation/swap.jl | 2 +- 8 files changed, 28 insertions(+), 24 deletions(-) rename src/{assignments => }/fit.jl (65%) create mode 100644 src/optimisation/config_rules/include.jl create mode 100644 src/optimisation/include.jl create mode 100644 src/optimisation/loss.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 9bbb3c4..3a05bdb 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -11,8 +11,8 @@ include("group_numbering.jl") include("sbm.jl") include("observations.jl") include("assignments/Assignments.jl") -include("optimisation/swap.jl") -include("optimisation/opti.jl") +include("fit.jl") +include("optimisation/include.jl") # more specialised and faster assignment types and methods include("assignments/include.jl") diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index cdb2b26..8ed6dd5 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -48,5 +48,3 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) @boundscheck checkbounds(a, i) return get_vertex_in_group(a, i) end - -include("fit.jl") diff --git a/src/assignments/fit.jl b/src/fit.jl similarity index 65% rename from src/assignments/fit.jl rename to src/fit.jl index e39fbab..c42744f 100644 --- a/src/assignments/fit.jl +++ b/src/fit.jl @@ -1,21 +1,29 @@ +# Slow fallback methods for the Assignment type +# speed up by implementing specialized methods for the BernoulliAssignment type and others # method to compute estimator from node clustering as specified in assignment function fit(a::Assignment, g::Observations) dists = initialize_sbm(a.group_size, g.dist_ref) for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) - dists[group1, group2] = fit(g.dist_ref, g.graph, edge_indices) + dists[group1, group2] = _fit(g.dist_ref, g.graph, edge_indices) end end return dists end -function fit(distribution, g, edges) +function _fit(distribution, g, edges) return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end -# method to compute the log likelihood of fitted SBM -function log_likelihood(a::Assignment, sbm::SBM, g) +# method to compute the log likelihood of a SBM fitted according to the assignment +function log_likelihood(a::Assignment, g::Observations) + dists = fit(a, g) + return _log_likelihood(a, dists, g.graph) +end + + +function _log_likelihood(a::Assignment, sbm::SBM, g) log_likelihood = 0.0 for i in 1:number_nodes(a) label_a = a.node_labels[i] @@ -26,13 +34,3 @@ function log_likelihood(a::Assignment, sbm::SBM, g) end return log_likelihood end - -function log_likelihood(a::Assignment, g::Observations) - dists = fit(a, g) - return log_likelihood(a, dists, g.graph) -end - -# default score is the log likelihood -function score(a::Assignment, g::Observations) - return log_likelihood(a, g)/binomial(number_nodes(a), 2) -end diff --git a/src/optimisation/config_rules/include.jl b/src/optimisation/config_rules/include.jl new file mode 100644 index 0000000..8c5b6bf --- /dev/null +++ b/src/optimisation/config_rules/include.jl @@ -0,0 +1,5 @@ +include("swap_rule.jl") +include("accept_rule.jl") +include("InitRule.jl") +include("stop_rule.jl") +include("bandwidth_selection_rule.jl") diff --git a/src/optimisation/include.jl b/src/optimisation/include.jl new file mode 100644 index 0000000..28efb63 --- /dev/null +++ b/src/optimisation/include.jl @@ -0,0 +1,2 @@ +include("swap.jl") +include("opti.jl") diff --git a/src/optimisation/loss.jl b/src/optimisation/loss.jl new file mode 100644 index 0000000..4cf0897 --- /dev/null +++ b/src/optimisation/loss.jl @@ -0,0 +1,4 @@ +# default score is the log likelihood +function score(a::Assignment, g::Observations) + return log_likelihood(a, g) / binomial(number_nodes(a), 2) +end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index a09e8ab..8e3dd9e 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -1,8 +1,5 @@ -include("config_rules/swap_rule.jl") -include("config_rules/accept_rule.jl") -include("config_rules/InitRule.jl") -include("config_rules/stop_rule.jl") -include("config_rules/bandwidth_selection_rule.jl") +include("config_rules/include.jl") +include("loss.jl") function optimize(g, h = select_bandwidth(g); max_iter::Int = 1000, diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 17d5a5e..56eeeda 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -5,7 +5,7 @@ mutable struct DefaultSwap <: Swap index2::Int end -function make_swap(::Assignment{T, Nothing}, id::Tuple{Int, Int}) where {T} +function make_swap(::Assignment, id::Tuple{Int, Int}) where {T} return DefaultSwap(id[1], id[2]) end From db753cd30feecf93e440351615a4f211dc826f86 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 11:57:07 +0200 Subject: [PATCH 009/266] revert over-refactoring of lossfunction --- src/optimisation/config_rules/stop_rule.jl | 9 +++++++-- src/optimisation/loss.jl | 4 ---- src/optimisation/opti.jl | 1 - src/run_dev.jl | 1 + 4 files changed, 8 insertions(+), 7 deletions(-) delete mode 100644 src/optimisation/loss.jl diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index 6efc980..1777c2d 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -1,7 +1,12 @@ abstract type StopRule end -function initialise_stop_rule!( - stop_rule::StopRule, a, g) +function initialise_stop_rule!(stop_rule::StopRule, a, g) +end + + +# default score is the log likelihood +function score(a::Assignment, g::Observations) + return log_likelihood(a, g) / binomial(number_nodes(a), 2) end mutable struct PreviousBestValue{T} <: StopRule diff --git a/src/optimisation/loss.jl b/src/optimisation/loss.jl deleted file mode 100644 index 4cf0897..0000000 --- a/src/optimisation/loss.jl +++ /dev/null @@ -1,4 +0,0 @@ -# default score is the log likelihood -function score(a::Assignment, g::Observations) - return log_likelihood(a, g) / binomial(number_nodes(a), 2) -end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index 8e3dd9e..6874038 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -1,5 +1,4 @@ include("config_rules/include.jl") -include("loss.jl") function optimize(g, h = select_bandwidth(g); max_iter::Int = 1000, diff --git a/src/run_dev.jl b/src/run_dev.jl index 6a40b25..ab57fa6 100644 --- a/src/run_dev.jl +++ b/src/run_dev.jl @@ -50,6 +50,7 @@ end ## using NetworkHistogram, Random +G = Graph(20, 20) group_number = NetworkHistogram.GroupSize(nv(G), 3) if typeof(group_number) == NetworkHistogram.GroupSize{Tuple{Int, Int}} From b0efdcce11e923c7a1325c3ff332d20a813268d0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 17:02:03 +0200 Subject: [PATCH 010/266] Fast Bernoulli data is functionning but wrong Algorithm is super fast, but does not seem to be able to fit simple SBM --- Project.toml | 2 + src/NetworkHistogram.jl | 2 +- src/assignments/Assignments.jl | 4 + src/assignments/BernoulliAssignment/struct.jl | 82 ++++++++++++-- src/assignments/BernoulliAssignment/swap.jl | 83 +++++++++++++-- src/fit.jl | 1 - src/optimisation/config_rules/stop_rule.jl | 1 - src/optimisation/config_rules/swap_rule.jl | 2 +- src/optimisation/opti.jl | 10 +- src/optimisation/swap.jl | 11 +- src/run_dev.jl | 100 ------------------ src/sbm.jl | 22 ++-- 12 files changed, 180 insertions(+), 140 deletions(-) delete mode 100644 src/run_dev.jl diff --git a/Project.toml b/Project.toml index e52cc10..f9a9df9 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.5.2" ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" @@ -32,6 +33,7 @@ ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7" ArnoldiMethod = "0.2.0" Arpack = "0.5.4" BenchmarkTools = "1.3.2" +CairoMakie = "0.12.14" CodecZstd = "0.7.2" DataStructures = "0.18.20" DensityInterface = "0.4.0" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 3a05bdb..850f4ae 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -4,7 +4,7 @@ using LinearAlgebra, SparseArrays, DataStructures using Distributions, DensityInterface using Graphs using PermutationSymmetricTensors -using ProgressMeter: Progress,next!,finish! +using ProgressMeter: Progress, next!, finish! import StatsBase, Random include("group_numbering.jl") diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 8ed6dd5..220e570 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -33,6 +33,10 @@ function get_vertex_in_group(assignment::Assignment, group::Int) return findall(assignment.node_labels .== group) end +function get_group_of_vertex(assignment::Assignment, vertex::Int) + return assignment.node_labels[vertex] +end + function get_edge_indices(a::Assignment, i::Int, j::Int) return [(x, y) for x in get_vertex_in_group(a, i) for y in get_vertex_in_group(a, j)] diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index 149af79..eeacbac 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -1,24 +1,84 @@ -struct BernoulliData{T} +mutable struct BernoulliData{F} counts::Matrix{Int} realized::Matrix{Int} - estimated_theta::Matrix{T} + estimated_theta::Matrix{F} A::BitMatrix + log_likelihood::F end -const BernoulliAssignment{T} = Assignment{T, BernoulliData} -const BernoulliInitRule{S} = InitRule{S, Val{BernoulliData}} +const BernoulliAssignment{T, F} = Assignment{T, BernoulliData{F}} +const BernoulliInitRule{S, F} = InitRule{S, Val{BernoulliData}} # is this type stable? should this be BernoulliAssignment{T,F}? see line 8 above function BernoulliAssignment( - G, node_labels::Vector{Int}, group_size::GroupSize{T}) where {T} - k = length(group_size) - return BernoulliAssignment{T}(group_size, node_labels, - BernoulliData(zeros(Int, k, k), zeros(Int, k, k), zeros(T, k, k), BitMatrix(G))) + g, group_size::GroupSize, node_labels::Vector{Int}) + bernoulli_data = make_bernoulli_data(g, node_labels, group_size) + return Assignment(group_size, node_labels, bernoulli_data) end -function make_assignment(G, h, init_rule::BernoulliInitRule{S}) where {S} - return BernoulliAssignment(initialize_node_labels( - G, h, init_rule.starting_assignment_rule)...) +function make_assignment(g, h, init_rule::BernoulliInitRule) + group_size, node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return BernoulliAssignment(g, group_size, node_labels) end +function make_bernoulli_data(g, node_labels, group_size) + number_groups = length(group_size) + n = length(node_labels) + counts = zeros(Int, number_groups, number_groups) + realized = zeros(Int, number_groups, number_groups) + A = convert_bitmatrix(g) + @inbounds @simd for k in 1:number_groups + for l in k:number_groups + realized[k, l] = sum(A[node_labels .== k, node_labels .== l]) + realized[l, k] = realized[k, l] + counts[k, l] = group_size[k] * group_size[l] + counts[l, k] = counts[k, l] + end + end + + @inbounds @simd for k in 1:number_groups + counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 + realized[k, k] = sum(A[node_labels .== k, node_labels .== k]) ÷ 2 + end + + estimated_theta = realized ./ counts + ll = compute_log_likelihood(estimated_theta, counts) + return BernoulliData(counts, realized, estimated_theta, A, ll) +end + +function convert_bitmatrix(g::Observations{<:AbstractGraph, D}) where {D} + A = collect(adjacency_matrix(g.graph)) + return convert(BitMatrix, collect(adjacency_matrix(g.graph))) +end + +function convert_bitmatrix(g::Observations{<:AbstractMatrix, D}) where {D} + return convert(BitMatrix, g.graph) +end + +function compute_log_likelihood(estimated_theta, counts) + number_groups = size(estimated_theta, 1) + loglik = 0.0 + @inbounds @simd for i in 1:number_groups + for j in i:number_groups + θ = estimated_theta[i, j] + θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) + loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * counts[i, j] + end + end + return loglik +end + +function log_likelihood(assignment::BernoulliAssignment) + return assignment.additional_data.log_likelihood +end + +function force_recompute_ll(a::BernoulliAssignment, g::Observations) + a_simple = Assignment(a.group_size, a.node_labels) + return log_likelihood(a_simple, g) +end + + +log_likelihood(a::BernoulliAssignment, g::Observations) = log_likelihood(a) + include("swap.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 7514742..b949ec0 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -1,23 +1,84 @@ -mutable struct BernoulliSwap{T} <: Swap +mutable struct BernoulliSwap{F} <: Swap index1::Int index2::Int - old_assignment::BernoulliAssignment{T} + realized::Matrix{Int} + estimated_theta::Matrix{F} + log_likelihood::F + node_labels::Vector{Int} end -function make_swap(assignment::BernoulliAssignment{T}, id::Tuple{Int}) where {T} - return BernoulliSwap(id[1], id[2], deepcopy(assignment)) +function make_swap(assignment::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} + return BernoulliSwap(id[1], id[2], copy(assignment.additional_data.realized), + copy(assignment.additional_data.estimated_theta), + assignment.additional_data.log_likelihood, copy(assignment.node_labels)) + # realized = copy(assignment.additional_data.realized) + # estimated_theta = copy(assignment.additional_data.estimated_theta) + # log_likelihood = assignment.additional_data.log_likelihood + # return BernoulliSwap(id[1], id[2], realized, estimated_theta, log_likelihood) end -function make_swap!(swap::BernoulliSwap{T}, assignment::BernoulliAssignment{T}, - id::Tuple{Int}) where {T} +function make_swap!(swap::BernoulliSwap{F}, assignment::BernoulliAssignment{T, F}, + id::Tuple{Int, Int}) where {T, F} swap.index1, swap.index2 = id - swap.old_assignment = deepcopy(assignment) + copy!(swap.realized, assignment.additional_data.realized) + copy!(swap.estimated_theta, assignment.additional_data.estimated_theta) + #copy!(swap.node_labels, assignment.node_labels) + swap.log_likelihood = assignment.additional_data.log_likelihood end -function revert_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} - assignment = deepcopy(swap.old_assignment) +function revert_swap!( + assignment::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + swap_node_labels!(assignment, swap.index1, swap.index2) + copy!(assignment.additional_data.realized, swap.realized) + copy!(assignment.additional_data.estimated_theta, swap.estimated_theta) + #copy!(assignment.node_labels, swap.node_labels) + assignment.additional_data.log_likelihood = swap.log_likelihood end -function apply_swap!(assignment::BernoulliAssignment{T}, swap::BernoulliSwap{T}) where {T} - # perform fast update +function apply_swap!( + assignment::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + swap_node_labels!(assignment, swap.index1, swap.index2) + update_observed!(assignment, swap) + update_ll!(assignment) +end + +function update_observed!(a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + + for i in 1:length(a.node_labels) + if i == swap.index1 || i == swap.index2 || + a.additional_data.A[swap.index1, i] == a.additional_data.A[swap.index2, i] + continue + end + group_inter = get_group_of_vertex(a, i) + if a.additional_data.A[swap.index1, i] == 1 + a.additional_data.realized[g1, group_inter] -= 1 + a.additional_data.realized[group_inter, g1] = a.additional_data.realized[ + g1, group_inter] + + a.additional_data.realized[g2, group_inter] += 1 + a.additional_data.realized[group_inter, g2] = a.additional_data.realized[ + g2, group_inter] + end + if a.additional_data.A[swap.index2, i] == 1 + a.additional_data.realized[g2, group_inter] -= 1 + a.additional_data.realized[group_inter, g2] = a.additional_data.realized[ + g2, group_inter] + + a.additional_data.realized[g1, group_inter] += 1 + a.additional_data.realized[group_inter, g1] = a.additional_data.realized[ + g1, group_inter] + end + end + + @. a.additional_data.estimated_theta = a.additional_data.realized / + a.additional_data.counts + return nothing +end + +function update_ll!(a::BernoulliAssignment) + a.additional_data.log_likelihood = compute_log_likelihood( + a.additional_data.estimated_theta, a.additional_data.counts) + return nothing end diff --git a/src/fit.jl b/src/fit.jl index c42744f..55ccebb 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -22,7 +22,6 @@ function log_likelihood(a::Assignment, g::Observations) return _log_likelihood(a, dists, g.graph) end - function _log_likelihood(a::Assignment, sbm::SBM, g) log_likelihood = 0.0 for i in 1:number_nodes(a) diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index 1777c2d..de7a34c 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -3,7 +3,6 @@ abstract type StopRule end function initialise_stop_rule!(stop_rule::StopRule, a, g) end - # default score is the log likelihood function score(a::Assignment, g::Observations) return log_likelihood(a, g) / binomial(number_nodes(a), 2) diff --git a/src/optimisation/config_rules/swap_rule.jl b/src/optimisation/config_rules/swap_rule.jl index c1a8afa..cee5b7a 100644 --- a/src/optimisation/config_rules/swap_rule.jl +++ b/src/optimisation/config_rules/swap_rule.jl @@ -14,7 +14,7 @@ current assignment `node_assignment`. select_swap function select_swap(assignment::Assignment, ::RandomNodeSwap) - groups = StatsBase.sample(1:number_groups(assignment), 2; replace=false) + groups = StatsBase.sample(1:number_groups(assignment), 2; replace = false) index1 = rand(get_vertex_in_group(assignment, groups[1])) index2 = rand(get_vertex_in_group(assignment, groups[2])) return (index1, index2) diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index 6874038..40743ad 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -2,13 +2,14 @@ include("config_rules/include.jl") function optimize(g, h = select_bandwidth(g); max_iter::Int = 1000, - initialise_rule::InitRule = InitRule(RandomStart(), nothing), + initialise_rule::InitRule = InitRule(OrderedStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), stop_rule::StopRule = PreviousBestValue(10), progress_bar::Bool = false ) a = make_assignment(g, h, initialise_rule) + println("Initial log likelihood: ", score(a, g)) initialise_stop_rule!(stop_rule, a, g) greedy_improve!(a, g; max_iter, swap_rule, accept_rule, stop_rule, progress_bar) return a @@ -18,17 +19,22 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 1000, swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), stop_rule::StopRule = PreviousBestValue(10), - progress_bar::Bool = false, + progress_bar::Bool = false ) # swap memory allocation swap = make_swap(a, (1, 1)) p = Progress(max_iter; enabled = progress_bar) # perform local search until the stopping rule is met + score_value = score(a, g) + new_score_value = score_value for i in 1:max_iter + score_value = new_score_value local_search!(a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) + new_score_value = score(a, g) next!(p) if stopping_rule(a, g, stop_rule) finish!(p) + println("Stopping rule kicked in") break end end diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 56eeeda..272b1de 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -5,7 +5,7 @@ mutable struct DefaultSwap <: Swap index2::Int end -function make_swap(::Assignment, id::Tuple{Int, Int}) where {T} +function make_swap(::Assignment, id::Tuple{Int, Int}) return DefaultSwap(id[1], id[2]) end @@ -13,9 +13,10 @@ function make_swap!(swap::DefaultSwap, a::Assignment, id::Tuple{Int, Int}) swap.index1, swap.index2 = id end -function apply_swap!(a::Assignment, s::DefaultSwap) - a.node_labels[s.index1], a.node_labels[s.index2] = a.node_labels[s.index2], - a.node_labels[s.index1] -end +apply_swap!(a::Assignment, s::DefaultSwap) = swap_node_labels!(a, s.index1, s.index2) revert_swap!(assignment::Assignment, swap::DefaultSwap) = apply_swap!(assignment, swap) + +function swap_node_labels!(a::Assignment, i, j) + a.node_labels[i], a.node_labels[j] = a.node_labels[j],a.node_labels[i] +end diff --git a/src/run_dev.jl b/src/run_dev.jl deleted file mode 100644 index ab57fa6..0000000 --- a/src/run_dev.jl +++ /dev/null @@ -1,100 +0,0 @@ -using Graphs -using MetaGraphsNext -using Distributions -using SimpleWeightedGraphs - -cities = MetaGraph( - Graph(); - label_type = String, - vertex_data_type = Vector{Float64}, - edge_data_type = Float64, - graph_data = nothing, - default_weight = -Inf, - weight_function = identity -); - -cities["Paris"] = [2.3, 48.9]; -cities["London"] = [0.1, 51.5]; -cities["Berlin"] = [13.4, 52.5]; -cities["Lausanne"] = [6.6, 46.5]; -cities["Paris", "London"] = 0.5; -cities["Paris", "Berlin"] = 1; -cities["London", "Berlin"] = 0; -cities["Paris", "Lausanne"] = 1; - -getindex(cities, "Paris") - -typeof(cities) - -index = code_for(cities, "Paris") -label_for.(Ref(cities), neighbors(cities, index)) -adjacency_matrix(cities) -collect(weights(cities)) - -G = Graph(20, 20) -adjacency_matrix(G) - -sources = [1, 2, 1]; - -destinations = [2, 3, 3]; - -weight_edges = [0.5, 0.8, 2.0]; - -g = SimpleWeightedGraph(sources, destinations, weight_edges) -add_vertices!(g, 5) - -function get_obs(g::SimpleGraph{T}, i::Int, j::Int) where {T} - return has_edge(g, i, j) -end - -## - -using NetworkHistogram, Random -G = Graph(20, 20) - -group_number = NetworkHistogram.GroupSize(nv(G), 3) -if typeof(group_number) == NetworkHistogram.GroupSize{Tuple{Int, Int}} - node_labels = repeat(1:(length(group_number) - 1), inner = group_number[1]) - last_labels = fill(length(group_number), group_number[end]) - node_labels = vcat(node_labels, last_labels) -else - node_labels = repeat(1:length(group_number), inner = group_number[1]) -end -additional_info = 1 -a = NetworkHistogram.Assignment(group_number, node_labels) -dist = Bernoulli(0.5) -obs = NetworkHistogram.Observations(G, dist) -sbm_fit = NetworkHistogram.fit(a, obs) - -sbm = NetworkHistogram.initialize_sbm([1 / 3, 1 / 3, 1 / 3], dist) -for i in 1:3 - sbm[i, i] = Bernoulli(0.2) - for j in (i + 1):3 - sbm[i, j] = Bernoulli(0.01) - end -end - -size_per_block = 200 -A, node_labels = NetworkHistogram.sample(sbm, 3 * size_per_block); -node_labels = repeat(1:3, inner = size_per_block) -Random.shuffle!(node_labels) -group_number = NetworkHistogram.GroupSize(size(A, 1), size_per_block) -a_star = NetworkHistogram.Assignment(group_number, node_labels, additional_info) -obs_star = NetworkHistogram.Observations(SimpleGraph(A), dist) -sbm_fitted = NetworkHistogram.fit(a_star, obs_star) - -sbm_fitted - -init_rule = NetworkHistogram.InitRule(NetworkHistogram.RandomStart(), nothing) -ll_old = NetworkHistogram.score(a_star, obs_star) -println("Log likelihood: ", ll_old) -a_best = NetworkHistogram.optimize(obs_star, size_per_block, max_iter = 100; - stop_rule = NetworkHistogram.PreviousBestValue(5), - initialise_rule = init_rule, progress_bar = true) -ll_new = NetworkHistogram.score(a_best, obs_star) -println("Log likelihood: ", ll_new) -if ll_old < ll_new - println("Optimization improved the log likelihood.") -else - println("Optimization did not improve the log likelihood.") -end diff --git a/src/sbm.jl b/src/sbm.jl index 7f8e4e2..da6a8d7 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -42,15 +42,11 @@ Base.@propagate_inbounds function Base.getindex(s::SBM, i, j) return getindex(s.probs, i, j) end -function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) - n_blocks = number_blocks(sbm) - node_labels = StatsBase.sample( - rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) - if sorted - sort!(node_labels) - end +function sample(rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) + n_nodes = length(node_labels) A = BitMatrix(undef, n_nodes, n_nodes) for i in 1:n_nodes + A[i, i] = zero(eltype(A)) for j in (i + 1):n_nodes A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) A[j, i] = A[i, j] @@ -58,4 +54,16 @@ function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) end return sparse(A), node_labels end + +sample(sbm::SBM, node_labels::Vector{Int}) = sample(Random.default_rng(), sbm, node_labels) +function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) + n_blocks = number_blocks(sbm) + node_labels = StatsBase.sample( + rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) + if sorted + sort!(node_labels) + end + return sample(rng, sbm, node_labels) +end + sample(sbm::SBM, n_nodes::Int) = sample(Random.default_rng(), sbm, n_nodes) From 11eee28f07ffa9ae8351932cefe50eabdc6231bd Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 18:21:26 +0200 Subject: [PATCH 011/266] functionning Bernoulli implementation --- Project.toml | 2 ++ src/assignments/BernoulliAssignment/struct.jl | 8 +++++-- src/assignments/BernoulliAssignment/swap.jl | 21 +++++++++++++---- src/optimisation/config_rules/stop_rule.jl | 23 ++++++++----------- src/optimisation/opti.jl | 2 +- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/Project.toml b/Project.toml index f9a9df9..9609784 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -35,6 +36,7 @@ Arpack = "0.5.4" BenchmarkTools = "1.3.2" CairoMakie = "0.12.14" CodecZstd = "0.7.2" +Combinatorics = "1.0.2" DataStructures = "0.18.20" DensityInterface = "0.4.0" Distributions = "0.25.112" diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index eeacbac..f9bfcb8 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -2,7 +2,7 @@ mutable struct BernoulliData{F} counts::Matrix{Int} realized::Matrix{Int} estimated_theta::Matrix{F} - A::BitMatrix + A::BitMatrix # possible improvement by using an adjacency list Graphs.SimpleGraphs.adj(G) log_likelihood::F end @@ -78,7 +78,11 @@ function force_recompute_ll(a::BernoulliAssignment, g::Observations) return log_likelihood(a_simple, g) end - log_likelihood(a::BernoulliAssignment, g::Observations) = log_likelihood(a) +function get_ordered_adjacency_matrix(a::BernoulliAssignment) + perm = sortperm(a.node_labels) + return a.additional_data.A[perm,perm] +end + include("swap.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index b949ec0..9175b90 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -37,8 +37,9 @@ end function apply_swap!( assignment::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - swap_node_labels!(assignment, swap.index1, swap.index2) + # swap of the labels should happen after the update of the realized and estimated_theta update_observed!(assignment, swap) + swap_node_labels!(assignment, swap.index1, swap.index2) update_ll!(assignment) end @@ -46,13 +47,13 @@ function update_observed!(a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) - for i in 1:length(a.node_labels) + for i in axes(a.additional_data.A, 2) if i == swap.index1 || i == swap.index2 || a.additional_data.A[swap.index1, i] == a.additional_data.A[swap.index2, i] continue end group_inter = get_group_of_vertex(a, i) - if a.additional_data.A[swap.index1, i] == 1 + if a.additional_data.A[swap.index1, i] a.additional_data.realized[g1, group_inter] -= 1 a.additional_data.realized[group_inter, g1] = a.additional_data.realized[ g1, group_inter] @@ -61,7 +62,7 @@ function update_observed!(a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) a.additional_data.realized[group_inter, g2] = a.additional_data.realized[ g2, group_inter] end - if a.additional_data.A[swap.index2, i] == 1 + if a.additional_data.A[swap.index2, i] a.additional_data.realized[g2, group_inter] -= 1 a.additional_data.realized[group_inter, g2] = a.additional_data.realized[ g2, group_inter] @@ -82,3 +83,15 @@ function update_ll!(a::BernoulliAssignment) a.additional_data.estimated_theta, a.additional_data.counts) return nothing end + + +function fit(a::BernoulliAssignment, g::Observations) + println("Fitting BernoulliAssignment") + dists = initialize_sbm(a.group_size, Bernoulli(0.5)) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + dists[group1, group2] = Bernoulli(a.additional_data.estimated_theta[group1, group2]) + end + end + return dists +end diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index de7a34c..1153978 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -10,21 +10,19 @@ end mutable struct PreviousBestValue{T} <: StopRule k::Int - past_values::CircularDeque{T} + previous_best_value::T + iterations_since_best::Int function PreviousBestValue( k::Int, x::T = -Inf) where {T <: Real} @assert k > 0 # queue stores the best values and at most k subsequent values - queue = CircularDeque{T}(k + 1) - push!(queue, x) - new{T}(k, queue) + new{T}(k, x, 0) end end function initialise_stop_rule!(stop_rule::PreviousBestValue, a, g) score_value = score(a, g) - empty!(stop_rule.past_values) - push!(stop_rule.past_values, score_value) + stop_rule.previous_best_value = score_value end """ @@ -40,14 +38,11 @@ stopping_rule function stopping_rule(assignment::Assignment, g, stop_rule::PreviousBestValue) score_value = score(assignment, g) - if isempty(stop_rule.past_values) - push!(stop_rule.past_values, score_value) - return false - elseif score_value > first(stop_rule.past_values) - empty!(stop_rule.past_values) - push!(stop_rule.past_values, score_value) - return false + if score_value > stop_rule.previous_best_value + stop_rule.previous_best_value = score_value + stop_rule.iterations_since_best = 0 else - return length(stop_rule.past_values) == capacity(stop_rule.past_values) + stop_rule.iterations_since_best += 1 end + return stop_rule.iterations_since_best >= stop_rule.k end diff --git a/src/optimisation/opti.jl b/src/optimisation/opti.jl index 40743ad..8b3c070 100644 --- a/src/optimisation/opti.jl +++ b/src/optimisation/opti.jl @@ -33,8 +33,8 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 1000, new_score_value = score(a, g) next!(p) if stopping_rule(a, g, stop_rule) + println("Stopping rule kicked in at iteration $i.") finish!(p) - println("Stopping rule kicked in") break end end From 10c1d76058ca9f918b9358c0c0eec4047e3335eb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 18:24:52 +0200 Subject: [PATCH 012/266] clean and create skeleton for categorical --- src/assignments/BernoulliAssignment/struct.jl | 2 +- src/assignments/BernoulliAssignment/swap.jl | 17 +++++++---------- src/assignments/CategoricalAssignment/struct.jl | 10 ++++++++++ src/assignments/CategoricalAssignment/swap.jl | 4 ++++ src/optimisation/swap.jl | 2 +- 5 files changed, 23 insertions(+), 12 deletions(-) create mode 100644 src/assignments/CategoricalAssignment/struct.jl create mode 100644 src/assignments/CategoricalAssignment/swap.jl diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index f9bfcb8..e210a0b 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -82,7 +82,7 @@ log_likelihood(a::BernoulliAssignment, g::Observations) = log_likelihood(a) function get_ordered_adjacency_matrix(a::BernoulliAssignment) perm = sortperm(a.node_labels) - return a.additional_data.A[perm,perm] + return a.additional_data.A[perm, perm] end include("swap.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 9175b90..7f03dca 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -11,10 +11,6 @@ function make_swap(assignment::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) w return BernoulliSwap(id[1], id[2], copy(assignment.additional_data.realized), copy(assignment.additional_data.estimated_theta), assignment.additional_data.log_likelihood, copy(assignment.node_labels)) - # realized = copy(assignment.additional_data.realized) - # estimated_theta = copy(assignment.additional_data.estimated_theta) - # log_likelihood = assignment.additional_data.log_likelihood - # return BernoulliSwap(id[1], id[2], realized, estimated_theta, log_likelihood) end function make_swap!(swap::BernoulliSwap{F}, assignment::BernoulliAssignment{T, F}, @@ -22,7 +18,6 @@ function make_swap!(swap::BernoulliSwap{F}, assignment::BernoulliAssignment{T, F swap.index1, swap.index2 = id copy!(swap.realized, assignment.additional_data.realized) copy!(swap.estimated_theta, assignment.additional_data.estimated_theta) - #copy!(swap.node_labels, assignment.node_labels) swap.log_likelihood = assignment.additional_data.log_likelihood end @@ -37,9 +32,7 @@ end function apply_swap!( assignment::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - # swap of the labels should happen after the update of the realized and estimated_theta - update_observed!(assignment, swap) - swap_node_labels!(assignment, swap.index1, swap.index2) + update_observed_and_labels!(assignment, swap) update_ll!(assignment) end @@ -75,6 +68,10 @@ function update_observed!(a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) @. a.additional_data.estimated_theta = a.additional_data.realized / a.additional_data.counts + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(assignment, swap.index1, swap.index2) return nothing end @@ -84,13 +81,13 @@ function update_ll!(a::BernoulliAssignment) return nothing end - function fit(a::BernoulliAssignment, g::Observations) println("Fitting BernoulliAssignment") dists = initialize_sbm(a.group_size, Bernoulli(0.5)) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) - dists[group1, group2] = Bernoulli(a.additional_data.estimated_theta[group1, group2]) + dists[group1, group2] = Bernoulli(a.additional_data.estimated_theta[ + group1, group2]) end end return dists diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl new file mode 100644 index 0000000..b2c8fc6 --- /dev/null +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -0,0 +1,10 @@ +mutable struct CategoricalData{F} + counts::Matrix{Int} + realized::Matrix{Int} + estimated_theta::Matrix{F} + A::Matrix{Int} # possible improvement by using an adjacency list Graphs.SimpleGraphs.adj(G) + log_likelihood::F +end + +const CategoricalAssignment{T, F} = Assignment{T, CategoricalData{F}} +const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl new file mode 100644 index 0000000..d6a721b --- /dev/null +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -0,0 +1,4 @@ +mutable struct CategoricalSwap{F} <: Swap + index1::Int + index2::Int +end diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 272b1de..08ef383 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -18,5 +18,5 @@ apply_swap!(a::Assignment, s::DefaultSwap) = swap_node_labels!(a, s.index1, s.in revert_swap!(assignment::Assignment, swap::DefaultSwap) = apply_swap!(assignment, swap) function swap_node_labels!(a::Assignment, i, j) - a.node_labels[i], a.node_labels[j] = a.node_labels[j],a.node_labels[i] + a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] end From 4b875e9d459c39da1f4d8d94724562495a6077eb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 18:26:31 +0200 Subject: [PATCH 013/266] minor typos --- src/assignments/BernoulliAssignment/swap.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 7f03dca..596bb1d 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -36,7 +36,8 @@ function apply_swap!( update_ll!(assignment) end -function update_observed!(a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} +function update_observed_and_labels!( + a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) @@ -71,7 +72,7 @@ function update_observed!(a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) # swap of the labels should happen after the update of the realized and estimated_theta # for the above loop to work correctly - swap_node_labels!(assignment, swap.index1, swap.index2) + swap_node_labels!(a, swap.index1, swap.index2) return nothing end From 1013bbd05edfee06e2fe40f3c80c1a8b8ca0c531 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 16 Oct 2024 18:28:38 +0200 Subject: [PATCH 014/266] add manual warning to remember what to do --- src/NetworkHistogram.jl | 3 +++ src/optimisation/config_rules/bandwidth_selection_rule.jl | 1 + 2 files changed, 4 insertions(+) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 850f4ae..177dac8 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -17,4 +17,7 @@ include("optimisation/include.jl") # more specialised and faster assignment types and methods include("assignments/include.jl") +@warn "User interface is not yet implemented" + + end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 15ff178..9991326 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -1,3 +1,4 @@ +@warn "Deprecated bandwidth selection needs to be updated" function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T} h = oracle_bandwidth(A, type, alpha, c) From 85ff2e7d26e4ee14b3d05b36576005ee7d77846d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 17 Oct 2024 11:02:10 +0200 Subject: [PATCH 015/266] update docs, GA, and prepare for benchmark --- .github/dependabot.yml | 10 ++++ .github/workflows/benchmark_pr.yml | 78 ++++++++++++++++++++++++++++++ benchmark/benchmarks.jl | 15 ++++++ docs/make.jl | 1 - docs/src/api.md | 3 +- docs/src/internals.md | 8 --- docs/src/rules.md | 2 +- 7 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/benchmark_pr.yml create mode 100644 benchmark/benchmarks.jl delete mode 100644 docs/src/internals.md diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..8400d25 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + open-pull-requests-limit: 99 + labels: + - "dependencies" + - "github-actions" \ No newline at end of file diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml new file mode 100644 index 0000000..dfad667 --- /dev/null +++ b/.github/workflows/benchmark_pr.yml @@ -0,0 +1,78 @@ +name: Benchmark a pull request + +on: + pull_request_target: + branches: + - master + +permissions: + pull-requests: write + +jobs: + generate_plots: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: "1.11" + - uses: julia-actions/cache@v2 + - name: Extract Package Name from Project.toml + id: extract-package-name + run: | + PACKAGE_NAME=$(grep "^name" Project.toml | sed 's/^name = "\(.*\)"$/\1/') + echo "::set-output name=package_name::$PACKAGE_NAME" + - name: Build AirspeedVelocity + env: + JULIA_NUM_THREADS: 2 + run: | + # Lightweight build step, as sometimes the runner runs out of memory: + julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")' + julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")' + - name: Add ~/.julia/bin to PATH + run: | + echo "$HOME/.julia/bin" >> $GITHUB_PATH + - name: Run benchmarks + run: | + echo $PATH + ls -l ~/.julia/bin + mkdir results + benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.repository.default_branch}}" --output-dir=results/ --tune + - name: Create plots from benchmarks + run: | + mkdir -p plots + benchpkgplot ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --npart=10 --format=png --input-dir=results/ --output-dir=plots/ + - name: Upload plot as artifact + uses: actions/upload-artifact@v4 + with: + name: plots + path: plots + - name: Create markdown table from benchmarks + run: | + benchpkgtable ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --input-dir=results/ --ratio > table.md + echo '### Benchmark Results' > body.md + echo '' >> body.md + echo '' >> body.md + cat table.md >> body.md + echo '' >> body.md + echo '' >> body.md + echo '### Benchmark Plots' >> body.md + echo 'A plot of the benchmark results have been uploaded as an artifact to the workflow run for this PR.' >> body.md + echo 'Go to "Actions"->"Benchmark a pull request"->[the most recent run]->"Artifacts" (at the bottom).' >> body.md + + - name: Find Comment + uses: peter-evans/find-comment@v3 + id: fcbenchmark + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-includes: Benchmark Results + + - name: Comment on PR + uses: peter-evans/create-or-update-comment@v4 + with: + comment-id: ${{ steps.fcbenchmark.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + body-path: body.md + edit-mode: replace \ No newline at end of file diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl new file mode 100644 index 0000000..0f0f91a --- /dev/null +++ b/benchmark/benchmarks.jl @@ -0,0 +1,15 @@ +using BenchmarkTools, NetworkHistogram +const SUITE = BenchmarkGroup() + +# Create hierarchy of benchmarks: +SUITE["eval"] = BenchmarkGroup() + +options = Options(; binary_operators = [+, -, *], unary_operators = [cos]) + + +for n in [10, 20] + SUITE["eval_tree_array"][n] = @benchmarkable(eval_tree_array($tree, X, $options), + evals=10, + samples=1000, + setup=(X = randn(Float32, 2, $n))) +end diff --git a/docs/make.jl b/docs/make.jl index e4219ec..74181a8 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,7 +18,6 @@ makedocs(; "Home" => "index.md", "API Reference" => "api.md", "Optimization hyperparameters" => "rules.md", - "Development" => "internals.md", ], checkdocs = :none) diff --git a/docs/src/api.md b/docs/src/api.md index 26fd83e..25d01f9 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -5,6 +5,7 @@ Depth = 1 # NetworkHistogram + diff --git a/docs/src/internals.md b/docs/src/internals.md deleted file mode 100644 index 04ce12a..0000000 --- a/docs/src/internals.md +++ /dev/null @@ -1,8 +0,0 @@ -# Notes on optimisation - -- We have three different assignment variables: - - current - - best - - proposal -- We need best because we might accept a proposal which is worse than the curent value. -- This is dealt within `accept_reject_update!()`. \ No newline at end of file diff --git a/docs/src/rules.md b/docs/src/rules.md index 510db50..5f57594 100644 --- a/docs/src/rules.md +++ b/docs/src/rules.md @@ -12,7 +12,7 @@ Pages = ["starting_assignment_rule.jl"] !!! note The groups will be of size `floor(h * n)` where `n` is the number of nodes if `h` is a float. If `h` is an integer, the groups will be of size `h`. The last group may be - smaller if `n` is not exactly divisible by the group size. + bigger if `n` is not exactly divisible by the group size. ## Swapping rule From 4fd3fb15b0eed73114681257d74a7b61191bfc10 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 17 Oct 2024 16:01:03 +0200 Subject: [PATCH 016/266] format --- .JuliaFormatter.toml | 3 +- src/NetworkHistogram.jl | 1 - src/assignments/Assignments.jl | 2 +- src/assignments/BernoulliAssignment/struct.jl | 8 +-- src/assignments/BernoulliAssignment/swap.jl | 51 ++++++++++--------- src/fit.jl | 2 +- .../config_rules/bandwidth_selection_rule.jl | 12 +++-- src/optimisation/include.jl | 2 +- .../{opti.jl => least_squares.jl} | 0 src/sbm.jl | 2 +- test/runtests.jl | 2 +- test/utils.jl | 5 ++ 12 files changed, 51 insertions(+), 39 deletions(-) rename src/optimisation/{opti.jl => least_squares.jl} (100%) create mode 100644 test/utils.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 453925c..9f18d98 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1 +1,2 @@ -style = "sciml" \ No newline at end of file +style = "sciml" +margin = 92 diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 177dac8..bbd71fc 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -19,5 +19,4 @@ include("assignments/include.jl") @warn "User interface is not yet implemented" - end diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 220e570..914bcb6 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -39,7 +39,7 @@ end function get_edge_indices(a::Assignment, i::Int, j::Int) return [(x, y) for x in get_vertex_in_group(a, i) - for y in get_vertex_in_group(a, j)] + for y in get_vertex_in_group(a, j) if x!=y] end function get_edge_indices(a::Assignment, i::Int) diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index e210a0b..5db3b02 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -17,11 +17,13 @@ function BernoulliAssignment( end function make_assignment(g, h, init_rule::BernoulliInitRule) - group_size, node_labels = initialize_node_labels( + group_size, + node_labels = initialize_node_labels( g, h, init_rule.starting_assignment_rule) return BernoulliAssignment(g, group_size, node_labels) end +# might be worth using graph accessors instead of the adjacency matrix ? function make_bernoulli_data(g, node_labels, group_size) number_groups = length(group_size) n = length(node_labels) @@ -73,13 +75,13 @@ function log_likelihood(assignment::BernoulliAssignment) return assignment.additional_data.log_likelihood end +log_likelihood(a::BernoulliAssignment, g::Observations) = log_likelihood(a) + function force_recompute_ll(a::BernoulliAssignment, g::Observations) a_simple = Assignment(a.group_size, a.node_labels) return log_likelihood(a_simple, g) end -log_likelihood(a::BernoulliAssignment, g::Observations) = log_likelihood(a) - function get_ordered_adjacency_matrix(a::BernoulliAssignment) perm = sortperm(a.node_labels) return a.additional_data.A[perm, perm] diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 596bb1d..8c3aaff 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -7,33 +7,32 @@ mutable struct BernoulliSwap{F} <: Swap node_labels::Vector{Int} end -function make_swap(assignment::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} - return BernoulliSwap(id[1], id[2], copy(assignment.additional_data.realized), - copy(assignment.additional_data.estimated_theta), - assignment.additional_data.log_likelihood, copy(assignment.node_labels)) +function make_swap(a::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} + return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), + copy(a.additional_data.estimated_theta), + a.additional_data.log_likelihood, copy(a.node_labels)) end -function make_swap!(swap::BernoulliSwap{F}, assignment::BernoulliAssignment{T, F}, +function make_swap!(swap::BernoulliSwap{F}, a::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} swap.index1, swap.index2 = id - copy!(swap.realized, assignment.additional_data.realized) - copy!(swap.estimated_theta, assignment.additional_data.estimated_theta) - swap.log_likelihood = assignment.additional_data.log_likelihood + copy!(swap.realized, a.additional_data.realized) + copy!(swap.estimated_theta, a.additional_data.estimated_theta) + swap.log_likelihood = a.additional_data.log_likelihood end function revert_swap!( - assignment::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - swap_node_labels!(assignment, swap.index1, swap.index2) - copy!(assignment.additional_data.realized, swap.realized) - copy!(assignment.additional_data.estimated_theta, swap.estimated_theta) - #copy!(assignment.node_labels, swap.node_labels) - assignment.additional_data.log_likelihood = swap.log_likelihood + a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + swap_node_labels!(a, swap.index1, swap.index2) + copy!(a.additional_data.realized, swap.realized) + copy!(a.additional_data.estimated_theta, swap.estimated_theta) + a.additional_data.log_likelihood = swap.log_likelihood end function apply_swap!( - assignment::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - update_observed_and_labels!(assignment, swap) - update_ll!(assignment) + a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + update_observed_and_labels!(a, swap) + update_ll!(a) end function update_observed_and_labels!( @@ -43,26 +42,30 @@ function update_observed_and_labels!( for i in axes(a.additional_data.A, 2) if i == swap.index1 || i == swap.index2 || - a.additional_data.A[swap.index1, i] == a.additional_data.A[swap.index2, i] + a.additional_data.A[swap.index1, i] == a.additional_data.A[swap.index2, ia] continue end group_inter = get_group_of_vertex(a, i) if a.additional_data.A[swap.index1, i] a.additional_data.realized[g1, group_inter] -= 1 - a.additional_data.realized[group_inter, g1] = a.additional_data.realized[ + a.additional_data.realized[ + group_inter, g1] = a.additional_data.realized[ g1, group_inter] a.additional_data.realized[g2, group_inter] += 1 - a.additional_data.realized[group_inter, g2] = a.additional_data.realized[ + a.additional_data.realized[ + group_inter, g2] = a.additional_data.realized[ g2, group_inter] end if a.additional_data.A[swap.index2, i] a.additional_data.realized[g2, group_inter] -= 1 - a.additional_data.realized[group_inter, g2] = a.additional_data.realized[ + a.additional_data.realized[ + group_inter, g2] = a.additional_data.realized[ g2, group_inter] a.additional_data.realized[g1, group_inter] += 1 - a.additional_data.realized[group_inter, g1] = a.additional_data.realized[ + a.additional_data.realized[ + group_inter, g1] = a.additional_data.realized[ g1, group_inter] end end @@ -83,11 +86,11 @@ function update_ll!(a::BernoulliAssignment) end function fit(a::BernoulliAssignment, g::Observations) - println("Fitting BernoulliAssignment") dists = initialize_sbm(a.group_size, Bernoulli(0.5)) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) - dists[group1, group2] = Bernoulli(a.additional_data.estimated_theta[ + dists[group1, + group2] = Bernoulli(a.additional_data.estimated_theta[ group1, group2]) end end diff --git a/src/fit.jl b/src/fit.jl index 55ccebb..03f2819 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -26,7 +26,7 @@ function _log_likelihood(a::Assignment, sbm::SBM, g) log_likelihood = 0.0 for i in 1:number_nodes(a) label_a = a.node_labels[i] - for j in (i + 1):number_nodes(a) + for j in (i+1):number_nodes(a) label_b = a.node_labels[j] log_likelihood += logdensityof(sbm[label_a, label_b], get_obs(g, i, j)) end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 9991326..17776b8 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -11,10 +11,13 @@ end Oracle bandwidth selection for graph histogram, using ```math -\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} \\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}}, +\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} +\\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}}, ``` -where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``. +where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the +empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on +``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``. """ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) if type ∉ ["eigs", "degs"] @@ -26,8 +29,8 @@ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1 end n = size(A, 1) - midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int, - (n ÷ 2 + c * sqrt(n)))) + midPt = collect(max(1, round(Int, (n÷2-c*sqrt(n)))):round(Int, + (n÷2+c*sqrt(n)))) rhoHat_inv = inv(sum(A) / (n * (n - 1))) # Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat); @@ -50,6 +53,5 @@ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1 h = (2^(alpha + 1) * alpha * mult^2 * (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 * rhoHat_inv)^(-1 / (2 * (alpha + 1))) - #estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2 return h[1] end diff --git a/src/optimisation/include.jl b/src/optimisation/include.jl index 28efb63..f3d51eb 100644 --- a/src/optimisation/include.jl +++ b/src/optimisation/include.jl @@ -1,2 +1,2 @@ include("swap.jl") -include("opti.jl") +include("least_squares.jl") diff --git a/src/optimisation/opti.jl b/src/optimisation/least_squares.jl similarity index 100% rename from src/optimisation/opti.jl rename to src/optimisation/least_squares.jl diff --git a/src/sbm.jl b/src/sbm.jl index da6a8d7..a8f4cd0 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -47,7 +47,7 @@ function sample(rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) A = BitMatrix(undef, n_nodes, n_nodes) for i in 1:n_nodes A[i, i] = zero(eltype(A)) - for j in (i + 1):n_nodes + for j in (i+1):n_nodes A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) A[j, i] = A[i, j] end diff --git a/test/runtests.jl b/test/runtests.jl index d785c62..dc481a0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,7 @@ using NetworkHistogram using Test -using JLD +include("utils.jl") @testset "NetworkHistogram.jl" begin include("test_swap.jl") diff --git a/test/utils.jl b/test/utils.jl new file mode 100644 index 0000000..a0cb036 --- /dev/null +++ b/test/utils.jl @@ -0,0 +1,5 @@ +function to_default_assignment(a_specialised::Assignment{T, B}) where {T, B} + return Assignment(a_specialised.group_size, a_specialised.node_labels) +end + +to_default_assignment(a::Assignment{T, Nothing}) where {T} = a From a286874411e8608b0f440fa1f242e3a1e1928b10 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 17 Oct 2024 19:09:40 +0200 Subject: [PATCH 017/266] add tests for assignments creation and swaps --- .JuliaFormatter.toml | 2 +- Project.toml | 1 + src/NetworkHistogram.jl | 6 ++- src/assignments/Assignments.jl | 37 +++++++++++------ src/assignments/BernoulliAssignment/struct.jl | 3 +- src/assignments/BernoulliAssignment/swap.jl | 8 ++-- .../CategoricalAssignment/struct.jl | 18 ++++---- src/{ => assignments}/group_numbering.jl | 0 src/assignments/include.jl | 1 + src/fit.jl | 22 ++++++---- src/observations.jl | 4 ++ src/optimisation/config_rules/InitRule.jl | 3 +- .../config_rules/bandwidth_selection_rule.jl | 12 ++++-- src/optimisation/config_rules/stop_rule.jl | 3 +- src/optimisation/config_rules/swap_rule.jl | 3 +- src/optimisation/least_squares.jl | 10 +++-- src/optimisation/swap.jl | 10 +++-- src/sbm.jl | 10 +++-- test/Project.toml | 6 +++ test/TestNetworkHistogram.jl | 30 ++++++++++++++ test/assignments/bernoulli_assignment.jl | 41 +++++++++++++++++++ test/assignments/categorical_assignment.jl | 0 test/assignments/default_assignment.jl | 16 ++++++++ test/runtests.jl | 9 ++-- test/test_swap.jl | 7 ---- test/utils.jl | 5 --- 26 files changed, 199 insertions(+), 68 deletions(-) rename src/{ => assignments}/group_numbering.jl (100%) create mode 100644 test/Project.toml create mode 100644 test/TestNetworkHistogram.jl create mode 100644 test/assignments/bernoulli_assignment.jl create mode 100644 test/assignments/categorical_assignment.jl create mode 100644 test/assignments/default_assignment.jl delete mode 100644 test/test_swap.jl delete mode 100644 test/utils.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 9f18d98..01897c4 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,2 +1,2 @@ style = "sciml" -margin = 92 +margin = 75 diff --git a/Project.toml b/Project.toml index 9609784..60f9fbf 100644 --- a/Project.toml +++ b/Project.toml @@ -52,6 +52,7 @@ SimpleWeightedGraphs = "1.4.0" SparseArrays = "1.11.0" StaticArrays = "1.9.7" StatsBase = "0.33.21" +Test = "1.11.0" TranscodingStreams = "0.9.11" ValueHistories = "0.5.4" julia = "1.8" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index bbd71fc..84ecfcd 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -6,11 +6,13 @@ using Graphs using PermutationSymmetricTensors using ProgressMeter: Progress, next!, finish! import StatsBase, Random +using DensityInterface: logdensityof -include("group_numbering.jl") +import Distributions.fit + +include("assignments/Assignments.jl") include("sbm.jl") include("observations.jl") -include("assignments/Assignments.jl") include("fit.jl") include("optimisation/include.jl") diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 914bcb6..d4cb4c9 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -1,22 +1,35 @@ +include("group_numbering.jl") + struct Assignment{T, B} <: AbstractVector{Vector{Int}} group_size::GroupSize{T} node_labels::Vector{Int} additional_data::B - function Assignment(group_size::GroupSize{T}, node_labels::Vector{Int}, + function Assignment(group_size::GroupSize{T}, node_labels, additional_data::B) where {T, B} if length(node_labels) != sum(group_size) - throw(ArgumentError("The length of `node_labels` must be equal to the sum of `group_size`")) + throw(ArgumentError("The length of `node_labels` must be equal to the sum of \ + `group_size`")) end return new{T, B}(group_size, node_labels, additional_data) end end -function Assignment(group_size::GroupSize{T}, node_labels::Vector{Int}) where {T} +function Assignment(group_size::GroupSize, node_labels) if length(node_labels) != sum(group_size) - message = "The length of `node_labels` $(length(node_labels)) must be equal " - message *= "to the sum of `group_size` $(sum(group_size))" - throw(ArgumentError(message)) + throw(ArgumentError("The length of `node_labels` $(length(node_labels)) must be \ + equal to the sum of `group_size` $(sum(group_size))")) + end + c = StatsBase.countmap(node_labels) + if length(c) != length(group_size) + throw(ArgumentError("The number of unique elements in `node_labels` $(length(c)) \ + must be equal to the length of `group_size` $(length(group_size))")) + end + for (k, v) in c + if v != group_size[k] + throw(ArgumentError("The number of elements in `node_labels` $(v) for group \ + $(k) must be equal to the size of the group $(group_size[k])")) + end end return Assignment(group_size, node_labels, nothing) end @@ -29,26 +42,26 @@ function number_nodes(assignment::Assignment) return length(assignment.node_labels) end -function get_vertex_in_group(assignment::Assignment, group::Int) +function get_vertex_in_group(assignment::Assignment, group) return findall(assignment.node_labels .== group) end -function get_group_of_vertex(assignment::Assignment, vertex::Int) +function get_group_of_vertex(assignment::Assignment, vertex) return assignment.node_labels[vertex] end -function get_edge_indices(a::Assignment, i::Int, j::Int) +function get_edge_indices(a::Assignment, i, j) return [(x, y) for x in get_vertex_in_group(a, i) - for y in get_vertex_in_group(a, j) if x!=y] + for y in get_vertex_in_group(a, j) if x < y] end -function get_edge_indices(a::Assignment, i::Int) +function get_edge_indices(a::Assignment, i) nodes_i = get_vertex_in_group(a, i) return [(x, y) for x in nodes_i for y in nodes_i if x < y] end Base.size(a::Assignment) = (number_groups(a),) -Base.@propagate_inbounds function Base.getindex(a::Assignment, i::Int) +Base.@propagate_inbounds function Base.getindex(a::Assignment, i) @boundscheck checkbounds(a, i) return get_vertex_in_group(a, i) end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index 5db3b02..40eed21 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -65,7 +65,8 @@ function compute_log_likelihood(estimated_theta, counts) for j in i:number_groups θ = estimated_theta[i, j] θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) - loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * counts[i, j] + loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * + counts[i, j] end end return loglik diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 8c3aaff..d7192ae 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -7,7 +7,8 @@ mutable struct BernoulliSwap{F} <: Swap node_labels::Vector{Int} end -function make_swap(a::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} +function make_swap( + a::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), copy(a.additional_data.estimated_theta), a.additional_data.log_likelihood, copy(a.node_labels)) @@ -42,7 +43,8 @@ function update_observed_and_labels!( for i in axes(a.additional_data.A, 2) if i == swap.index1 || i == swap.index2 || - a.additional_data.A[swap.index1, i] == a.additional_data.A[swap.index2, ia] + a.additional_data.A[swap.index1, i] == + a.additional_data.A[swap.index2, i] continue end group_inter = get_group_of_vertex(a, i) @@ -85,7 +87,7 @@ function update_ll!(a::BernoulliAssignment) return nothing end -function fit(a::BernoulliAssignment, g::Observations) +function fit_sbm(a::BernoulliAssignment, g::Observations) dists = initialize_sbm(a.group_size, Bernoulli(0.5)) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index b2c8fc6..54f4068 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -1,10 +1,10 @@ -mutable struct CategoricalData{F} - counts::Matrix{Int} - realized::Matrix{Int} - estimated_theta::Matrix{F} - A::Matrix{Int} # possible improvement by using an adjacency list Graphs.SimpleGraphs.adj(G) - log_likelihood::F -end +# mutable struct CategoricalData{F} +# counts::Matrix{Int} +# realized::Matrix{Int} +# estimated_theta::Matrix{F} +# A::Matrix{Int} # possible improvement by using an adjacency list Graphs.SimpleGraphs.adj(G) +# log_likelihood::F +# end -const CategoricalAssignment{T, F} = Assignment{T, CategoricalData{F}} -const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} +# const CategoricalAssignment{T, F} = Assignment{T, CategoricalData{F}} +# const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} diff --git a/src/group_numbering.jl b/src/assignments/group_numbering.jl similarity index 100% rename from src/group_numbering.jl rename to src/assignments/group_numbering.jl diff --git a/src/assignments/include.jl b/src/assignments/include.jl index e6e4e65..fd238a3 100644 --- a/src/assignments/include.jl +++ b/src/assignments/include.jl @@ -1 +1,2 @@ include("BernoulliAssignment/struct.jl") +include("CategoricalAssignment/struct.jl") diff --git a/src/fit.jl b/src/fit.jl index 03f2819..045ee83 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -1,25 +1,32 @@ # Slow fallback methods for the Assignment type # speed up by implementing specialized methods for the BernoulliAssignment type and others # method to compute estimator from node clustering as specified in assignment -function fit(a::Assignment, g::Observations) +function fit_sbm(a::Assignment, g::Observations) dists = initialize_sbm(a.group_size, g.dist_ref) for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) - dists[group1, group2] = _fit(g.dist_ref, g.graph, edge_indices) + dists[group1, + group2] = fit_group(g.dist_ref, g.graph, edge_indices) end end return dists end -function _fit(distribution, g, edges) - return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) +function fit_group(distribution, g, edges) + return fit(distribution, get_obs.(Ref(g), edges)) end +function fit(dist, data) + error("NetworkHistogram.fit method not implemented for \ + $(typeof(dist)) and $(typeof(data))") +end + +fit(d::Distribution, data) = fit(typeof(d), data) + # method to compute the log likelihood of a SBM fitted according to the assignment function log_likelihood(a::Assignment, g::Observations) - dists = fit(a, g) - return _log_likelihood(a, dists, g.graph) + return _log_likelihood(a, fit_sbm(a, g), g.graph) end function _log_likelihood(a::Assignment, sbm::SBM, g) @@ -28,7 +35,8 @@ function _log_likelihood(a::Assignment, sbm::SBM, g) label_a = a.node_labels[i] for j in (i+1):number_nodes(a) label_b = a.node_labels[j] - log_likelihood += logdensityof(sbm[label_a, label_b], get_obs(g, i, j)) + log_likelihood += logdensityof( + sbm[label_a, label_b], get_obs(g, i, j)) end end return log_likelihood diff --git a/src/observations.jl b/src/observations.jl index 8e11684..0bc0fec 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -17,5 +17,9 @@ function get_obs(g::SimpleGraph, x::Tuple) end function get_obs(g::SimpleGraph, i::Int, j::Int) + @warn "this is weird" return convert(Bool, has_edge(g, i, j)) end + +get_obs(g::AbstractArray, x) = get_obs(g, x[1], x[2]) +get_obs(g::AbstractArray, i, j) = g[i, j] diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 5a1eff1..76a3de3 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -8,7 +8,8 @@ struct InitRule{S <: StartingAssignment, I} end function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} - return Assignment(initialize_node_labels(g, h, init_rule.starting_assignment_rule)...) + return Assignment(initialize_node_labels( + g, h, init_rule.starting_assignment_rule)...) end """ diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 17776b8..4966043 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -1,6 +1,7 @@ @warn "Deprecated bandwidth selection needs to be updated" -function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T} +function select_bandwidth( + A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T} h = oracle_bandwidth(A, type, alpha, c) return max(2, min(size(A)[1], round(Int, h))) end @@ -19,7 +20,8 @@ where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n` empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``. """ -function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) +function oracle_bandwidth( + A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) if type ∉ ["eigs", "degs"] error("Invalid input type $(type)") end @@ -29,7 +31,8 @@ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1 end n = size(A, 1) - midPt = collect(max(1, round(Int, (n÷2-c*sqrt(n)))):round(Int, + midPt = collect(max( + 1, round(Int, (n÷2-c*sqrt(n)))):round(Int, (n÷2+c*sqrt(n)))) rhoHat_inv = inv(sum(A) / (n * (n - 1))) @@ -51,7 +54,8 @@ function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1 lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid h = (2^(alpha + 1) * alpha * mult^2 * - (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 * + (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * + lmfit_coef[2]^2 * rhoHat_inv)^(-1 / (2 * (alpha + 1))) return h[1] end diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index 1153978..850ec66 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -36,7 +36,8 @@ Returns a Bool with true if we should stop the optimization based on the `stop_r """ stopping_rule -function stopping_rule(assignment::Assignment, g, stop_rule::PreviousBestValue) +function stopping_rule( + assignment::Assignment, g, stop_rule::PreviousBestValue) score_value = score(assignment, g) if score_value > stop_rule.previous_best_value stop_rule.previous_best_value = score_value diff --git a/src/optimisation/config_rules/swap_rule.jl b/src/optimisation/config_rules/swap_rule.jl index cee5b7a..e6c716c 100644 --- a/src/optimisation/config_rules/swap_rule.jl +++ b/src/optimisation/config_rules/swap_rule.jl @@ -14,7 +14,8 @@ current assignment `node_assignment`. select_swap function select_swap(assignment::Assignment, ::RandomNodeSwap) - groups = StatsBase.sample(1:number_groups(assignment), 2; replace = false) + groups = StatsBase.sample( + 1:number_groups(assignment), 2; replace = false) index1 = rand(get_vertex_in_group(assignment, groups[1])) index2 = rand(get_vertex_in_group(assignment, groups[2])) return (index1, index2) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 8b3c070..703368d 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -9,9 +9,9 @@ function optimize(g, h = select_bandwidth(g); progress_bar::Bool = false ) a = make_assignment(g, h, initialise_rule) - println("Initial log likelihood: ", score(a, g)) initialise_stop_rule!(stop_rule, a, g) - greedy_improve!(a, g; max_iter, swap_rule, accept_rule, stop_rule, progress_bar) + greedy_improve!( + a, g; max_iter, swap_rule, accept_rule, stop_rule, progress_bar) return a end @@ -29,7 +29,8 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 1000, new_score_value = score_value for i in 1:max_iter score_value = new_score_value - local_search!(a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) + local_search!( + a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) new_score_value = score(a, g) next!(p) if stopping_rule(a, g, stop_rule) @@ -41,7 +42,8 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 1000, end # perform local search by trying a swap and accepting it if it improves the likelihood -function local_search!(a::Assignment, g, swap::Swap = make_swap(a, (1, 1)); +function local_search!( + a::Assignment, g, swap::Swap = make_swap(a, (1, 1)); swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict() ) diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 08ef383..23c6ab6 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -9,13 +9,17 @@ function make_swap(::Assignment, id::Tuple{Int, Int}) return DefaultSwap(id[1], id[2]) end -function make_swap!(swap::DefaultSwap, a::Assignment, id::Tuple{Int, Int}) +function make_swap!(swap::DefaultSwap, ::Assignment, id::Tuple{Int, Int}) swap.index1, swap.index2 = id end -apply_swap!(a::Assignment, s::DefaultSwap) = swap_node_labels!(a, s.index1, s.index2) +function apply_swap!(a::Assignment, s::DefaultSwap) + swap_node_labels!(a, s.index1, s.index2) +end -revert_swap!(assignment::Assignment, swap::DefaultSwap) = apply_swap!(assignment, swap) +function revert_swap!(assignment::Assignment, swap::DefaultSwap) + apply_swap!(assignment, swap) +end function swap_node_labels!(a::Assignment, i, j) a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] diff --git a/src/sbm.jl b/src/sbm.jl index a8f4cd0..f2bd007 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -42,7 +42,8 @@ Base.@propagate_inbounds function Base.getindex(s::SBM, i, j) return getindex(s.probs, i, j) end -function sample(rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) +function sample( + rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) n_nodes = length(node_labels) A = BitMatrix(undef, n_nodes, n_nodes) for i in 1:n_nodes @@ -55,8 +56,11 @@ function sample(rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) return sparse(A), node_labels end -sample(sbm::SBM, node_labels::Vector{Int}) = sample(Random.default_rng(), sbm, node_labels) -function sample(rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) +function sample(sbm::SBM, node_labels::Vector{Int}) + sample(Random.default_rng(), sbm, node_labels) +end +function sample( + rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) n_blocks = number_blocks(sbm) node_labels = StatsBase.sample( rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..a23db61 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,6 @@ +[deps] +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/TestNetworkHistogram.jl b/test/TestNetworkHistogram.jl new file mode 100644 index 0000000..7a6f6ed --- /dev/null +++ b/test/TestNetworkHistogram.jl @@ -0,0 +1,30 @@ +module TestNetworkHistogram + +import NetworkHistogram as NH +using Test + +function to_default_assignment(a_specialised::NH.Assignment{T, B}) where {T, B} + return NH.Assignment(a_specialised.group_size, a_specialised.node_labels) +end + +to_default_assignment(a::NH.Assignment{T, Nothing}) where {T} = a + +function test_swap_revertible(a::NH.Assignment, swap::NH.Swap, g::NH.Observations) + a_test = deepcopy(a) + NH.apply_swap!(a_test, swap) + @test NH.get_group_of_vertex(a, swap.index1) == + NH.get_group_of_vertex(a_test, swap.index2) + @test NH.get_group_of_vertex(a, swap.index2) == + NH.get_group_of_vertex(a_test, swap.index1) + + # force recomputation of the log likelihood using default assignment + a_new = to_default_assignment(a_test) + @test NH.log_likelihood(a_new, g) ≈ NH.log_likelihood(a_test, g) + + # revert the swap and check if the assignment is the same as before + NH.revert_swap!(a_test, swap) + @test a == a_test + @test NH.log_likelihood(a, g) ≈ NH.log_likelihood(a_test, g) +end + +end diff --git a/test/assignments/bernoulli_assignment.jl b/test/assignments/bernoulli_assignment.jl new file mode 100644 index 0000000..5e09115 --- /dev/null +++ b/test/assignments/bernoulli_assignment.jl @@ -0,0 +1,41 @@ +import NetworkHistogram as NH + +@testset "test construction Bernoulli assignment" begin + using Distributions: Bernoulli + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + obs = NH.Observations(A, Bernoulli(0.5)) + node_labels = [1, 1, 1, 1, 2, 2, 2, 2] + group_size = NH.GroupSize(8, 4) + a = NH.BernoulliAssignment(obs, group_size, node_labels) + for i in 1:8 + @test NH.get_group_of_vertex(a, i) == node_labels[i] + end + @test all(a.additional_data.A .== A) + @test a.additional_data.realized == [5 2; 2 5] + @test a.additional_data.counts == [6 16; 16 6] + @test a.additional_data.estimated_theta == [5/6 1/8; 1/8 5/6] +end + +@testset "test Bernoulli swap" begin + using ..TestNetworkHistogram: test_swap_revertible + using Distributions: Bernoulli + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + obs = NH.Observations(A, Bernoulli(0.5)) + a = NH.BernoulliAssignment(obs, NH.GroupSize(8, 4), [1, 1, 1, 1, 2, 2, 2, 2]) + swap = NH.make_swap(a, (1, 2)) + test_swap_revertible(a, swap, obs) +end diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl new file mode 100644 index 0000000..e69de29 diff --git a/test/assignments/default_assignment.jl b/test/assignments/default_assignment.jl new file mode 100644 index 0000000..97d656a --- /dev/null +++ b/test/assignments/default_assignment.jl @@ -0,0 +1,16 @@ +import NetworkHistogram as NH + +@testset "test default swap" begin + using ..TestNetworkHistogram: test_swap_revertible + import Random, LinearAlgebra + using Distributions: Bernoulli + Random.seed!(1234) + n = 20 + k = 5 + data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) + g = NH.Observations(data, Bernoulli(0.5)) + labels = repeat(1:n÷k, inner = k) + a = NH.Assignment(NH.GroupSize(n,k),labels) + swap = NH.DefaultSwap(1,2) + test_swap_revertible(a, swap, g) +end diff --git a/test/runtests.jl b/test/runtests.jl index dc481a0..d8fd4cd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,8 +1,9 @@ -using NetworkHistogram using Test -include("utils.jl") +include("TestNetworkHistogram.jl") -@testset "NetworkHistogram.jl" begin - include("test_swap.jl") +@testset "Assignment tests" begin + include("assignments/default_assignment.jl") + include("assignments/bernoulli_assignment.jl") + include("assignments/categorical_assignment.jl") end diff --git a/test/test_swap.jl b/test/test_swap.jl deleted file mode 100644 index 30dc1a0..0000000 --- a/test/test_swap.jl +++ /dev/null @@ -1,7 +0,0 @@ -@testset "swap test" begin - # given a::Assignment and s::Swap - # save copy assignment - a_ref = deepcopy(a) - revert_swap!(make_swap!(a), s) - @assert a == a_ref -end diff --git a/test/utils.jl b/test/utils.jl deleted file mode 100644 index a0cb036..0000000 --- a/test/utils.jl +++ /dev/null @@ -1,5 +0,0 @@ -function to_default_assignment(a_specialised::Assignment{T, B}) where {T, B} - return Assignment(a_specialised.group_size, a_specialised.node_labels) -end - -to_default_assignment(a::Assignment{T, Nothing}) where {T} = a From c795929e83a101dc7179d93132fe71111fb3bee8 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 18 Oct 2024 15:19:34 +0200 Subject: [PATCH 018/266] found fastest ll computation --- Project.toml | 4 + src/assignments/BernoulliAssignment/struct.jl | 12 ++- .../CategoricalAssignment/struct.jl | 77 ++++++++++++++++--- test/assignments/default_assignment.jl | 7 +- 4 files changed, 80 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index 60f9fbf..aaeb7d4 100644 --- a/Project.toml +++ b/Project.toml @@ -18,6 +18,8 @@ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8" Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" MetaGraphsNext = "fa8bd995-216d-47f1-8a91-f3b68fbeb377" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" @@ -44,6 +46,8 @@ Graphs = "1.9.0" HTTP = "1.7.4" JLD = "0.13.3" Kronecker = "0.5" +LogExpFunctions = "0.3.28" +LoopVectorization = "0.12.171" MetaGraphsNext = "0.7.0" Metis = "1.5.0" PermutationSymmetricTensors = "0.2.0" diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index 40eed21..066eb3e 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -58,15 +58,13 @@ function convert_bitmatrix(g::Observations{<:AbstractMatrix, D}) where {D} return convert(BitMatrix, g.graph) end -function compute_log_likelihood(estimated_theta, counts) +function compute_log_likelihood(estimated_theta::AbstractMatrix, counts::AbstractMatrix) number_groups = size(estimated_theta, 1) - loglik = 0.0 - @inbounds @simd for i in 1:number_groups - for j in i:number_groups + loglik = zero(eltype(estimated_theta)) + @inbounds for j in 1:number_groups + @simd for i in j:number_groups θ = estimated_theta[i, j] - θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) - loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * - counts[i, j] + loglik += xlogx(θ) + xlogx(1 - θ) * counts[i, j] end end return loglik diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 54f4068..2aa1801 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -1,10 +1,67 @@ -# mutable struct CategoricalData{F} -# counts::Matrix{Int} -# realized::Matrix{Int} -# estimated_theta::Matrix{F} -# A::Matrix{Int} # possible improvement by using an adjacency list Graphs.SimpleGraphs.adj(G) -# log_likelihood::F -# end - -# const CategoricalAssignment{T, F} = Assignment{T, CategoricalData{F}} -# const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} +mutable struct CategoricalData{F} + counts::Matrix{Int} + realized::Matrix{Array{Int,2}} + estimated_theta::Matrix{Array{F,2}} + A::Matrix{Int} + log_likelihood::F +end + +const CategoricalAssignment{T, F} = Assignment{T, CategoricalData{F}} +const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} + +function CategoricalAssignment( + g, group_size::GroupSize, node_labels::Vector{Int}) + categorical_data = make_categorical_data(g, node_labels, group_size) + return Assignment(group_size, node_labels, categorical_data) +end + +function make_assignment(g, h, init_rule::CategoricalInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return CategoricalAssignment(g, group_size, node_labels) +end + +function make_categorical_data(g, node_labels, group_size) + number_groups = length(group_size) + n = length(node_labels) + A, num_categories = categorical_matrix(g) + counts = zeros(Int, number_groups, number_groups) + realized = zeros(Int, number_groups, number_groups, num_categories) + @inbounds @simd for k in 1:number_groups + for l in k:number_groups + for m in 1:num_categories + if k == l + c = group_size[k] * (group_size[k] - 1) ÷ 2 + r = sum(A[node_labels .== k, node_labels .== l] .== m)/2 + else + c = group_size[k] * group_size[l] + r = sum(A[node_labels .== k, node_labels .== l] .== m) + end + realized[k, l, m] = r + realized[l, k, m] = realized[k, l, m] + counts[k, l] = c + counts[l, k] = c + end + end + end + + estimated_theta = realized ./ counts + ll = compute_log_likelihood(estimated_theta, counts) + return CategoricalData(counts, realized, estimated_theta, A, ll) +end + + + +function compute_log_likelihood_1(estimated_theta, counts) + loglik = 0.0 + @inbounds @simd for coord in CartesianIndices(estimated_theta) + if coord[2] <= coord[1] + θ = estimated_theta[coord] + θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) + loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * + counts[coord] + end + end + return loglik +end diff --git a/test/assignments/default_assignment.jl b/test/assignments/default_assignment.jl index 97d656a..06c5fa9 100644 --- a/test/assignments/default_assignment.jl +++ b/test/assignments/default_assignment.jl @@ -3,12 +3,13 @@ import NetworkHistogram as NH @testset "test default swap" begin using ..TestNetworkHistogram: test_swap_revertible import Random, LinearAlgebra - using Distributions: Bernoulli + using Distributions: Bernoulli, Normal Random.seed!(1234) n = 20 k = 5 - data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) - g = NH.Observations(data, Bernoulli(0.5)) + #data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) + data = Random.rand(Normal(), n,n) + g = NH.Observations(data, Normal(0,1)) labels = repeat(1:n÷k, inner = k) a = NH.Assignment(NH.GroupSize(n,k),labels) swap = NH.DefaultSwap(1,2) From 75e3f14d3695b7e0ee74826cbf32b3b621316de7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 18 Oct 2024 17:49:08 +0200 Subject: [PATCH 019/266] fix make data categorical --- src/NetworkHistogram.jl | 3 +- src/assignments/BernoulliAssignment/struct.jl | 9 +++- .../CategoricalAssignment/struct.jl | 49 ++++++++++++------- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 84ecfcd..d8e50cd 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -7,7 +7,8 @@ using PermutationSymmetricTensors using ProgressMeter: Progress, next!, finish! import StatsBase, Random using DensityInterface: logdensityof - +using StaticArrays: MVector, MMatrix +using LogExpFunctions: xlogx, xlogy import Distributions.fit include("assignments/Assignments.jl") diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index 066eb3e..9ec22d3 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -30,6 +30,10 @@ function make_bernoulli_data(g, node_labels, group_size) counts = zeros(Int, number_groups, number_groups) realized = zeros(Int, number_groups, number_groups) A = convert_bitmatrix(g) + + # below needs to be abstracted: not sure how diagonal is handled if nonzero + # addtioally, we should be able to deal with missing values ! + # This concerns the counts matrix above as well @inbounds @simd for k in 1:number_groups for l in k:number_groups realized[k, l] = sum(A[node_labels .== k, node_labels .== l]) @@ -58,13 +62,14 @@ function convert_bitmatrix(g::Observations{<:AbstractMatrix, D}) where {D} return convert(BitMatrix, g.graph) end -function compute_log_likelihood(estimated_theta::AbstractMatrix, counts::AbstractMatrix) +function compute_log_likelihood(estimated_theta::AbstractMatrix{F}, + counts::AbstractMatrix{T}) where {F <: Real, T <: Real} number_groups = size(estimated_theta, 1) loglik = zero(eltype(estimated_theta)) @inbounds for j in 1:number_groups @simd for i in j:number_groups θ = estimated_theta[i, j] - loglik += xlogx(θ) + xlogx(1 - θ) * counts[i, j] + loglik += (xlogx(θ) + xlogx(1 - θ)) * counts[i, j] end end return loglik diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 2aa1801..9a934bd 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -1,8 +1,8 @@ -mutable struct CategoricalData{F} +mutable struct CategoricalData{F, M} counts::Matrix{Int} - realized::Matrix{Array{Int,2}} - estimated_theta::Matrix{Array{F,2}} - A::Matrix{Int} + realized::Matrix{MVector{M, Int}} + estimated_theta::Matrix{MVector{M, F}} + A::Matrix{Int} # possible use of CategoricalArrays.jl ? log_likelihood::F end @@ -27,19 +27,23 @@ function make_categorical_data(g, node_labels, group_size) n = length(node_labels) A, num_categories = categorical_matrix(g) counts = zeros(Int, number_groups, number_groups) - realized = zeros(Int, number_groups, number_groups, num_categories) + realized = [MVector{num_categories}(zeros(Int, num_categories)) + for _ in 1:number_groups, _ in 1:number_groups] + + # this is incorrect if the diagonal of the matrix is anything + # else than 0, and that no "categories" is represented by 0 @inbounds @simd for k in 1:number_groups for l in k:number_groups for m in 1:num_categories if k == l c = group_size[k] * (group_size[k] - 1) ÷ 2 - r = sum(A[node_labels .== k, node_labels .== l] .== m)/2 + r = sum(A[node_labels .== k, node_labels .== l] .== m) ÷ 2 else c = group_size[k] * group_size[l] r = sum(A[node_labels .== k, node_labels .== l] .== m) end - realized[k, l, m] = r - realized[l, k, m] = realized[k, l, m] + realized[k, l][m] = r + realized[l, k][m] = r counts[k, l] = c counts[l, k] = c end @@ -47,21 +51,28 @@ function make_categorical_data(g, node_labels, group_size) end estimated_theta = realized ./ counts - ll = compute_log_likelihood(estimated_theta, counts) + ll = compute_log_likelihood(estimated_theta, realized) return CategoricalData(counts, realized, estimated_theta, A, ll) end - - -function compute_log_likelihood_1(estimated_theta, counts) - loglik = 0.0 - @inbounds @simd for coord in CartesianIndices(estimated_theta) - if coord[2] <= coord[1] - θ = estimated_theta[coord] - θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) - loglik += (θ_c * log(θ_c) + (1 - θ_c) * log(1 - θ_c)) * - counts[coord] +function compute_log_likelihood( + estimated_theta::Matrix{MVector{M, T}}, counts::Matrix{F}) where {M, T, F} + loglik = zero(T) + number_groups = size(estimated_theta, 1) + @inbounds for j in 1:number_groups + @simd for i in j:number_groups + c = counts[i, j] + loglik += sum(xlogx.(estimated_theta[i, j]) .* c) end end return loglik end + +# to update, just for test now +function categorical_matrix(A::AbstractArray{<:Integer}) + A_inter = A .- minimum(A) .+ 1 + for i in 1:size(A_inter, 1) + A_inter[i, i] = 0 + end + return A_inter, maximum(A_inter) +end From cc54cc45f8f85121f009fc974eb2ae29e200fe01 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 18 Oct 2024 18:10:06 +0200 Subject: [PATCH 020/266] remove useless memeory in BernoulliSwap --- src/assignments/BernoulliAssignment/swap.jl | 3 +-- src/assignments/CategoricalAssignment/struct.jl | 7 ++++++- src/assignments/CategoricalAssignment/swap.jl | 5 ++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index d7192ae..fb56ae1 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -4,14 +4,13 @@ mutable struct BernoulliSwap{F} <: Swap realized::Matrix{Int} estimated_theta::Matrix{F} log_likelihood::F - node_labels::Vector{Int} end function make_swap( a::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), copy(a.additional_data.estimated_theta), - a.additional_data.log_likelihood, copy(a.node_labels)) + a.additional_data.log_likelihood) end function make_swap!(swap::BernoulliSwap{F}, a::BernoulliAssignment{T, F}, diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 9a934bd..1071a83 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -56,7 +56,7 @@ function make_categorical_data(g, node_labels, group_size) end function compute_log_likelihood( - estimated_theta::Matrix{MVector{M, T}}, counts::Matrix{F}) where {M, T, F} + estimated_theta::AbstractMatrix{MVector{M, T}}, counts::AbstractMatrix{F}) where {M, T, F} loglik = zero(T) number_groups = size(estimated_theta, 1) @inbounds for j in 1:number_groups @@ -76,3 +76,8 @@ function categorical_matrix(A::AbstractArray{<:Integer}) end return A_inter, maximum(A_inter) end + + +function categorical_matrix(g::Observations{AbstractMatrix{<:Integer},I}) where {I} + return categorical_matrix(g.graph) +end diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index d6a721b..392488e 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -1,4 +1,7 @@ -mutable struct CategoricalSwap{F} <: Swap +mutable struct CategoricalSwap{M,F} <: Swap index1::Int index2::Int + realized::Matrix{MVector{M, Int}} + estimated_theta::Matrix{MVector{M, F}} + log_likelihood::F end From 798ce65c66758ce73002489da34abb1e12ca4e9e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 20 Oct 2024 11:01:44 +0200 Subject: [PATCH 021/266] working cat assginment before switching to adj list --- src/NetworkHistogram.jl | 1 + src/assignments/AggregAssignment/struct.jl | 7 ++ src/assignments/BernoulliAssignment/struct.jl | 1 - src/assignments/CategoricalAssignment/swap.jl | 108 +++++++++++++++++- src/observations.jl | 5 +- src/sbm.jl | 3 +- test/assignments/categorical_assignment.jl | 26 +++++ 7 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 src/assignments/AggregAssignment/struct.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index d8e50cd..85bc3a5 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -10,6 +10,7 @@ using DensityInterface: logdensityof using StaticArrays: MVector, MMatrix using LogExpFunctions: xlogx, xlogy import Distributions.fit +using LoopVectorization: @turbo include("assignments/Assignments.jl") include("sbm.jl") diff --git a/src/assignments/AggregAssignment/struct.jl b/src/assignments/AggregAssignment/struct.jl new file mode 100644 index 0000000..c7a8ad2 --- /dev/null +++ b/src/assignments/AggregAssignment/struct.jl @@ -0,0 +1,7 @@ +struct AggregData{C, R, E, D, F} + counts::AbstractMatrix{C} + realized::AbstractMatrix{R} + estimated_theta::AbstractMatrix{E} + A::D + log_likelihood::F +end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index 9ec22d3..e759881 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -9,7 +9,6 @@ end const BernoulliAssignment{T, F} = Assignment{T, BernoulliData{F}} const BernoulliInitRule{S, F} = InitRule{S, Val{BernoulliData}} -# is this type stable? should this be BernoulliAssignment{T,F}? see line 8 above function BernoulliAssignment( g, group_size::GroupSize, node_labels::Vector{Int}) bernoulli_data = make_bernoulli_data(g, node_labels, group_size) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 392488e..51b19b3 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -1,7 +1,113 @@ -mutable struct CategoricalSwap{M,F} <: Swap +mutable struct CategoricalSwap{M, F} <: Swap index1::Int index2::Int realized::Matrix{MVector{M, Int}} estimated_theta::Matrix{MVector{M, F}} log_likelihood::F end + +function make_swap(a::CategoricalAssignment, id::Tuple{Int, Int}) + return CategoricalSwap(id[1], id[2], copy(a.additional_data.realized), + copy(a.additional_data.estimated_theta), + a.additional_data.log_likelihood) +end + +function make_swap!(swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F}, + id::Tuple{Int, Int}) where {T, M, F} + swap.index1, swap.index2 = id + copy!(swap.realized, a.additional_data.realized) + copy!(swap.estimated_theta, a.additional_data.estimated_theta) + swap.log_likelihood = a.additional_data.log_likelihood +end + +function revert_swap!( + a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where {T, M, F} + swap_node_labels!(a, swap.index1, swap.index2) + copy!(a.additional_data.realized, swap.realized) + copy!(a.additional_data.estimated_theta, swap.estimated_theta) + a.additional_data.log_likelihood = swap.log_likelihood +end + +function apply_swap!( + a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where {T, M, F} + update_observed_and_labels!(a, swap) + update_ll!(a) +end + +function update_ll!(a::CategoricalAssignment) + a.additional_data.log_likelihood = compute_log_likelihood( + a.additional_data.estimated_theta, a.additional_data.counts) + return nothing +end + +function fit_sbm(a::CategoricalAssignment{T, M, F}, g::Observations) where {T, M, F} + dists = initialize_sbm(a.group_size, Categorical(ones(M)/M)) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + dists[group1, + group2] = Categorical(a.additional_data.estimated_theta[ + group1, group2]) + end + end + return dists +end + +function update_observed_and_labels!( + a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where {T, M, F} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + + adj_1 = @view a.additional_data.A[:, swap.index1] + adj_2 = @view a.additional_data.A[:, swap.index2] + realized_g1 = @view a.additional_data.realized[:, g1] + realized_g2 = @view a.additional_data.realized[:, g2] + + @inbounds @fastmath for i in axes(a.additional_data.A, 1) + index_1 = adj_1[i] + index_2 = adj_2[i] + if i == swap.index1 || i == swap.index2 || index_1 == index_2 + + else + group_inter = get_group_of_vertex(a, i) + + a_g1_g_inter = a.additional_data.realized[g1, group_inter] + a_g2_g_inter = a.additional_data.realized[g2, group_inter] + a_g_inter_g1 = realized_g1[group_inter] + a_g_inter_g2 = realized_g2[group_inter] + + # send from group 1 to group 2 + a_g1_g_inter[index_1] -= 1 + a_g_inter_g1[index_1] -= 1 + + a_g2_g_inter[index_2] += 1 + a_g_inter_g2[index_2] += 1 + + # send from group 2 to group 1 + a_g2_g_inter[index_2] -= 1 + a_g_inter_g2[index_2] -= 1 + + a_g1_g_inter[index_1] += 1 + a_g_inter_g1[index_1] += 1 + end + end + + _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, + a.additional_data.counts) + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + + +function _fast_div!(theta, realized, counts) + for j in axes(theta, 2) + for i in axes(theta, 1) + t = theta[i, j] + for k in axes(t, 1) + theta[i, j][k] = realized[i, j][k] / counts[i, j] + end + end + end +end diff --git a/src/observations.jl b/src/observations.jl index 0bc0fec..821b7c8 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -4,10 +4,13 @@ struct Observations{G, D} dist_ref::D end -function number_nodes(g::Observations) +function number_nodes(g::Observations{AbstractGraph, D}) where {D} return nv(g.graph) end +function number_nodes(g::Observations) + return size(g.graph,1) +end function get_obs(g::Observations, x::Tuple) return get_obs(g.graph, x[1], x[2]) end diff --git a/src/sbm.jl b/src/sbm.jl index f2bd007..d3283f9 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -45,7 +45,8 @@ end function sample( rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) n_nodes = length(node_labels) - A = BitMatrix(undef, n_nodes, n_nodes) + type_input = eltype(sbm.probs[1,1]) + A = Matrix{type_input}(undef, n_nodes, n_nodes) for i in 1:n_nodes A[i, i] = zero(eltype(A)) for j in (i+1):n_nodes diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index e69de29..2525d32 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -0,0 +1,26 @@ +import NetworkHistogram as NH + +@testset "test conversion to categorical observations" begin +end + +@testset "test Categorical swap" begin + using ..TestNetworkHistogram: test_swap_revertible + using Distributions: Categorical + using LinearAlgebra: Symmetric + import Random + m = 5 + p = ones(m) ./ m + n = 12 + k = 3 + dist = Categorical(p) + A = Symmetric(Random.rand(dist, n, n)) + # set the diagonal to 0 + for i in 1:n + A[i, i] = 0 + end + obs = NH.Observations(A, dist) + node_labels = repeat(1:k, inner = n÷k) + a = NH.CategoricalAssignment(obs, NH.GroupSize(n, n÷k), node_labels) + swap = NH.make_swap(a, (1, k+1)) + test_swap_revertible(a, swap, obs) +end From 29f59cf87a8a55bd362d51022a2cd4e27376c4cf Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 11:16:30 +0200 Subject: [PATCH 022/266] fix cat assignments --- src/assignments/CategoricalAssignment/struct.jl | 15 +++++++++++---- src/observations.jl | 1 - 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 1071a83..d45ea30 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -1,4 +1,4 @@ -mutable struct CategoricalData{F, M} +mutable struct CategoricalData{M, F} counts::Matrix{Int} realized::Matrix{MVector{M, Int}} estimated_theta::Matrix{MVector{M, F}} @@ -6,7 +6,7 @@ mutable struct CategoricalData{F, M} log_likelihood::F end -const CategoricalAssignment{T, F} = Assignment{T, CategoricalData{F}} +const CategoricalAssignment{T, M, F} = Assignment{T, CategoricalData{M, F}} const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} function CategoricalAssignment( @@ -69,7 +69,7 @@ function compute_log_likelihood( end # to update, just for test now -function categorical_matrix(A::AbstractArray{<:Integer}) +function categorical_matrix(A) A_inter = A .- minimum(A) .+ 1 for i in 1:size(A_inter, 1) A_inter[i, i] = 0 @@ -78,6 +78,13 @@ function categorical_matrix(A::AbstractArray{<:Integer}) end -function categorical_matrix(g::Observations{AbstractMatrix{<:Integer},I}) where {I} +function categorical_matrix(g::Observations) return categorical_matrix(g.graph) end + + +function log_likelihood(a::CategoricalAssignment, g::Observations) + return a.additional_data.log_likelihood +end + +include("swap.jl") diff --git a/src/observations.jl b/src/observations.jl index 821b7c8..b2bb0e4 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -20,7 +20,6 @@ function get_obs(g::SimpleGraph, x::Tuple) end function get_obs(g::SimpleGraph, i::Int, j::Int) - @warn "this is weird" return convert(Bool, has_edge(g, i, j)) end From 5bd3dd82e59234e11203ac0428c05082cf4c7a27 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 11:22:29 +0200 Subject: [PATCH 023/266] clean --- src/assignments/AggregAssignment/struct.jl | 7 ------- src/observations.jl | 1 + src/optimisation/config_rules/bandwidth_selection_rule.jl | 5 +++++ 3 files changed, 6 insertions(+), 7 deletions(-) delete mode 100644 src/assignments/AggregAssignment/struct.jl diff --git a/src/assignments/AggregAssignment/struct.jl b/src/assignments/AggregAssignment/struct.jl deleted file mode 100644 index c7a8ad2..0000000 --- a/src/assignments/AggregAssignment/struct.jl +++ /dev/null @@ -1,7 +0,0 @@ -struct AggregData{C, R, E, D, F} - counts::AbstractMatrix{C} - realized::AbstractMatrix{R} - estimated_theta::AbstractMatrix{E} - A::D - log_likelihood::F -end diff --git a/src/observations.jl b/src/observations.jl index b2bb0e4..f0d1be6 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -11,6 +11,7 @@ end function number_nodes(g::Observations) return size(g.graph,1) end + function get_obs(g::Observations, x::Tuple) return get_obs(g.graph, x[1], x[2]) end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 4966043..980f02b 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -1,5 +1,10 @@ @warn "Deprecated bandwidth selection needs to be updated" +function select_number_node_per_bloc(g::Observations) + h = orcacle_bandwidth(g.graph) + return max(2, min(number_nodes(g), round(Int, h))) +end + function select_bandwidth( A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T} h = oracle_bandwidth(A, type, alpha, c) From d5973306bd0bbd99f80a54c2b85c20bcfcda57be Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 12:20:18 +0200 Subject: [PATCH 024/266] clean bandwidth selection --- src/observations.jl | 8 ++ .../config_rules/bandwidth_selection_rule.jl | 90 +++++++++++++++++-- 2 files changed, 90 insertions(+), 8 deletions(-) diff --git a/src/observations.jl b/src/observations.jl index f0d1be6..a57c6c9 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -26,3 +26,11 @@ end get_obs(g::AbstractArray, x) = get_obs(g, x[1], x[2]) get_obs(g::AbstractArray, i, j) = g[i, j] + +function density(g::AbstractGraph) + return Graphs.density(g) +end + +function density(g::AbstractMatrix{Bool}) + return sum(g) / ((size(g, 1) * (size(g, 1) - 1))*2) +end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 980f02b..ce111af 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -1,14 +1,88 @@ -@warn "Deprecated bandwidth selection needs to be updated" +abstract type KSelectionRule end +struct OracleK <: KSelectionRule + K::Int +end +struct OracleM <: KSelectionRule + M::Int + α::Float64 +end + +function OracleM(M::Int) + return OracleM(M, 1.0) +end + +abstract type EstimatedM <: KSelectionRule end +struct EstimatedEigenvalues <: EstimatedM end +struct EstimatedDegrees <: EstimatedM end + +""" + select_number_node_per_block(g::Observations, rule::KSelectionRule) + +How to select the number of blocks `K` for the SBM model. + +# Implemented rules +- `OracleK(K::Int)`: Use the oracle number of blocks `K`. +- `OracleM(M::Int)`: Give the Holder constant `M` of the graphon, use the results from +[Olhede and Wolfe (2014)](https://www.pnas.org/doi/epdf/10.1073/pnas.1400374111) to estimate +the number of blocks `K`. +- `EstimatedEigenvalues()`: Use the estimated eigenvalues of the adjacency matrix to +estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. +- `EstimatedDegrees()`: Use the estimated degrees of the adjacency matrix to estimate the +Holder constant and then use `OracleM` to estimate the number of blocks `K`. + + +..note: + - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in + the graph. + - The estimated Holder constant `M` comes from equation (11) in Olhede and Wolfe (2014). +""" +select_number_node_per_block + +function select_number_node_per_block(g::Observations, rule::OracleK) + if rule.K > number_nodes(g)÷2 + error("The number of blocks $K is too large for the number of nodes \ + $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)") + end + return rule.K +end + +function select_number_node_per_block(g::Observations, rule::OracleM) + rho = density(g.graph) + n = number_nodes(g) + k = max(2, round(Int,2*rule.M^2*rho)^(-1/4)*sqrt(n)) + return select_number_node_per_block(g, OracleK(k)) +end -function select_number_node_per_bloc(g::Observations) - h = orcacle_bandwidth(g.graph) - return max(2, min(number_nodes(g), round(Int, h))) +function select_number_node_per_block(g::Observations, rule::EstimatedM) + n = number_nodes(g) + c = min(4, sqrt(n) / 8) + number_points_from_mid = round(Int, c * sqrt(n)) + mid_points = collect(max(1, n÷2-number_points_from_mid):(n÷2+number_points_from_mid)) + rho = density(g) + M = estimated_holder_constant(g, rule, mid_points, rho) + return select_number_node_per_block(g, OracleM(M)) end -function select_bandwidth( - A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T} - h = oracle_bandwidth(A, type, alpha, c) - return max(2, min(size(A)[1], round(Int, h))) +function estimated_holder_constant(g::Observations, ::EstimatedEigenvalues, points, rho) + eig_res = eigs(g.graph, nev = 1, which = :LM) + u = eig_res.vectors + mult = eig_res.values[1] + return _approx_m_from_delta_f(u, mult, points, rho) +end + +function estimated_holder_constant(g::Observations, ::EstimatedDegrees, points, rho) + d = degree(g.graph) + mult = (d' * g.graph * d) / (sum(d .^ 2))^2 + return _approx_m_from_delta_f(d, mult, points, rho) +end + + +function _approx_m_from_delta_f(u, mult, midpoints, ρ, α=1.0) + sort!(u,dims=1) + uMid = u[midpoints] + β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid + h = 2^(α+1) * α * mult^2 * (β₁ * length(uMid)/2 + β₀)^2 * β₁^2 * ρ^(-1/(2*(α+1))) + return h^(-1/(2*(α+1))) end """ From d52a6b5e6cd5247690cf74ae7d2610894218b29d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 15:47:45 +0200 Subject: [PATCH 025/266] make docs work --- docs/Project.toml | 1 - docs/make.jl | 2 ++ docs/src/api.md | 22 ++-------------------- docs/src/custom_types.md | 8 ++++++++ docs/src/examples.md | 0 docs/src/index.md | 10 ++++------ docs/src/internal.md | 20 ++++++++++++++++++++ docs/src/rules.md | 20 ++++++++------------ 8 files changed, 44 insertions(+), 39 deletions(-) create mode 100644 docs/src/custom_types.md create mode 100644 docs/src/examples.md create mode 100644 docs/src/internal.md diff --git a/docs/Project.toml b/docs/Project.toml index 56dc1c7..dfa65cd 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,3 +1,2 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" diff --git a/docs/make.jl b/docs/make.jl index 74181a8..27097b4 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,6 +18,8 @@ makedocs(; "Home" => "index.md", "API Reference" => "api.md", "Optimization hyperparameters" => "rules.md", + "Examples" => "examples.md", + "Internal" => "internal.md", ], checkdocs = :none) diff --git a/docs/src/api.md b/docs/src/api.md index 25d01f9..d67e6ad 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -1,27 +1,9 @@ ```@contents Pages = ["api.md"] -Depth = 1 +Depth = 2 ``` -# NetworkHistogram - - diff --git a/docs/src/custom_types.md b/docs/src/custom_types.md new file mode 100644 index 0000000..68ed03d --- /dev/null +++ b/docs/src/custom_types.md @@ -0,0 +1,8 @@ +```@contents +Pages = ["custom_types.md"] +Depth = 2 +``` + + +# How to specialize the `Assignment` type for faster performance + diff --git a/docs/src/examples.md b/docs/src/examples.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/src/index.md b/docs/src/index.md index f496b95..9629542 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -11,7 +11,7 @@ Pkg.add("NetworkHistogram") ## Usage -We fit the estimator using [`graphhist`](@ref graphhist) and then extract the estimated graphon matrix and node labels. +We fit the estimator and then extract the estimated graphon matrix and node labels. ```julia using NetworkHistogram, LinearAlgebra @@ -20,10 +20,7 @@ A = Symmetric(rand(0:1, 100, 100)) A[diagind(A)] .= 0 # approximate the graphon with a network histogram -hist = graphhist(A) - -# get the graphist structure -estimate = hist.graphhist +estimate = graph_hist(A) # get the estimated graphon matrix sbm_matrix = estimate.θ @@ -32,4 +29,5 @@ sbm_matrix = estimate.θ node_labels = estimate.node_labels ``` -You can control the optimization process by modifying the rules used in the optimization. Check out [Optimization hyper-parameters](@ref) for more information. \ No newline at end of file +You can control the optimization process by modifying the rules used in the optimization. +Check out [Optimization hyper-parameters](@ref) for more information. \ No newline at end of file diff --git a/docs/src/internal.md b/docs/src/internal.md new file mode 100644 index 0000000..307a222 --- /dev/null +++ b/docs/src/internal.md @@ -0,0 +1,20 @@ +```@contents +Pages = ["internal.md"] +Depth = 2 +``` + + +# Assignments and group sizes + + +```@autodocs +Modules = [NetworkHistogram] +Pages = ["Assignments.jl", "group_numbering.jl"] +Private = true +``` + +## How to specialize the `Assignment` type for faster performance + +```@docs +NetworkHistogram.BernoulliData +``` \ No newline at end of file diff --git a/docs/src/rules.md b/docs/src/rules.md index 5f57594..788374e 100644 --- a/docs/src/rules.md +++ b/docs/src/rules.md @@ -4,9 +4,8 @@ Here we discuss the different parameters that can be used to control the optimiz ## Starting node labels -```@autodocs -Modules = [NetworkHistogram] -Pages = ["starting_assignment_rule.jl"] +```@docs; canonical=false +NetworkHistogram.initialize_node_labels ``` !!! note @@ -17,22 +16,19 @@ Pages = ["starting_assignment_rule.jl"] ## Swapping rule -```@autodocs -Modules = [NetworkHistogram] -Pages = ["swap_rule.jl"] +```@docs; canonical=false +NetworkHistogram.select_swap ``` ## Acceptance rule -```@autodocs -Modules = [NetworkHistogram] -Pages = ["accept_rule.jl"] +```@docs; canonical=false +NetworkHistogram.accept_reject_update! ``` ## Stopping rule -```@autodocs -Modules = [NetworkHistogram] -Pages = ["stop_rule.jl"] +```@docs; canonical=false +NetworkHistogram.stopping_rule ``` \ No newline at end of file From 5d407ca471bf827bb73d06063c30b37caa5e5400 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 15:48:19 +0200 Subject: [PATCH 026/266] add more docstrings --- src/assignments/Assignments.jl | 29 +++++++++++++++++++ src/assignments/BernoulliAssignment/struct.jl | 16 +++++++--- src/assignments/group_numbering.jl | 2 +- src/optimisation/config_rules/InitRule.jl | 4 +-- .../config_rules/bandwidth_selection_rule.jl | 12 ++++---- 5 files changed, 50 insertions(+), 13 deletions(-) diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index d4cb4c9..6e26ec0 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -1,5 +1,29 @@ include("group_numbering.jl") + +""" + struct Assignment{T, B} <: AbstractVector{Vector{Int}} + +A structure representing an assignment of nodes to groups. + +# Fields +- `group_size::GroupSize{T}`: The size of each group. +- `node_labels::Vector{Int}`: A vector of node labels. +- `additional_data::B`: Additional data associated with the assignment. + +# Constructor + Assignment(group_size::GroupSize{T}, node_labels, additional_data::B) where {T, B} + +Creates a new `Assignment` instance. + +# Arguments +- `group_size::GroupSize{T}`: The size of each group. +- `node_labels::Vector{Int}`: A vector of node labels. The length of this vector must be equal to the sum of `group_size`. +- `additional_data::B`: Additional data associated with the assignment. + +# Throws +- `ArgumentError`: If the length of `node_labels` is not equal to the sum of `group_size`. +""" struct Assignment{T, B} <: AbstractVector{Vector{Int}} group_size::GroupSize{T} node_labels::Vector{Int} @@ -65,3 +89,8 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i) @boundscheck checkbounds(a, i) return get_vertex_in_group(a, i) end + +function get_ordered_adjacency_matrix(a::Assignment) + perm = sortperm(a.node_labels) + return a.additional_data.A[perm, perm] +end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index e759881..c3c90d0 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -1,3 +1,15 @@ +""" + mutable struct BernoulliData{F} + +A data structure to store information related to a Bernoulli assignment in a network. + +# Fields +- `counts::Matrix{Int}`: A matrix representing the maximum number of edges between groups. +- `realized::Matrix{Int}`: A matrix representing the number of edges between groups. +- `estimated_theta::Matrix{F}`: A matrix of estimated parameters (theta). +- `A::BitMatrix`: An adjacency matrix representing the network structure. +- `log_likelihood::F`: +""" mutable struct BernoulliData{F} counts::Matrix{Int} realized::Matrix{Int} @@ -85,9 +97,5 @@ function force_recompute_ll(a::BernoulliAssignment, g::Observations) return log_likelihood(a_simple, g) end -function get_ordered_adjacency_matrix(a::BernoulliAssignment) - perm = sortperm(a.node_labels) - return a.additional_data.A[perm, perm] -end include("swap.jl") diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index e1fa865..726b70b 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -1,6 +1,6 @@ """ Array-like storage for the number of nodes in each group. Try to split the number of nodes -into equal groups, but if it is not possible, the last group may have mode nodes. +into equal groups, but if it is not possible, the last group may have more nodes. """ struct GroupSize{T} <: AbstractVector{Int} group_number::T diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 76a3de3..4da98f6 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -16,9 +16,9 @@ end initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` -objecta vector of node labels. +object and a vector of node labels. -# Implemenented rules +# Implemented rules - `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. - `RandomStart()`: Randomly assign nodes to groups. """ diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index ce111af..6f3097b 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -23,17 +23,17 @@ How to select the number of blocks `K` for the SBM model. # Implemented rules - `OracleK(K::Int)`: Use the oracle number of blocks `K`. - `OracleM(M::Int)`: Give the Holder constant `M` of the graphon, use the results from -[Olhede and Wolfe (2014)](https://www.pnas.org/doi/epdf/10.1073/pnas.1400374111) to estimate -the number of blocks `K`. + [Olhede and Wolfe (2014)](https://www.pnas.org/doi/epdf/10.1073/pnas.1400374111) to + estimate the number of blocks `K`. - `EstimatedEigenvalues()`: Use the estimated eigenvalues of the adjacency matrix to -estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. + estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. - `EstimatedDegrees()`: Use the estimated degrees of the adjacency matrix to estimate the -Holder constant and then use `OracleM` to estimate the number of blocks `K`. + Holder constant and then use `OracleM` to estimate the number of blocks `K`. -..note: +!!! info - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in - the graph. + the graph. - The estimated Holder constant `M` comes from equation (11) in Olhede and Wolfe (2014). """ select_number_node_per_block From 4feb3d90cef8094b433b5648c4fea56ec4a34b0a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 17:52:14 +0200 Subject: [PATCH 027/266] format --- .JuliaFormatter.toml | 2 +- benchmark/benchmarks.jl | 4 +- docs/make.jl | 5 +- src/NetworkHistogram.jl | 2 + src/assignments/Assignments.jl | 1 - src/assignments/BernoulliAssignment/struct.jl | 1 - src/assignments/BernoulliAssignment/swap.jl | 10 +- .../CategoricalAssignment/struct.jl | 7 +- src/assignments/CategoricalAssignment/swap.jl | 20 ++-- src/assignments/group_numbering.jl | 9 +- src/fit.jl | 4 +- src/observations.jl | 21 +++- .../config_rules/bandwidth_selection_rule.jl | 108 +++++------------- src/sbm.jl | 4 +- test/TestNetworkHistogram.jl | 3 +- test/assignments/bernoulli_assignment.jl | 3 +- test/assignments/categorical_assignment.jl | 9 +- test/assignments/default_assignment.jl | 10 +- test/optimisation/config_rules/init_rule.jl | 44 +++++++ test/runtests.jl | 4 + 20 files changed, 145 insertions(+), 126 deletions(-) create mode 100644 test/optimisation/config_rules/init_rule.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 01897c4..90ebcb4 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,2 +1,2 @@ style = "sciml" -margin = 75 +margin = 79 diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 0f0f91a..158dbef 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -6,9 +6,9 @@ SUITE["eval"] = BenchmarkGroup() options = Options(; binary_operators = [+, -, *], unary_operators = [cos]) - for n in [10, 20] - SUITE["eval_tree_array"][n] = @benchmarkable(eval_tree_array($tree, X, $options), + SUITE["eval_tree_array"][n] = @benchmarkable(eval_tree_array( + $tree, X, $options), evals=10, samples=1000, setup=(X = randn(Float32, 2, $n))) diff --git a/docs/make.jl b/docs/make.jl index 27097b4..29d001c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,7 +1,8 @@ using NetworkHistogram using Documenter -DocMeta.setdocmeta!(NetworkHistogram, :DocTestSetup, :(using NetworkHistogram); +DocMeta.setdocmeta!( + NetworkHistogram, :DocTestSetup, :(using NetworkHistogram); recursive = true) makedocs(; @@ -19,7 +20,7 @@ makedocs(; "API Reference" => "api.md", "Optimization hyperparameters" => "rules.md", "Examples" => "examples.md", - "Internal" => "internal.md", + "Internal" => "internal.md" ], checkdocs = :none) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 85bc3a5..7edce95 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -11,6 +11,8 @@ using StaticArrays: MVector, MMatrix using LogExpFunctions: xlogx, xlogy import Distributions.fit using LoopVectorization: @turbo +using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen +import Arpack include("assignments/Assignments.jl") include("sbm.jl") diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 6e26ec0..b47be5a 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -1,6 +1,5 @@ include("group_numbering.jl") - """ struct Assignment{T, B} <: AbstractVector{Vector{Int}} diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index c3c90d0..b64b0ad 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -97,5 +97,4 @@ function force_recompute_ll(a::BernoulliAssignment, g::Observations) return log_likelihood(a_simple, g) end - include("swap.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index fb56ae1..75afacb 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -50,23 +50,23 @@ function update_observed_and_labels!( if a.additional_data.A[swap.index1, i] a.additional_data.realized[g1, group_inter] -= 1 a.additional_data.realized[ - group_inter, g1] = a.additional_data.realized[ + group_inter, g1] = a.additional_data.realized[ g1, group_inter] a.additional_data.realized[g2, group_inter] += 1 a.additional_data.realized[ - group_inter, g2] = a.additional_data.realized[ + group_inter, g2] = a.additional_data.realized[ g2, group_inter] end if a.additional_data.A[swap.index2, i] a.additional_data.realized[g2, group_inter] -= 1 a.additional_data.realized[ - group_inter, g2] = a.additional_data.realized[ + group_inter, g2] = a.additional_data.realized[ g2, group_inter] a.additional_data.realized[g1, group_inter] += 1 a.additional_data.realized[ - group_inter, g1] = a.additional_data.realized[ + group_inter, g1] = a.additional_data.realized[ g1, group_inter] end end @@ -91,7 +91,7 @@ function fit_sbm(a::BernoulliAssignment, g::Observations) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) dists[group1, - group2] = Bernoulli(a.additional_data.estimated_theta[ + group2] = Bernoulli(a.additional_data.estimated_theta[ group1, group2]) end end diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index d45ea30..4e553fb 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -56,7 +56,8 @@ function make_categorical_data(g, node_labels, group_size) end function compute_log_likelihood( - estimated_theta::AbstractMatrix{MVector{M, T}}, counts::AbstractMatrix{F}) where {M, T, F} + estimated_theta::AbstractMatrix{MVector{M, T}}, counts::AbstractMatrix{F}) where { + M, T, F} loglik = zero(T) number_groups = size(estimated_theta, 1) @inbounds for j in 1:number_groups @@ -77,14 +78,12 @@ function categorical_matrix(A) return A_inter, maximum(A_inter) end - function categorical_matrix(g::Observations) return categorical_matrix(g.graph) end - function log_likelihood(a::CategoricalAssignment, g::Observations) - return a.additional_data.log_likelihood + return a.additional_data.log_likelihood end include("swap.jl") diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 51b19b3..41e413b 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -12,7 +12,8 @@ function make_swap(a::CategoricalAssignment, id::Tuple{Int, Int}) a.additional_data.log_likelihood) end -function make_swap!(swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F}, +function make_swap!( + swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F}, id::Tuple{Int, Int}) where {T, M, F} swap.index1, swap.index2 = id copy!(swap.realized, a.additional_data.realized) @@ -21,7 +22,8 @@ function make_swap!(swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, end function revert_swap!( - a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where {T, M, F} + a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where { + T, M, F} swap_node_labels!(a, swap.index1, swap.index2) copy!(a.additional_data.realized, swap.realized) copy!(a.additional_data.estimated_theta, swap.estimated_theta) @@ -29,7 +31,8 @@ function revert_swap!( end function apply_swap!( - a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where {T, M, F} + a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where { + T, M, F} update_observed_and_labels!(a, swap) update_ll!(a) end @@ -40,12 +43,13 @@ function update_ll!(a::CategoricalAssignment) return nothing end -function fit_sbm(a::CategoricalAssignment{T, M, F}, g::Observations) where {T, M, F} - dists = initialize_sbm(a.group_size, Categorical(ones(M)/M)) +function fit_sbm( + a::CategoricalAssignment{T, M, F}, g::Observations) where {T, M, F} + dists = initialize_sbm(a.group_size, Categorical(ones(M) / M)) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) dists[group1, - group2] = Categorical(a.additional_data.estimated_theta[ + group2] = Categorical(a.additional_data.estimated_theta[ group1, group2]) end end @@ -53,7 +57,8 @@ function fit_sbm(a::CategoricalAssignment{T, M, F}, g::Observations) where {T, M end function update_observed_and_labels!( - a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where {T, M, F} + a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where { + T, M, F} g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) @@ -100,7 +105,6 @@ function update_observed_and_labels!( return nothing end - function _fast_div!(theta, realized, counts) for j in axes(theta, 2) for i in axes(theta, 1) diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index 726b70b..f7a77e5 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -18,8 +18,10 @@ struct GroupSize{T} <: AbstractVector{Int} if number_groups * standard_group == number_nodes new{Int}(standard_group, number_groups) else - remainder_group = standard_group + mod(number_nodes, standard_group) - new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups) + remainder_group = standard_group + + mod(number_nodes, standard_group) + new{Tuple{Int, Int}}( + (standard_group, remainder_group), number_groups) end end end @@ -30,7 +32,8 @@ Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) return g.group_number end -Base.@propagate_inbounds function Base.getindex(g::GroupSize{Tuple{Int, Int}}, i::Int) +Base.@propagate_inbounds function Base.getindex( + g::GroupSize{Tuple{Int, Int}}, i::Int) @boundscheck checkbounds(g, i) return i < length(g) ? g.group_number[1] : g.group_number[2] end diff --git a/src/fit.jl b/src/fit.jl index 045ee83..ac7c5d9 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -7,7 +7,7 @@ function fit_sbm(a::Assignment, g::Observations) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) dists[group1, - group2] = fit_group(g.dist_ref, g.graph, edge_indices) + group2] = fit_group(g.dist_ref, g.graph, edge_indices) end end return dists @@ -33,7 +33,7 @@ function _log_likelihood(a::Assignment, sbm::SBM, g) log_likelihood = 0.0 for i in 1:number_nodes(a) label_a = a.node_labels[i] - for j in (i+1):number_nodes(a) + for j in (i + 1):number_nodes(a) label_b = a.node_labels[j] log_likelihood += logdensityof( sbm[label_a, label_b], get_obs(g, i, j)) diff --git a/src/observations.jl b/src/observations.jl index a57c6c9..2d3c7ce 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -9,7 +9,7 @@ function number_nodes(g::Observations{AbstractGraph, D}) where {D} end function number_nodes(g::Observations) - return size(g.graph,1) + return size(g.graph, 1) end function get_obs(g::Observations, x::Tuple) @@ -27,10 +27,27 @@ end get_obs(g::AbstractArray, x) = get_obs(g, x[1], x[2]) get_obs(g::AbstractArray, i, j) = g[i, j] +density(g::Observations) = density(g.graph) function density(g::AbstractGraph) return Graphs.density(g) end function density(g::AbstractMatrix{Bool}) - return sum(g) / ((size(g, 1) * (size(g, 1) - 1))*2) + return sum(g) / ((size(g, 1) * (size(g, 1) - 1))) +end + +function get_degree(g::Observations{AbstractGraph, D}) where {D} + Graphs.degree(g.graph) +end + +function get_degree(g) + return sum(g.graph, dims = 2) +end + +function get_adj(g::Observations{AbstractGraph, D}) where {D} + return Graphs.adjacency_matrix(g.graph) +end + +function get_adj(g::Observations) + return g.graph end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 6f3097b..05d1b80 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -2,12 +2,12 @@ abstract type KSelectionRule end struct OracleK <: KSelectionRule K::Int end -struct OracleM <: KSelectionRule - M::Int - α::Float64 +struct OracleM{F} <: KSelectionRule + M::F + α::F end -function OracleM(M::Int) +function OracleM(M) return OracleM(M, 1.0) end @@ -39,9 +39,9 @@ How to select the number of blocks `K` for the SBM model. select_number_node_per_block function select_number_node_per_block(g::Observations, rule::OracleK) - if rule.K > number_nodes(g)÷2 - error("The number of blocks $K is too large for the number of nodes \ - $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)") + if rule.K > number_nodes(g) ÷ 2 + throw(ArgumentError("The number of blocks $(rule.K) is too large for the number \ + of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) end return rule.K end @@ -49,7 +49,7 @@ end function select_number_node_per_block(g::Observations, rule::OracleM) rho = density(g.graph) n = number_nodes(g) - k = max(2, round(Int,2*rule.M^2*rho)^(-1/4)*sqrt(n)) + k = max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))) return select_number_node_per_block(g, OracleK(k)) end @@ -57,84 +57,30 @@ function select_number_node_per_block(g::Observations, rule::EstimatedM) n = number_nodes(g) c = min(4, sqrt(n) / 8) number_points_from_mid = round(Int, c * sqrt(n)) - mid_points = collect(max(1, n÷2-number_points_from_mid):(n÷2+number_points_from_mid)) - rho = density(g) - M = estimated_holder_constant(g, rule, mid_points, rho) - return select_number_node_per_block(g, OracleM(M)) + mid_points = max(1, n ÷ 2 - number_points_from_mid):(n ÷ 2 + number_points_from_mid) + m = estimated_number_nodes_per_block(g, rule, mid_points, density(g)) + return select_number_node_per_block(g, OracleK(m)) end -function estimated_holder_constant(g::Observations, ::EstimatedEigenvalues, points, rho) - eig_res = eigs(g.graph, nev = 1, which = :LM) - u = eig_res.vectors - mult = eig_res.values[1] - return _approx_m_from_delta_f(u, mult, points, rho) +function estimated_number_nodes_per_block( + g::Observations, ::EstimatedEigenvalues, points, rho) + λ, u = Arpack.eigs(get_adj(g), nev = 1, which = :LM) + return _approx_k_from_delta_f(u, λ[1], points, rho) end -function estimated_holder_constant(g::Observations, ::EstimatedDegrees, points, rho) - d = degree(g.graph) - mult = (d' * g.graph * d) / (sum(d .^ 2))^2 - return _approx_m_from_delta_f(d, mult, points, rho) +function estimated_number_nodes_per_block( + g::Observations, ::EstimatedDegrees, points, rho) + d = get_degree(g) + mult = ((d' * g.graph * d) / (sum(d .^ 2))^2)[1] + return _approx_k_from_delta_f(d, mult, points, rho) end - -function _approx_m_from_delta_f(u, mult, midpoints, ρ, α=1.0) - sort!(u,dims=1) +function _approx_k_from_delta_f(u, mult, midpoints, ρ, α = 1.0) + sort!(u, dims = 1) uMid = u[midpoints] - β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid - h = 2^(α+1) * α * mult^2 * (β₁ * length(uMid)/2 + β₀)^2 * β₁^2 * ρ^(-1/(2*(α+1))) - return h^(-1/(2*(α+1))) -end - -""" - oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) - -Oracle bandwidth selection for graph histogram, using - -```math -\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} -\\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}}, -``` - -where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the -empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on -``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``. -""" -function oracle_bandwidth( - A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) - if type ∉ ["eigs", "degs"] - error("Invalid input type $(type)") - end - - if alpha != 1 - error("Currently only supports alpha = 1") - end - - n = size(A, 1) - midPt = collect(max( - 1, round(Int, (n÷2-c*sqrt(n)))):round(Int, - (n÷2+c*sqrt(n)))) - rhoHat_inv = inv(sum(A) / (n * (n - 1))) - - # Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat); - if type == "eigs" - eig_res = eigs(A, nev = 1, which = :LM) - u = eig_res.vectors - mult = eig_res.values[1] - elseif type == "degs" - u = sum(A, dims = 2) - mult = (u' * A * u) / (sum(u .^ 2))^2 - else - error("Invalid input type $(type)") - end - - # Calculation bandwidth - u = sort(u, dims = 1) - uMid = u[midPt] - lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid - - h = (2^(alpha + 1) * alpha * mult^2 * - (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * - lmfit_coef[2]^2 * - rhoHat_inv)^(-1 / (2 * (alpha + 1))) - return h[1] + β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid + # from Olhede and Wolfe (2014), equation (11) + h = (2^(α + 1) * α * mult^2 * (β₁ * length(uMid) / 2 + β₀)^2 * β₁^2 * + ρ^(-1))^(-1 / (2 * (α + 1))) + return round(Int, h) end diff --git a/src/sbm.jl b/src/sbm.jl index d3283f9..b3e6913 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -45,11 +45,11 @@ end function sample( rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) n_nodes = length(node_labels) - type_input = eltype(sbm.probs[1,1]) + type_input = eltype(sbm.probs[1, 1]) A = Matrix{type_input}(undef, n_nodes, n_nodes) for i in 1:n_nodes A[i, i] = zero(eltype(A)) - for j in (i+1):n_nodes + for j in (i + 1):n_nodes A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) A[j, i] = A[i, j] end diff --git a/test/TestNetworkHistogram.jl b/test/TestNetworkHistogram.jl index 7a6f6ed..00bc3c9 100644 --- a/test/TestNetworkHistogram.jl +++ b/test/TestNetworkHistogram.jl @@ -9,7 +9,8 @@ end to_default_assignment(a::NH.Assignment{T, Nothing}) where {T} = a -function test_swap_revertible(a::NH.Assignment, swap::NH.Swap, g::NH.Observations) +function test_swap_revertible( + a::NH.Assignment, swap::NH.Swap, g::NH.Observations) a_test = deepcopy(a) NH.apply_swap!(a_test, swap) @test NH.get_group_of_vertex(a, swap.index1) == diff --git a/test/assignments/bernoulli_assignment.jl b/test/assignments/bernoulli_assignment.jl index 5e09115..d683e1a 100644 --- a/test/assignments/bernoulli_assignment.jl +++ b/test/assignments/bernoulli_assignment.jl @@ -35,7 +35,8 @@ end 1 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0] obs = NH.Observations(A, Bernoulli(0.5)) - a = NH.BernoulliAssignment(obs, NH.GroupSize(8, 4), [1, 1, 1, 1, 2, 2, 2, 2]) + a = NH.BernoulliAssignment( + obs, NH.GroupSize(8, 4), [1, 1, 1, 1, 2, 2, 2, 2]) swap = NH.make_swap(a, (1, 2)) test_swap_revertible(a, swap, obs) end diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index 2525d32..1b5697d 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -1,7 +1,6 @@ import NetworkHistogram as NH -@testset "test conversion to categorical observations" begin -end +@testset "test conversion to categorical observations" begin end @testset "test Categorical swap" begin using ..TestNetworkHistogram: test_swap_revertible @@ -19,8 +18,8 @@ end A[i, i] = 0 end obs = NH.Observations(A, dist) - node_labels = repeat(1:k, inner = n÷k) - a = NH.CategoricalAssignment(obs, NH.GroupSize(n, n÷k), node_labels) - swap = NH.make_swap(a, (1, k+1)) + node_labels = repeat(1:k, inner = n ÷ k) + a = NH.CategoricalAssignment(obs, NH.GroupSize(n, n ÷ k), node_labels) + swap = NH.make_swap(a, (1, k + 1)) test_swap_revertible(a, swap, obs) end diff --git a/test/assignments/default_assignment.jl b/test/assignments/default_assignment.jl index 06c5fa9..76a1479 100644 --- a/test/assignments/default_assignment.jl +++ b/test/assignments/default_assignment.jl @@ -8,10 +8,10 @@ import NetworkHistogram as NH n = 20 k = 5 #data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) - data = Random.rand(Normal(), n,n) - g = NH.Observations(data, Normal(0,1)) - labels = repeat(1:n÷k, inner = k) - a = NH.Assignment(NH.GroupSize(n,k),labels) - swap = NH.DefaultSwap(1,2) + data = Random.rand(Normal(), n, n) + g = NH.Observations(data, Normal(0, 1)) + labels = repeat(1:(n ÷ k), inner = k) + a = NH.Assignment(NH.GroupSize(n, k), labels) + swap = NH.DefaultSwap(1, 2) test_swap_revertible(a, swap, g) end diff --git a/test/optimisation/config_rules/init_rule.jl b/test/optimisation/config_rules/init_rule.jl new file mode 100644 index 0000000..8c07a2d --- /dev/null +++ b/test/optimisation/config_rules/init_rule.jl @@ -0,0 +1,44 @@ +import NetworkHistogram as NH + +@testset "regression test" begin + using Distributions: Bernoulli + A = BitMatrix([0 0 1 0 1 0 1 1 0 1 + 0 0 1 1 1 1 1 1 0 0 + 1 1 0 1 0 0 0 0 1 0 + 0 1 1 0 1 0 1 0 0 0 + 1 1 0 1 0 0 1 0 0 1 + 0 1 0 0 0 0 0 1 0 0 + 1 1 0 1 1 0 0 1 0 1 + 1 1 0 0 0 1 1 0 0 1 + 0 0 1 0 0 0 0 0 0 1 + 1 0 0 0 1 0 1 1 1 0]) + h_true_nethist = 2.643731 # version 0.2.3 from nethist package + k_true = 3 + obs = NH.Observations(A, Bernoulli(0.5)) + @testset "degrees" begin + k = NH.select_number_node_per_block(obs, NH.EstimatedDegrees()) + @test k == k_true + end + @testset "eigenvalues" begin + k = NH.select_number_node_per_block(obs, NH.EstimatedEigenvalues()) + @test k == k_true + end +end + +@testset "test oracle K" begin + using Distributions: Bernoulli + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + obs = NH.Observations(A, Bernoulli(0.5)) + oracle = NH.OracleK(4) + @test NH.select_number_node_per_block(obs, oracle) == 4 + err = ArgumentError("The number of blocks 5 is too large for the number of nodes \ + 8, it should be at most 4") + @test_throws err NH.select_number_node_per_block(obs, NH.OracleK(5)) +end diff --git a/test/runtests.jl b/test/runtests.jl index d8fd4cd..f4d7e6c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,3 +7,7 @@ include("TestNetworkHistogram.jl") include("assignments/bernoulli_assignment.jl") include("assignments/categorical_assignment.jl") end + +@testset "Rule optimization tests" begin + include("optimisation/config_rules/init_rule.jl") +end From 0fc312ab727c0cea5da5e7a42c855d9b38cc96b6 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 19:07:52 +0200 Subject: [PATCH 028/266] add init rules --- src/NetworkHistogram.jl | 2 +- src/assignments/group_numbering.jl | 12 +++++++ src/observations.jl | 24 ++++++++++++++ src/optimisation/config_rules/InitRule.jl | 38 +++++++++++++++++++++-- src/{ => optimisation}/fit.jl | 0 src/optimisation/include.jl | 1 + 6 files changed, 74 insertions(+), 3 deletions(-) rename src/{ => optimisation}/fit.jl (100%) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 7edce95..a5ac3d4 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -13,11 +13,11 @@ import Distributions.fit using LoopVectorization: @turbo using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen import Arpack +import Metis include("assignments/Assignments.jl") include("sbm.jl") include("observations.jl") -include("fit.jl") include("optimisation/include.jl") # more specialised and faster assignment types and methods diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index f7a77e5..4fc13d1 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -37,3 +37,15 @@ Base.@propagate_inbounds function Base.getindex( @boundscheck checkbounds(g, i) return i < length(g) ? g.group_number[1] : g.group_number[2] end + +function check_compatiblity(g::GroupSize, node_labels) + counts = StatsBase.countmap(node_labels) + if length(counts) != g.number_groups || size(node_labels, 1) != sum(g) + return false + end + for (i, c) in enumerate(g) + if counts[i] != c + return false + end + end +end diff --git a/src/observations.jl b/src/observations.jl index 2d3c7ce..f40f396 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -51,3 +51,27 @@ end function get_adj(g::Observations) return g.graph end + +function normalized_laplacian(g::Observations) + return normalized_laplacian(g.graph) +end + +function normalized_laplacian(g::AbstractGraph) + return normalized_laplacian(Graphs.adjacency_matrix(g)) +end + +function normalized_laplacian(g::AbstractMatrix) + degrees = sum(g, dims = 1) + n = size(g, 1) + L = similar(g, Float64) + for j in 1:n + for i in 1:n + if i == j + L[i, j] = 1 + elseif g[i, j] == 1 + L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) + end + end + end + return L +end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 4da98f6..64ad225 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -1,6 +1,11 @@ abstract type StartingAssignment end struct OrderedStart <: StartingAssignment end struct RandomStart <: StartingAssignment end +struct SpectralStart <: StartingAssignment end +struct MetisStart <: StartingAssignment end +struct FromAssignment{A} <: StartingAssignment + assignment::A +end struct InitRule{S <: StartingAssignment, I} starting_assignment_rule::S @@ -36,5 +41,34 @@ function initialize_node_labels(g, h, ::RandomStart) return group_size, node_labels end -# check https://github.com/TrainOfCode/LocalFennelPartitioning.jl/tree/main -# check https://github.com/JuliaSparse/Metis.jl +function initialize_node_labels(g, h, ::SpectralStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = zeros(Int, number_nodes(g)) + + laplacian = normalized_laplacian(g) + _, eigenvectors = Arpack.eigs(laplacian, nev = 2, which = :LR) + # get 2nd eigenvector, sort its components + indices = sortperm(eigenvectors[:, 1]) + # bin them into groups of correct size + start = 1 + for (i, group) in enumerate(group_size) + stop = start + group - 1 + node_labels[indices[start:stop]] .= i + start = stop + 1 + end + return group_size, node_labels +end + +function initialize_node_labels(g, h, ::MetisStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = convert.( + Int, Metis.partition(Metis.graph(g.graph), length(group_size))) + check_compatiblity(group_size, node_labels) + return group_size, node_labels +end + +function initialize_node_labels(g, h, rule::FromAssignment{A}) where {A} + group_size = GroupSize(number_nodes(g), h) + check_compatiblity(group_size, rule.assignment.node_labels) + return group_size, rule.assignment.node_labels +end diff --git a/src/fit.jl b/src/optimisation/fit.jl similarity index 100% rename from src/fit.jl rename to src/optimisation/fit.jl diff --git a/src/optimisation/include.jl b/src/optimisation/include.jl index f3d51eb..044e887 100644 --- a/src/optimisation/include.jl +++ b/src/optimisation/include.jl @@ -1,2 +1,3 @@ +include("fit.jl") include("swap.jl") include("least_squares.jl") From cb668e55bcb70595c78edaf0e0f185cf0770432d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 19:08:40 +0200 Subject: [PATCH 029/266] add docs for init --- src/optimisation/config_rules/InitRule.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 64ad225..d292ec5 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -26,6 +26,9 @@ object and a vector of node labels. # Implemented rules - `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. - `RandomStart()`: Randomly assign nodes to groups. +- `SpectralStart()`: Assign nodes to groups based on spectral clustering. +- `MetisStart()`: Assign nodes to groups based on Metis partitioning. +- `FromAssignment(a)`: Assign nodes to groups based on the given assignment `a`. """ initialize_node_labels From ea12c67f3820b9a8f9eac3d6b840c49aab39f779 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 21 Oct 2024 19:25:17 +0200 Subject: [PATCH 030/266] put first steps for clustering with higher dims --- Project.toml | 4 ++++ src/NetworkHistogram.jl | 2 ++ src/optimisation/config_rules/InitRule.jl | 15 +++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/Project.toml b/Project.toml index aaeb7d4..ee79a60 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" @@ -15,6 +16,7 @@ DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8" Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -37,6 +39,7 @@ ArnoldiMethod = "0.2.0" Arpack = "0.5.4" BenchmarkTools = "1.3.2" CairoMakie = "0.12.14" +Clustering = "0.15.7" CodecZstd = "0.7.2" Combinatorics = "1.0.2" DataStructures = "0.18.20" @@ -44,6 +47,7 @@ DensityInterface = "0.4.0" Distributions = "0.25.112" Graphs = "1.9.0" HTTP = "1.7.4" +IterativeSolvers = "0.9.4" JLD = "0.13.3" Kronecker = "0.5" LogExpFunctions = "0.3.28" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index a5ac3d4..db82f84 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -14,6 +14,8 @@ using LoopVectorization: @turbo using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen import Arpack import Metis +import IterativeSolvers +import Clustering include("assignments/Assignments.jl") include("sbm.jl") diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index d292ec5..7ac9f69 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -6,6 +6,9 @@ struct MetisStart <: StartingAssignment end struct FromAssignment{A} <: StartingAssignment assignment::A end +struct HigherOrderSpectralStart <: StartingAssignment + k::Int +end struct InitRule{S <: StartingAssignment, I} starting_assignment_rule::S @@ -75,3 +78,15 @@ function initialize_node_labels(g, h, rule::FromAssignment{A}) where {A} check_compatiblity(group_size, rule.assignment.node_labels) return group_size, rule.assignment.node_labels end + + + + +function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) + group_size = GroupSize(number_nodes(g), h) + + laplacian = normalized_laplacian(g) + results = IterativeSolvers.lobpcg(laplacian, true, rule.k) + throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) + return group_size, node_labels +end From 6a8a13c3674c672c59be34c149de48870f717e4c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 22 Oct 2024 12:31:18 +0200 Subject: [PATCH 031/266] using StatsAPI as it should be --- .gitignore | 23 +++++++++++++ Project.toml | 32 ++++--------------- src/NetworkHistogram.jl | 5 ++- src/assignments/BernoulliAssignment/struct.jl | 6 ++-- src/assignments/BernoulliAssignment/swap.jl | 2 +- .../CategoricalAssignment/struct.jl | 2 +- src/assignments/CategoricalAssignment/swap.jl | 2 +- src/observations.jl | 11 ++++++- src/optimisation/config_rules/InitRule.jl | 5 ++- .../config_rules/bandwidth_selection_rule.jl | 4 +-- src/optimisation/fit.jl | 17 +++------- test/Project.toml | 1 + test/TestNetworkHistogram.jl | 4 +-- test/runtests.jl | 23 +++++++++---- 14 files changed, 78 insertions(+), 59 deletions(-) diff --git a/.gitignore b/.gitignore index 13c44d4..d02318d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,30 @@ .vscode *dev/ +.CondaPkg/ + +# Files generated by invoking Julia with --code-coverage +*.jl.cov +*.jl.*.cov + +# Files generated by invoking Julia with --track-allocation +*.jl.mem + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package docs/build/ +docs/site/ docs/Manifest.toml +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. Manifest.toml \ No newline at end of file diff --git a/Project.toml b/Project.toml index ee79a60..4e8cde5 100644 --- a/Project.toml +++ b/Project.toml @@ -6,63 +6,45 @@ version = "0.5.2" [deps] ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" -CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" -Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" -HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" -JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8" -Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" -MetaGraphsNext = "fa8bd995-216d-47f1-8a91-f3b68fbeb377" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SimpleWeightedGraphs = "47aef6b3-ad0c-573a-a1e2-d07658019622" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7" [compat] -ArnoldiMethod = "0.2.0" +ArnoldiMethod = "0.4.0" Arpack = "0.5.4" -BenchmarkTools = "1.3.2" -CairoMakie = "0.12.14" Clustering = "0.15.7" -CodecZstd = "0.7.2" -Combinatorics = "1.0.2" DataStructures = "0.18.20" DensityInterface = "0.4.0" Distributions = "0.25.112" -Graphs = "1.9.0" -HTTP = "1.7.4" +Graphs = "1.12.0" IterativeSolvers = "0.9.4" -JLD = "0.13.3" -Kronecker = "0.5" +LinearAlgebra = "1.11.0" LogExpFunctions = "0.3.28" LoopVectorization = "0.12.171" -MetaGraphsNext = "0.7.0" Metis = "1.5.0" PermutationSymmetricTensors = "0.2.0" ProgressMeter = "1.7.2" -SimpleWeightedGraphs = "1.4.0" +Random = "1.11.0" SparseArrays = "1.11.0" StaticArrays = "1.9.7" -StatsBase = "0.33.21" +StatsAPI = "1.7.0" +StatsBase = "0.34.3" Test = "1.11.0" -TranscodingStreams = "0.9.11" -ValueHistories = "0.5.4" julia = "1.8" [extras] diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index db82f84..b6c51ea 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -9,13 +9,16 @@ import StatsBase, Random using DensityInterface: logdensityof using StaticArrays: MVector, MMatrix using LogExpFunctions: xlogx, xlogy -import Distributions.fit using LoopVectorization: @turbo using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen import Arpack import Metis import IterativeSolvers import Clustering +import StatsAPI: loglikelihood, fit + + +export loglikelihood, fit include("assignments/Assignments.jl") include("sbm.jl") diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index b64b0ad..bd37f5e 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -86,15 +86,15 @@ function compute_log_likelihood(estimated_theta::AbstractMatrix{F}, return loglik end -function log_likelihood(assignment::BernoulliAssignment) +function loglikelihood(assignment::BernoulliAssignment) return assignment.additional_data.log_likelihood end -log_likelihood(a::BernoulliAssignment, g::Observations) = log_likelihood(a) +loglikelihood(a::BernoulliAssignment, g::Observations) = loglikelihood(a) function force_recompute_ll(a::BernoulliAssignment, g::Observations) a_simple = Assignment(a.group_size, a.node_labels) - return log_likelihood(a_simple, g) + return loglikelihood(a_simple, g) end include("swap.jl") diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 75afacb..8a06f4f 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -86,7 +86,7 @@ function update_ll!(a::BernoulliAssignment) return nothing end -function fit_sbm(a::BernoulliAssignment, g::Observations) +function fit(a::BernoulliAssignment, g::Observations) dists = initialize_sbm(a.group_size, Bernoulli(0.5)) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 4e553fb..115c50c 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -82,7 +82,7 @@ function categorical_matrix(g::Observations) return categorical_matrix(g.graph) end -function log_likelihood(a::CategoricalAssignment, g::Observations) +function loglikelihood(a::CategoricalAssignment, g::Observations) return a.additional_data.log_likelihood end diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 41e413b..3f11e70 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -43,7 +43,7 @@ function update_ll!(a::CategoricalAssignment) return nothing end -function fit_sbm( +function fit( a::CategoricalAssignment{T, M, F}, g::Observations) where {T, M, F} dists = initialize_sbm(a.group_size, Categorical(ones(M) / M)) for group1 in 1:number_groups(a) diff --git a/src/observations.jl b/src/observations.jl index f40f396..610f009 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -13,7 +13,11 @@ function number_nodes(g::Observations) end function get_obs(g::Observations, x::Tuple) - return get_obs(g.graph, x[1], x[2]) + return get_obs(g, x[1], x[2]) +end + +function get_obs(g::Observations, i::Int, j::Int) + return get_obs(g.graph, i, j) end function get_obs(g::SimpleGraph, x::Tuple) @@ -75,3 +79,8 @@ function normalized_laplacian(g::AbstractMatrix) end return L end + + +function Metis.graph(g::Observations) + return Metis.graph(g.graph) +end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 7ac9f69..ccacc64 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -68,7 +68,7 @@ end function initialize_node_labels(g, h, ::MetisStart) group_size = GroupSize(number_nodes(g), h) node_labels = convert.( - Int, Metis.partition(Metis.graph(g.graph), length(group_size))) + Int, Metis.partition(Metis.graph(g), length(group_size))) check_compatiblity(group_size, node_labels) return group_size, node_labels end @@ -83,10 +83,9 @@ end function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) + throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) group_size = GroupSize(number_nodes(g), h) - laplacian = normalized_laplacian(g) results = IterativeSolvers.lobpcg(laplacian, true, rule.k) - throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) return group_size, node_labels end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 05d1b80..0d01a3a 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -47,7 +47,7 @@ function select_number_node_per_block(g::Observations, rule::OracleK) end function select_number_node_per_block(g::Observations, rule::OracleM) - rho = density(g.graph) + rho = density(g) n = number_nodes(g) k = max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))) return select_number_node_per_block(g, OracleK(k)) @@ -71,7 +71,7 @@ end function estimated_number_nodes_per_block( g::Observations, ::EstimatedDegrees, points, rho) d = get_degree(g) - mult = ((d' * g.graph * d) / (sum(d .^ 2))^2)[1] + mult = ((d' * get_adj(g) * d) / (sum(d .^ 2))^2)[1] return _approx_k_from_delta_f(d, mult, points, rho) end diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index ac7c5d9..211db1a 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -1,32 +1,25 @@ # Slow fallback methods for the Assignment type # speed up by implementing specialized methods for the BernoulliAssignment type and others # method to compute estimator from node clustering as specified in assignment -function fit_sbm(a::Assignment, g::Observations) +function fit(a::Assignment, g::Observations) dists = initialize_sbm(a.group_size, g.dist_ref) for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) dists[group1, - group2] = fit_group(g.dist_ref, g.graph, edge_indices) + group2] = fit_group(g.dist_ref, g, edge_indices) end end return dists end function fit_group(distribution, g, edges) - return fit(distribution, get_obs.(Ref(g), edges)) + return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end -function fit(dist, data) - error("NetworkHistogram.fit method not implemented for \ - $(typeof(dist)) and $(typeof(data))") -end - -fit(d::Distribution, data) = fit(typeof(d), data) - # method to compute the log likelihood of a SBM fitted according to the assignment -function log_likelihood(a::Assignment, g::Observations) - return _log_likelihood(a, fit_sbm(a, g), g.graph) +function loglikelihood(a::Assignment, g::Observations) + return _log_likelihood(a, fit(a, g), g) end function _log_likelihood(a::Assignment, sbm::SBM, g) diff --git a/test/Project.toml b/test/Project.toml index a23db61..b225864 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/TestNetworkHistogram.jl b/test/TestNetworkHistogram.jl index 00bc3c9..81c3ee3 100644 --- a/test/TestNetworkHistogram.jl +++ b/test/TestNetworkHistogram.jl @@ -20,12 +20,12 @@ function test_swap_revertible( # force recomputation of the log likelihood using default assignment a_new = to_default_assignment(a_test) - @test NH.log_likelihood(a_new, g) ≈ NH.log_likelihood(a_test, g) + @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) # revert the swap and check if the assignment is the same as before NH.revert_swap!(a_test, swap) @test a == a_test - @test NH.log_likelihood(a, g) ≈ NH.log_likelihood(a_test, g) + @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) end end diff --git a/test/runtests.jl b/test/runtests.jl index f4d7e6c..b73af6e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,13 +1,22 @@ using Test +using Aqua include("TestNetworkHistogram.jl") -@testset "Assignment tests" begin - include("assignments/default_assignment.jl") - include("assignments/bernoulli_assignment.jl") - include("assignments/categorical_assignment.jl") -end +@testset "Tests" begin + @testset "Assignment tests" begin + include("assignments/default_assignment.jl") + include("assignments/bernoulli_assignment.jl") + include("assignments/categorical_assignment.jl") + end + + @testset "Rule optimization tests" begin + include("optimisation/config_rules/init_rule.jl") + end + -@testset "Rule optimization tests" begin - include("optimisation/config_rules/init_rule.jl") + @testset "Aqua.jl for package quality" begin + using NetworkHistogram + Aqua.test_all(NetworkHistogram) + end end From 3ea84134339e4c0ed0962f12d8b2dec41618fa29 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 22 Oct 2024 17:58:08 +0200 Subject: [PATCH 032/266] handle categorical arrays and fix node labels --- Project.toml | 6 +++ src/NetworkHistogram.jl | 6 +-- src/api.jl | 13 ++++++ src/assignments/BernoulliAssignment/struct.jl | 2 +- .../CategoricalAssignment/struct.jl | 29 ++++++++---- src/assignments/CategoricalAssignment/swap.jl | 19 ++++---- src/assignments/group_numbering.jl | 44 ++++++++++++++++--- src/observations.jl | 16 ++++++- src/optimisation/config_rules/InitRule.jl | 7 +-- .../config_rules/bandwidth_selection_rule.jl | 2 +- src/optimisation/config_rules/stop_rule.jl | 2 +- src/optimisation/fit.jl | 4 +- src/sbm.jl | 28 ++++++------ test/TestNetworkHistogram.jl | 1 - test/assignments/categorical_assignment.jl | 9 ++-- test/runtests.jl | 1 - 16 files changed, 130 insertions(+), 59 deletions(-) create mode 100644 src/api.jl diff --git a/Project.toml b/Project.toml index 4e8cde5..d189d65 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,8 @@ version = "0.5.2" [deps] ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" @@ -19,6 +21,7 @@ Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SimpleWeightedGraphs = "47aef6b3-ad0c-573a-a1e2-d07658019622" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" @@ -27,6 +30,8 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] ArnoldiMethod = "0.4.0" Arpack = "0.5.4" +CategoricalArrays = "0.10.8" +CategoricalDistributions = "0.1.15" Clustering = "0.15.7" DataStructures = "0.18.20" DensityInterface = "0.4.0" @@ -40,6 +45,7 @@ Metis = "1.5.0" PermutationSymmetricTensors = "0.2.0" ProgressMeter = "1.7.2" Random = "1.11.0" +SimpleWeightedGraphs = "1.4.0" SparseArrays = "1.11.0" StaticArrays = "1.9.7" StatsAPI = "1.7.0" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index b6c51ea..c676b4c 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -2,7 +2,7 @@ module NetworkHistogram using LinearAlgebra, SparseArrays, DataStructures using Distributions, DensityInterface -using Graphs +using Graphs, SimpleWeightedGraphs using PermutationSymmetricTensors using ProgressMeter: Progress, next!, finish! import StatsBase, Random @@ -16,9 +16,9 @@ import Metis import IterativeSolvers import Clustering import StatsAPI: loglikelihood, fit +using CategoricalArrays, CategoricalDistributions - -export loglikelihood, fit +export loglikelihood, fit include("assignments/Assignments.jl") include("sbm.jl") diff --git a/src/api.jl b/src/api.jl new file mode 100644 index 0000000..9915ec5 --- /dev/null +++ b/src/api.jl @@ -0,0 +1,13 @@ +import MLJModelInterface +const MMI = MLJModelInterface +using PermutationSymmetricTensors +using Distributions + +MMI.@mlj_model mutable struct SBM <: MMI.Probabilistic + k::Int = 1::(_ > 0) + D::Val{<:Distribution} = Val{Bernoulli}() +end + +function MMI.fit(model::SBM, X, y) + return model +end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index bd37f5e..dba5a7c 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -14,7 +14,7 @@ mutable struct BernoulliData{F} counts::Matrix{Int} realized::Matrix{Int} estimated_theta::Matrix{F} - A::BitMatrix # possible improvement by using an adjacency list Graphs.SimpleGraphs.adj(G) + A::BitMatrix # possible improvement by using an adjacency list log_likelihood::F end diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 115c50c..0650203 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -1,12 +1,13 @@ -mutable struct CategoricalData{M, F} +mutable struct CategoricalData{M, F, C} counts::Matrix{Int} realized::Matrix{MVector{M, Int}} estimated_theta::Matrix{MVector{M, F}} - A::Matrix{Int} # possible use of CategoricalArrays.jl ? + A::Matrix{C} # possible use of CategoricalArrays.jl ? log_likelihood::F end -const CategoricalAssignment{T, M, F} = Assignment{T, CategoricalData{M, F}} +const CategoricalAssignment{T, M, F, C} = Assignment{ + T, CategoricalData{M, F, C}} const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} function CategoricalAssignment( @@ -32,7 +33,7 @@ function make_categorical_data(g, node_labels, group_size) # this is incorrect if the diagonal of the matrix is anything # else than 0, and that no "categories" is represented by 0 - @inbounds @simd for k in 1:number_groups + @inbounds for k in 1:number_groups for l in k:number_groups for m in 1:num_categories if k == l @@ -69,17 +70,29 @@ function compute_log_likelihood( return loglik end +function categorical_matrix(A::CategoricalMatrix) + @info "Converting CategoricalMatrix to matrix" + categories = levels(A) + return levelcode.(recode( + A, [l => i for (i, l) in enumerate(categories)]..., missing => 0)) +end + # to update, just for test now -function categorical_matrix(A) - A_inter = A .- minimum(A) .+ 1 +function categorical_matrix(A::AbstractMatrix{Int}) + min_A = minimum(A) + if min_A > 1 + A_inter = A .- min_A .+ 1 + else + A_inter = copy(A) + end for i in 1:size(A_inter, 1) A_inter[i, i] = 0 end - return A_inter, maximum(A_inter) + return A_inter end function categorical_matrix(g::Observations) - return categorical_matrix(g.graph) + return categorical_matrix(g.graph), length(support(g.dist_ref)) end function loglikelihood(a::CategoricalAssignment, g::Observations) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 3f11e70..a163f97 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -13,8 +13,8 @@ function make_swap(a::CategoricalAssignment, id::Tuple{Int, Int}) end function make_swap!( - swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F}, - id::Tuple{Int, Int}) where {T, M, F} + swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F, C}, + id::Tuple{Int, Int}) where {T, M, F, C} swap.index1, swap.index2 = id copy!(swap.realized, a.additional_data.realized) copy!(swap.estimated_theta, a.additional_data.estimated_theta) @@ -22,8 +22,8 @@ function make_swap!( end function revert_swap!( - a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where { - T, M, F} + a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { + T, M, F, C} swap_node_labels!(a, swap.index1, swap.index2) copy!(a.additional_data.realized, swap.realized) copy!(a.additional_data.estimated_theta, swap.estimated_theta) @@ -31,8 +31,8 @@ function revert_swap!( end function apply_swap!( - a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where { - T, M, F} + a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { + T, M, F, C} update_observed_and_labels!(a, swap) update_ll!(a) end @@ -44,7 +44,8 @@ function update_ll!(a::CategoricalAssignment) end function fit( - a::CategoricalAssignment{T, M, F}, g::Observations) where {T, M, F} + a::CategoricalAssignment{T, M, F, C}, g::Observations) where { + T, M, F, C} dists = initialize_sbm(a.group_size, Categorical(ones(M) / M)) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) @@ -57,8 +58,8 @@ function fit( end function update_observed_and_labels!( - a::CategoricalAssignment{T, M, F}, swap::CategoricalSwap{M, F}) where { - T, M, F} + a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { + T, M, F, C} g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index 4fc13d1..f56b16c 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -13,7 +13,7 @@ struct GroupSize{T} <: AbstractVector{Int} end function GroupSize(number_nodes, standard_group::Integer) - @assert 1 < standard_group < number_nodes + @assert 1 < standard_group <= number_nodes number_groups = number_nodes ÷ standard_group # number of standard groups! if number_groups * standard_group == number_nodes new{Int}(standard_group, number_groups) @@ -38,14 +38,46 @@ Base.@propagate_inbounds function Base.getindex( return i < length(g) ? g.group_number[1] : g.group_number[2] end -function check_compatiblity(g::GroupSize, node_labels) + +function check_compatiblity!(node_labels,g::GroupSize,) counts = StatsBase.countmap(node_labels) if length(counts) != g.number_groups || size(node_labels, 1) != sum(g) - return false + throw(ArgumentError("The vector of node labels is not compatible with the \ + group size: $(length(counts)) != $(g.number_groups) or $(size(node_labels, 1)) \ + != $(sum(g))")) end - for (i, c) in enumerate(g) - if counts[i] != c - return false + unbalanced = any(((k,v),) -> v != g[k], counts) + if unbalanced + @info "The group size is unbalanced, trying to fix it" + g, node_labels = try_fixing_group_size!(node_labels, g) + if any(((k,v),) -> v != g[k], StatsBase.countmap(node_labels)) + throw(ArgumentError("Could not fix the group size")) + else + @info "Fixed the group size by moving nodes between groups" + end + end +end + + +function try_fixing_group_size!(node_labels,g::GroupSize) + counts = StatsBase.countmap(node_labels) + groups_too_small = filter(((k,v),) -> v < g[k], counts) + groups_too_large = filter(((k,v),) -> v > g[k], counts) + amount_too_small = sum(g[k] - v for (k, v) in groups_too_small) + amount_too_large = sum(v - g[k] for (k, v) in groups_too_large) + if amount_too_small == amount_too_large + nodes_to_move = [] + for (l,v) in groups_too_large + number_nodes_to_move = v - g[l] + nodes_to_move = vcat(nodes_to_move, findall(x-> x == l,node_labels)[1:number_nodes_to_move]) + end + for (k, v) in groups_too_small + number_nodes_to_move = g[k] - v + for i in 1:number_nodes_to_move + index = popfirst!(nodes_to_move) + node_labels[index] = k + end end end + return g, node_labels end diff --git a/src/observations.jl b/src/observations.jl index 610f009..b1b9516 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -1,4 +1,4 @@ -# +# switch to MetaGraphsNext.jl ? struct Observations{G, D} graph::G dist_ref::D @@ -80,7 +80,19 @@ function normalized_laplacian(g::AbstractMatrix) return L end +function Metis.graph(g::Observations{<:AbstractGraph, <:Bernoulli}) + return Metis.graph(g.graph) +end -function Metis.graph(g::Observations) +function Metis.graph(g::Observations{<:AbstractMatrix, <:Bernoulli}) return Metis.graph(g.graph) end + +function Metis.graph(g::Observations{<:AbstractMatrix, <:Categorical}) + return Metis.graph(adjacency_matrix(SimpleWeightedGraph(g.graph)), weights= true) +end + +function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) + A, _ = categorical_matrix(g) + return Metis.graph(adjacency_matrix(SimpleWeightedGraph(A)), weights= true) +end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index ccacc64..27aabd1 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -69,19 +69,16 @@ function initialize_node_labels(g, h, ::MetisStart) group_size = GroupSize(number_nodes(g), h) node_labels = convert.( Int, Metis.partition(Metis.graph(g), length(group_size))) - check_compatiblity(group_size, node_labels) + check_compatiblity!(node_labels, group_size) return group_size, node_labels end function initialize_node_labels(g, h, rule::FromAssignment{A}) where {A} group_size = GroupSize(number_nodes(g), h) - check_compatiblity(group_size, rule.assignment.node_labels) + check_compatiblity!(rule.assignment.node_labels, group_size) return group_size, rule.assignment.node_labels end - - - function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) group_size = GroupSize(number_nodes(g), h) diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 0d01a3a..6618eba 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -18,7 +18,7 @@ struct EstimatedDegrees <: EstimatedM end """ select_number_node_per_block(g::Observations, rule::KSelectionRule) -How to select the number of blocks `K` for the SBM model. +How to select the number of blocks `K` for the BlockModel model. # Implemented rules - `OracleK(K::Int)`: Use the oracle number of blocks `K`. diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl index 850ec66..89575dc 100644 --- a/src/optimisation/config_rules/stop_rule.jl +++ b/src/optimisation/config_rules/stop_rule.jl @@ -5,7 +5,7 @@ end # default score is the log likelihood function score(a::Assignment, g::Observations) - return log_likelihood(a, g) / binomial(number_nodes(a), 2) + return loglikelihood(a, g) / binomial(number_nodes(a), 2) end mutable struct PreviousBestValue{T} <: StopRule diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index 211db1a..829976a 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -17,12 +17,12 @@ function fit_group(distribution, g, edges) return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end -# method to compute the log likelihood of a SBM fitted according to the assignment +# method to compute the log likelihood of a BlockModel fitted according to the assignment function loglikelihood(a::Assignment, g::Observations) return _log_likelihood(a, fit(a, g), g) end -function _log_likelihood(a::Assignment, sbm::SBM, g) +function _log_likelihood(a::Assignment, sbm::BlockModel, g) log_likelihood = 0.0 for i in 1:number_nodes(a) label_a = a.node_labels[i] diff --git a/src/sbm.jl b/src/sbm.jl index b3e6913..c6d4bcf 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -1,4 +1,4 @@ -struct SBM{T, K, F <: Real} <: AbstractMatrix{T} +struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} sizes::Vector{F} probs::SymmetricTensor{T, K, 2} end @@ -17,7 +17,7 @@ function initialize_sbm(sizes::Vector, dist, k = length(sizes)) n_dims = binomial(k + 1, 2) probs = Vector{typeof(dist)}(undef, n_dims) fill!(probs, dist) - return SBM(sizes, SymmetricTensor(probs, Val(k), Val(2))) + return BlockModel(sizes, SymmetricTensor(probs, Val(k), Val(2))) end function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) @@ -25,25 +25,25 @@ function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) n_dims = binomial(k + 1, 2) probs = Vector{typeof(dist)}(undef, n_dims) fill!(probs, dist) - return SBM(size_bins, SymmetricTensor(probs, Val(k), Val(2))) + return BlockModel(size_bins, SymmetricTensor(probs, Val(k), Val(2))) end function initialize_sbm(k::Int, dist) return initialize_sbm(ones(k) / k, dist) end -number_blocks(::SBM{T, K}) where {T, K} = K +number_blocks(::BlockModel{T, K, F}) where {T, K, F} = K -Base.size(s::SBM) = size(s.probs) -Base.ndims(::SBM) = 2 -Base.eltype(::SBM{T, K}) where {T, K} = T -Base.setindex!(s::SBM, v, i, j) = setindex!(s.probs, v, i, j) -Base.@propagate_inbounds function Base.getindex(s::SBM, i, j) +Base.size(s::BlockModel) = size(s.probs) +Base.ndims(::BlockModel) = 2 +Base.eltype(::BlockModel{T, K, F}) where {T, K, F} = T +Base.setindex!(s::BlockModel, v, i, j) = setindex!(s.probs, v, i, j) +Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) return getindex(s.probs, i, j) end function sample( - rng::Random.AbstractRNG, sbm::SBM, node_labels::Vector{Int}) + rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}) n_nodes = length(node_labels) type_input = eltype(sbm.probs[1, 1]) A = Matrix{type_input}(undef, n_nodes, n_nodes) @@ -57,11 +57,11 @@ function sample( return sparse(A), node_labels end -function sample(sbm::SBM, node_labels::Vector{Int}) +function sample(sbm::BlockModel, node_labels::Vector{Int}) sample(Random.default_rng(), sbm, node_labels) end function sample( - rng::Random.AbstractRNG, sbm::SBM, n_nodes::Int, sorted = true) + rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = true) n_blocks = number_blocks(sbm) node_labels = StatsBase.sample( rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) @@ -71,4 +71,6 @@ function sample( return sample(rng, sbm, node_labels) end -sample(sbm::SBM, n_nodes::Int) = sample(Random.default_rng(), sbm, n_nodes) +function sample(sbm::BlockModel, n_nodes::Int) + sample(Random.default_rng(), sbm, n_nodes) +end diff --git a/test/TestNetworkHistogram.jl b/test/TestNetworkHistogram.jl index 81c3ee3..0018f70 100644 --- a/test/TestNetworkHistogram.jl +++ b/test/TestNetworkHistogram.jl @@ -17,7 +17,6 @@ function test_swap_revertible( NH.get_group_of_vertex(a_test, swap.index2) @test NH.get_group_of_vertex(a, swap.index2) == NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment a_new = to_default_assignment(a_test) @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index 1b5697d..4141cd3 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -12,12 +12,9 @@ import NetworkHistogram as NH n = 12 k = 3 dist = Categorical(p) - A = Symmetric(Random.rand(dist, n, n)) - # set the diagonal to 0 - for i in 1:n - A[i, i] = 0 - end - obs = NH.Observations(A, dist) + sbm = NH.initialize_sbm(ones(k) ./ k, dist) + A, _ = NH.sample(sbm, repeat(1:k, inner = n ÷ k)) + obs = NH.Observations(collect(A), dist) node_labels = repeat(1:k, inner = n ÷ k) a = NH.CategoricalAssignment(obs, NH.GroupSize(n, n ÷ k), node_labels) swap = NH.make_swap(a, (1, k + 1)) diff --git a/test/runtests.jl b/test/runtests.jl index b73af6e..cba3641 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,7 +14,6 @@ include("TestNetworkHistogram.jl") include("optimisation/config_rules/init_rule.jl") end - @testset "Aqua.jl for package quality" begin using NetworkHistogram Aqua.test_all(NetworkHistogram) From ad9d78c607c2da8da0256297470325ae1552b79c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 22 Oct 2024 17:58:45 +0200 Subject: [PATCH 033/266] format --- src/assignments/group_numbering.jl | 19 +++++++++---------- src/observations.jl | 6 ++++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index f56b16c..e4546f8 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -38,19 +38,18 @@ Base.@propagate_inbounds function Base.getindex( return i < length(g) ? g.group_number[1] : g.group_number[2] end - -function check_compatiblity!(node_labels,g::GroupSize,) +function check_compatiblity!(node_labels, g::GroupSize) counts = StatsBase.countmap(node_labels) if length(counts) != g.number_groups || size(node_labels, 1) != sum(g) throw(ArgumentError("The vector of node labels is not compatible with the \ group size: $(length(counts)) != $(g.number_groups) or $(size(node_labels, 1)) \ != $(sum(g))")) end - unbalanced = any(((k,v),) -> v != g[k], counts) + unbalanced = any(((k, v),) -> v != g[k], counts) if unbalanced @info "The group size is unbalanced, trying to fix it" g, node_labels = try_fixing_group_size!(node_labels, g) - if any(((k,v),) -> v != g[k], StatsBase.countmap(node_labels)) + if any(((k, v),) -> v != g[k], StatsBase.countmap(node_labels)) throw(ArgumentError("Could not fix the group size")) else @info "Fixed the group size by moving nodes between groups" @@ -58,18 +57,18 @@ function check_compatiblity!(node_labels,g::GroupSize,) end end - -function try_fixing_group_size!(node_labels,g::GroupSize) +function try_fixing_group_size!(node_labels, g::GroupSize) counts = StatsBase.countmap(node_labels) - groups_too_small = filter(((k,v),) -> v < g[k], counts) - groups_too_large = filter(((k,v),) -> v > g[k], counts) + groups_too_small = filter(((k, v),) -> v < g[k], counts) + groups_too_large = filter(((k, v),) -> v > g[k], counts) amount_too_small = sum(g[k] - v for (k, v) in groups_too_small) amount_too_large = sum(v - g[k] for (k, v) in groups_too_large) if amount_too_small == amount_too_large nodes_to_move = [] - for (l,v) in groups_too_large + for (l, v) in groups_too_large number_nodes_to_move = v - g[l] - nodes_to_move = vcat(nodes_to_move, findall(x-> x == l,node_labels)[1:number_nodes_to_move]) + nodes_to_move = vcat(nodes_to_move, + findall(x -> x == l, node_labels)[1:number_nodes_to_move]) end for (k, v) in groups_too_small number_nodes_to_move = g[k] - v diff --git a/src/observations.jl b/src/observations.jl index b1b9516..989bb45 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -89,10 +89,12 @@ function Metis.graph(g::Observations{<:AbstractMatrix, <:Bernoulli}) end function Metis.graph(g::Observations{<:AbstractMatrix, <:Categorical}) - return Metis.graph(adjacency_matrix(SimpleWeightedGraph(g.graph)), weights= true) + return Metis.graph( + adjacency_matrix(SimpleWeightedGraph(g.graph)), weights = true) end function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) A, _ = categorical_matrix(g) - return Metis.graph(adjacency_matrix(SimpleWeightedGraph(A)), weights= true) + return Metis.graph( + adjacency_matrix(SimpleWeightedGraph(A)), weights = true) end From c3bd492a8a27a8da2b0c2ec973741f849b2f1f42 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 23 Oct 2024 17:28:29 +0200 Subject: [PATCH 034/266] bugfix: swap being overwritten when updating assignment --- src/assignments/Assignments.jl | 8 +- .../CategoricalAssignment/struct.jl | 68 +++++++----- src/assignments/CategoricalAssignment/swap.jl | 98 ++++++++++++----- src/assignments/group_numbering.jl | 2 +- src/observations.jl | 4 +- src/optimisation/config_rules/InitRule.jl | 2 +- src/optimisation/config_rules/accept_rule.jl | 2 +- src/optimisation/least_squares.jl | 12 +-- test/assignments/categorical_assignment.jl | 102 ++++++++++++++++-- 9 files changed, 226 insertions(+), 72 deletions(-) diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index b47be5a..c9028d9 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -74,8 +74,12 @@ function get_group_of_vertex(assignment::Assignment, vertex) end function get_edge_indices(a::Assignment, i, j) - return [(x, y) for x in get_vertex_in_group(a, i) - for y in get_vertex_in_group(a, j) if x < y] + if i == j + return get_edge_indices(a, i) + else + return [(x,y) for x in get_vertex_in_group(a, i) + for y in get_vertex_in_group(a, j)] + end end function get_edge_indices(a::Assignment, i) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 0650203..e1da0ac 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -20,51 +20,64 @@ function make_assignment(g, h, init_rule::CategoricalInitRule) group_size, node_labels = initialize_node_labels( g, h, init_rule.starting_assignment_rule) - return CategoricalAssignment(g, group_size, node_labels) + a = CategoricalAssignment(deepcopy(g), group_size, node_labels) + @show a.additional_data.log_likelihood + ll_test = force_recompute_ll(a, g) + @show ll_test + return a end function make_categorical_data(g, node_labels, group_size) number_groups = length(group_size) - n = length(node_labels) A, num_categories = categorical_matrix(g) counts = zeros(Int, number_groups, number_groups) realized = [MVector{num_categories}(zeros(Int, num_categories)) for _ in 1:number_groups, _ in 1:number_groups] - # this is incorrect if the diagonal of the matrix is anything - # else than 0, and that no "categories" is represented by 0 - @inbounds for k in 1:number_groups - for l in k:number_groups - for m in 1:num_categories - if k == l - c = group_size[k] * (group_size[k] - 1) ÷ 2 - r = sum(A[node_labels .== k, node_labels .== l] .== m) ÷ 2 - else - c = group_size[k] * group_size[l] - r = sum(A[node_labels .== k, node_labels .== l] .== m) - end - realized[k, l][m] = r - realized[l, k][m] = r - counts[k, l] = c - counts[l, k] = c - end - end - end + _count_cat_occurences!(counts, realized, g, Assignment(group_size, node_labels)) estimated_theta = realized ./ counts ll = compute_log_likelihood(estimated_theta, realized) return CategoricalData(counts, realized, estimated_theta, A, ll) end +function _count_cat_occurences!(counts, realized, g, a_dummy) + @inbounds for k in 1:number_groups(a_dummy) + for l in k:number_groups(a_dummy) + counts_dict = StatsBase.countmap(get_obs.( + Ref(g), get_edge_indices(a_dummy, k, l))) + total = 0 + for (m, v) in counts_dict + realized[k, l][m] = v + realized[l, k][m] = v + total += v + end + counts[k, l] = total + counts[l, k] = total + end + end +end + + +function recount_occurences!(a) + _count_cat_occurences!(a.additional_data.counts, a.additional_data.realized, a.additional_data.A, a) + return nothing +end + function compute_log_likelihood( - estimated_theta::AbstractMatrix{MVector{M, T}}, counts::AbstractMatrix{F}) where { + estimated_theta::AbstractMatrix{MVector{M, T}}, realized::AbstractMatrix{F}) where { M, T, F} loglik = zero(T) number_groups = size(estimated_theta, 1) @inbounds for j in 1:number_groups - @simd for i in j:number_groups - c = counts[i, j] - loglik += sum(xlogx.(estimated_theta[i, j]) .* c) + for i in j:number_groups + for m in 1:length(estimated_theta[i, j]) + if realized[i, j][m] != 0 + loglik += realized[i, j][m] * log(estimated_theta[i, j][m]) + end + end + #loglik += sum(log.(estimated_theta[i, j]) .* realized[i, j]) + #loglik += sum(xlogy.(realized[i,j], estimated_theta[i, j]) ) end end return loglik @@ -99,4 +112,9 @@ function loglikelihood(a::CategoricalAssignment, g::Observations) return a.additional_data.log_likelihood end +function force_recompute_ll(a::CategoricalAssignment, g::Observations) + a_simple = Assignment(a.group_size, a.node_labels) + return loglikelihood(a_simple, g) +end + include("swap.jl") diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index a163f97..4549259 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -12,22 +12,39 @@ function make_swap(a::CategoricalAssignment, id::Tuple{Int, Int}) a.additional_data.log_likelihood) end +function deep_copy_matrix_of_vec!(container, source) + for i in eachindex(container) + container[i] = copy(source[i]) + end +end + +function copy_realized_and_theta!(a,b) + deep_copy_matrix_of_vec!(a.realized, b.realized) + deep_copy_matrix_of_vec!(a.estimated_theta, b.estimated_theta) + a.log_likelihood = b.log_likelihood + return nothing +end + function make_swap!( swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F, C}, id::Tuple{Int, Int}) where {T, M, F, C} swap.index1, swap.index2 = id - copy!(swap.realized, a.additional_data.realized) - copy!(swap.estimated_theta, a.additional_data.estimated_theta) - swap.log_likelihood = a.additional_data.log_likelihood + copy_realized_and_theta!(swap, a.additional_data) + #copy!.(swap.realized, a.additional_data.realized) + #copy!.(swap.estimated_theta, a.additional_data.estimated_theta) + #swap.log_likelihood = a.additional_data.log_likelihood + #return nothing end function revert_swap!( a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { T, M, F, C} swap_node_labels!(a, swap.index1, swap.index2) - copy!(a.additional_data.realized, swap.realized) - copy!(a.additional_data.estimated_theta, swap.estimated_theta) - a.additional_data.log_likelihood = swap.log_likelihood + copy_realized_and_theta!(a.additional_data, swap) + #copy!.(a.additional_data.realized, swap.realized) + #copy!.(a.additional_data.estimated_theta, swap.estimated_theta) + #a.additional_data.log_likelihood = swap.log_likelihood + #return nothing end function apply_swap!( @@ -39,7 +56,7 @@ end function update_ll!(a::CategoricalAssignment) a.additional_data.log_likelihood = compute_log_likelihood( - a.additional_data.estimated_theta, a.additional_data.counts) + a.additional_data.estimated_theta, a.additional_data.realized) return nothing end @@ -68,33 +85,42 @@ function update_observed_and_labels!( realized_g1 = @view a.additional_data.realized[:, g1] realized_g2 = @view a.additional_data.realized[:, g2] - @inbounds @fastmath for i in axes(a.additional_data.A, 1) - index_1 = adj_1[i] - index_2 = adj_2[i] - if i == swap.index1 || i == swap.index2 || index_1 == index_2 + for i in axes(a.additional_data.A, 1) + if i == swap.index1 || i == swap.index2 + continue + end + obs_1 = adj_1[i] + obs_2 = adj_2[i] + group_inter = get_group_of_vertex(a, i) + if obs_1 != obs_2 + _fast_update!!(a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) + end - else - group_inter = get_group_of_vertex(a, i) + # if i == swap.index1 || i == swap.index2 || obs_1 == obs_2 + # continue + # else - a_g1_g_inter = a.additional_data.realized[g1, group_inter] - a_g2_g_inter = a.additional_data.realized[g2, group_inter] - a_g_inter_g1 = realized_g1[group_inter] - a_g_inter_g2 = realized_g2[group_inter] - # send from group 1 to group 2 - a_g1_g_inter[index_1] -= 1 - a_g_inter_g1[index_1] -= 1 + # a_g1_g_inter = a.additional_data.realized[g1, group_inter] + # a_g2_g_inter = a.additional_data.realized[g2, group_inter] + # a_g_inter_g1 = realized_g1[group_inter] + # a_g_inter_g2 = realized_g2[group_inter] - a_g2_g_inter[index_2] += 1 - a_g_inter_g2[index_2] += 1 + # # send from group 1 to group 2 + # a_g1_g_inter[obs_1] -= 1 + # a_g_inter_g1[obs_1] = a_g1_g_inter[obs_1] - # send from group 2 to group 1 - a_g2_g_inter[index_2] -= 1 - a_g_inter_g2[index_2] -= 1 + # a_g2_g_inter[obs_1] += 1 + # a_g_inter_g2[obs_1] = a_g2_g_inter[obs_1] - a_g1_g_inter[index_1] += 1 - a_g_inter_g1[index_1] += 1 - end + + # # send from group 2 to group 1 + # a_g2_g_inter[obs_2] -= 1 + # a_g_inter_g2[obs_2] = a_g2_g_inter[obs_2] + + # a_g1_g_inter[obs_2] += 1 + # a_g_inter_g1[obs_2] = a_g1_g_inter[obs_2] + # end end _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, @@ -106,6 +132,22 @@ function update_observed_and_labels!( return nothing end + +function _fast_update!!(realized, g1, g2, obs_1, obs_2, g_inter) + realized[g1, g_inter][obs_1] -= 1 + realized[g_inter, g1][obs_1] = realized[g1, g_inter][obs_1] + + realized[g2,g_inter][obs_1] += 1 + realized[g_inter, g2][obs_1] = realized[g2,g_inter][obs_1] + + # send from group 2 to group 1 + realized[g2,g_inter][obs_2] -= 1 + realized[g_inter, g2][obs_2] = realized[g2,g_inter][obs_2] + + realized[g1, g_inter][obs_2] += 1 + realized[g_inter,g1][obs_2] = realized[g1, g_inter][obs_2] +end + function _fast_div!(theta, realized, counts) for j in axes(theta, 2) for i in axes(theta, 1) diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index e4546f8..49e489c 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -47,7 +47,7 @@ function check_compatiblity!(node_labels, g::GroupSize) end unbalanced = any(((k, v),) -> v != g[k], counts) if unbalanced - @info "The group size is unbalanced, trying to fix it" + @info "The group size is unbalanced, trying to fix it : $(counts)" g, node_labels = try_fixing_group_size!(node_labels, g) if any(((k, v),) -> v != g[k], StatsBase.countmap(node_labels)) throw(ArgumentError("Could not fix the group size")) diff --git a/src/observations.jl b/src/observations.jl index 989bb45..8a1c0e3 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -36,7 +36,7 @@ function density(g::AbstractGraph) return Graphs.density(g) end -function density(g::AbstractMatrix{Bool}) +function density(g::AbstractMatrix) return sum(g) / ((size(g, 1) * (size(g, 1) - 1))) end @@ -85,7 +85,7 @@ function Metis.graph(g::Observations{<:AbstractGraph, <:Bernoulli}) end function Metis.graph(g::Observations{<:AbstractMatrix, <:Bernoulli}) - return Metis.graph(g.graph) + return Metis.graph(SimpleGraph(g.graph)) end function Metis.graph(g::Observations{<:AbstractMatrix, <:Categorical}) diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 27aabd1..8c22b60 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -54,7 +54,7 @@ function initialize_node_labels(g, h, ::SpectralStart) laplacian = normalized_laplacian(g) _, eigenvectors = Arpack.eigs(laplacian, nev = 2, which = :LR) # get 2nd eigenvector, sort its components - indices = sortperm(eigenvectors[:, 1]) + indices = sortperm(real.(eigenvectors[:, 1])) # bin them into groups of correct size start = 1 for (i, group) in enumerate(group_size) diff --git a/src/optimisation/config_rules/accept_rule.jl b/src/optimisation/config_rules/accept_rule.jl index 4f864f7..81b0059 100644 --- a/src/optimisation/config_rules/accept_rule.jl +++ b/src/optimisation/config_rules/accept_rule.jl @@ -21,7 +21,7 @@ function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) # calculate the score of the new assignment new_score = score(a, g) # if the new assignment is worse, revert the swap - if new_score < current_score + if new_score <= current_score revert_swap!(a, swap) end end diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 703368d..0424ee9 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -1,11 +1,11 @@ include("config_rules/include.jl") -function optimize(g, h = select_bandwidth(g); - max_iter::Int = 1000, - initialise_rule::InitRule = InitRule(OrderedStart(), nothing), +function optimize(g, h = select_number_node_per_block(g, EstimatedDegrees()); + max_iter::Int = 10_000, + initialise_rule::InitRule = InitRule(RandomStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(10), + stop_rule::StopRule = PreviousBestValue(1000), progress_bar::Bool = false ) a = make_assignment(g, h, initialise_rule) @@ -15,10 +15,10 @@ function optimize(g, h = select_bandwidth(g); return a end -function greedy_improve!(a::Assignment, g; max_iter::Int = 1000, +function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(10), + stop_rule::StopRule = PreviousBestValue(1000), progress_bar::Bool = false ) # swap memory allocation diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index 4141cd3..8b669df 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -3,20 +3,110 @@ import NetworkHistogram as NH @testset "test conversion to categorical observations" begin end @testset "test Categorical swap" begin - using ..TestNetworkHistogram: test_swap_revertible + using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment using Distributions: Categorical using LinearAlgebra: Symmetric import Random - m = 5 + m = 2 p = ones(m) ./ m n = 12 - k = 3 + k = 4 dist = Categorical(p) sbm = NH.initialize_sbm(ones(k) ./ k, dist) A, _ = NH.sample(sbm, repeat(1:k, inner = n ÷ k)) - obs = NH.Observations(collect(A), dist) + g = NH.Observations(collect(A), dist) node_labels = repeat(1:k, inner = n ÷ k) - a = NH.CategoricalAssignment(obs, NH.GroupSize(n, n ÷ k), node_labels) + a = NH.CategoricalAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) swap = NH.make_swap(a, (1, k + 1)) - test_swap_revertible(a, swap, obs) + @test A[:, 1] != A[:, k + 1] + a_test = deepcopy(a) + NH.apply_swap!(a_test, swap) + @test NH.get_group_of_vertex(a, swap.index1) == + NH.get_group_of_vertex(a_test, swap.index2) + @test NH.get_group_of_vertex(a, swap.index2) == + NH.get_group_of_vertex(a_test, swap.index1) + # force recomputation of the log likelihood using default assignment + a_new = to_default_assignment(a_test) + @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) + @test a_test.additional_data.realized != a.additional_data.realized + @test a_test.additional_data.estimated_theta != a.additional_data.estimated_theta + @test a_test.additional_data.log_likelihood != a.additional_data.log_likelihood + # revert the swap and check if the assignment is the same as before + NH.revert_swap!(a_test, swap) + @test a == a_test + @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) +end + +@testset "fast update test" begin + using Distributions + realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; + [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + counts = [1 4 4 + 4 1 4 + 4 4 1] + A = [0 1 2 2 3 3 + 1 0 2 2 3 3 + 2 2 0 1 3 3 + 2 2 1 0 3 3 + 3 3 3 3 0 1 + 3 3 3 3 1 0] + groupsize = NH.GroupSize(6, 2) + node_labels = [1, 1, 2, 2, 3, 3] + g = NH.Observations(A, Categorical(3)) + a = NH.CategoricalAssignment(g, groupsize, node_labels) + for index in eachindex(realized) + @test all(realized[index] .== a.additional_data.realized[index]) + end + @test loglikelihood(a, g) ≈ 0 + @test a.additional_data.counts == counts + swap_id = (1, 3) + realized_after_swap = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; + [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + swap = NH.make_swap(a, swap_id) + NH.apply_swap!(a, swap) + for index in eachindex(realized_after_swap) + @test all(realized_after_swap[index] .== a.additional_data.realized[index]) + @test all(a.additional_data.estimated_theta[index] .≈ + realized_after_swap[index] ./ counts[index]) + end + @test loglikelihood(a, g) == 4 * log(0.5) +end + +#todo: test ll against categorical likelihood on basic assignment +@testset "test swap is not overwritten" begin + A = [0 4 4 2 1 2 2 3 4 2 3 1 4 1 1 3 4 4 3 3 + 4 0 4 2 4 2 1 1 1 3 3 1 1 1 3 3 4 2 1 4 + 4 4 0 1 2 4 2 2 1 3 2 3 1 2 3 2 3 4 1 1 + 2 2 1 0 2 1 2 2 2 3 1 1 3 3 3 3 3 1 1 2 + 1 4 2 2 0 4 1 4 3 2 4 3 4 3 1 3 1 1 1 3 + 2 2 4 1 4 0 2 3 1 3 1 4 3 3 1 3 1 3 3 3 + 2 1 2 2 1 2 0 3 2 2 1 1 1 3 3 1 1 3 1 1 + 3 1 2 2 4 3 3 0 4 3 2 3 1 1 1 1 1 3 2 1 + 4 1 1 2 3 1 2 4 0 3 1 1 1 3 2 1 3 1 4 1 + 2 3 3 3 2 3 2 3 3 0 1 3 1 1 3 1 3 1 1 4 + 3 3 2 1 4 1 1 2 1 1 0 2 3 2 2 1 2 2 1 3 + 1 1 3 1 3 4 1 3 1 3 2 0 4 4 2 2 2 3 1 1 + 4 1 1 3 4 3 1 1 1 1 3 4 0 2 2 1 2 1 1 3 + 1 1 2 3 3 3 3 1 3 1 2 4 2 0 1 2 1 2 1 1 + 1 3 3 3 1 1 3 1 2 3 2 2 2 1 0 2 1 2 1 1 + 3 3 2 3 3 3 1 1 1 1 1 2 1 2 2 0 1 1 1 3 + 4 4 3 3 1 1 1 1 3 3 2 2 2 1 1 1 0 1 1 1 + 4 2 4 1 1 3 3 3 1 1 2 3 1 2 2 1 1 0 1 1 + 3 1 1 1 1 3 1 2 4 1 1 1 1 1 1 1 1 1 0 1 + 3 4 1 2 3 3 1 1 1 4 3 1 3 1 1 3 1 1 1 0] + g = NH.Observations(A, Categorical(4)) + h = 6 + a = NH.make_assignment(g, h, NH.InitRule(NH.OrderedStart(), Val{NH.CategoricalData}())) + a_ref = deepcopy(a) + swap_indices = [(18, 5), (15, 10), (5, 13)] + swap = NH.make_swap(a, swap_indices[1]) + for swap_index in swap_indices + NH.make_swap!(swap, a, swap_index) + NH.apply_swap!(a, swap) + @assert swap.realized == a_ref.additional_data.realized + @assert swap.estimated_theta == a_ref.additional_data.estimated_theta + NH.revert_swap!(a, swap) + end end From 0c4295509f5917b7af8020395de954c56b1f8686 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 23 Oct 2024 17:29:57 +0200 Subject: [PATCH 035/266] remove debug statement --- src/assignments/CategoricalAssignment/struct.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index e1da0ac..55860ef 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -21,9 +21,6 @@ function make_assignment(g, h, init_rule::CategoricalInitRule) node_labels = initialize_node_labels( g, h, init_rule.starting_assignment_rule) a = CategoricalAssignment(deepcopy(g), group_size, node_labels) - @show a.additional_data.log_likelihood - ll_test = force_recompute_ll(a, g) - @show ll_test return a end From 03470623d5bc5f413d9699fa51b24f34ab790c69 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 23 Oct 2024 19:32:40 +0200 Subject: [PATCH 036/266] fix speed and correcteness issue in categorical case --- src/NetworkHistogram.jl | 1 - src/assignments/Assignments.jl | 2 +- src/assignments/BernoulliAssignment/swap.jl | 4 +- .../CategoricalAssignment/struct.jl | 46 ++++++----- src/assignments/CategoricalAssignment/swap.jl | 78 ++++++++----------- src/optimisation/config_rules/accept_rule.jl | 4 +- src/optimisation/config_rules/swap_rule.jl | 7 +- src/optimisation/least_squares.jl | 4 - src/optimisation/swap.jl | 4 +- test/assignments/categorical_assignment.jl | 39 ++++++---- test/runtests.jl | 8 +- 11 files changed, 101 insertions(+), 96 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index c676b4c..82f3567 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -17,7 +17,6 @@ import IterativeSolvers import Clustering import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions - export loglikelihood, fit include("assignments/Assignments.jl") diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index c9028d9..18b4023 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -77,7 +77,7 @@ function get_edge_indices(a::Assignment, i, j) if i == j return get_edge_indices(a, i) else - return [(x,y) for x in get_vertex_in_group(a, i) + return [(x, y) for x in get_vertex_in_group(a, i) for y in get_vertex_in_group(a, j)] end end diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl index 8a06f4f..edfd9a3 100644 --- a/src/assignments/BernoulliAssignment/swap.jl +++ b/src/assignments/BernoulliAssignment/swap.jl @@ -7,14 +7,14 @@ mutable struct BernoulliSwap{F} <: Swap end function make_swap( - a::BernoulliAssignment{T, F}, id::Tuple{Int, Int}) where {T, F} + a::BernoulliAssignment{T, F}, id) where {T, F} return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), copy(a.additional_data.estimated_theta), a.additional_data.log_likelihood) end function make_swap!(swap::BernoulliSwap{F}, a::BernoulliAssignment{T, F}, - id::Tuple{Int, Int}) where {T, F} + id) where {T, F} swap.index1, swap.index2 = id copy!(swap.realized, a.additional_data.realized) copy!(swap.estimated_theta, a.additional_data.estimated_theta) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 55860ef..7232557 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -1,13 +1,13 @@ -mutable struct CategoricalData{M, F, C} +mutable struct CategoricalData{F, C} counts::Matrix{Int} - realized::Matrix{MVector{M, Int}} - estimated_theta::Matrix{MVector{M, F}} + realized::Array{Int, 3} + estimated_theta::Array{F, 3} A::Matrix{C} # possible use of CategoricalArrays.jl ? - log_likelihood::F + log_likelihood::F # need to remove this type end -const CategoricalAssignment{T, M, F, C} = Assignment{ - T, CategoricalData{M, F, C}} +const CategoricalAssignment{T, F, C} = Assignment{ + T, CategoricalData{F, C}} const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} function CategoricalAssignment( @@ -20,7 +20,7 @@ function make_assignment(g, h, init_rule::CategoricalInitRule) group_size, node_labels = initialize_node_labels( g, h, init_rule.starting_assignment_rule) - a = CategoricalAssignment(deepcopy(g), group_size, node_labels) + a = CategoricalAssignment(g, group_size, node_labels) return a end @@ -28,12 +28,15 @@ function make_categorical_data(g, node_labels, group_size) number_groups = length(group_size) A, num_categories = categorical_matrix(g) counts = zeros(Int, number_groups, number_groups) - realized = [MVector{num_categories}(zeros(Int, num_categories)) - for _ in 1:number_groups, _ in 1:number_groups] + realized = zeros(Int, num_categories, number_groups, number_groups) + estimated_theta = zeros( + Float64, num_categories, number_groups, number_groups) - _count_cat_occurences!(counts, realized, g, Assignment(group_size, node_labels)) + _count_cat_occurences!( + counts, realized, g, Assignment(group_size, node_labels)) + + _fast_div!(estimated_theta, realized, counts) - estimated_theta = realized ./ counts ll = compute_log_likelihood(estimated_theta, realized) return CategoricalData(counts, realized, estimated_theta, A, ll) end @@ -45,8 +48,8 @@ function _count_cat_occurences!(counts, realized, g, a_dummy) Ref(g), get_edge_indices(a_dummy, k, l))) total = 0 for (m, v) in counts_dict - realized[k, l][m] = v - realized[l, k][m] = v + realized[m, k, l] = v + realized[m, l, k] = v total += v end counts[k, l] = total @@ -55,22 +58,23 @@ function _count_cat_occurences!(counts, realized, g, a_dummy) end end - function recount_occurences!(a) - _count_cat_occurences!(a.additional_data.counts, a.additional_data.realized, a.additional_data.A, a) + _count_cat_occurences!( + a.additional_data.counts, a.additional_data.realized, a.additional_data.A, a) return nothing end function compute_log_likelihood( - estimated_theta::AbstractMatrix{MVector{M, T}}, realized::AbstractMatrix{F}) where { - M, T, F} + estimated_theta::Array{T, 3}, realized::Array{F, 3}) where { + T, F} loglik = zero(T) - number_groups = size(estimated_theta, 1) + number_groups = size(estimated_theta, 2) + number_decorations = size(estimated_theta, 1) @inbounds for j in 1:number_groups for i in j:number_groups - for m in 1:length(estimated_theta[i, j]) - if realized[i, j][m] != 0 - loglik += realized[i, j][m] * log(estimated_theta[i, j][m]) + for m in 1:number_decorations + if realized[m, i, j] != 0 + loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) end end #loglik += sum(log.(estimated_theta[i, j]) .* realized[i, j]) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 4549259..e949c15 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -1,33 +1,27 @@ -mutable struct CategoricalSwap{M, F} <: Swap +mutable struct CategoricalSwap{F} <: Swap index1::Int index2::Int - realized::Matrix{MVector{M, Int}} - estimated_theta::Matrix{MVector{M, F}} + realized::Array{Int, 3} + estimated_theta::Array{F, 3} log_likelihood::F end -function make_swap(a::CategoricalAssignment, id::Tuple{Int, Int}) +function make_swap(a::CategoricalAssignment, id) return CategoricalSwap(id[1], id[2], copy(a.additional_data.realized), copy(a.additional_data.estimated_theta), a.additional_data.log_likelihood) end -function deep_copy_matrix_of_vec!(container, source) - for i in eachindex(container) - container[i] = copy(source[i]) - end -end - -function copy_realized_and_theta!(a,b) - deep_copy_matrix_of_vec!(a.realized, b.realized) - deep_copy_matrix_of_vec!(a.estimated_theta, b.estimated_theta) +function copy_realized_and_theta!(a, b) + copy!(a.realized, b.realized) + copy!(a.estimated_theta, b.estimated_theta) a.log_likelihood = b.log_likelihood return nothing end function make_swap!( - swap::CategoricalSwap{M, F}, a::CategoricalAssignment{T, M, F, C}, - id::Tuple{Int, Int}) where {T, M, F, C} + swap::CategoricalSwap{F}, a::CategoricalAssignment{T, F, C}, + id) where {T, F, C} swap.index1, swap.index2 = id copy_realized_and_theta!(swap, a.additional_data) #copy!.(swap.realized, a.additional_data.realized) @@ -37,8 +31,8 @@ function make_swap!( end function revert_swap!( - a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { - T, M, F, C} + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} swap_node_labels!(a, swap.index1, swap.index2) copy_realized_and_theta!(a.additional_data, swap) #copy!.(a.additional_data.realized, swap.realized) @@ -48,8 +42,8 @@ function revert_swap!( end function apply_swap!( - a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { - T, M, F, C} + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} update_observed_and_labels!(a, swap) update_ll!(a) end @@ -61,13 +55,14 @@ function update_ll!(a::CategoricalAssignment) end function fit( - a::CategoricalAssignment{T, M, F, C}, g::Observations) where { - T, M, F, C} - dists = initialize_sbm(a.group_size, Categorical(ones(M) / M)) + a::CategoricalAssignment{T, F, C}, g::Observations) where { + T, F, C} + dists = initialize_sbm( + a.group_size, Categorical(length(support(g.dist_ref)))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) dists[group1, - group2] = Categorical(a.additional_data.estimated_theta[ + group2] = Categorical(a.additional_data.estimated_theta[:, group1, group2]) end end @@ -75,15 +70,13 @@ function fit( end function update_observed_and_labels!( - a::CategoricalAssignment{T, M, F, C}, swap::CategoricalSwap{M, F}) where { - T, M, F, C} + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) adj_1 = @view a.additional_data.A[:, swap.index1] adj_2 = @view a.additional_data.A[:, swap.index2] - realized_g1 = @view a.additional_data.realized[:, g1] - realized_g2 = @view a.additional_data.realized[:, g2] for i in axes(a.additional_data.A, 1) if i == swap.index1 || i == swap.index2 @@ -93,14 +86,14 @@ function update_observed_and_labels!( obs_2 = adj_2[i] group_inter = get_group_of_vertex(a, i) if obs_1 != obs_2 - _fast_update!!(a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) + _fast_update!!( + a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) end # if i == swap.index1 || i == swap.index2 || obs_1 == obs_2 # continue # else - # a_g1_g_inter = a.additional_data.realized[g1, group_inter] # a_g2_g_inter = a.additional_data.realized[g2, group_inter] # a_g_inter_g1 = realized_g1[group_inter] @@ -113,7 +106,6 @@ function update_observed_and_labels!( # a_g2_g_inter[obs_1] += 1 # a_g_inter_g2[obs_1] = a_g2_g_inter[obs_1] - # # send from group 2 to group 1 # a_g2_g_inter[obs_2] -= 1 # a_g_inter_g2[obs_2] = a_g2_g_inter[obs_2] @@ -132,28 +124,26 @@ function update_observed_and_labels!( return nothing end - function _fast_update!!(realized, g1, g2, obs_1, obs_2, g_inter) - realized[g1, g_inter][obs_1] -= 1 - realized[g_inter, g1][obs_1] = realized[g1, g_inter][obs_1] + realized[obs_1, g1, g_inter] -= 1 + realized[obs_1, g_inter, g1] = realized[obs_1, g1, g_inter] - realized[g2,g_inter][obs_1] += 1 - realized[g_inter, g2][obs_1] = realized[g2,g_inter][obs_1] + realized[obs_1, g2, g_inter] += 1 + realized[obs_1, g_inter, g2] = realized[obs_1, g2, g_inter] # send from group 2 to group 1 - realized[g2,g_inter][obs_2] -= 1 - realized[g_inter, g2][obs_2] = realized[g2,g_inter][obs_2] + realized[obs_2, g2, g_inter] -= 1 + realized[obs_2, g_inter, g2] = realized[obs_2, g2, g_inter] - realized[g1, g_inter][obs_2] += 1 - realized[g_inter,g1][obs_2] = realized[g1, g_inter][obs_2] + realized[obs_2, g1, g_inter] += 1 + realized[obs_2, g_inter, g1] = realized[obs_2, g1, g_inter] end function _fast_div!(theta, realized, counts) - for j in axes(theta, 2) - for i in axes(theta, 1) - t = theta[i, j] - for k in axes(t, 1) - theta[i, j][k] = realized[i, j][k] / counts[i, j] + for j in axes(theta, 3) + for i in axes(theta, 2) + for m in axes(theta, 1) + theta[m, i, j] = realized[m, i, j] / counts[i, j] end end end diff --git a/src/optimisation/config_rules/accept_rule.jl b/src/optimisation/config_rules/accept_rule.jl index 81b0059..46e7bd6 100644 --- a/src/optimisation/config_rules/accept_rule.jl +++ b/src/optimisation/config_rules/accept_rule.jl @@ -18,10 +18,8 @@ function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) current_score = score(a, g) # perform the swap apply_swap!(a, swap) - # calculate the score of the new assignment - new_score = score(a, g) # if the new assignment is worse, revert the swap - if new_score <= current_score + if score(a, g) <= current_score revert_swap!(a, swap) end end diff --git a/src/optimisation/config_rules/swap_rule.jl b/src/optimisation/config_rules/swap_rule.jl index e6c716c..e811d4f 100644 --- a/src/optimisation/config_rules/swap_rule.jl +++ b/src/optimisation/config_rules/swap_rule.jl @@ -1,7 +1,7 @@ abstract type NodeSwapRule end struct RandomNodeSwap <: NodeSwapRule end - +struct RandomGroupSwap <: NodeSwapRule end """ select_swap(node_assignment::Assignment, ::NodeSwapRule) @@ -10,10 +10,15 @@ current assignment `node_assignment`. # Implemented rules - `RandomNodeSwap()`: Select two nodes at random. +- `RandomGroupSwap()`: Select two nodes from two different groups at random. """ select_swap function select_swap(assignment::Assignment, ::RandomNodeSwap) + return StatsBase.sample(1:number_nodes(assignment), 2; replace = false) +end + +function select_swap(assignment::Assignment, ::RandomGroupSwap) groups = StatsBase.sample( 1:number_groups(assignment), 2; replace = false) index1 = rand(get_vertex_in_group(assignment, groups[1])) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 0424ee9..5f39064 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -25,13 +25,9 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, swap = make_swap(a, (1, 1)) p = Progress(max_iter; enabled = progress_bar) # perform local search until the stopping rule is met - score_value = score(a, g) - new_score_value = score_value for i in 1:max_iter - score_value = new_score_value local_search!( a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) - new_score_value = score(a, g) next!(p) if stopping_rule(a, g, stop_rule) println("Stopping rule kicked in at iteration $i.") diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl index 23c6ab6..69b1ff8 100644 --- a/src/optimisation/swap.jl +++ b/src/optimisation/swap.jl @@ -5,11 +5,11 @@ mutable struct DefaultSwap <: Swap index2::Int end -function make_swap(::Assignment, id::Tuple{Int, Int}) +function make_swap(::Assignment, id) return DefaultSwap(id[1], id[2]) end -function make_swap!(swap::DefaultSwap, ::Assignment, id::Tuple{Int, Int}) +function make_swap!(swap::DefaultSwap, ::Assignment, id) swap.index1, swap.index2 = id end diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index 8b669df..ce3a0b5 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -29,8 +29,10 @@ import NetworkHistogram as NH a_new = to_default_assignment(a_test) @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) @test a_test.additional_data.realized != a.additional_data.realized - @test a_test.additional_data.estimated_theta != a.additional_data.estimated_theta - @test a_test.additional_data.log_likelihood != a.additional_data.log_likelihood + @test a_test.additional_data.estimated_theta != + a.additional_data.estimated_theta + @test a_test.additional_data.log_likelihood != + a.additional_data.log_likelihood # revert the swap and check if the assignment is the same as before NH.revert_swap!(a_test, swap) @test a == a_test @@ -42,6 +44,9 @@ end realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized = [realized[I][k] + for k in eachindex(realized[1, 1]), + I in CartesianIndices(realized)] counts = [1 4 4 4 1 4 4 4 1] @@ -61,15 +66,22 @@ end @test loglikelihood(a, g) ≈ 0 @test a.additional_data.counts == counts swap_id = (1, 3) - realized_after_swap = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; - [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; + [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized_after_swap = [ras[I][k] + for k in eachindex(ras[1, 1]), + I in CartesianIndices(ras)] + swap = NH.make_swap(a, swap_id) NH.apply_swap!(a, swap) - for index in eachindex(realized_after_swap) - @test all(realized_after_swap[index] .== a.additional_data.realized[index]) - @test all(a.additional_data.estimated_theta[index] .≈ - realized_after_swap[index] ./ counts[index]) + for j in 1:3 + for i in 1:3 + @test all(realized_after_swap[:, i, j] .== + a.additional_data.realized[:, i, j]) + @test all(a.additional_data.estimated_theta[:, i, j] .≈ + realized_after_swap[:, i, j] ./ counts[i, j]) + end end @test loglikelihood(a, g) == 4 * log(0.5) end @@ -98,15 +110,16 @@ end 3 4 1 2 3 3 1 1 1 4 3 1 3 1 1 3 1 1 1 0] g = NH.Observations(A, Categorical(4)) h = 6 - a = NH.make_assignment(g, h, NH.InitRule(NH.OrderedStart(), Val{NH.CategoricalData}())) - a_ref = deepcopy(a) + a = NH.make_assignment( + g, h, NH.InitRule(NH.OrderedStart(), Val{NH.CategoricalData}())) + a_ref = deepcopy(a) swap_indices = [(18, 5), (15, 10), (5, 13)] swap = NH.make_swap(a, swap_indices[1]) for swap_index in swap_indices NH.make_swap!(swap, a, swap_index) NH.apply_swap!(a, swap) - @assert swap.realized == a_ref.additional_data.realized - @assert swap.estimated_theta == a_ref.additional_data.estimated_theta + @test swap.realized == a_ref.additional_data.realized + @test swap.estimated_theta == a_ref.additional_data.estimated_theta NH.revert_swap!(a, swap) end end diff --git a/test/runtests.jl b/test/runtests.jl index cba3641..78ca277 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,8 +14,8 @@ include("TestNetworkHistogram.jl") include("optimisation/config_rules/init_rule.jl") end - @testset "Aqua.jl for package quality" begin - using NetworkHistogram - Aqua.test_all(NetworkHistogram) - end + # @testset "Aqua.jl for package quality" begin + # using NetworkHistogram + # Aqua.test_all(NetworkHistogram) + # end end From 82556fce2ad5dceb773c5b84c3238dfb2da00535 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 12 Nov 2024 20:42:12 +0100 Subject: [PATCH 037/266] typo --- src/assignments/Assignments.jl | 6 +++--- src/assignments/BernoulliAssignment/struct.jl | 5 +++++ src/assignments/CategoricalAssignment/struct.jl | 5 +++++ src/assignments/GroupedAssignment/struct.jl | 3 +++ src/optimisation/least_squares.jl | 1 - 5 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 src/assignments/GroupedAssignment/struct.jl diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 18b4023..6d00894 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -93,7 +93,7 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i) return get_vertex_in_group(a, i) end -function get_ordered_adjacency_matrix(a::Assignment) - perm = sortperm(a.node_labels) - return a.additional_data.A[perm, perm] +function get_ordered_adjacency_matrix(a::Assignment, A, by=identity) + perm = sortperm(a.node_labels, by=by) + return A[perm, perm] end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index dba5a7c..e1a5f3b 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -98,3 +98,8 @@ function force_recompute_ll(a::BernoulliAssignment, g::Observations) end include("swap.jl") + + +function get_ordered_adjacency_matrix(a::BernoulliAssignment, by=identity) + return get_ordered_adjacency_matrix(a, a.additional_data.A, by) +end diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 7232557..3bfd9db 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -119,3 +119,8 @@ function force_recompute_ll(a::CategoricalAssignment, g::Observations) end include("swap.jl") + + +function get_ordered_adjacency_matrix(a::CategoricalAssignment, by=identity) + return get_ordered_adjacency_matrix(a, a.additional_data.A, by) +end diff --git a/src/assignments/GroupedAssignment/struct.jl b/src/assignments/GroupedAssignment/struct.jl new file mode 100644 index 0000000..d8461d9 --- /dev/null +++ b/src/assignments/GroupedAssignment/struct.jl @@ -0,0 +1,3 @@ +# assignment that move the edge data around when trying an update +# might be useful if the computation of the loglikelihood is expensive +# and with no closed-form solution: contiguous memory access might be faster diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 5f39064..77e9225 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -30,7 +30,6 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) next!(p) if stopping_rule(a, g, stop_rule) - println("Stopping rule kicked in at iteration $i.") finish!(p) break end From d6ec72fb13305629661350e81224cef10b361269 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 12 Nov 2024 20:42:20 +0100 Subject: [PATCH 038/266] first try to benchmark --- benchmark/benchmarks.jl | 59 ++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 158dbef..0bb7d66 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -1,15 +1,56 @@ -using BenchmarkTools, NetworkHistogram +# using BenchmarkTools + +# SUITE = BenchmarkGroup() +# for file in readdir(@__DIR__) +# if startswith(file, "bench_") && endswith(file, ".jl") +# SUITE[file[length("bench_") + 1:end - length(".jl")]] = +# include(file) +# end +# end + +using BenchmarkTools, Random, Distributions, LinearAlgebra +import NetworkHistogram as NH const SUITE = BenchmarkGroup() +function make_A(n, dist) + A = zeros(Int, n, n) + for j in 1:n + for i in j:n + if i == j + A[i, j] = 0 + else + A[i, j] = rand(dist) + A[j, i] = A[i, j] + end + end + end + return A +end # Create hierarchy of benchmarks: -SUITE["eval"] = BenchmarkGroup() +SUITE["Assignment"] = BenchmarkGroup(["assignment"]) -options = Options(; binary_operators = [+, -, *], unary_operators = [cos]) +Random.seed!(123451) +stop_rule = NH.PreviousBestValue(200) +max_iter = 200 +swap_rule = NH.RandomNodeSwap() +accept_rule = NH.Strict() +dist = Bernoulli(0.5) -for n in [10, 20] - SUITE["eval_tree_array"][n] = @benchmarkable(eval_tree_array( - $tree, X, $options), - evals=10, - samples=1000, - setup=(X = randn(Float32, 2, $n))) +for ae in ["Bernoulli", "default"] + if ae == "default" + init_rule = NH.InitRule(NH.OrderedStart(), nothing) + else + init_rule = NH.InitRule(NH.OrderedStart(), Val{NH.BernoulliData}()) + end + for n in [60,120,300] + obs = NH.Observations(make_A(n,dist), dist) + h = n ÷ 20 + a = NH.make_assignment(obs, h, init_rule) + swap = NH.make_swap(a, (1, n)) + SUITE["Assignment"][ae]["local_search!"][n] = @benchmarkable NH.local_search!( + $a, $obs, $swap, swap_rule = $swap_rule, accept_rule = $accept_rule) + end end + +# tune!(SUITE); +results = run(SUITE, verbose = true) From 42ffe61f975a3e9f9dde672759c4de53f288446e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 18 Nov 2024 14:04:51 +0100 Subject: [PATCH 039/266] add discretisation --- Project.toml | 2 ++ src/NetworkHistogram.jl | 25 ++++++++++++++++++++- src/observations.jl | 25 +++++++++++++++++++++ src/optimisation/config_rules/InitRule.jl | 2 ++ src/optimisation/least_squares.jl | 2 +- test/assignments/categorical_assignment.jl | 4 +++- test/assignments/default_assignment.jl | 2 +- test/observations/discretisation.jl | 14 ++++++++++++ test/optimisation/config_rules/init_rule.jl | 2 ++ test/runtests.jl | 4 ++++ 10 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 test/observations/discretisation.jl diff --git a/Project.toml b/Project.toml index d189d65..ccbc3d5 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" +Discretizers = "6e83dbb3-75ca-525b-8ae2-3751f0dd50b4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" @@ -35,6 +36,7 @@ CategoricalDistributions = "0.1.15" Clustering = "0.15.7" DataStructures = "0.18.20" DensityInterface = "0.4.0" +Discretizers = "3.2.3" Distributions = "0.25.112" Graphs = "1.12.0" IterativeSolvers = "0.9.4" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 82f3567..7b41fea 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -17,7 +17,7 @@ import IterativeSolvers import Clustering import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions -export loglikelihood, fit +using Discretizers: LinearDiscretizer, binedges, DiscretizeUniformWidth, encode include("assignments/Assignments.jl") include("sbm.jl") @@ -29,4 +29,27 @@ include("assignments/include.jl") @warn "User interface is not yet implemented" +export loglikelihood, fit + +# export options for optimisation +export estimate_graphon +# starting assignment rules +export InitRule +export OrderedStart, RandomStart, SpectralStart, MetisStart, FromAssignment +# accept rules +export AcceptRule +export Strict +# stopping rules +export PreviousBestValue +# bandwidth selection rules +export OracleK, EstimatedEigenvalues, EstimatedDegrees, select_number_node_per_block +# random local search rules +export RandomNodeSwap, RandomGroupSwap + +# export useful function for manipulating assignments +export Assignment, number_groups, number_nodes +export get_ordered_adjacency_matrix, get_vertex_in_group, get_group_of_vertex +export BernoulliData, CategoricalData +export Observations, discretise + end diff --git a/src/observations.jl b/src/observations.jl index 8a1c0e3..f86e6bb 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -72,6 +72,8 @@ function normalized_laplacian(g::AbstractMatrix) for i in 1:n if i == j L[i, j] = 1 + elseif degrees[i] == 0 || degrees[j] == 0 + L[i, j] = 0 elseif g[i, j] == 1 L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) end @@ -98,3 +100,26 @@ function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) return Metis.graph( adjacency_matrix(SimpleWeightedGraph(A)), weights = true) end + +function discretise(g::Observations{<:AbstractMatrix{R}, D}; + number_groups = nothing, number_levels = nothing) where {R<:Real, D} + if isnothing(number_groups) && isnothing(number_levels) + throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) + end + if isnothing(number_levels) + number_levels = get_num_levels_from_groups(number_nodes(g), number_groups) + else + if !isnothing(number_groups) + @warn "disregarding `number_groups` as `number_levels` is provided" + end + end + zero_locations = g.graph .== 0 + bin_edges = binedges(DiscretizeUniformWidth(number_levels), g.graph) + A_encoded = encode(LinearDiscretizer(bin_edges), g.graph) + A_encoded[zero_locations] .= 0 + return Observations(A_encoded, Categorical(number_levels + 1)) +end + +function get_num_levels_from_groups(n, number_groups) + return n^(0.5 * (1 - log(number_groups) / log(n))) +end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 8c22b60..be7512f 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -81,6 +81,8 @@ end function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) + # this will need to have the main optim changed -> no assumption that all blocks are + # the same size group_size = GroupSize(number_nodes(g), h) laplacian = normalized_laplacian(g) results = IterativeSolvers.lobpcg(laplacian, true, rule.k) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 77e9225..a8a3419 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -1,6 +1,6 @@ include("config_rules/include.jl") -function optimize(g, h = select_number_node_per_block(g, EstimatedDegrees()); +function estimate_graphon(g, h = select_number_node_per_block(g, EstimatedDegrees()); max_iter::Int = 10_000, initialise_rule::InitRule = InitRule(RandomStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index ce3a0b5..234dcd2 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -1,8 +1,10 @@ import NetworkHistogram as NH -@testset "test conversion to categorical observations" begin end +using Random + @testset "test Categorical swap" begin + Random.seed!(1234123) using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment using Distributions: Categorical using LinearAlgebra: Symmetric diff --git a/test/assignments/default_assignment.jl b/test/assignments/default_assignment.jl index 76a1479..fefbf64 100644 --- a/test/assignments/default_assignment.jl +++ b/test/assignments/default_assignment.jl @@ -4,7 +4,7 @@ import NetworkHistogram as NH using ..TestNetworkHistogram: test_swap_revertible import Random, LinearAlgebra using Distributions: Bernoulli, Normal - Random.seed!(1234) + Random.seed!(1234123) n = 20 k = 5 #data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) diff --git a/test/observations/discretisation.jl b/test/observations/discretisation.jl new file mode 100644 index 0000000..60c1c01 --- /dev/null +++ b/test/observations/discretisation.jl @@ -0,0 +1,14 @@ +using NetworkHistogram + +@testset "discretisation" begin + using Distributions + A = rand(-1:1, 20, 20) + for i in 1:20 + A[i, i] = 0 + end + g = Observations(A,Uniform(-1,1)) + discretised_g = discretise(g; number_levels = 5) + @test size(discretised_g.graph) == size(g.graph) + @test discretised_g.dist_ref == Categorical(6) + @test all(discretised_g.graph .∈ Ref(0:5)) +end diff --git a/test/optimisation/config_rules/init_rule.jl b/test/optimisation/config_rules/init_rule.jl index 8c07a2d..7a373e1 100644 --- a/test/optimisation/config_rules/init_rule.jl +++ b/test/optimisation/config_rules/init_rule.jl @@ -1,6 +1,7 @@ import NetworkHistogram as NH @testset "regression test" begin + Random.seed!(1234123) using Distributions: Bernoulli A = BitMatrix([0 0 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 0 @@ -26,6 +27,7 @@ import NetworkHistogram as NH end @testset "test oracle K" begin + Random.seed!(1234123) using Distributions: Bernoulli A = [0 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 diff --git a/test/runtests.jl b/test/runtests.jl index 78ca277..7b87bed 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,6 +14,10 @@ include("TestNetworkHistogram.jl") include("optimisation/config_rules/init_rule.jl") end + @testset "Observations tests" begin + include("observations/discretisation.jl") + end + # @testset "Aqua.jl for package quality" begin # using NetworkHistogram # Aqua.test_all(NetworkHistogram) From 77a339594bec6e33876744842c772e6f926548d4 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 21 Nov 2024 14:21:53 +0100 Subject: [PATCH 040/266] small api changes --- src/NetworkHistogram.jl | 1 + src/observations.jl | 17 ++++++---- src/sbm.jl | 52 ++++++++++++++++++++++++++--- test/observations/discretisation.jl | 2 +- 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 7b41fea..06b8f42 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -18,6 +18,7 @@ import Clustering import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions using Discretizers: LinearDiscretizer, binedges, DiscretizeUniformWidth, encode +using Combinatorics: permutations include("assignments/Assignments.jl") include("sbm.jl") diff --git a/src/observations.jl b/src/observations.jl index f86e6bb..1d9a508 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -66,6 +66,7 @@ end function normalized_laplacian(g::AbstractMatrix) degrees = sum(g, dims = 1) + degrees .-= minimum(degrees) n = size(g, 1) L = similar(g, Float64) for j in 1:n @@ -74,7 +75,7 @@ function normalized_laplacian(g::AbstractMatrix) L[i, j] = 1 elseif degrees[i] == 0 || degrees[j] == 0 L[i, j] = 0 - elseif g[i, j] == 1 + elseif g[i, j] != 0 L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) end end @@ -107,17 +108,21 @@ function discretise(g::Observations{<:AbstractMatrix{R}, D}; throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) end if isnothing(number_levels) - number_levels = get_num_levels_from_groups(number_nodes(g), number_groups) + number_levels = round(Int,get_num_levels_from_groups(number_nodes(g), number_groups)) else if !isnothing(number_groups) @warn "disregarding `number_groups` as `number_levels` is provided" end end - zero_locations = g.graph .== 0 + #zero_locations = g.graph .== 0 bin_edges = binedges(DiscretizeUniformWidth(number_levels), g.graph) - A_encoded = encode(LinearDiscretizer(bin_edges), g.graph) - A_encoded[zero_locations] .= 0 - return Observations(A_encoded, Categorical(number_levels + 1)) + discretizer = LinearDiscretizer(bin_edges) + A_encoded = encode(discretizer, g.graph) + for i in 1:size(A_encoded, 1) + A_encoded[i, i] = 0 + end + #A_encoded[zero_locations] .= 0 + return Observations(A_encoded, Categorical(number_levels + 1)), discretizer end function get_num_levels_from_groups(n, number_groups) diff --git a/src/sbm.jl b/src/sbm.jl index c6d4bcf..acf0001 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -57,8 +57,8 @@ function sample( return sparse(A), node_labels end -function sample(sbm::BlockModel, node_labels::Vector{Int}) - sample(Random.default_rng(), sbm, node_labels) +function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted=false) + sample(Random.default_rng(), sbm, node_labels,sorted) end function sample( rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = true) @@ -71,6 +71,50 @@ function sample( return sample(rng, sbm, node_labels) end -function sample(sbm::BlockModel, n_nodes::Int) - sample(Random.default_rng(), sbm, n_nodes) +function sample(sbm::BlockModel, n_nodes::Int, sorted=false) + sample(Random.default_rng(), sbm, n_nodes, sorted) +end + + +function get_probability_matrix(sbm::BlockModel, node_labels::Vector{Int}) + return sbm.probs[node_labels, node_labels] +end + + +function _get_params_as_vec(dist::Distribution) + return vcat(params(dist)...) +end + + +""" + best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) + +Find the best permutation of the blocks of `fitted_sbm` to match the blocks of `true_sbm` by +comparing the mean absolute difference of the parameters of the two models. +If the difference between the two models is less than `tol`, the function stops early. + +!!! warning + This function is not efficient for large numbers of blocks, as it uses brute force to + find the best permutation. +""" +function best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) + k = number_blocks(fitted_sbm) + if k != number_blocks(true_sbm) + throw(ArgumentError("The number of blocks must be the same for both models")) + end + best_perm = nothing + best_loss = Inf + fitted_params = _get_params_as_vec.(fitted_sbm) + true_params = _get_params_as_vec.(true_sbm) + for perm in permutations(1:k) + loss = sum(map(x -> sum(abs.(x)), fitted_params[perm] .- true_params)) + if loss < best_loss + best_loss = loss + best_perm = perm + end + if best_loss < tol + break + end + end + return best_perm end diff --git a/test/observations/discretisation.jl b/test/observations/discretisation.jl index 60c1c01..d9d07b7 100644 --- a/test/observations/discretisation.jl +++ b/test/observations/discretisation.jl @@ -6,7 +6,7 @@ using NetworkHistogram for i in 1:20 A[i, i] = 0 end - g = Observations(A,Uniform(-1,1)) + g, discretizer = Observations(A,Uniform(-1,1)) discretised_g = discretise(g; number_levels = 5) @test size(discretised_g.graph) == size(g.graph) @test discretised_g.dist_ref == Categorical(6) From 45c9e40b01ab66dadf6b6a451f9dff939f4ae189 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 21 Nov 2024 14:22:52 +0100 Subject: [PATCH 041/266] remove unused import @turbo --- src/NetworkHistogram.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 06b8f42..e94b133 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -9,7 +9,6 @@ import StatsBase, Random using DensityInterface: logdensityof using StaticArrays: MVector, MMatrix using LogExpFunctions: xlogx, xlogy -using LoopVectorization: @turbo using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen import Arpack import Metis From 45c12599115bbbd571edf0364aa6b891bf600235 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 22 Nov 2024 14:14:30 +0100 Subject: [PATCH 042/266] fix test to new api --- .../config_rules/bandwidth_selection_rule.jl | 30 ++++++++++++++----- src/optimisation/fit.jl | 6 ++++ src/optimisation/least_squares.jl | 2 +- src/sbm.jl | 7 +++-- test/assignments/categorical_assignment.jl | 4 +-- test/observations/discretisation.jl | 4 +-- test/optimisation/config_rules/init_rule.jl | 8 ++--- 7 files changed, 42 insertions(+), 19 deletions(-) diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index 6618eba..df4d659 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -7,6 +7,11 @@ struct OracleM{F} <: KSelectionRule α::F end +struct OracleH <: KSelectionRule + H::Int +end + + function OracleM(M) return OracleM(M, 1.0) end @@ -29,7 +34,7 @@ How to select the number of blocks `K` for the BlockModel model. estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. - `EstimatedDegrees()`: Use the estimated degrees of the adjacency matrix to estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. - +- `OracleH(H::Int)`: Use the oracle number of nodes per block `H`. !!! info - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in @@ -38,19 +43,28 @@ How to select the number of blocks `K` for the BlockModel model. """ select_number_node_per_block -function select_number_node_per_block(g::Observations, rule::OracleK) - if rule.K > number_nodes(g) ÷ 2 - throw(ArgumentError("The number of blocks $(rule.K) is too large for the number \ - of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) +function select_number_node_per_block(g::Observations, rule::OracleH) + if rule.H > number_nodes(g)÷2 + throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ + number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) end - return rule.K + if rule.H <= 1 + throw(ArgumentError("The number of nodes per block $(rule.H) is too small, it should \ + be at least 2")) + end + return rule.H +end + +function select_number_node_per_block(g::Observations, rule::OracleK) + nodes_per_block = number_nodes(g) ÷ rule.K + return select_number_node_per_block(g, OracleH(nodes_per_block)) end function select_number_node_per_block(g::Observations, rule::OracleM) rho = density(g) n = number_nodes(g) k = max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))) - return select_number_node_per_block(g, OracleK(k)) + return select_number_node_per_block(g, OracleH(k)) end function select_number_node_per_block(g::Observations, rule::EstimatedM) @@ -59,7 +73,7 @@ function select_number_node_per_block(g::Observations, rule::EstimatedM) number_points_from_mid = round(Int, c * sqrt(n)) mid_points = max(1, n ÷ 2 - number_points_from_mid):(n ÷ 2 + number_points_from_mid) m = estimated_number_nodes_per_block(g, rule, mid_points, density(g)) - return select_number_node_per_block(g, OracleK(m)) + return select_number_node_per_block(g, OracleH(m)) end function estimated_number_nodes_per_block( diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index 829976a..399ba00 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -34,3 +34,9 @@ function _log_likelihood(a::Assignment, sbm::BlockModel, g) end return log_likelihood end + + +function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} + k = number_blocks(sbm) + +end diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index a8a3419..e4b14ee 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -2,7 +2,7 @@ include("config_rules/include.jl") function estimate_graphon(g, h = select_number_node_per_block(g, EstimatedDegrees()); max_iter::Int = 10_000, - initialise_rule::InitRule = InitRule(RandomStart(), nothing), + initialise_rule::InitRule = InitRule(SpectralStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), stop_rule::StopRule = PreviousBestValue(1000), diff --git a/src/sbm.jl b/src/sbm.jl index acf0001..5255710 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -43,8 +43,11 @@ Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) end function sample( - rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}) + rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}, sorted=false) n_nodes = length(node_labels) + if sorted + sort!(node_labels) + end type_input = eltype(sbm.probs[1, 1]) A = Matrix{type_input}(undef, n_nodes, n_nodes) for i in 1:n_nodes @@ -58,7 +61,7 @@ function sample( end function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted=false) - sample(Random.default_rng(), sbm, node_labels,sorted) + sample(Random.default_rng(), sbm, node_labels, sorted) end function sample( rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = true) diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index 234dcd2..5dea6e0 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -15,9 +15,9 @@ using Random k = 4 dist = Categorical(p) sbm = NH.initialize_sbm(ones(k) ./ k, dist) - A, _ = NH.sample(sbm, repeat(1:k, inner = n ÷ k)) - g = NH.Observations(collect(A), dist) node_labels = repeat(1:k, inner = n ÷ k) + A, _ = NH.sample(sbm, node_labels) + g = NH.Observations(collect(A), dist) a = NH.CategoricalAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) swap = NH.make_swap(a, (1, k + 1)) @test A[:, 1] != A[:, k + 1] diff --git a/test/observations/discretisation.jl b/test/observations/discretisation.jl index d9d07b7..45d46e7 100644 --- a/test/observations/discretisation.jl +++ b/test/observations/discretisation.jl @@ -6,8 +6,8 @@ using NetworkHistogram for i in 1:20 A[i, i] = 0 end - g, discretizer = Observations(A,Uniform(-1,1)) - discretised_g = discretise(g; number_levels = 5) + g = Observations(A,Uniform(-1,1)) + discretised_g, discretizer = discretise(g; number_levels = 5) @test size(discretised_g.graph) == size(g.graph) @test discretised_g.dist_ref == Categorical(6) @test all(discretised_g.graph .∈ Ref(0:5)) diff --git a/test/optimisation/config_rules/init_rule.jl b/test/optimisation/config_rules/init_rule.jl index 7a373e1..f304378 100644 --- a/test/optimisation/config_rules/init_rule.jl +++ b/test/optimisation/config_rules/init_rule.jl @@ -38,9 +38,9 @@ end 1 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0] obs = NH.Observations(A, Bernoulli(0.5)) - oracle = NH.OracleK(4) + oracle = NH.OracleH(4) @test NH.select_number_node_per_block(obs, oracle) == 4 - err = ArgumentError("The number of blocks 5 is too large for the number of nodes \ - 8, it should be at most 4") - @test_throws err NH.select_number_node_per_block(obs, NH.OracleK(5)) + err = ArgumentError("The number of nodes per block 5 is too large for the \ + number of nodes 8, it should be at most 4") + @test_throws err NH.select_number_node_per_block(obs, NH.OracleH(5)) end From 3149409c25ece9fdc318fdd4b84d26fdaf231803 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 25 Nov 2024 08:59:10 +0100 Subject: [PATCH 043/266] other fit method --- Project.toml | 4 ++-- src/optimisation/fit.jl | 11 ++++++++--- test/runtests.jl | 8 ++++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Project.toml b/Project.toml index ccbc3d5..6273450 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Discretizers = "6e83dbb3-75ca-525b-8ae2-3751f0dd50b4" @@ -17,7 +18,6 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" @@ -34,6 +34,7 @@ Arpack = "0.5.4" CategoricalArrays = "0.10.8" CategoricalDistributions = "0.1.15" Clustering = "0.15.7" +Combinatorics = "1.0.2" DataStructures = "0.18.20" DensityInterface = "0.4.0" Discretizers = "3.2.3" @@ -42,7 +43,6 @@ Graphs = "1.12.0" IterativeSolvers = "0.9.4" LinearAlgebra = "1.11.0" LogExpFunctions = "0.3.28" -LoopVectorization = "0.12.171" Metis = "1.5.0" PermutationSymmetricTensors = "0.2.0" ProgressMeter = "1.7.2" diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index 399ba00..ba17dc9 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -3,14 +3,18 @@ # method to compute estimator from node clustering as specified in assignment function fit(a::Assignment, g::Observations) dists = initialize_sbm(a.group_size, g.dist_ref) + fit!(dists, g, a) + return dists +end + +function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) - dists[group1, + sbm[group1, group2] = fit_group(g.dist_ref, g, edge_indices) end end - return dists end function fit_group(distribution, g, edges) @@ -38,5 +42,6 @@ end function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} k = number_blocks(sbm) - + a = estimate_graphon(g, select_number_node_per_block(g, OracleK(k))) + fit!(sbm, g, a) end diff --git a/test/runtests.jl b/test/runtests.jl index 7b87bed..0e3641c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,8 +18,8 @@ include("TestNetworkHistogram.jl") include("observations/discretisation.jl") end - # @testset "Aqua.jl for package quality" begin - # using NetworkHistogram - # Aqua.test_all(NetworkHistogram) - # end + @testset "Aqua.jl for package quality" begin + using NetworkHistogram + Aqua.test_all(NetworkHistogram) + end end From d10db5b87caa26ffb5069a7db725a7e1090bbb7e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 25 Nov 2024 13:28:40 +0100 Subject: [PATCH 044/266] add perm util function --- src/sbm.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/sbm.jl b/src/sbm.jl index 5255710..5341c63 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -121,3 +121,9 @@ function best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01 end return best_perm end + + +function align_sbm!(sbm::BlockModel, perm) + sbm.probs = sbm.probs[perm, perm] + sbm.sizes = sbm.sizes[perm] +end From 5448040f01fb1cc692cf87bdd77ab3eb2fa5e150 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 25 Nov 2024 13:30:02 +0100 Subject: [PATCH 045/266] typo --- src/sbm.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sbm.jl b/src/sbm.jl index 5341c63..2212dff 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -124,6 +124,6 @@ end function align_sbm!(sbm::BlockModel, perm) - sbm.probs = sbm.probs[perm, perm] - sbm.sizes = sbm.sizes[perm] + sbm.probs .= sbm.probs[perm, perm] + sbm.sizes .= sbm.sizes[perm] end From dbde98725ba5ef6b811dea8ef247788599167373 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 25 Nov 2024 14:48:58 +0100 Subject: [PATCH 046/266] monkeypatch --- src/optimisation/fit.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index ba17dc9..231e3de 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -21,6 +21,11 @@ function fit_group(distribution, g, edges) return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end + +function fit_group(distribution::Binomial, g, edges) + return Distributions.fit(typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) +end + # method to compute the log likelihood of a BlockModel fitted according to the assignment function loglikelihood(a::Assignment, g::Observations) return _log_likelihood(a, fit(a, g), g) From f4b521d1bf07314d1545c826365ed8824f011d30 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 26 Nov 2024 16:45:23 +0100 Subject: [PATCH 047/266] remove unused DataStructures --- Project.toml | 2 -- src/NetworkHistogram.jl | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 6273450..56dcd4f 100644 --- a/Project.toml +++ b/Project.toml @@ -10,7 +10,6 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Discretizers = "6e83dbb3-75ca-525b-8ae2-3751f0dd50b4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -35,7 +34,6 @@ CategoricalArrays = "0.10.8" CategoricalDistributions = "0.1.15" Clustering = "0.15.7" Combinatorics = "1.0.2" -DataStructures = "0.18.20" DensityInterface = "0.4.0" Discretizers = "3.2.3" Distributions = "0.25.112" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index e94b133..35b308b 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,6 +1,6 @@ module NetworkHistogram -using LinearAlgebra, SparseArrays, DataStructures +using LinearAlgebra, SparseArrays using Distributions, DensityInterface using Graphs, SimpleWeightedGraphs using PermutationSymmetricTensors From e634ef21f70b70240815c7f6669f31c92b6628e3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 27 Nov 2024 10:52:34 +0100 Subject: [PATCH 048/266] add basic api --- src/NetworkHistogram.jl | 3 +- src/api.jl | 79 ++++++++++++++++--- src/assignments/BernoulliAssignment/struct.jl | 16 ++++ src/observations.jl | 36 +++++++-- src/sbm.jl | 16 ++-- test/runtests.jl | 3 + test/test_api.jl | 18 +++++ 7 files changed, 147 insertions(+), 24 deletions(-) create mode 100644 test/test_api.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 35b308b..bacb368 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -27,8 +27,9 @@ include("optimisation/include.jl") # more specialised and faster assignment types and methods include("assignments/include.jl") -@warn "User interface is not yet implemented" +include("api.jl") +export nethist, nethist_discretised export loglikelihood, fit # export options for optimisation diff --git a/src/api.jl b/src/api.jl index 9915ec5..dcf657e 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,13 +1,74 @@ -import MLJModelInterface -const MMI = MLJModelInterface -using PermutationSymmetricTensors -using Distributions +# import MLJModelInterface +# const MMI = MLJModelInterface +# using PermutationSymmetricTensors +# using Distributions -MMI.@mlj_model mutable struct SBM <: MMI.Probabilistic - k::Int = 1::(_ > 0) - D::Val{<:Distribution} = Val{Bernoulli}() +# MMI.@mlj_model mutable struct SBM <: MMI.Probabilistic +# k::Int = 1::(_ > 0) +# D::Val{<:Distribution} = Val{Bernoulli}() +# end + +# function MMI.fit(model::SBM, X, y) +# return model +# end + +function _default_init(dist::Distribution, start = MetisStart()) + if dist isa Bernoulli + return InitRule(start, Val{BernoulliData}()) + elseif dist isa Categorical + return InitRule(start, Val{CategoricalData}()) + else + return InitRule(start, nothing) + end +end + +function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} + kwargs_dict = Dict(kwargs) + start_clustering = pop!(kwargs_dict, :start_clustering, MetisStart()) + initialise_rule = pop!( + kwargs_dict, :initialise_rule, _default_init(g.dist_ref, start_clustering)) + a = estimate_graphon(g, h; + kwargs_dict..., initialise_rule = initialise_rule) + return fit(a, g) +end + +function nethist(g::Observations{G, D}; + h = select_number_node_per_block(g, EstimatedDegrees()), + max_iter = 10_000, + stalled_iter = 1000, + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict(), + progress_bar::Bool = false, + start_clustering = MetisStart() +) where {G, D} + return _nethist(g, h; + max_iter = max_iter, + swap_rule = swap_rule, + accept_rule = accept_rule, + stop_rule = PreviousBestValue(stalled_iter), + progress_bar = progress_bar, + start_clustering = start_clustering) end -function MMI.fit(model::SBM, X, y) - return model +function nethist_discretised(g::Observations{G, D}; + number_levels = nothing, + h = select_number_node_per_block(g, EstimatedDegrees()), + max_iter = 10_000, + stalled_iter = 1000, + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict(), + progress_bar::Bool = false, + start_clustering = MetisStart() +) where {G, D} + num_groups = isnothing(number_levels) ? number_nodes(g) ÷ h : nothing + obs_discrete, discretiser = discretise( + g, number_groups = num_groups, number_levels = number_levels) + sbm_discretise = _nethist(obs_discrete, h; + max_iter = max_iter, + swap_rule = swap_rule, + accept_rule = accept_rule, + stop_rule = PreviousBestValue(stalled_iter), + progress_bar = progress_bar, + start_clustering = start_clustering) + return sbm_discretise, discretiser end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index e1a5f3b..390c8c6 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -103,3 +103,19 @@ include("swap.jl") function get_ordered_adjacency_matrix(a::BernoulliAssignment, by=identity) return get_ordered_adjacency_matrix(a, a.additional_data.A, by) end + + + +# TODO: move to sparse structure to encode difference between 0 weight and absence of edge +# from docs: +# A = sparse(I,J,V) +# rows = rowvals(A) +# vals = nonzeros(A) +# m, n = size(A) +# for j = 1:n +# for i in nzrange(A, j) +# row = rows[i] +# val = vals[i] +# # perform sparse wizardry... +# end +# end diff --git a/src/observations.jl b/src/observations.jl index 1d9a508..2656275 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -91,9 +91,18 @@ function Metis.graph(g::Observations{<:AbstractMatrix, <:Bernoulli}) return Metis.graph(SimpleGraph(g.graph)) end -function Metis.graph(g::Observations{<:AbstractMatrix, <:Categorical}) +function Metis.graph(g::Observations{<:AbstractGraph, <:UnivariateDistribution}) + if minimum(g.dist_ref) < 0 + @warn "Negative values are not allowed for MetisStart, using binary graph" + return Metis.graph(g.graph) + else + return Metis.graph(g.graph, weights = true) + end +end + +function Metis.graph(g::Observations{<:AbstractMatrix, <:UnivariateDistribution}) return Metis.graph( - adjacency_matrix(SimpleWeightedGraph(g.graph)), weights = true) + weights(SimpleWeightedGraph(g.graph)), weights = true) end function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) @@ -102,8 +111,8 @@ function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) adjacency_matrix(SimpleWeightedGraph(A)), weights = true) end -function discretise(g::Observations{<:AbstractMatrix{R}, D}; - number_groups = nothing, number_levels = nothing) where {R<:Real, D} +function discretise(g::Observations{G, D}; + number_groups = nothing, number_levels = nothing) where {G, D} if isnothing(number_groups) && isnothing(number_levels) throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) end @@ -116,13 +125,26 @@ function discretise(g::Observations{<:AbstractMatrix{R}, D}; end #zero_locations = g.graph .== 0 bin_edges = binedges(DiscretizeUniformWidth(number_levels), g.graph) - discretizer = LinearDiscretizer(bin_edges) - A_encoded = encode(discretizer, g.graph) + discretiser = LinearDiscretizer(bin_edges) + return discretise(g, discretiser) +end + +function discretise(g::Observations{G, D}, discretiser ::LinearDiscretizer) where {G,D<:UnivariateDistribution} + A_encoded = encode(discretiser, _graph_to_mat(g)) for i in 1:size(A_encoded, 1) A_encoded[i, i] = 0 end #A_encoded[zero_locations] .= 0 - return Observations(A_encoded, Categorical(number_levels + 1)), discretizer + return Observations(A_encoded, Categorical(discretiser.nbins + 1)), discretiser +end + + +function _graph_to_mat(g::Observations{<:AbstractGraph, D}) where {D<:UnivariateDistribution} + return weights(g.graph) +end + +function _graph_to_mat(g::Observations{<:AbstractMatrix, D}) where {D<:UnivariateDistribution} + return g.graph end function get_num_levels_from_groups(n, number_groups) diff --git a/src/sbm.jl b/src/sbm.jl index 2212dff..e839fe2 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -3,6 +3,12 @@ struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} probs::SymmetricTensor{T, K, 2} end +function BlockModel(θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real} + return BlockModel(sizes, + SymmetricTensor([θ[i, j] for i in 1:size(θ, 1) for j in i:size(θ, 2)], + Val(length(sizes)), Val(2))) +end + function _check_sizes(sizes) @assert sum(sizes)≈1 "Sizes must sum to 1, got $(sum(sizes))" return sizes @@ -43,7 +49,7 @@ Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) end function sample( - rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}, sorted=false) + rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}, sorted = false) n_nodes = length(node_labels) if sorted sort!(node_labels) @@ -60,7 +66,7 @@ function sample( return sparse(A), node_labels end -function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted=false) +function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted = false) sample(Random.default_rng(), sbm, node_labels, sorted) end function sample( @@ -74,21 +80,18 @@ function sample( return sample(rng, sbm, node_labels) end -function sample(sbm::BlockModel, n_nodes::Int, sorted=false) +function sample(sbm::BlockModel, n_nodes::Int, sorted = false) sample(Random.default_rng(), sbm, n_nodes, sorted) end - function get_probability_matrix(sbm::BlockModel, node_labels::Vector{Int}) return sbm.probs[node_labels, node_labels] end - function _get_params_as_vec(dist::Distribution) return vcat(params(dist)...) end - """ best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) @@ -122,7 +125,6 @@ function best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01 return best_perm end - function align_sbm!(sbm::BlockModel, perm) sbm.probs .= sbm.probs[perm, perm] sbm.sizes .= sbm.sizes[perm] diff --git a/test/runtests.jl b/test/runtests.jl index 0e3641c..30b15d0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,9 @@ include("TestNetworkHistogram.jl") include("observations/discretisation.jl") end + @testset "API tests" begin + include("test_api.jl") + end @testset "Aqua.jl for package quality" begin using NetworkHistogram Aqua.test_all(NetworkHistogram) diff --git a/test/test_api.jl b/test/test_api.jl new file mode 100644 index 0000000..a17343c --- /dev/null +++ b/test/test_api.jl @@ -0,0 +1,18 @@ +@testset "test api" begin + using Distributions + A = rand(0:1, 40, 40) + for i in 1:40 + A[i, i] = 0 + end + + g = Observations(Symmetric(A), Uniform(-1, 1)) + sbm_fitted = nethist(g; h = 10, max_iter = 10) + + @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) + @test size(sbm_fitted) == (4,4) + + sbm_discretised, discretizer = nethist_discretised( + g; number_levels = 5, h = 10, max_iter = 10) + @test eltype(sbm_discretised) == typeof(Categorical(5)) + @test size(sbm_discretised) == (4,4) +end From 120c4e5d7fea939cb4251039b274d62ef355b07c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 27 Nov 2024 17:54:57 +0100 Subject: [PATCH 049/266] change log level from info to debug --- src/assignments/group_numbering.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index 49e489c..9873a46 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -47,12 +47,12 @@ function check_compatiblity!(node_labels, g::GroupSize) end unbalanced = any(((k, v),) -> v != g[k], counts) if unbalanced - @info "The group size is unbalanced, trying to fix it : $(counts)" + @debug "The group size is unbalanced, trying to fix it : $(counts)" g, node_labels = try_fixing_group_size!(node_labels, g) if any(((k, v),) -> v != g[k], StatsBase.countmap(node_labels)) throw(ArgumentError("Could not fix the group size")) else - @info "Fixed the group size by moving nodes between groups" + @debug "Fixed the group size by moving nodes between groups" end end end From 014655ae9e1831ce1096314e0757899a501bb631 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 Nov 2024 08:07:45 +0100 Subject: [PATCH 050/266] add fast inplace sampling --- src/sbm.jl | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/sbm.jl b/src/sbm.jl index e839fe2..0bf218b 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -9,6 +9,10 @@ function BlockModel(θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real Val(length(sizes)), Val(2))) end +function edge_type(::BlockModel{T, K, F}) where {T, K, F} + return eltype(T) +end + function _check_sizes(sizes) @assert sum(sizes)≈1 "Sizes must sum to 1, got $(sum(sizes))" return sizes @@ -54,23 +58,39 @@ function sample( if sorted sort!(node_labels) end - type_input = eltype(sbm.probs[1, 1]) - A = Matrix{type_input}(undef, n_nodes, n_nodes) - for i in 1:n_nodes - A[i, i] = zero(eltype(A)) - for j in (i + 1):n_nodes + A = zeros(edge_type(sbm), n_nodes, n_nodes) + for j in 1:n_nodes + for i in (j + 1):n_nodes A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) - A[j, i] = A[i, j] end end - return sparse(A), node_labels + return sparse(Symmetric(A, :L)), node_labels end + +function draw_and_fill!(rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = false) + n_blocks = number_blocks(sbm) + n_nodes = size(A, 1) + node_labels = StatsBase.sample( + rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) + if sorted + sort!(node_labels) + end + for j in 1:n_nodes + for i in (j + 1):n_nodes + A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) + end + end + A .= Symmetric(A, :L) +end + +draw_and_fill!(A, sbm, sorted = false) = draw_and_fill!(Random.default_rng(), A, sbm, sorted) + function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted = false) sample(Random.default_rng(), sbm, node_labels, sorted) end function sample( - rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = true) + rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = false) n_blocks = number_blocks(sbm) node_labels = StatsBase.sample( rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) From c1d9e0f410d68ed871ab6207923508210f93e92a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 Nov 2024 13:20:09 +0100 Subject: [PATCH 051/266] add bootstraping capabilities --- src/NetworkHistogram.jl | 10 ++++++++++ src/api.jl | 4 ++-- src/assignments/group_numbering.jl | 5 +++-- src/bootstrap.jl | 14 ++++++++++++++ src/sbm.jl | 2 +- 5 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 src/bootstrap.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index bacb368..1b3ab1b 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -19,6 +19,11 @@ using CategoricalArrays, CategoricalDistributions using Discretizers: LinearDiscretizer, binedges, DiscretizeUniformWidth, encode using Combinatorics: permutations + +using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx +import Bootstrap: bootstrap + + include("assignments/Assignments.jl") include("sbm.jl") include("observations.jl") @@ -28,6 +33,7 @@ include("optimisation/include.jl") include("assignments/include.jl") include("api.jl") +include("bootstrap.jl") export nethist, nethist_discretised export loglikelihood, fit @@ -53,4 +59,8 @@ export get_ordered_adjacency_matrix, get_vertex_in_group, get_group_of_vertex export BernoulliData, CategoricalData export Observations, discretise + + +export bootstrap + end diff --git a/src/api.jl b/src/api.jl index dcf657e..cfae4dc 100644 --- a/src/api.jl +++ b/src/api.jl @@ -36,7 +36,7 @@ function nethist(g::Observations{G, D}; h = select_number_node_per_block(g, EstimatedDegrees()), max_iter = 10_000, stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomNodeSwap(), + swap_rule::NodeSwapRule = RandomGroupSwap(), accept_rule::AcceptRule = Strict(), progress_bar::Bool = false, start_clustering = MetisStart() @@ -55,7 +55,7 @@ function nethist_discretised(g::Observations{G, D}; h = select_number_node_per_block(g, EstimatedDegrees()), max_iter = 10_000, stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomNodeSwap(), + swap_rule::NodeSwapRule = RandomGroupSwap(), accept_rule::AcceptRule = Strict(), progress_bar::Bool = false, start_clustering = MetisStart() diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index 9873a46..a9afb50 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -40,10 +40,11 @@ end function check_compatiblity!(node_labels, g::GroupSize) counts = StatsBase.countmap(node_labels) + if length(counts) != g.number_groups || size(node_labels, 1) != sum(g) throw(ArgumentError("The vector of node labels is not compatible with the \ - group size: $(length(counts)) != $(g.number_groups) or $(size(node_labels, 1)) \ - != $(sum(g))")) + group size: group number $(length(counts)) != $(g.number_groups) or node number \ + $(size(node_labels, 1)) != $(sum(g))")) end unbalanced = any(((k, v),) -> v != g[k], counts) if unbalanced diff --git a/src/bootstrap.jl b/src/bootstrap.jl new file mode 100644 index 0000000..0b3ff97 --- /dev/null +++ b/src/bootstrap.jl @@ -0,0 +1,14 @@ +function bootstrap(statistic::Function, data::AbstractMatrix, model::BlockModel, + sampling::BootstrapSampling) + t0 = tx(statistic(data)) + m = nrun(sampling) + t1 = Bootstrap.zeros_tuple(t0, m) + data1 = copy(data) + for i in 1:m + draw_and_fill!(data1, model) + for (j, t) in enumerate(tx(statistic(data1))) + t1[j][i] = t + end + end + return ParametricBootstrapSample(t0, t1, statistic, data, model, sampling) +end diff --git a/src/sbm.jl b/src/sbm.jl index 0bf218b..3ca4719 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -76,7 +76,7 @@ function draw_and_fill!(rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = fa if sorted sort!(node_labels) end - for j in 1:n_nodes + @inbounds for j in 1:n_nodes for i in (j + 1):n_nodes A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) end From 372b404e3d736f7dea65dc3231bca6f7ec2c7904 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 Nov 2024 13:23:41 +0100 Subject: [PATCH 052/266] improve error messages on node labels --- Project.toml | 2 ++ src/api.jl | 14 -------------- src/assignments/group_numbering.jl | 9 ++++++--- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/Project.toml b/Project.toml index 56dcd4f..f76bfa0 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.5.2" [deps] ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" +Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" @@ -30,6 +31,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] ArnoldiMethod = "0.4.0" Arpack = "0.5.4" +Bootstrap = "2.4.0" CategoricalArrays = "0.10.8" CategoricalDistributions = "0.1.15" Clustering = "0.15.7" diff --git a/src/api.jl b/src/api.jl index cfae4dc..0024f54 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,17 +1,3 @@ -# import MLJModelInterface -# const MMI = MLJModelInterface -# using PermutationSymmetricTensors -# using Distributions - -# MMI.@mlj_model mutable struct SBM <: MMI.Probabilistic -# k::Int = 1::(_ > 0) -# D::Val{<:Distribution} = Val{Bernoulli}() -# end - -# function MMI.fit(model::SBM, X, y) -# return model -# end - function _default_init(dist::Distribution, start = MetisStart()) if dist isa Bernoulli return InitRule(start, Val{BernoulliData}()) diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl index a9afb50..879e909 100644 --- a/src/assignments/group_numbering.jl +++ b/src/assignments/group_numbering.jl @@ -41,10 +41,13 @@ end function check_compatiblity!(node_labels, g::GroupSize) counts = StatsBase.countmap(node_labels) - if length(counts) != g.number_groups || size(node_labels, 1) != sum(g) + if length(counts) != g.number_groups throw(ArgumentError("The vector of node labels is not compatible with the \ - group size: group number $(length(counts)) != $(g.number_groups) or node number \ - $(size(node_labels, 1)) != $(sum(g))")) + group size: number of group in labels $(length(counts)) != expected number $(g.number_groups)")) + end + if size(node_labels, 1) != sum(g) + throw(ArgumentError("The vector of node labels is not compatible with the \ + group size: number of node labels $(size(node_labels, 1)) != expected number of nodes $(sum(g))")) end unbalanced = any(((k, v),) -> v != g[k], counts) if unbalanced From f4577f3955c8ccb8ea504d90440187673a9a64c7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 Nov 2024 16:51:28 +0100 Subject: [PATCH 053/266] add assignment retour --- src/api.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/api.jl b/src/api.jl index 0024f54..9635d8b 100644 --- a/src/api.jl +++ b/src/api.jl @@ -15,7 +15,7 @@ function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} kwargs_dict, :initialise_rule, _default_init(g.dist_ref, start_clustering)) a = estimate_graphon(g, h; kwargs_dict..., initialise_rule = initialise_rule) - return fit(a, g) + return fit(a, g), a end function nethist(g::Observations{G, D}; @@ -49,12 +49,12 @@ function nethist_discretised(g::Observations{G, D}; num_groups = isnothing(number_levels) ? number_nodes(g) ÷ h : nothing obs_discrete, discretiser = discretise( g, number_groups = num_groups, number_levels = number_levels) - sbm_discretise = _nethist(obs_discrete, h; + sbm_discretise, a = _nethist(obs_discrete, h; max_iter = max_iter, swap_rule = swap_rule, accept_rule = accept_rule, stop_rule = PreviousBestValue(stalled_iter), progress_bar = progress_bar, start_clustering = start_clustering) - return sbm_discretise, discretiser + return sbm_discretise, a, discretiser end From 84f87f8cafb6fb9732b1827b8b3578d05d262b55 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 29 Nov 2024 17:26:15 +0100 Subject: [PATCH 054/266] add utils for continuous latent --- src/sbm.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/sbm.jl b/src/sbm.jl index 3ca4719..937d9eb 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -112,6 +112,13 @@ function _get_params_as_vec(dist::Distribution) return vcat(params(dist)...) end + +function latent_to_block_index(latents::Vector{T}, sbm::BlockModel) where T<:Real + cum_sum_sizes = cumsum(sbm.sizes) + cum_sum_sizes[end] = 1.0 + return [findfirst(x -> x >= l, cum_sum_sizes) for l in latents] +end + """ best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) From 33d65e51612858784820f4ccf5245f6f32f14904 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 29 Nov 2024 17:28:04 +0100 Subject: [PATCH 055/266] update api test --- test/test_api.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_api.jl b/test/test_api.jl index a17343c..d179b42 100644 --- a/test/test_api.jl +++ b/test/test_api.jl @@ -6,12 +6,12 @@ end g = Observations(Symmetric(A), Uniform(-1, 1)) - sbm_fitted = nethist(g; h = 10, max_iter = 10) + sbm_fitted, a = nethist(g; h = 10, max_iter = 10) @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) @test size(sbm_fitted) == (4,4) - sbm_discretised, discretizer = nethist_discretised( + sbm_discretised, a, discretizer = nethist_discretised( g; number_levels = 5, h = 10, max_iter = 10) @test eltype(sbm_discretised) == typeof(Categorical(5)) @test size(sbm_discretised) == (4,4) From 9ede44b36d7c34491b13cf0a3a4ee44ced7f1a6c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 29 Nov 2024 17:37:55 +0100 Subject: [PATCH 056/266] remove over specification of argument type --- src/sbm.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sbm.jl b/src/sbm.jl index 937d9eb..bc9cd3e 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -113,10 +113,10 @@ function _get_params_as_vec(dist::Distribution) end -function latent_to_block_index(latents::Vector{T}, sbm::BlockModel) where T<:Real +function latent_to_block_index(latents_vec, sbm::BlockModel) cum_sum_sizes = cumsum(sbm.sizes) cum_sum_sizes[end] = 1.0 - return [findfirst(x -> x >= l, cum_sum_sizes) for l in latents] + return [findfirst(x -> x >= l, Ref(cum_sum_sizes)) for l in latents_vec] end """ From ea58864bb5dbfb7eb8eecae0196458838a137ebb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 29 Nov 2024 17:38:41 +0100 Subject: [PATCH 057/266] typo --- src/sbm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sbm.jl b/src/sbm.jl index bc9cd3e..e7fe17b 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -116,7 +116,7 @@ end function latent_to_block_index(latents_vec, sbm::BlockModel) cum_sum_sizes = cumsum(sbm.sizes) cum_sum_sizes[end] = 1.0 - return [findfirst(x -> x >= l, Ref(cum_sum_sizes)) for l in latents_vec] + return [findfirst(x -> x >= l, cum_sum_sizes) for l in latents_vec] end """ From 38c0b0763d7c8d70546aae5c0225127984c77410 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 16:29:30 +0100 Subject: [PATCH 058/266] change progress bar --- src/optimisation/least_squares.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index e4b14ee..f83af96 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -23,7 +23,7 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, ) # swap memory allocation swap = make_swap(a, (1, 1)) - p = Progress(max_iter; enabled = progress_bar) + p = Progress(max_iter; enabled = progress_bar, showspeed = true) # perform local search until the stopping rule is met for i in 1:max_iter local_search!( From a453dc5dd55d95489b0ab25a03b7ff992df77554 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 18:11:12 +0100 Subject: [PATCH 059/266] backbone --- src/assignments/CategoricalAssignment/swap.jl | 4 ---- src/optimisation/config_rules/InitRule.jl | 10 ++++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index e949c15..4a40a4c 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -24,10 +24,6 @@ function make_swap!( id) where {T, F, C} swap.index1, swap.index2 = id copy_realized_and_theta!(swap, a.additional_data) - #copy!.(swap.realized, a.additional_data.realized) - #copy!.(swap.estimated_theta, a.additional_data.estimated_theta) - #swap.log_likelihood = a.additional_data.log_likelihood - #return nothing end function revert_swap!( diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index be7512f..2169073 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -3,6 +3,8 @@ struct OrderedStart <: StartingAssignment end struct RandomStart <: StartingAssignment end struct SpectralStart <: StartingAssignment end struct MetisStart <: StartingAssignment end +struct BiasAdjustedSoS <: StartingAssignment end + struct FromAssignment{A} <: StartingAssignment assignment::A end @@ -88,3 +90,11 @@ function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) results = IterativeSolvers.lobpcg(laplacian, true, rule.k) return group_size, node_labels end + + + +function initialize_node_labels(g, h, ::BiasAdjustedSoS) + # implement method from Bias-adjusted spectral clustering in multilayer stochastic block + # models + +end From 44287c4c13d56b859f2ca82fdc6e13143800b0e4 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 21:38:58 +0100 Subject: [PATCH 060/266] test new update --- src/NetworkHistogram.jl | 2 +- src/api.jl | 4 +- .../CategoricalAssignment/struct.jl | 7 +- src/assignments/CategoricalAssignment/swap.jl | 80 +++++++++++++------ 4 files changed, 63 insertions(+), 30 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 1b3ab1b..ffce6c5 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -18,7 +18,7 @@ import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions using Discretizers: LinearDiscretizer, binedges, DiscretizeUniformWidth, encode using Combinatorics: permutations - +using StaticArrays using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx import Bootstrap: bootstrap diff --git a/src/api.jl b/src/api.jl index 9635d8b..eeea918 100644 --- a/src/api.jl +++ b/src/api.jl @@ -20,7 +20,7 @@ end function nethist(g::Observations{G, D}; h = select_number_node_per_block(g, EstimatedDegrees()), - max_iter = 10_000, + max_iter = 100_000, stalled_iter = 1000, swap_rule::NodeSwapRule = RandomGroupSwap(), accept_rule::AcceptRule = Strict(), @@ -39,7 +39,7 @@ end function nethist_discretised(g::Observations{G, D}; number_levels = nothing, h = select_number_node_per_block(g, EstimatedDegrees()), - max_iter = 10_000, + max_iter = 100_000, stalled_iter = 1000, swap_rule::NodeSwapRule = RandomGroupSwap(), accept_rule::AcceptRule = Strict(), diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 3bfd9db..29b4835 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -4,6 +4,7 @@ mutable struct CategoricalData{F, C} estimated_theta::Array{F, 3} A::Matrix{C} # possible use of CategoricalArrays.jl ? log_likelihood::F # need to remove this type + scratch::Matrix{Int} end const CategoricalAssignment{T, F, C} = Assignment{ @@ -36,9 +37,10 @@ function make_categorical_data(g, node_labels, group_size) counts, realized, g, Assignment(group_size, node_labels)) _fast_div!(estimated_theta, realized, counts) + scratch = zeros(Int, num_categories, number_groups) ll = compute_log_likelihood(estimated_theta, realized) - return CategoricalData(counts, realized, estimated_theta, A, ll) + return CategoricalData(counts, realized, estimated_theta, A, ll, scratch) end function _count_cat_occurences!(counts, realized, g, a_dummy) @@ -120,7 +122,6 @@ end include("swap.jl") - -function get_ordered_adjacency_matrix(a::CategoricalAssignment, by=identity) +function get_ordered_adjacency_matrix(a::CategoricalAssignment, by = identity) return get_ordered_adjacency_matrix(a, a.additional_data.A, by) end diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 4a40a4c..4c541a9 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -41,6 +41,7 @@ function apply_swap!( a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { T, F, C} update_observed_and_labels!(a, swap) + #new_update_observed_and_labels!(a, swap) update_ll!(a) end @@ -65,6 +66,61 @@ function fit( return dists end +function _move_connection!(realized, group_origin, group_dest, scratch) + for group in axes(realized, 2) + for label in axes(realized, 1) + realized[label, group, group_origin] -= scratch[label, group] + realized[label, group, group_dest] += scratch[label, group] + realized[label, group_origin, group] = realized[label, group, group_origin] + realized[label, group_dest, group] = realized[label, group, group_dest] + end + end +end + +function new_update_observed_and_labels!( + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + if g1 == g2 + return nothing + end + + a.additional_data.scratch .= 0 + for i in axes(a.additional_data.A, 1) + if i == swap.index1 || i == swap.index2 + continue + end + obs = a.additional_data.A[i, swap.index1] + if obs != 0 + group_inter = get_group_of_vertex(a, i) + a.additional_data.scratch[obs, group_inter] += 1 + end + end + _move_connection!(a.additional_data.realized, g1, g2, a.additional_data.scratch) + + a.additional_data.scratch .= 0 + for i in axes(a.additional_data.A, 1) + if i == swap.index1 || i == swap.index2 + continue + end + obs = a.additional_data.A[i, swap.index2] + if obs != 0 + group_inter = get_group_of_vertex(a, i) + a.additional_data.scratch[obs, group_inter] += 1 + end + end + _move_connection!(a.additional_data.realized, g2, g1, a.additional_data.scratch) + + _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, + a.additional_data.counts) + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + function update_observed_and_labels!( a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { T, F, C} @@ -85,30 +141,6 @@ function update_observed_and_labels!( _fast_update!!( a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) end - - # if i == swap.index1 || i == swap.index2 || obs_1 == obs_2 - # continue - # else - - # a_g1_g_inter = a.additional_data.realized[g1, group_inter] - # a_g2_g_inter = a.additional_data.realized[g2, group_inter] - # a_g_inter_g1 = realized_g1[group_inter] - # a_g_inter_g2 = realized_g2[group_inter] - - # # send from group 1 to group 2 - # a_g1_g_inter[obs_1] -= 1 - # a_g_inter_g1[obs_1] = a_g1_g_inter[obs_1] - - # a_g2_g_inter[obs_1] += 1 - # a_g_inter_g2[obs_1] = a_g2_g_inter[obs_1] - - # # send from group 2 to group 1 - # a_g2_g_inter[obs_2] -= 1 - # a_g_inter_g2[obs_2] = a_g2_g_inter[obs_2] - - # a_g1_g_inter[obs_2] += 1 - # a_g_inter_g1[obs_2] = a_g1_g_inter[obs_2] - # end end _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, From 9d315dc651358d1ac6083046b56a49889ca4d5b0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 21:56:27 +0100 Subject: [PATCH 061/266] new method --- src/assignments/CategoricalAssignment/swap.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 4c541a9..36e8324 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -40,8 +40,8 @@ end function apply_swap!( a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { T, F, C} - update_observed_and_labels!(a, swap) - #new_update_observed_and_labels!(a, swap) + #update_observed_and_labels!(a, swap) + new_update_observed_and_labels!(a, swap) update_ll!(a) end From bdf1ddf8ab1b293a751aea7735e9240b19445b91 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 21:59:54 +0100 Subject: [PATCH 062/266] new method --- src/assignments/CategoricalAssignment/swap.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 36e8324..4c541a9 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -40,8 +40,8 @@ end function apply_swap!( a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { T, F, C} - #update_observed_and_labels!(a, swap) - new_update_observed_and_labels!(a, swap) + update_observed_and_labels!(a, swap) + #new_update_observed_and_labels!(a, swap) update_ll!(a) end From 2037f33066b29d9db4340b8ce23de11908403744 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 22:05:58 +0100 Subject: [PATCH 063/266] typo --- src/assignments/CategoricalAssignment/swap.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 4c541a9..36e8324 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -40,8 +40,8 @@ end function apply_swap!( a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { T, F, C} - update_observed_and_labels!(a, swap) - #new_update_observed_and_labels!(a, swap) + #update_observed_and_labels!(a, swap) + new_update_observed_and_labels!(a, swap) update_ll!(a) end From 016104b939d37961a39274bcbfb7a2db8f9c6339 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 23:33:21 +0100 Subject: [PATCH 064/266] update progress bar --- src/optimisation/least_squares.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index f83af96..f68330e 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -23,7 +23,7 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, ) # swap memory allocation swap = make_swap(a, (1, 1)) - p = Progress(max_iter; enabled = progress_bar, showspeed = true) + p = ProgressUnknown(enabled = progress_bar, showspeed = true, desc = "Greedy search: ") # perform local search until the stopping rule is met for i in 1:max_iter local_search!( From c82d1b7c6bbc76c6b1d5c59f2c9ccddc3f22043a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 2 Dec 2024 23:36:07 +0100 Subject: [PATCH 065/266] update import --- src/NetworkHistogram.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index ffce6c5..48b3469 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -4,7 +4,7 @@ using LinearAlgebra, SparseArrays using Distributions, DensityInterface using Graphs, SimpleWeightedGraphs using PermutationSymmetricTensors -using ProgressMeter: Progress, next!, finish! +using ProgressMeter: Progress, next!, finish!, ProgressUnknown import StatsBase, Random using DensityInterface: logdensityof using StaticArrays: MVector, MMatrix From af02584a263897d675ae422cdc2aa0407ae89b14 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 3 Dec 2024 19:26:29 +0100 Subject: [PATCH 066/266] fix arpack version to avoid bug on 0.5.4 --- Project.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f76bfa0..a41150d 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Discretizers = "6e83dbb3-75ca-525b-8ae2-3751f0dd50b4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +EmpiricalDistributions = "0bbb1fad-0f24-45fe-94a4-415852c5cc3b" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -30,7 +31,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] ArnoldiMethod = "0.4.0" -Arpack = "0.5.4" +Arpack = "0.5.3" Bootstrap = "2.4.0" CategoricalArrays = "0.10.8" CategoricalDistributions = "0.1.15" @@ -39,6 +40,7 @@ Combinatorics = "1.0.2" DensityInterface = "0.4.0" Discretizers = "3.2.3" Distributions = "0.25.112" +EmpiricalDistributions = "0.3.8" Graphs = "1.12.0" IterativeSolvers = "0.9.4" LinearAlgebra = "1.11.0" From c8b207bf91a176518c521a634db6140dc03b1502 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 3 Dec 2024 19:52:05 +0100 Subject: [PATCH 067/266] remove Arpack --- src/NetworkHistogram.jl | 1 - .../CategoricalAssignment/struct.jl | 2 +- src/assignments/CategoricalAssignment/swap.jl | 9 ++- src/discretised_dist.jl | 66 +++++++++++++++++++ src/observations.jl | 3 + src/optimisation/config_rules/InitRule.jl | 5 +- .../config_rules/bandwidth_selection_rule.jl | 8 ++- 7 files changed, 84 insertions(+), 10 deletions(-) create mode 100644 src/discretised_dist.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 48b3469..74c3e15 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -10,7 +10,6 @@ using DensityInterface: logdensityof using StaticArrays: MVector, MMatrix using LogExpFunctions: xlogx, xlogy using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen -import Arpack import Metis import IterativeSolvers import Clustering diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index 29b4835..b6ce477 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -12,7 +12,7 @@ const CategoricalAssignment{T, F, C} = Assignment{ const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} function CategoricalAssignment( - g, group_size::GroupSize, node_labels::Vector{Int}) + g::Observations{G,D}, group_size::GroupSize, node_labels::Vector{Int}) where {G,D} categorical_data = make_categorical_data(g, node_labels, group_size) return Assignment(group_size, node_labels, categorical_data) end diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 36e8324..5ac5811 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -67,7 +67,7 @@ function fit( end function _move_connection!(realized, group_origin, group_dest, scratch) - for group in axes(realized, 2) + @inbounds for group in axes(realized, 2) for label in axes(realized, 1) realized[label, group, group_origin] -= scratch[label, group] realized[label, group, group_dest] += scratch[label, group] @@ -77,6 +77,9 @@ function _move_connection!(realized, group_origin, group_dest, scratch) end end + +# need to rethink if want to use muli-threading +# check https://juliafolds.github.io/Transducers.jl/dev/tutorials/words/ function new_update_observed_and_labels!( a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { T, F, C} @@ -91,7 +94,7 @@ function new_update_observed_and_labels!( if i == swap.index1 || i == swap.index2 continue end - obs = a.additional_data.A[i, swap.index1] + @inbounds obs = a.additional_data.A[i, swap.index1] if obs != 0 group_inter = get_group_of_vertex(a, i) a.additional_data.scratch[obs, group_inter] += 1 @@ -104,7 +107,7 @@ function new_update_observed_and_labels!( if i == swap.index1 || i == swap.index2 continue end - obs = a.additional_data.A[i, swap.index2] + @inbounds obs = a.additional_data.A[i, swap.index2] if obs != 0 group_inter = get_group_of_vertex(a, i) a.additional_data.scratch[obs, group_inter] += 1 diff --git a/src/discretised_dist.jl b/src/discretised_dist.jl new file mode 100644 index 0000000..b48e8df --- /dev/null +++ b/src/discretised_dist.jl @@ -0,0 +1,66 @@ +using CategoricalArrays +using CategoricalDistributions +using Distributions + + +struct Encoder{F, S} + breaks::Vector{F} + labels::Vector{S} + extended::Bool + + function Encoder( + _breaks::AbstractVector{F}, labels = CategoricalArrays.default_formatter; + extend = missing) where {F} + breaks = sort(_breaks) + n = length(breaks) + from = breaks[1:(n - 1)] + to = breaks[2:n] + firstlevel = labels(from[1], to[1], 1, + leftclosed = breaks[1] != breaks[2], rightclosed = false) + levs = Vector{typeof(firstlevel)}(undef, n - 1) + levs[1] = firstlevel + for i in 2:(n - 2) + levs[i] = labels(from[i], to[i], i, + leftclosed = breaks[i] != breaks[i + 1], rightclosed = false) + end + levs[end] = labels(from[end], to[end], n - 1, + leftclosed = breaks[end - 1] != breaks[end], + rightclosed = coalesce(extend, false)) + + new{F, typeof(firstlevel)}(breaks, levs, coalesce(extend, false)) + end +end + +function convert(encoder::Encoder, x::T) where {T <: Real} + if x < encoder.breaks[1] || x > encoder.breaks[end] + throw(ArgumentError("Value $x out of bounds $(encoder.breaks[1]) - $(encoder.breaks[end])")) + end + if x == encoder.breaks[end] && encoder.extended + return encoder.labels[end] + end + if x == encoder.breaks[1] + return encoder.labels[1] + end + return encoder.labels[findlast(y-> y <= x, encoder.breaks)] +end + + +function convert(encoder::Encoder, x::String) + index = findfirst(l -> l == x, encoder.labels) + if isnothing(index) + throw(ArgumentError("Value $x not found in $(encoder.labels)")) + end + if index == 1 + return encoder.breaks[1], encoder.breaks[2] + elseif index == length(encoder.labels) + return encoder.breaks[end-1], encoder.breaks[end] + else + return encoder.breaks[index-1], encoder.breaks[index] + end +end + + +struct DiscretisedDist{S, F, L} + dist::UnivariateFinite{S} + encoding::Encoder{F,L} +end diff --git a/src/observations.jl b/src/observations.jl index 2656275..e6b10e5 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -64,6 +64,9 @@ function normalized_laplacian(g::AbstractGraph) return normalized_laplacian(Graphs.adjacency_matrix(g)) end + +normalized_laplacian(g::CategoricalArray) = normalized_laplacian(levelcode.(g)) + function normalized_laplacian(g::AbstractMatrix) degrees = sum(g, dims = 1) degrees .-= minimum(degrees) diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 2169073..7cc4329 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -54,9 +54,10 @@ function initialize_node_labels(g, h, ::SpectralStart) node_labels = zeros(Int, number_nodes(g)) laplacian = normalized_laplacian(g) - _, eigenvectors = Arpack.eigs(laplacian, nev = 2, which = :LR) + decomp, = partialschur(laplacian, nev=2, which=:LR) + # get 2nd eigenvector, sort its components - indices = sortperm(real.(eigenvectors[:, 1])) + indices = sortperm(real.(decomp.Q[:, 2])) # bin them into groups of correct size start = 1 for (i, group) in enumerate(group_size) diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index df4d659..a378dcb 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -78,8 +78,10 @@ end function estimated_number_nodes_per_block( g::Observations, ::EstimatedEigenvalues, points, rho) - λ, u = Arpack.eigs(get_adj(g), nev = 1, which = :LM) - return _approx_k_from_delta_f(u, λ[1], points, rho) + @warn "Check this method again" + decomp, = partialschur(get_adj(g), nev = 1, which = :LR) + u, λ = real.(decomp.Q), decomp.eigenvalues[1] + return _approx_k_from_delta_f(u, λ, points, rho) end function estimated_number_nodes_per_block( @@ -90,7 +92,7 @@ function estimated_number_nodes_per_block( end function _approx_k_from_delta_f(u, mult, midpoints, ρ, α = 1.0) - sort!(u, dims = 1) + sort!(u, dims=1) uMid = u[midpoints] β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid # from Olhede and Wolfe (2014), equation (11) From 572cfd52773f86735d8119f255371e12ab86dcf9 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 3 Dec 2024 19:52:21 +0100 Subject: [PATCH 068/266] update dependencies --- Project.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Project.toml b/Project.toml index a41150d..42ec043 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,6 @@ version = "0.5.2" [deps] ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" -Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" @@ -14,7 +13,6 @@ Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Discretizers = "6e83dbb3-75ca-525b-8ae2-3751f0dd50b4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" -EmpiricalDistributions = "0bbb1fad-0f24-45fe-94a4-415852c5cc3b" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -31,7 +29,6 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] ArnoldiMethod = "0.4.0" -Arpack = "0.5.3" Bootstrap = "2.4.0" CategoricalArrays = "0.10.8" CategoricalDistributions = "0.1.15" @@ -40,7 +37,6 @@ Combinatorics = "1.0.2" DensityInterface = "0.4.0" Discretizers = "3.2.3" Distributions = "0.25.112" -EmpiricalDistributions = "0.3.8" Graphs = "1.12.0" IterativeSolvers = "0.9.4" LinearAlgebra = "1.11.0" From 39a199909bdd72edb64926266c4bb72bdf6db3fe Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Dec 2024 16:56:46 +0100 Subject: [PATCH 069/266] define convention for discretisation --- Project.toml | 2 - src/NetworkHistogram.jl | 5 +- src/discretised_dist.jl | 66 -------------- src/distributions/discretizer.jl | 127 +++++++++++++++++++++++++++ src/distributions/include.jl | 1 + src/observations.jl | 17 ++-- test/Project.toml | 1 + test/discretised_dist/discretizer.jl | 17 ++++ test/observations/discretisation.jl | 6 +- test/runtests.jl | 4 + test/test_api.jl | 2 +- 11 files changed, 163 insertions(+), 85 deletions(-) delete mode 100644 src/discretised_dist.jl create mode 100644 src/distributions/discretizer.jl create mode 100644 src/distributions/include.jl create mode 100644 test/discretised_dist/discretizer.jl diff --git a/Project.toml b/Project.toml index 42ec043..cbb639a 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,6 @@ CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" -Discretizers = "6e83dbb3-75ca-525b-8ae2-3751f0dd50b4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" @@ -35,7 +34,6 @@ CategoricalDistributions = "0.1.15" Clustering = "0.15.7" Combinatorics = "1.0.2" DensityInterface = "0.4.0" -Discretizers = "3.2.3" Distributions = "0.25.112" Graphs = "1.12.0" IterativeSolvers = "0.9.4" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 74c3e15..73824bb 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -7,7 +7,6 @@ using PermutationSymmetricTensors using ProgressMeter: Progress, next!, finish!, ProgressUnknown import StatsBase, Random using DensityInterface: logdensityof -using StaticArrays: MVector, MMatrix using LogExpFunctions: xlogx, xlogy using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen import Metis @@ -15,14 +14,12 @@ import IterativeSolvers import Clustering import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions -using Discretizers: LinearDiscretizer, binedges, DiscretizeUniformWidth, encode using Combinatorics: permutations using StaticArrays - using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx import Bootstrap: bootstrap - +include("distributions/include.jl") include("assignments/Assignments.jl") include("sbm.jl") include("observations.jl") diff --git a/src/discretised_dist.jl b/src/discretised_dist.jl deleted file mode 100644 index b48e8df..0000000 --- a/src/discretised_dist.jl +++ /dev/null @@ -1,66 +0,0 @@ -using CategoricalArrays -using CategoricalDistributions -using Distributions - - -struct Encoder{F, S} - breaks::Vector{F} - labels::Vector{S} - extended::Bool - - function Encoder( - _breaks::AbstractVector{F}, labels = CategoricalArrays.default_formatter; - extend = missing) where {F} - breaks = sort(_breaks) - n = length(breaks) - from = breaks[1:(n - 1)] - to = breaks[2:n] - firstlevel = labels(from[1], to[1], 1, - leftclosed = breaks[1] != breaks[2], rightclosed = false) - levs = Vector{typeof(firstlevel)}(undef, n - 1) - levs[1] = firstlevel - for i in 2:(n - 2) - levs[i] = labels(from[i], to[i], i, - leftclosed = breaks[i] != breaks[i + 1], rightclosed = false) - end - levs[end] = labels(from[end], to[end], n - 1, - leftclosed = breaks[end - 1] != breaks[end], - rightclosed = coalesce(extend, false)) - - new{F, typeof(firstlevel)}(breaks, levs, coalesce(extend, false)) - end -end - -function convert(encoder::Encoder, x::T) where {T <: Real} - if x < encoder.breaks[1] || x > encoder.breaks[end] - throw(ArgumentError("Value $x out of bounds $(encoder.breaks[1]) - $(encoder.breaks[end])")) - end - if x == encoder.breaks[end] && encoder.extended - return encoder.labels[end] - end - if x == encoder.breaks[1] - return encoder.labels[1] - end - return encoder.labels[findlast(y-> y <= x, encoder.breaks)] -end - - -function convert(encoder::Encoder, x::String) - index = findfirst(l -> l == x, encoder.labels) - if isnothing(index) - throw(ArgumentError("Value $x not found in $(encoder.labels)")) - end - if index == 1 - return encoder.breaks[1], encoder.breaks[2] - elseif index == length(encoder.labels) - return encoder.breaks[end-1], encoder.breaks[end] - else - return encoder.breaks[index-1], encoder.breaks[index] - end -end - - -struct DiscretisedDist{S, F, L} - dist::UnivariateFinite{S} - encoding::Encoder{F,L} -end diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl new file mode 100644 index 0000000..896dd2d --- /dev/null +++ b/src/distributions/discretizer.jl @@ -0,0 +1,127 @@ +abstract type Discretizer end + + +function encode(d::Discretizer, x::AbstractArray{<:Real}) + return [encode(d, u) for u in x] +end + +function decode(d::Discretizer, x::AbstractArray{<:Real}) + return [decode(d, u) for u in x] +end + +""" +Uniformly discretizes a continuous distribution into a fixed number of bins of equal width. +""" +struct RegularDiscretizer{F, T, L} <: Discretizer + n_bins::Int + lower_bound::F + upper_bound::F + bin_labels::MVector{L, T} + bin_width::F +end + + +function support_encoding(d::RegularDiscretizer, x::Real) + return d.lower_bound <= x <= d.upper_bound +end + +function encode(d::RegularDiscretizer, x::Real) + if !support_encoding(d, x) + throw(ArgumentError("Value $x is not supported by the discretizer")) + end + if x == d.upper_bound + return d.n_bins + end + return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] +end + +function decode(d::RegularDiscretizer, bin::Int) + return (d.lower_bound + (bin - 1) * d.bin_width, d.lower_bound + bin * d.bin_width) +end + +function encode(d::RegularDiscretizer, x::AbstractArray{Real}) + return [encode(d, u) for u in x] +end + + +function decode(d::RegularDiscretizer, x::AbstractArray{Real}) + return [decode(d, u) for u in x] +end + +function nlabels(d::RegularDiscretizer) + return d.n_bins +end + +""" +Maps a set of categories to a set of bins +""" +struct CategoryDiscretizer{F, T} + cat_to_bin::Dict{F, T} + bin_to_cat::Dict{T, F} +end + +function support_encoding(d::CategoryDiscretizer, x) + return haskey(d.cat_to_bin, x) +end + +function encode(d::CategoryDiscretizer, x) + return d.cat_to_bin[x] +end + +function decode(d::CategoryDiscretizer, label) + return d.bin_to_cat[label] +end + +function nlabels(d::CategoryDiscretizer) + return length(d.bin_to_cat) +end + +""" +Uniformly discretizes a continuous distribution into a fixed number of bins of equal width, +with additional bins for missing or special values. +""" +struct HybridDiscretizer{F, F2, T, L} <: Discretizer + lin::RegularDiscretizer{F, T, L} + cat::CategoryDiscretizer{F2, T} +end + +function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) + cat_to_bin = Dict(a => n_bins + i for (i, a) in enumerate(atoms)) + bin_to_cat = Dict(n_bins + i => a for (i, a) in enumerate(atoms)) + bin_width = (upper_bound - lower_bound) / n_bins + return HybridDiscretizer( + RegularDiscretizer{typeof(bin_width), Int, n_bins}( + n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), + (upper_bound - lower_bound) / n_bins), + CategoryDiscretizer(cat_to_bin, bin_to_cat) + ) +end + + +function support_encoding(d::HybridDiscretizer, x) + return support_encoding(d.lin, x) || support_encoding(d.cat, x) +end + +function nlabels(d::HybridDiscretizer) + return nlabels(d.lin) + nlabels(d.cat) +end + + +function encode(d::HybridDiscretizer, x::Real) + if !support_encoding(d, x) + throw(ArgumentError("Value $x is not supported by the discretizer")) + end + if haskey(d.cat.cat_to_bin, x) + return encode(d.cat, x) + else + return encode(d.lin, x) + end +end + +function decode(d::HybridDiscretizer, bin::Int) + if haskey(d.cat.bin_to_cat, bin) + return decode(d.cat, bin) + else + return decode(d.lin, bin) + end +end diff --git a/src/distributions/include.jl b/src/distributions/include.jl new file mode 100644 index 0000000..347c67d --- /dev/null +++ b/src/distributions/include.jl @@ -0,0 +1 @@ +include("discretizer.jl") diff --git a/src/observations.jl b/src/observations.jl index e6b10e5..231d1e1 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -114,6 +114,11 @@ function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) adjacency_matrix(SimpleWeightedGraph(A)), weights = true) end + +""" +Assume that the diagonal is zero. +0 indicates no edge, while missing indicates no information about the edge. +""" function discretise(g::Observations{G, D}; number_groups = nothing, number_levels = nothing) where {G, D} if isnothing(number_groups) && isnothing(number_levels) @@ -126,19 +131,13 @@ function discretise(g::Observations{G, D}; @warn "disregarding `number_groups` as `number_levels` is provided" end end - #zero_locations = g.graph .== 0 - bin_edges = binedges(DiscretizeUniformWidth(number_levels), g.graph) - discretiser = LinearDiscretizer(bin_edges) + discretiser = HybridDiscretizer(number_levels-1, extrema(g.graph)..., 0.0) return discretise(g, discretiser) end -function discretise(g::Observations{G, D}, discretiser ::LinearDiscretizer) where {G,D<:UnivariateDistribution} +function discretise(g::Observations{G, D}, discretiser ::Discretizer) where {G,D<:UnivariateDistribution} A_encoded = encode(discretiser, _graph_to_mat(g)) - for i in 1:size(A_encoded, 1) - A_encoded[i, i] = 0 - end - #A_encoded[zero_locations] .= 0 - return Observations(A_encoded, Categorical(discretiser.nbins + 1)), discretiser + return Observations(A_encoded, Categorical(nlabels(discretiser))), discretiser end diff --git a/test/Project.toml b/test/Project.toml index b225864..bb072c9 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,4 +4,5 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/discretised_dist/discretizer.jl b/test/discretised_dist/discretizer.jl new file mode 100644 index 0000000..9094987 --- /dev/null +++ b/test/discretised_dist/discretizer.jl @@ -0,0 +1,17 @@ +using NetworkHistogram + +@testset "discretizer" begin + using StaticArrays + reg_disc = NetworkHistogram.RegularDiscretizer(10, 0.0, 1.0, MVector{10}(1:10), 1 / 10) + cat_disc = NetworkHistogram.CategoryDiscretizer( + Dict([0.0 => 11]), Dict([11 => 0.0])) + hybrid_disc = NetworkHistogram.HybridDiscretizer( + reg_disc, cat_disc) + + @test NetworkHistogram.encode(reg_disc, 0.0) == 1 + @test NetworkHistogram.encode(cat_disc, 0.0) == 11 + @test NetworkHistogram.encode(hybrid_disc, 0.0) == 11 + @test NetworkHistogram.decode(hybrid_disc, 11) == 0.0 + @test all(NetworkHistogram.encode(reg_disc, 0.001:0.001:1.0) .== NetworkHistogram.encode(hybrid_disc, 0.001:0.001:1.0)) + @test all(NetworkHistogram.decode(hybrid_disc, 1:10) .== NetworkHistogram.decode(reg_disc, 1:10)) +end diff --git a/test/observations/discretisation.jl b/test/observations/discretisation.jl index 45d46e7..4b25863 100644 --- a/test/observations/discretisation.jl +++ b/test/observations/discretisation.jl @@ -6,9 +6,9 @@ using NetworkHistogram for i in 1:20 A[i, i] = 0 end - g = Observations(A,Uniform(-1,1)) - discretised_g, discretizer = discretise(g; number_levels = 5) + g = Observations(A, Uniform(-1, 1)) + discretised_g, discretizer = discretise(g; number_levels = 6) @test size(discretised_g.graph) == size(g.graph) @test discretised_g.dist_ref == Categorical(6) - @test all(discretised_g.graph .∈ Ref(0:5)) + @test all(discretised_g.graph .∈ Ref(0:6)) end diff --git a/test/runtests.jl b/test/runtests.jl index 30b15d0..c0b9d60 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,10 @@ using Aqua include("TestNetworkHistogram.jl") @testset "Tests" begin + + @testset "Discretizer tests" begin + include("discretised_dist/discretizer.jl") + end @testset "Assignment tests" begin include("assignments/default_assignment.jl") include("assignments/bernoulli_assignment.jl") diff --git a/test/test_api.jl b/test/test_api.jl index d179b42..24652d7 100644 --- a/test/test_api.jl +++ b/test/test_api.jl @@ -1,6 +1,6 @@ @testset "test api" begin using Distributions - A = rand(0:1, 40, 40) + A = rand(-1:1, 40, 40) for i in 1:40 A[i, i] = 0 end From 5bfc699122a2e8246f41a8e26616c779592e4e26 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Dec 2024 17:38:17 +0100 Subject: [PATCH 070/266] add basic discretized distribution --- src/NetworkHistogram.jl | 2 + src/distributions/discrete_dist.jl | 47 ++++++++++++++++++ src/distributions/discretizer.jl | 76 +++++++++++++++++++++++++----- src/distributions/include.jl | 1 + 4 files changed, 114 insertions(+), 12 deletions(-) create mode 100644 src/distributions/discrete_dist.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 73824bb..be55bb4 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -18,6 +18,8 @@ using Combinatorics: permutations using StaticArrays using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx import Bootstrap: bootstrap +import Base.maximum, Base.minimum +import Random: rand include("distributions/include.jl") include("assignments/Assignments.jl") diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl new file mode 100644 index 0000000..3c6928a --- /dev/null +++ b/src/distributions/discrete_dist.jl @@ -0,0 +1,47 @@ +struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution where {D, L} + disc::D + probs::L +end + +function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) where {D} + disc = HybridDiscretizer(n_bins, support_bound..., 0.0) + # for now we keep track of the non-edges as well + probs = Distributions.Categorical(nlabels(discretizer)) + return DiscretizedDistribution(disc, probs) +end + +function pdf(d::DiscretizedDistribution, x::Real) + if !supports_encoding(d.discretizer, x) + return 0.0 + end + # for now suppose that the non-edges are encoded in the last bin + bin = encode(d.discretizer, x) + return pdf(d.probs, bin) / binwidth(d.discretizer) +end + +function logpdf(d::DiscretizedDistribution, x::Real) + if !supports_encoding(d.discretizer, x) + return -Inf + end + # for now suppose that the non-edges are encoded in the last bin + bin = encode(d.discretizer, x) + return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) +end + + +function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) + bin = rand(rng, d.probs) + return _decode_randomly(rng, d.discretizer, bin) +end + +function minimum(d::DiscretizedDistribution) + return minimum(d.discretizer) +end + +function maximum(d::DiscretizedDistribution) + return maximum(d.discretizer) +end + +function insupport(d::DiscretizedDistribution, x::Real) + return supports_encoding(d.discretizer, x) +end diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index 896dd2d..bed7eed 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -1,6 +1,7 @@ +# Inspired by Discretizer.jl but with the fast decoding function and built-in +# convention for discretizing continuous distributions. abstract type Discretizer end - function encode(d::Discretizer, x::AbstractArray{<:Real}) return [encode(d, u) for u in x] end @@ -20,21 +21,34 @@ struct RegularDiscretizer{F, T, L} <: Discretizer bin_width::F end - function support_encoding(d::RegularDiscretizer, x::Real) return d.lower_bound <= x <= d.upper_bound end +function minimum(d::RegularDiscretizer) + return d.lower_bound +end + +function maximum(d::RegularDiscretizer) + return d.upper_bound +end + function encode(d::RegularDiscretizer, x::Real) - if !support_encoding(d, x) - throw(ArgumentError("Value $x is not supported by the discretizer")) - end if x == d.upper_bound return d.n_bins end return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] end +function _decode_randomly(rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) + hi,lo = decode(d, bin) + return lo + (hi - lo) * rand(rng) +end + +function binwidth(d::RegularDiscretizer) + return d.bin_width +end + function decode(d::RegularDiscretizer, bin::Int) return (d.lower_bound + (bin - 1) * d.bin_width, d.lower_bound + bin * d.bin_width) end @@ -43,7 +57,6 @@ function encode(d::RegularDiscretizer, x::AbstractArray{Real}) return [encode(d, u) for u in x] end - function decode(d::RegularDiscretizer, x::AbstractArray{Real}) return [decode(d, u) for u in x] end @@ -58,6 +71,14 @@ Maps a set of categories to a set of bins struct CategoryDiscretizer{F, T} cat_to_bin::Dict{F, T} bin_to_cat::Dict{T, F} + min_label::T + max_label::T +end + +function CategoryDiscretizer(cat_to_bin::Dict, bin_to_cat::Dict) + min_label = minimum(keys(bin_to_cat)) + max_label = maximum(keys(bin_to_cat)) + return CategoryDiscretizer(cat_to_bin, bin_to_cat, min_label, max_label) end function support_encoding(d::CategoryDiscretizer, x) @@ -76,6 +97,14 @@ function nlabels(d::CategoryDiscretizer) return length(d.bin_to_cat) end +function minimum(d::CategoryDiscretizer) + return d.min_label +end + +function maximum(d::CategoryDiscretizer) + return d.max_label +end + """ Uniformly discretizes a continuous distribution into a fixed number of bins of equal width, with additional bins for missing or special values. @@ -97,20 +126,24 @@ function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) ) end - function support_encoding(d::HybridDiscretizer, x) return support_encoding(d.lin, x) || support_encoding(d.cat, x) end + +function minimum(d::HybridDiscretizer) + return min(minimum(d.lin), minimum(d.cat)) +end + +function maximum(d::HybridDiscretizer) + return max(maximum(d.lin), maximum(d.cat)) +end + function nlabels(d::HybridDiscretizer) return nlabels(d.lin) + nlabels(d.cat) end - function encode(d::HybridDiscretizer, x::Real) - if !support_encoding(d, x) - throw(ArgumentError("Value $x is not supported by the discretizer")) - end if haskey(d.cat.cat_to_bin, x) return encode(d.cat, x) else @@ -119,9 +152,28 @@ function encode(d::HybridDiscretizer, x::Real) end function decode(d::HybridDiscretizer, bin::Int) - if haskey(d.cat.bin_to_cat, bin) + if haskey(d.cat.bin_to_cat, bin) return decode(d.cat, bin) else return decode(d.lin, bin) end end + + +function _decode_randomly(rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) + if haskey(d.cat.bin_to_cat, bin) + return decode(d.cat, bin) + else + return _decode_randomly(rng, d.lin, bin) + end +end + + +function auto_nbins(data) + binwidth = 2iqr(data) / cbrt(n) + lo, hi = extrema(data) + nbins_fd = ceil(Int, (hi - lo) / binwidth) + nbins_sturges = ceil(Int, log(2, n)) + 1 + nbins = max(nbins_fd, nbins_sturges) + return nbins +end diff --git a/src/distributions/include.jl b/src/distributions/include.jl index 347c67d..d7998eb 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -1 +1,2 @@ include("discretizer.jl") +include("discrete_dist.jl") From f9ea503c2e9fae06a1eef26c07b2e9aa6f9feae5 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Dec 2024 18:20:00 +0100 Subject: [PATCH 071/266] fix DiscretizedDistribution --- src/NetworkHistogram.jl | 4 ++- src/api.jl | 3 +- .../CategoricalAssignment/struct.jl | 2 +- src/assignments/CategoricalAssignment/swap.jl | 19 +++++++++++-- src/distributions/discrete_dist.jl | 28 +++++++++++++++++-- src/observations.jl | 5 ++-- test/observations/discretisation.jl | 3 +- test/test_api.jl | 3 +- 8 files changed, 54 insertions(+), 13 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index be55bb4..40d83e4 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -20,6 +20,8 @@ using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx import Bootstrap: bootstrap import Base.maximum, Base.minimum import Random: rand +import Base.convert +import Distributions: pdf,logpdf, ncategories include("distributions/include.jl") include("assignments/Assignments.jl") @@ -56,7 +58,7 @@ export Assignment, number_groups, number_nodes export get_ordered_adjacency_matrix, get_vertex_in_group, get_group_of_vertex export BernoulliData, CategoricalData export Observations, discretise - +export DiscretizedDistribution export bootstrap diff --git a/src/api.jl b/src/api.jl index eeea918..ee7125b 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,7 +1,8 @@ function _default_init(dist::Distribution, start = MetisStart()) if dist isa Bernoulli return InitRule(start, Val{BernoulliData}()) - elseif dist isa Categorical + elseif dist isa Categorical || dist isa CategoricalArray || + dist isa DiscretizedDistribution return InitRule(start, Val{CategoricalData}()) else return InitRule(start, nothing) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index b6ce477..d44b5f7 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -108,7 +108,7 @@ function categorical_matrix(A::AbstractMatrix{Int}) end function categorical_matrix(g::Observations) - return categorical_matrix(g.graph), length(support(g.dist_ref)) + return categorical_matrix(g.graph), ncategories(g.dist_ref) end function loglikelihood(a::CategoricalAssignment, g::Observations) diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 5ac5811..6b70eb6 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -55,7 +55,7 @@ function fit( a::CategoricalAssignment{T, F, C}, g::Observations) where { T, F, C} dists = initialize_sbm( - a.group_size, Categorical(length(support(g.dist_ref)))) + a.group_size, g.dist_ref) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) dists[group1, @@ -66,6 +66,22 @@ function fit( return dists end +function fit( + a::CategoricalAssignment{T, F, C}, g::Observations{G, <:DiscretizedDistribution}) where { + T, F, C, G} + dists = initialize_sbm( + a.group_size, g.dist_ref) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + set_params!( + dists[group1, + group2], a.additional_data.estimated_theta[:, + group1, group2]) + end + end + return dists +end + function _move_connection!(realized, group_origin, group_dest, scratch) @inbounds for group in axes(realized, 2) for label in axes(realized, 1) @@ -77,7 +93,6 @@ function _move_connection!(realized, group_origin, group_dest, scratch) end end - # need to rethink if want to use muli-threading # check https://juliafolds.github.io/Transducers.jl/dev/tutorials/words/ function new_update_observed_and_labels!( diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 3c6928a..358f2ed 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -1,5 +1,5 @@ -struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution where {D, L} - disc::D +mutable struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution where {D, L} + discretizer::D probs::L end @@ -10,6 +10,11 @@ function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) return DiscretizedDistribution(disc, probs) end +function DiscretizedDistribution(discretizer::Discretizer) + return DiscretizedDistribution( + discretizer, Distributions.Categorical(nlabels(discretizer))) +end + function pdf(d::DiscretizedDistribution, x::Real) if !supports_encoding(d.discretizer, x) return 0.0 @@ -28,7 +33,6 @@ function logpdf(d::DiscretizedDistribution, x::Real) return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) end - function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) bin = rand(rng, d.probs) return _decode_randomly(rng, d.discretizer, bin) @@ -45,3 +49,21 @@ end function insupport(d::DiscretizedDistribution, x::Real) return supports_encoding(d.discretizer, x) end + +function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} + return DiscretizedDistribution(d, 10) +end + + +function Distributions.ncategories(d::DiscretizedDistribution) + return ncategories(d.probs) +end + + +function Distributions.fit(::Type{<:DiscretizedDistribution{D,L}},data) where {D,L} + return fit(L, data) +end + +function set_params!(d::DiscretizedDistribution{D, L}, params) where {D,L} + d.probs = L(params) +end diff --git a/src/observations.jl b/src/observations.jl index 231d1e1..332aeff 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -131,13 +131,12 @@ function discretise(g::Observations{G, D}; @warn "disregarding `number_groups` as `number_levels` is provided" end end - discretiser = HybridDiscretizer(number_levels-1, extrema(g.graph)..., 0.0) - return discretise(g, discretiser) + return discretise(g, HybridDiscretizer(number_levels-1, extrema(g.graph)..., 0.0)) end function discretise(g::Observations{G, D}, discretiser ::Discretizer) where {G,D<:UnivariateDistribution} A_encoded = encode(discretiser, _graph_to_mat(g)) - return Observations(A_encoded, Categorical(nlabels(discretiser))), discretiser + return Observations(A_encoded, DiscretizedDistribution(discretiser)), discretiser end diff --git a/test/observations/discretisation.jl b/test/observations/discretisation.jl index 4b25863..49eb959 100644 --- a/test/observations/discretisation.jl +++ b/test/observations/discretisation.jl @@ -9,6 +9,7 @@ using NetworkHistogram g = Observations(A, Uniform(-1, 1)) discretised_g, discretizer = discretise(g; number_levels = 6) @test size(discretised_g.graph) == size(g.graph) - @test discretised_g.dist_ref == Categorical(6) + @test discretised_g.dist_ref isa NetworkHistogram.DiscretizedDistribution + @test ncategories(discretised_g.dist_ref) == 6 @test all(discretised_g.graph .∈ Ref(0:6)) end diff --git a/test/test_api.jl b/test/test_api.jl index 24652d7..db1a77a 100644 --- a/test/test_api.jl +++ b/test/test_api.jl @@ -13,6 +13,7 @@ sbm_discretised, a, discretizer = nethist_discretised( g; number_levels = 5, h = 10, max_iter = 10) - @test eltype(sbm_discretised) == typeof(Categorical(5)) + @test sbm_discretised[1,1] isa DiscretizedDistribution + @test ncategories(sbm_discretised[1,1]) == 5 @test size(sbm_discretised) == (4,4) end From 0d08ee5555f9c214e033d9be2741eba2fd80e85c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Dec 2024 20:48:30 +0100 Subject: [PATCH 072/266] first try of sparse assignments --- src/assignments/GroupedAssignment/struct.jl | 3 - src/assignments/SparseAssignment/struct.jl | 103 ++++++++++++++++++++ src/assignments/SparseAssignment/swap.jl | 96 ++++++++++++++++++ src/assignments/include.jl | 1 + src/distributions/discretizer.jl | 15 +++ test/Project.toml | 1 + test/assignments/sparse_assignment.jl | 92 +++++++++++++++++ test/runtests.jl | 3 +- 8 files changed, 310 insertions(+), 4 deletions(-) delete mode 100644 src/assignments/GroupedAssignment/struct.jl create mode 100644 src/assignments/SparseAssignment/struct.jl create mode 100644 src/assignments/SparseAssignment/swap.jl create mode 100644 test/assignments/sparse_assignment.jl diff --git a/src/assignments/GroupedAssignment/struct.jl b/src/assignments/GroupedAssignment/struct.jl deleted file mode 100644 index d8461d9..0000000 --- a/src/assignments/GroupedAssignment/struct.jl +++ /dev/null @@ -1,3 +0,0 @@ -# assignment that move the edge data around when trying an update -# might be useful if the computation of the loglikelihood is expensive -# and with no closed-form solution: contiguous memory access might be faster diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl new file mode 100644 index 0000000..91bebb0 --- /dev/null +++ b/src/assignments/SparseAssignment/struct.jl @@ -0,0 +1,103 @@ +mutable struct SparseData{F, C} + counts::Matrix{Int} + realized::Array{Int, 3} + estimated_theta::Array{F, 3} + A::SparseMatrixCSC{C, Int} + scratch_count::Matrix{Int} + scratch_missing::Vector{Int} + log_likelihood::F +end + + +const SparseAssignment{T, F, C} = Assignment{ + T, SparseData{F, C}} +const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} + +function SparseAssignment( g::Observations{G,D}, group_size::GroupSize, node_labels::Vector{Int}) where {G,D} + A = issparse(g.graph) ? g.graph : sparse(g.graph) + num_levels = length(unique(A)) -1 + sparse_data = SparseData(A, size(group_size, 1), num_levels, group_size, node_labels) + return Assignment(group_size, node_labels, sparse_data) +end + + +function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, + level_count::Int, group_size, node_labels) where {T} + n = size(A, 1) + data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), + zeros(Float64, level_count, k, k), dropzeros!(A), zeros(Int, level_count, k), zeros(Int, k), 0.0) + _count_possible_occurences!(data, group_size) + _count_occurences!(data, node_labels) + _fast_div!(data.estimated_theta, data.realized, data.counts) + println(data.estimated_theta) + println(data.realized) + println(data.counts) + data.log_likelihood = compute_log_likelihood_without_0(data.estimated_theta, data.realized, data.counts) + return data +end + + +function _count_possible_occurences!(data, group_size) + k = size(group_size, 1) + for j in 1:k + data.counts[j, j] = group_size[j] * (group_size[j] - 1) ÷ 2 + for i in j+1:k + data.counts[i, j] = group_size[i] * group_size[j] + data.counts[j, i] = group_size[i] * group_size[j] + end + end +end + +function _count_occurences!(data, node_labels) + rows = rowvals(data.A) + vals = nonzeros(data.A) + m, n = size(data.A) + for j in 1:n + groupj = node_labels[j] + for i in nzrange(data.A, j) + row = rows[i] + val = vals[i] + groupi = node_labels[row] + if ismissing(val) + data.counts[groupj, groupj] -= 1 + if groupj != groupj + data.counts[groupj, groupj] -= 1 + end + else + data.realized[val, groupi, groupj] += 1 + if groupi != groupj + data.realized[val, groupj, groupi] += 1 + end + end + end + end +end + + + +function compute_log_likelihood_without_0( + estimated_theta::Array{T, 3}, realized::Array{F, 3}, counts) where { + T, F} + loglik = zero(T) + number_groups = size(estimated_theta, 2) + number_decorations = size(estimated_theta, 1) + @inbounds for j in 1:number_groups + for i in j:number_groups + prob_absent = one(T) + total_decorations = counts[i, j] + for m in 1:number_decorations + if realized[m, i, j] != 0 + prob_absent -= estimated_theta[m, i, j] + total_decorations -= realized[m, i, j] + loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) + end + end + println(total_decorations, prob_absent) + loglik += total_decorations * log(prob_absent) + end + end + return loglik +end + + +include("swap.jl") diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl new file mode 100644 index 0000000..283e2da --- /dev/null +++ b/src/assignments/SparseAssignment/swap.jl @@ -0,0 +1,96 @@ +mutable struct SparseSwap{F} <: Swap + index1::Int + index2::Int + realized::Array{Int, 3} + counts::Matrix{Int} + estimated_theta::Array{F, 3} + log_likelihood::F +end + + +function make_swap(a::SparseAssignment, id) + return SparseSwap(id[1], id[2], copy(a.additional_data.realized), + copy(a.additional_data.estimated_theta),copy(a.additional_data.counts), + a.additional_data.log_likelihood) +end + +function copy_addtional!(a, b) + copy!(a.realized, b.realized) + copy!(a.estimated_theta, b.estimated_theta) + copy!(a.counts, b.counts) + a.log_likelihood = b.log_likelihood + return nothing +end + +function make_swap!( + swap::SparseSwap{F}, a::SparseAssignment{T, F}, + id) where {T, F} + swap.index1, swap.index2 = id + copy_addtional!(swap, a.additional_data) +end + + +function revert_swap!( + a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} + swap_node_labels!(a, swap.index1, swap.index2) + copy_addtional!(a.additional_data, swap) + return nothing +end + +function apply_swap!( + a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} + update_observed_and_labels!(a, swap) + update_ll!(a) +end + + +function update_ll!(a::SparseAssignment) + a.additional_data.log_likelihood = compute_log_likelihood_without_0( + a.additional_data.estimated_theta, a.additional_data.realized, a.additional_data.counts) + return nothing +end + +function update_observed_and_labels!( + a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + + rows = rowvals(a.addtional_data.A) + vals = nonzeros(a.addtional_data.A) + m, n = size(a.addtional_data.A) + @inbounds for j in [swap.index1, swap.index2] + a.additional_data.scratch_count .= 0 + a.additional_data.scratch_missing .= 0 + g_from = swap.index1 == j ? g1 : g2 + g_to = swap.index1 == j ? g2 : g1 + for i_index in nzrange(a.additional_data.A, j) + row = rows[i_index] + val = vals[i_index] + groupi = get_group_of_vertex(a, row) + if ismissing(val) + a.additional_data.scratch_missing[groupi] += 1 + end + + a.additional_data.scratch[val, groupi] += 1 + end + move_connection!( + a.additional_data.realized, g_from, g_to, a.additional_data.scratch) + _update_counts!( + a.additional_data.counts, g_from, g_to, a.additional_data.scratch_missing) + end + + _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, + a.additional_data.counts) + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + +function _update_counts!(counts, g_from, g_to, missing_update) + for i in 1:axes(counts, 1) + counts[i, g_to] += missing_update[i] + counts[i, g_from] -= missing_update[i] + end +end diff --git a/src/assignments/include.jl b/src/assignments/include.jl index fd238a3..d9395c0 100644 --- a/src/assignments/include.jl +++ b/src/assignments/include.jl @@ -1,2 +1,3 @@ include("BernoulliAssignment/struct.jl") include("CategoricalAssignment/struct.jl") +include("SparseAssignment/struct.jl") diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index bed7eed..fda2b82 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -114,6 +114,8 @@ struct HybridDiscretizer{F, F2, T, L} <: Discretizer cat::CategoryDiscretizer{F2, T} end + +# change so that atoms can be packed together if wanted function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) cat_to_bin = Dict(a => n_bins + i for (i, a) in enumerate(atoms)) bin_to_cat = Dict(n_bins + i => a for (i, a) in enumerate(atoms)) @@ -126,6 +128,19 @@ function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) ) end + +function DiscretizerZeroToZero(n_bins, lower_bound, upper_bound) + cat_to_bin = Dict([0 => 0]) + bin_to_cat = Dict([0 => 0]) + bin_width = (upper_bound - lower_bound) / n_bins + return HybridDiscretizer( + RegularDiscretizer{typeof(bin_width), Int, n_bins}( + n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), + (upper_bound - lower_bound) / n_bins), + CategoryDiscretizer(cat_to_bin, bin_to_cat) + ) +end + function support_encoding(d::HybridDiscretizer, x) return support_encoding(d.lin, x) || support_encoding(d.cat, x) end diff --git a/test/Project.toml b/test/Project.toml index bb072c9..a7b675e 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,5 +4,6 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl new file mode 100644 index 0000000..5062482 --- /dev/null +++ b/test/assignments/sparse_assignment.jl @@ -0,0 +1,92 @@ +import NetworkHistogram as NH + +using Random + + +@testset "test sparse swap" begin + Random.seed!(1234123) + using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment + using Distributions: Categorical + using LinearAlgebra: Symmetric + import Random + m = 4 + p = ones(m) ./ m + n = 12 + k = 4 + dist = Categorical(p) + sbm = NH.initialize_sbm(ones(k) ./ k, dist) + node_labels = repeat(1:k, inner = n ÷ k) + A = sparse(first(NH.sample(sbm, node_labels))) + g = NH.Observations(A, dist) + a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) + swap = NH.make_swap(a, (1, k + 1)) + @test A[:, 1] != A[:, k + 1] + a_test = deepcopy(a) + NH.apply_swap!(a_test, swap) + @test NH.get_group_of_vertex(a, swap.index1) == + NH.get_group_of_vertex(a_test, swap.index2) + @test NH.get_group_of_vertex(a, swap.index2) == + NH.get_group_of_vertex(a_test, swap.index1) + # force recomputation of the log likelihood using default assignment + a_new = to_default_assignment(a_test) + @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) + @test a_test.additional_data.realized != a.additional_data.realized + @test a_test.additional_data.estimated_theta != + a.additional_data.estimated_theta + @test a_test.additional_data.log_likelihood != + a.additional_data.log_likelihood + # revert the swap and check if the assignment is the same as before + NH.revert_swap!(a_test, swap) + @test a == a_test + @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) +end + +@testset "fast sparse update test" begin + using Distributions + realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; + [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized = [realized[I][k] + for k in eachindex(realized[1, 1]), + I in CartesianIndices(realized)] + counts = [1 4 4 + 4 1 4 + 4 4 1] + A = sparse([0 1 2 2 3 3 + 1 0 2 2 3 3 + 2 2 0 1 3 3 + 2 2 1 0 3 3 + 3 3 3 3 0 1 + 3 3 3 3 1 0]) + groupsize = NH.GroupSize(6, 2) + node_labels = [1, 1, 2, 2, 3, 3] + g = NH.Observations(A, Categorical(3)) + k = 3 + m = 3 + n = size(A,1) + a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) + for index in eachindex(realized) + @test all(realized[index] .== a.additional_data.realized[index]) + end + @test loglikelihood(a, g) ≈ 0 + @test a.additional_data.counts == counts + swap_id = (1, 3) + ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; + [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized_after_swap = [ras[I][k] + for k in eachindex(ras[1, 1]), + I in CartesianIndices(ras)] + + swap = NH.make_swap(a, swap_id) + NH.apply_swap!(a, swap) + for j in 1:3 + for i in 1:3 + @test all(realized_after_swap[:, i, j] .== + a.additional_data.realized[:, i, j]) + @test all(a.additional_data.estimated_theta[:, i, j] .≈ + realized_after_swap[:, i, j] ./ counts[i, j]) + end + end + @test loglikelihood(a, g) == 4 * log(0.5) +end diff --git a/test/runtests.jl b/test/runtests.jl index c0b9d60..3b39327 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ using Test using Aqua - +using SparseArrays include("TestNetworkHistogram.jl") @testset "Tests" begin @@ -12,6 +12,7 @@ include("TestNetworkHistogram.jl") include("assignments/default_assignment.jl") include("assignments/bernoulli_assignment.jl") include("assignments/categorical_assignment.jl") + include("assignments/sparse_assignment.jl") end @testset "Rule optimization tests" begin From 5e022bb772e94db933608c21c0bbac45346e2114 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Dec 2024 21:25:19 +0100 Subject: [PATCH 073/266] partially solved, still issues --- src/assignments/SparseAssignment/struct.jl | 15 +++----- src/assignments/SparseAssignment/swap.jl | 16 ++++----- test/assignments/sparse_assignment.jl | 40 ++++++++++++++++++---- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index 91bebb0..d0b85b5 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -25,13 +25,10 @@ function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, level_count::Int, group_size, node_labels) where {T} n = size(A, 1) data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), - zeros(Float64, level_count, k, k), dropzeros!(A), zeros(Int, level_count, k), zeros(Int, k), 0.0) + zeros(Float64, level_count, k, k), dropzeros(A), zeros(Int, level_count, k), zeros(Int, k), 0.0) _count_possible_occurences!(data, group_size) _count_occurences!(data, node_labels) _fast_div!(data.estimated_theta, data.realized, data.counts) - println(data.estimated_theta) - println(data.realized) - println(data.counts) data.log_likelihood = compute_log_likelihood_without_0(data.estimated_theta, data.realized, data.counts) return data end @@ -60,17 +57,14 @@ function _count_occurences!(data, node_labels) groupi = node_labels[row] if ismissing(val) data.counts[groupj, groupj] -= 1 - if groupj != groupj - data.counts[groupj, groupj] -= 1 - end else data.realized[val, groupi, groupj] += 1 - if groupi != groupj - data.realized[val, groupj, groupi] += 1 - end end end end + for k in axes(data.realized, 2) + data.realized[:,k,k] ./= 2 + end end @@ -92,7 +86,6 @@ function compute_log_likelihood_without_0( loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) end end - println(total_decorations, prob_absent) loglik += total_decorations * log(prob_absent) end end diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 283e2da..3e3c2b1 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -2,8 +2,8 @@ mutable struct SparseSwap{F} <: Swap index1::Int index2::Int realized::Array{Int, 3} - counts::Matrix{Int} estimated_theta::Array{F, 3} + counts::Matrix{Int} log_likelihood::F end @@ -55,9 +55,9 @@ function update_observed_and_labels!( g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) - rows = rowvals(a.addtional_data.A) - vals = nonzeros(a.addtional_data.A) - m, n = size(a.addtional_data.A) + rows = rowvals(a.additional_data.A) + vals = nonzeros(a.additional_data.A) + m, n = size(a.additional_data.A) @inbounds for j in [swap.index1, swap.index2] a.additional_data.scratch_count .= 0 a.additional_data.scratch_missing .= 0 @@ -71,10 +71,10 @@ function update_observed_and_labels!( a.additional_data.scratch_missing[groupi] += 1 end - a.additional_data.scratch[val, groupi] += 1 + a.additional_data.scratch_count[val, groupi] += 1 end - move_connection!( - a.additional_data.realized, g_from, g_to, a.additional_data.scratch) + _move_connection!( + a.additional_data.realized, g_from, g_to, a.additional_data.scratch_count) _update_counts!( a.additional_data.counts, g_from, g_to, a.additional_data.scratch_missing) end @@ -89,7 +89,7 @@ function update_observed_and_labels!( end function _update_counts!(counts, g_from, g_to, missing_update) - for i in 1:axes(counts, 1) + for i in axes(counts, 1) counts[i, g_to] += missing_update[i] counts[i, g_from] -= missing_update[i] end diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl index 5062482..6832f2d 100644 --- a/test/assignments/sparse_assignment.jl +++ b/test/assignments/sparse_assignment.jl @@ -2,6 +2,33 @@ import NetworkHistogram as NH using Random +@testset "test sparse give the same as categorical" begin + using Distributions, LinearAlgebra, SparseArrays + k = 2 + m = 5 + level_count = 4 + n = 20 + tau = [0.8, 0.1, 0.1, 0.1, 0.1] + sbm = NH.initialize_sbm(ones(k) ./ k, Categorical(tau ./ sum(tau))) + A, _ = NH.sample(sbm, n) + A_dense = collect(A) + A = sparse(A_dense .- 1) + for i in 1:n + A[i, i] = 0 + end + dropzeros!(A) + g = NH.Observations(A_dense, Categorical(m)) + sbm_fitted, a = nethist(g; h = n ÷ k, max_iter = 10) + sparse_a = NH.SparseAssignment( + NH.Observations(A, Categorical(m)), a.group_size, a.node_labels) + @test a.additional_data.counts == sparse_a.additional_data.counts + for (l,m_index) in enumerate(2:m) + @test a.additional_data.realized[m_index, :, :] == sparse_a.additional_data.realized[l, :, :] + @test a.additional_data.estimated_theta[m_index, :, :] == + sparse_a.additional_data.estimated_theta[l, :, :] + end + @test a.additional_data.log_likelihood ≈ sparse_a.additional_data.log_likelihood +end @testset "test sparse swap" begin Random.seed!(1234123) @@ -30,6 +57,7 @@ using Random # force recomputation of the log likelihood using default assignment a_new = to_default_assignment(a_test) @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) + println(typeof(a), typeof(a_test)) @test a_test.additional_data.realized != a.additional_data.realized @test a_test.additional_data.estimated_theta != a.additional_data.estimated_theta @@ -53,17 +81,17 @@ end 4 1 4 4 4 1] A = sparse([0 1 2 2 3 3 - 1 0 2 2 3 3 - 2 2 0 1 3 3 - 2 2 1 0 3 3 - 3 3 3 3 0 1 - 3 3 3 3 1 0]) + 1 0 2 2 3 3 + 2 2 0 1 3 3 + 2 2 1 0 3 3 + 3 3 3 3 0 1 + 3 3 3 3 1 0]) groupsize = NH.GroupSize(6, 2) node_labels = [1, 1, 2, 2, 3, 3] g = NH.Observations(A, Categorical(3)) k = 3 m = 3 - n = size(A,1) + n = size(A, 1) a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) for index in eachindex(realized) @test all(realized[index] .== a.additional_data.realized[index]) From 1b7ffd18631558d40f81877a0cf9b55f7fc4e4d8 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Dec 2024 23:20:22 +0100 Subject: [PATCH 074/266] =?UTF-8?q?fixed=20it=20=F0=9F=92=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/assignments/SparseAssignment/struct.jl | 13 +++++-------- src/assignments/SparseAssignment/swap.jl | 19 ++++++++++++------- test/assignments/sparse_assignment.jl | 1 - 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index d0b85b5..a4e1650 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -56,7 +56,7 @@ function _count_occurences!(data, node_labels) val = vals[i] groupi = node_labels[row] if ismissing(val) - data.counts[groupj, groupj] -= 1 + data.counts[groupi, groupj] -= 1 else data.realized[val, groupi, groupj] += 1 end @@ -77,16 +77,13 @@ function compute_log_likelihood_without_0( number_decorations = size(estimated_theta, 1) @inbounds for j in 1:number_groups for i in j:number_groups - prob_absent = one(T) total_decorations = counts[i, j] + loglik -= xlogx(total_decorations) for m in 1:number_decorations - if realized[m, i, j] != 0 - prob_absent -= estimated_theta[m, i, j] - total_decorations -= realized[m, i, j] - loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) - end + loglik += xlogx(realized[m,i,j]) + total_decorations -= realized[m,i,j] end - loglik += total_decorations * log(prob_absent) + loglik += xlogx(total_decorations) end end return loglik diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 3e3c2b1..d708f37 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -7,10 +7,9 @@ mutable struct SparseSwap{F} <: Swap log_likelihood::F end - function make_swap(a::SparseAssignment, id) return SparseSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta),copy(a.additional_data.counts), + copy(a.additional_data.estimated_theta), copy(a.additional_data.counts), a.additional_data.log_likelihood) end @@ -29,7 +28,6 @@ function make_swap!( copy_addtional!(swap, a.additional_data) end - function revert_swap!( a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} swap_node_labels!(a, swap.index1, swap.index2) @@ -43,7 +41,6 @@ function apply_swap!( update_ll!(a) end - function update_ll!(a::SparseAssignment) a.additional_data.log_likelihood = compute_log_likelihood_without_0( a.additional_data.estimated_theta, a.additional_data.realized, a.additional_data.counts) @@ -55,26 +52,34 @@ function update_observed_and_labels!( g1 = get_group_of_vertex(a, swap.index1) g2 = get_group_of_vertex(a, swap.index2) + if g1 == g2 + return nothing + end + rows = rowvals(a.additional_data.A) vals = nonzeros(a.additional_data.A) m, n = size(a.additional_data.A) - @inbounds for j in [swap.index1, swap.index2] + for j in [swap.index1, swap.index2] a.additional_data.scratch_count .= 0 a.additional_data.scratch_missing .= 0 g_from = swap.index1 == j ? g1 : g2 g_to = swap.index1 == j ? g2 : g1 for i_index in nzrange(a.additional_data.A, j) row = rows[i_index] + if row == swap.index1 || row == swap.index2 + continue + end val = vals[i_index] groupi = get_group_of_vertex(a, row) if ismissing(val) a.additional_data.scratch_missing[groupi] += 1 + else + a.additional_data.scratch_count[val, groupi] += 1 end - - a.additional_data.scratch_count[val, groupi] += 1 end _move_connection!( a.additional_data.realized, g_from, g_to, a.additional_data.scratch_count) + _update_counts!( a.additional_data.counts, g_from, g_to, a.additional_data.scratch_missing) end diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl index 6832f2d..ac8a25c 100644 --- a/test/assignments/sparse_assignment.jl +++ b/test/assignments/sparse_assignment.jl @@ -57,7 +57,6 @@ end # force recomputation of the log likelihood using default assignment a_new = to_default_assignment(a_test) @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - println(typeof(a), typeof(a_test)) @test a_test.additional_data.realized != a.additional_data.realized @test a_test.additional_data.estimated_theta != a.additional_data.estimated_theta From 3e2ac10ea3db0032ddb912df9fc5c1fc8ed0990b Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:23:44 +0100 Subject: [PATCH 075/266] add custom zero-inflated cat dist --- src/NetworkHistogram.jl | 4 +- src/api.jl | 5 +- src/assignments/SparseAssignment/struct.jl | 22 ++++++- src/assignments/SparseAssignment/swap.jl | 27 +++++++++ src/distributions/categorical_with_0.jl | 68 ++++++++++++++++++++++ src/distributions/discrete_dist.jl | 20 +++---- src/distributions/discretizer.jl | 17 ++++++ src/distributions/include.jl | 1 + src/observations.jl | 4 +- src/optimisation/fit.jl | 5 ++ test/assignments/sparse_assignment.jl | 7 +-- 11 files changed, 157 insertions(+), 23 deletions(-) create mode 100644 src/distributions/categorical_with_0.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 40d83e4..feb54e0 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -21,7 +21,7 @@ import Bootstrap: bootstrap import Base.maximum, Base.minimum import Random: rand import Base.convert -import Distributions: pdf,logpdf, ncategories +import Distributions: pdf,logpdf, ncategories, cdf, rand include("distributions/include.jl") include("assignments/Assignments.jl") @@ -36,7 +36,7 @@ include("api.jl") include("bootstrap.jl") export nethist, nethist_discretised -export loglikelihood, fit +export loglikelihood, fit, cdf, pdf # export options for optimisation export estimate_graphon diff --git a/src/api.jl b/src/api.jl index ee7125b..57826b3 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,9 +1,10 @@ function _default_init(dist::Distribution, start = MetisStart()) if dist isa Bernoulli return InitRule(start, Val{BernoulliData}()) - elseif dist isa Categorical || dist isa CategoricalArray || - dist isa DiscretizedDistribution + elseif dist isa Categorical return InitRule(start, Val{CategoricalData}()) + elseif dist isa DiscretizedDistribution || dist isa ZeroInflatedCategorical + return InitRule(start, Val{SparseData}()) else return InitRule(start, nothing) end diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index a4e1650..26daf0a 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -9,18 +9,28 @@ mutable struct SparseData{F, C} end + const SparseAssignment{T, F, C} = Assignment{ T, SparseData{F, C}} const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} -function SparseAssignment( g::Observations{G,D}, group_size::GroupSize, node_labels::Vector{Int}) where {G,D} +function SparseAssignment(g::Observations{G,D}, group_size::GroupSize, node_labels::Vector{Int}) where {G,D} A = issparse(g.graph) ? g.graph : sparse(g.graph) - num_levels = length(unique(A)) -1 + num_levels = ncategories(g.dist_ref) sparse_data = SparseData(A, size(group_size, 1), num_levels, group_size, node_labels) return Assignment(group_size, node_labels, sparse_data) end + +function make_assignment(g, h, init_rule::SparseInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return SparseAssignment(g, group_size, node_labels) +end + + function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, level_count::Int, group_size, node_labels) where {T} n = size(A, 1) @@ -90,4 +100,12 @@ function compute_log_likelihood_without_0( end +function _n_decorations_with_0(a::SparseAssignment) + return size(a.additional_data.estimated_theta, 1) + 1 +end + +function loglikelihood(assignment::SparseAssignment, g::Observations) + return assignment.additional_data.log_likelihood +end + include("swap.jl") diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index d708f37..b55a5ed 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -99,3 +99,30 @@ function _update_counts!(counts, g_from, g_to, missing_update) counts[i, g_from] -= missing_update[i] end end + +function fit(a::SparseAssignment, g::Observations) + dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_with_0(a))) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + theta = a.additional_data.estimated_theta[:, group1, group2] + dists[group1, + group2] = ZeroInflatedCategorical(1 - sum(theta), theta) + end + end + return dists +end + +function fit(a::SparseAssignment, g::Observations{G, <:DiscretizedDistribution}) where {G} + dists = initialize_sbm(a.group_size, + DiscretizedDistribution( + g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_with_0(a)))) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + theta = a.additional_data.estimated_theta[:, group1, group2] + dists[group1, + group2] = DiscretizedDistribution( + g.dist_ref.discretizer, ZeroInflatedCategorical(1 - sum(theta), theta)) + end + end + return dists +end diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl new file mode 100644 index 0000000..7054c8b --- /dev/null +++ b/src/distributions/categorical_with_0.jl @@ -0,0 +1,68 @@ + +struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution + proba_zero::B + dist::D +end + +_dirac_delta(x) = x == 0 ? one(x) : zero(x) + +function ZeroInflatedCategorical(p::Real, dist::D) where {D} + return ZeroInflatedCategorical(Bernoulli(1 - p), dist) +end + +function ZeroInflatedCategorical(p::Real, probs::AbstractVector) + if sum(probs) == 0 + probs_ = ones(length(probs)) / length(probs) + else + probs_ = probs / sum(probs) + end + return ZeroInflatedCategorical(Bernoulli(1 - p), Categorical(probs_)) +end + +function ZeroInflatedCategorical(vec_probs::AbstractVector) + ZeroInflatedCategorical(vec_probs[1], vec_probs[2:end]) +end + +ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k+1) ./ (k+1)) + +function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) + return pdf(d.proba_zero, 0) * _dirac_delta(x) + pdf(d.proba_zero, 1) * pdf(d.dist, x) +end + +function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) + return rand(rng, d.proba_zero) * rand(rng, d.dist) +end + +logpdf(d::ZeroInflatedCategorical, x::Real) = log(pdf(d, x)) + +minimum(d::ZeroInflatedCategorical) = min(minimum(d.dist), 0) + +maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) + +insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) + +function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) + return pdf(d.proba_zero, 0) * _dirac_delta(x) + pdf(d.proba_zero, 1) * cdf(d.dist, x) +end + +function Distributions.params(d::ZeroInflatedCategorical) + (first(params(d.proba_zero)), params(d.dist)...) +end + +ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) + +function Distributions.suffstats(::Type{ZeroInflatedCategorical{B, D}}, data) where {B, D} + return Distributions.suffstats(D, data) +end + +function Distributions.fit( + ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where {B, D} + indices_0 = findall(x -> x == 0, data) + p = length(indices_0) / length(data) + if p != 1 + dist = fit(D, data[setdiff(1:end, indices_0)]) + return ZeroInflatedCategorical(p, dist) + else + return ZeroInflatedCategorical(1.0, zeros(n_cat)) + end +end diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 358f2ed..3118033 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -1,25 +1,24 @@ -mutable struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution where {D, L} +mutable struct DiscretizedDistribution{D, L} <: + ContinuousUnivariateDistribution where {D, L} discretizer::D probs::L end function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) where {D} - disc = HybridDiscretizer(n_bins, support_bound..., 0.0) - # for now we keep track of the non-edges as well - probs = Distributions.Categorical(nlabels(discretizer)) + disc = ZeroToZeroDiscretizer(n_bins, support_bound...) + probs = ZeroInflatedCategorical(non_zero_labels_counts(disc)) return DiscretizedDistribution(disc, probs) end function DiscretizedDistribution(discretizer::Discretizer) return DiscretizedDistribution( - discretizer, Distributions.Categorical(nlabels(discretizer))) + discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) end function pdf(d::DiscretizedDistribution, x::Real) if !supports_encoding(d.discretizer, x) return 0.0 end - # for now suppose that the non-edges are encoded in the last bin bin = encode(d.discretizer, x) return pdf(d.probs, bin) / binwidth(d.discretizer) end @@ -28,7 +27,6 @@ function logpdf(d::DiscretizedDistribution, x::Real) if !supports_encoding(d.discretizer, x) return -Inf end - # for now suppose that the non-edges are encoded in the last bin bin = encode(d.discretizer, x) return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) end @@ -54,16 +52,14 @@ function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} return DiscretizedDistribution(d, 10) end - function Distributions.ncategories(d::DiscretizedDistribution) return ncategories(d.probs) end - -function Distributions.fit(::Type{<:DiscretizedDistribution{D,L}},data) where {D,L} +function Distributions.fit(::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} return fit(L, data) end -function set_params!(d::DiscretizedDistribution{D, L}, params) where {D,L} - d.probs = L(params) +function set_params!(d::DiscretizedDistribution{D, L}, params) where {D, L} + d.probs = L(params...) end diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index fda2b82..70ee996 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -65,6 +65,9 @@ function nlabels(d::RegularDiscretizer) return d.n_bins end +non_zero_labels_counts(d::RegularDiscretizer) = nlabels(d) + + """ Maps a set of categories to a set of bins """ @@ -97,6 +100,16 @@ function nlabels(d::CategoryDiscretizer) return length(d.bin_to_cat) end + + +function non_zero_labels_counts(d::CategoryDiscretizer) + if 0 ∈ keys(d.bin_to_cat) + return length(d.bin_to_cat) - 1 + else + return length(d.bin_to_cat) + end +end + function minimum(d::CategoryDiscretizer) return d.min_label end @@ -158,6 +171,10 @@ function nlabels(d::HybridDiscretizer) return nlabels(d.lin) + nlabels(d.cat) end +function non_zero_labels_counts(d::HybridDiscretizer) + return non_zero_labels_counts(d.lin) + non_zero_labels_counts(d.cat) +end + function encode(d::HybridDiscretizer, x::Real) if haskey(d.cat.cat_to_bin, x) return encode(d.cat, x) diff --git a/src/distributions/include.jl b/src/distributions/include.jl index d7998eb..9fafb24 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -1,2 +1,3 @@ +include("categorical_with_0.jl") include("discretizer.jl") include("discrete_dist.jl") diff --git a/src/observations.jl b/src/observations.jl index 332aeff..6d5e4ab 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -118,6 +118,8 @@ end """ Assume that the diagonal is zero. 0 indicates no edge, while missing indicates no information about the edge. +By default maps 0 to 0. If you want another behaviour use the function where you +pass a `Discretizer` object. """ function discretise(g::Observations{G, D}; number_groups = nothing, number_levels = nothing) where {G, D} @@ -131,7 +133,7 @@ function discretise(g::Observations{G, D}; @warn "disregarding `number_groups` as `number_levels` is provided" end end - return discretise(g, HybridDiscretizer(number_levels-1, extrema(g.graph)..., 0.0)) + return discretise(g, DiscretizerZeroToZero(number_levels, extrema(g.graph)...)) end function discretise(g::Observations{G, D}, discretiser ::Discretizer) where {G,D<:UnivariateDistribution} diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index 231e3de..547e925 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -17,6 +17,11 @@ function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where end end + +function fit_group(d::ZeroInflatedCategorical, g, edges) + return Distributions.fit(typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) +end + function fit_group(distribution, g, edges) return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl index ac8a25c..c7186ac 100644 --- a/test/assignments/sparse_assignment.jl +++ b/test/assignments/sparse_assignment.jl @@ -16,7 +16,6 @@ using Random for i in 1:n A[i, i] = 0 end - dropzeros!(A) g = NH.Observations(A_dense, Categorical(m)) sbm_fitted, a = nethist(g; h = n ÷ k, max_iter = 10) sparse_a = NH.SparseAssignment( @@ -33,14 +32,14 @@ end @testset "test sparse swap" begin Random.seed!(1234123) using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment - using Distributions: Categorical + using Distributions: DiscreteNonParametric using LinearAlgebra: Symmetric import Random m = 4 p = ones(m) ./ m n = 12 k = 4 - dist = Categorical(p) + dist = NH.ZeroInflatedCategorical(p) sbm = NH.initialize_sbm(ones(k) ./ k, dist) node_labels = repeat(1:k, inner = n ÷ k) A = sparse(first(NH.sample(sbm, node_labels))) @@ -115,5 +114,5 @@ end realized_after_swap[:, i, j] ./ counts[i, j]) end end - @test loglikelihood(a, g) == 4 * log(0.5) + @test loglikelihood(a, g) ≈ 4 * log(0.5) end From 09e0a0fcda250660c23cf921f69b12521acde707 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:35:53 +0100 Subject: [PATCH 076/266] typo --- src/distributions/discrete_dist.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 3118033..9f99406 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -16,7 +16,7 @@ function DiscretizedDistribution(discretizer::Discretizer) end function pdf(d::DiscretizedDistribution, x::Real) - if !supports_encoding(d.discretizer, x) + if !support_encoding(d.discretizer, x) return 0.0 end bin = encode(d.discretizer, x) @@ -24,7 +24,7 @@ function pdf(d::DiscretizedDistribution, x::Real) end function logpdf(d::DiscretizedDistribution, x::Real) - if !supports_encoding(d.discretizer, x) + if !support_encoding(d.discretizer, x) return -Inf end bin = encode(d.discretizer, x) @@ -45,7 +45,7 @@ function maximum(d::DiscretizedDistribution) end function insupport(d::DiscretizedDistribution, x::Real) - return supports_encoding(d.discretizer, x) + return support_encoding(d.discretizer, x) end function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} From e5fd356448813a7809029a3ca4ed835c13b47ca2 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:38:28 +0100 Subject: [PATCH 077/266] another typo --- src/distributions/discretizer.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index 70ee996..e86c717 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -175,6 +175,8 @@ function non_zero_labels_counts(d::HybridDiscretizer) return non_zero_labels_counts(d.lin) + non_zero_labels_counts(d.cat) end +binwidth(d::HybridDiscretizer) = binwidth(d.lin) + function encode(d::HybridDiscretizer, x::Real) if haskey(d.cat.cat_to_bin, x) return encode(d.cat, x) From 7ca23a78edae53a4c2fa13f6df667ac9f47915e1 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:50:12 +0100 Subject: [PATCH 078/266] corner cases --- src/distributions/categorical_with_0.jl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index 7054c8b..fc4521a 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -1,6 +1,6 @@ struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution - proba_zero::B + edge_proba::B dist::D end @@ -16,6 +16,11 @@ function ZeroInflatedCategorical(p::Real, probs::AbstractVector) else probs_ = probs / sum(probs) end + if p ≈ 0 + p = 0 + elseif p ≈ 1 + p = 1 + end return ZeroInflatedCategorical(Bernoulli(1 - p), Categorical(probs_)) end @@ -26,11 +31,11 @@ end ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k+1) ./ (k+1)) function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.proba_zero, 0) * _dirac_delta(x) + pdf(d.proba_zero, 1) * pdf(d.dist, x) + return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * pdf(d.dist, x) end function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) - return rand(rng, d.proba_zero) * rand(rng, d.dist) + return rand(rng, d.edge_proba) * rand(rng, d.dist) end logpdf(d::ZeroInflatedCategorical, x::Real) = log(pdf(d, x)) @@ -42,11 +47,11 @@ maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.proba_zero, 0) * _dirac_delta(x) + pdf(d.proba_zero, 1) * cdf(d.dist, x) + return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * cdf(d.dist, x) end function Distributions.params(d::ZeroInflatedCategorical) - (first(params(d.proba_zero)), params(d.dist)...) + (first(params(d.edge_proba)), params(d.dist)...) end ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) From a9250903f0bf7a836d4cfe84b5a0ac267daf5107 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:50:58 +0100 Subject: [PATCH 079/266] type stable clamp --- src/distributions/categorical_with_0.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index fc4521a..8af1552 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -17,9 +17,9 @@ function ZeroInflatedCategorical(p::Real, probs::AbstractVector) probs_ = probs / sum(probs) end if p ≈ 0 - p = 0 + p = zero(p) elseif p ≈ 1 - p = 1 + p = one(p) end return ZeroInflatedCategorical(Bernoulli(1 - p), Categorical(probs_)) end From b5eade9c08446c85755813d6538e9077a73437b8 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:52:04 +0100 Subject: [PATCH 080/266] typo --- src/distributions/categorical_with_0.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index 8af1552..f52dbc3 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -21,7 +21,7 @@ function ZeroInflatedCategorical(p::Real, probs::AbstractVector) elseif p ≈ 1 p = one(p) end - return ZeroInflatedCategorical(Bernoulli(1 - p), Categorical(probs_)) + return ZeroInflatedCategorical(p, Categorical(probs_)) end function ZeroInflatedCategorical(vec_probs::AbstractVector) From a3dd9c1ac0aaf5bec9010c2c2b5ef618d25d0faf Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 8 Dec 2024 18:54:14 +0100 Subject: [PATCH 081/266] approx error --- src/assignments/SparseAssignment/swap.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index b55a5ed..96825f5 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -119,9 +119,10 @@ function fit(a::SparseAssignment, g::Observations{G, <:DiscretizedDistribution}) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] + p = clamp(1 - sum(theta),0,1) dists[group1, group2] = DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(1 - sum(theta), theta)) + g.dist_ref.discretizer, ZeroInflatedCategorical(p, theta)) end end return dists From 1a58affdd6b91bd8f4b0b7a13ca98825cfed725f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 08:01:31 +0100 Subject: [PATCH 082/266] add TODO and basic zero inflated cont dist --- src/distributions/categorical_with_0.jl | 4 --- src/distributions/include.jl | 1 + src/distributions/zero_inflated.jl | 44 +++++++++++++++++++++++++ src/observations.jl | 4 ++- src/sbm.jl | 3 ++ 5 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 src/distributions/zero_inflated.jl diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index f52dbc3..fbad765 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -1,4 +1,3 @@ - struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution edge_proba::B dist::D @@ -56,9 +55,6 @@ end ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) -function Distributions.suffstats(::Type{ZeroInflatedCategorical{B, D}}, data) where {B, D} - return Distributions.suffstats(D, data) -end function Distributions.fit( ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where {B, D} diff --git a/src/distributions/include.jl b/src/distributions/include.jl index 9fafb24..be0e899 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -1,3 +1,4 @@ include("categorical_with_0.jl") include("discretizer.jl") include("discrete_dist.jl") +include("zero_inflated.jl") diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl new file mode 100644 index 0000000..00d5418 --- /dev/null +++ b/src/distributions/zero_inflated.jl @@ -0,0 +1,44 @@ +struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution + edge_proba::B + dist::D +end + +function ZeroInflated(p::Real, dist::D) where {D} + return ZeroInflated(Bernoulli(1 - p), dist) +end + +function Distributions.pdf(d::ZeroInflated, x::Real) + return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * pdf(d.dist, x) +end + +function rand(rng::Random.AbstractRNG, d::ZeroInflated) + return rand(rng, d.edge_proba) * rand(rng, d.dist) +end + +logpdf(d::ZeroInflated, x::Real) = log(pdf(d, x)) + +minimum(d::ZeroInflated) = min(minimum(d.dist), 0) + +maximum(d::ZeroInflated) = max(maximum(d.dist), 0) + +insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) + +function Distributions.cdf(d::ZeroInflated, x::Real) + return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * cdf(d.dist, x) +end + +function Distributions.params(d::ZeroInflated) + (first(params(d.edge_proba)), params(d.dist)...) +end + + +function Distributions.fit( + ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} + indices_0 = findall(x -> x == 0, data) + p = length(indices_0) / length(data) + if p != 1 + return ZeroInflated(p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) + else + return ZeroInflated(1.0, D()) + end +end diff --git a/src/observations.jl b/src/observations.jl index 6d5e4ab..e7f52eb 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -1,4 +1,4 @@ -# switch to MetaGraphsNext.jl ? +# remove all references to graphs, and only use sparse matrices ? struct Observations{G, D} graph::G dist_ref::D @@ -120,6 +120,8 @@ Assume that the diagonal is zero. 0 indicates no edge, while missing indicates no information about the edge. By default maps 0 to 0. If you want another behaviour use the function where you pass a `Discretizer` object. + +number_levels will be the number of levels in the discretized distribution (excluding 0). """ function discretise(g::Observations{G, D}; number_groups = nothing, number_levels = nothing) where {G, D} diff --git a/src/sbm.jl b/src/sbm.jl index e7fe17b..ad84c72 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -1,3 +1,6 @@ +# TODO: remove BlockModel being a subtype of AbstractMatrix +# this was fun but useless and actually harmful + struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} sizes::Vector{F} probs::SymmetricTensor{T, K, 2} From 760c7e15f8d8c565f1790dcadc0744e002b626eb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 09:50:50 +0100 Subject: [PATCH 083/266] try to manage floating point error --- src/distributions/categorical_with_0.jl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index fbad765..a4774dd 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -6,6 +6,11 @@ end _dirac_delta(x) = x == 0 ? one(x) : zero(x) function ZeroInflatedCategorical(p::Real, dist::D) where {D} + if p < 0 + p = zero(p) + elseif p > 1 + p = one(p) + end return ZeroInflatedCategorical(Bernoulli(1 - p), dist) end @@ -15,9 +20,9 @@ function ZeroInflatedCategorical(p::Real, probs::AbstractVector) else probs_ = probs / sum(probs) end - if p ≈ 0 + if p < 0 p = zero(p) - elseif p ≈ 1 + elseif p > 1 p = one(p) end return ZeroInflatedCategorical(p, Categorical(probs_)) @@ -27,7 +32,7 @@ function ZeroInflatedCategorical(vec_probs::AbstractVector) ZeroInflatedCategorical(vec_probs[1], vec_probs[2:end]) end -ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k+1) ./ (k+1)) +ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * pdf(d.dist, x) @@ -55,7 +60,6 @@ end ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) - function Distributions.fit( ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where {B, D} indices_0 = findall(x -> x == 0, data) From 5900676167a3bc3c501474795c14281900ea099e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 10:13:04 +0100 Subject: [PATCH 084/266] monkey patch of convert zero inflated to cat --- src/distributions/categorical_with_0.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index a4774dd..51c79c8 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -71,3 +71,10 @@ function Distributions.fit( return ZeroInflatedCategorical(1.0, zeros(n_cat)) end end + + +function get_params_cat_like(dist::ZeroInflatedCategorical) + p = first(params(dist.edge_proba)) + probs = params(dist.dist) + return vcat(p, probs .* (1-p)) +end From da86e77376310e10067b7a137216ca15a83b6ffc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 10:15:51 +0100 Subject: [PATCH 085/266] forgot to take first of tuple params --- src/distributions/categorical_with_0.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index 51c79c8..f5370b3 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -75,6 +75,6 @@ end function get_params_cat_like(dist::ZeroInflatedCategorical) p = first(params(dist.edge_proba)) - probs = params(dist.dist) + probs = vcat(params(dist.dist)...) return vcat(p, probs .* (1-p)) end From 0d9fa128a21b4670a997f0a20f073c24a78f9cfb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 10:20:35 +0100 Subject: [PATCH 086/266] p vs 1-p is a pain --- src/distributions/categorical_with_0.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index f5370b3..0387ae3 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -76,5 +76,5 @@ end function get_params_cat_like(dist::ZeroInflatedCategorical) p = first(params(dist.edge_proba)) probs = vcat(params(dist.dist)...) - return vcat(p, probs .* (1-p)) + return vcat(1-p, probs .* p) end From d36d7338c7c7c1f98c84d563049f8a479ecc9a24 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 12:27:53 +0100 Subject: [PATCH 087/266] hack: remove compat --- Project.toml | 24 ------------------------ src/distributions/categorical_with_0.jl | 10 ++++++++++ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/Project.toml b/Project.toml index cbb639a..57a5613 100644 --- a/Project.toml +++ b/Project.toml @@ -26,30 +26,6 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -[compat] -ArnoldiMethod = "0.4.0" -Bootstrap = "2.4.0" -CategoricalArrays = "0.10.8" -CategoricalDistributions = "0.1.15" -Clustering = "0.15.7" -Combinatorics = "1.0.2" -DensityInterface = "0.4.0" -Distributions = "0.25.112" -Graphs = "1.12.0" -IterativeSolvers = "0.9.4" -LinearAlgebra = "1.11.0" -LogExpFunctions = "0.3.28" -Metis = "1.5.0" -PermutationSymmetricTensors = "0.2.0" -ProgressMeter = "1.7.2" -Random = "1.11.0" -SimpleWeightedGraphs = "1.4.0" -SparseArrays = "1.11.0" -StaticArrays = "1.9.7" -StatsAPI = "1.7.0" -StatsBase = "0.34.3" -Test = "1.11.0" -julia = "1.8" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index 0387ae3..3d67204 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -78,3 +78,13 @@ function get_params_cat_like(dist::ZeroInflatedCategorical) probs = vcat(params(dist.dist)...) return vcat(1-p, probs .* p) end + + +function Base.convert(::Type{<:ZeroInflatedCategorical}, d::D) where {D} + return ZeroInflatedCategorical(1.0, d) +end + + +function Base.convert(T::Type{<:Categorical}, d::ZeroInflatedCategorical) + return T(get_params_cat_like(d)) +end From 7860d739dff09abc68bc1c076f72041a1600becc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 9 Dec 2024 14:51:55 +0100 Subject: [PATCH 088/266] remove sbm subtype matrix --- src/sbm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sbm.jl b/src/sbm.jl index ad84c72..35c3321 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -1,7 +1,7 @@ # TODO: remove BlockModel being a subtype of AbstractMatrix # this was fun but useless and actually harmful -struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} +struct BlockModel{T, K, F <: Real} sizes::Vector{F} probs::SymmetricTensor{T, K, 2} end From 91d5164ed1b1894865e05886493651bfb139b280 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 10 Dec 2024 11:18:37 +0100 Subject: [PATCH 089/266] Revert "remove sbm subtype matrix" This reverts commit 7860d739dff09abc68bc1c076f72041a1600becc. --- src/sbm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sbm.jl b/src/sbm.jl index 35c3321..ad84c72 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -1,7 +1,7 @@ # TODO: remove BlockModel being a subtype of AbstractMatrix # this was fun but useless and actually harmful -struct BlockModel{T, K, F <: Real} +struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} sizes::Vector{F} probs::SymmetricTensor{T, K, 2} end From 1e2b692c5654932fc0fc966e8a341362846ac76a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 10:52:33 +0100 Subject: [PATCH 090/266] typo in name --- src/distributions/discrete_dist.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 9f99406..33b1301 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -5,7 +5,7 @@ mutable struct DiscretizedDistribution{D, L} <: end function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) where {D} - disc = ZeroToZeroDiscretizer(n_bins, support_bound...) + disc = DiscretizerZeroToZero(n_bins, support_bound...) probs = ZeroInflatedCategorical(non_zero_labels_counts(disc)) return DiscretizedDistribution(disc, probs) end From 984f31012b77525a69f56d1b08b9b628b197417c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 13:10:33 +0100 Subject: [PATCH 091/266] correctly initialize discretizedDist --- src/distributions/discrete_dist.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 33b1301..e674330 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -6,7 +6,12 @@ end function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) where {D} disc = DiscretizerZeroToZero(n_bins, support_bound...) - probs = ZeroInflatedCategorical(non_zero_labels_counts(disc)) + ps = zeros(non_zero_labels_counts(disc)) + for i in 1:non_zero_labels_counts(disc) + lb, ub = NetworkHistogram.decode(disc, i) + ps[i] = cdf(dist, ub) - cdf(dist, lb) + end + probs = ZeroInflatedCategorical(0.0, ps) return DiscretizedDistribution(disc, probs) end From 3149c5539c3d71e14c526dd5ef31a394370e5a25 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 13:11:46 +0100 Subject: [PATCH 092/266] add case of zero inflated discretisation --- src/distributions/discrete_dist.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index e674330..dfeb507 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -15,6 +15,18 @@ function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) return DiscretizedDistribution(disc, probs) end + +function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) + disc = DiscretizerZeroToZero(n_bins, support_bound...) + ps = zeros(non_zero_labels_counts(disc)) + for i in 1:non_zero_labels_counts(disc) + lb, ub = NetworkHistogram.decode(disc, i) + ps[i] = cdf(dist, ub) - cdf(dist, lb) + end + probs = ZeroInflatedCategorical(pdf(d,0.0), ps) + return DiscretizedDistribution(disc, probs) +end + function DiscretizedDistribution(discretizer::Discretizer) return DiscretizedDistribution( discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) From ee80912db334a151d9fe524b906f71d6e5d23357 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 13:13:18 +0100 Subject: [PATCH 093/266] change order include --- src/distributions/include.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions/include.jl b/src/distributions/include.jl index be0e899..875f028 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -1,4 +1,4 @@ include("categorical_with_0.jl") include("discretizer.jl") -include("discrete_dist.jl") include("zero_inflated.jl") +include("discrete_dist.jl") From a66b08c4b3751f0676df2182c8bf8a7fa466c050 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 13:15:27 +0100 Subject: [PATCH 094/266] typo --- src/distributions/discrete_dist.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index dfeb507..f603e33 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -9,7 +9,7 @@ function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) ps = zeros(non_zero_labels_counts(disc)) for i in 1:non_zero_labels_counts(disc) lb, ub = NetworkHistogram.decode(disc, i) - ps[i] = cdf(dist, ub) - cdf(dist, lb) + ps[i] = cdf(d, ub) - cdf(d, lb) end probs = ZeroInflatedCategorical(0.0, ps) return DiscretizedDistribution(disc, probs) @@ -21,7 +21,7 @@ function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = e ps = zeros(non_zero_labels_counts(disc)) for i in 1:non_zero_labels_counts(disc) lb, ub = NetworkHistogram.decode(disc, i) - ps[i] = cdf(dist, ub) - cdf(dist, lb) + ps[i] = cdf(d, ub) - cdf(d, lb) end probs = ZeroInflatedCategorical(pdf(d,0.0), ps) return DiscretizedDistribution(disc, probs) From 4cd29621c8b4db80748938eb377f85e5bb24df03 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 13:30:15 +0100 Subject: [PATCH 095/266] change rescaling issue on 0 --- src/distributions/categorical_with_0.jl | 6 ++---- src/distributions/discrete_dist.jl | 5 ++++- src/distributions/zero_inflated.jl | 5 ++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index 3d67204..5c0982a 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -4,6 +4,7 @@ struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution end _dirac_delta(x) = x == 0 ? one(x) : zero(x) +_dirac_delta(x, lb, ub) = lb <= x <= ub ? one(x) : zero(x) function ZeroInflatedCategorical(p::Real, dist::D) where {D} if p < 0 @@ -72,19 +73,16 @@ function Distributions.fit( end end - function get_params_cat_like(dist::ZeroInflatedCategorical) p = first(params(dist.edge_proba)) probs = vcat(params(dist.dist)...) - return vcat(1-p, probs .* p) + return vcat(1 - p, probs .* p) end - function Base.convert(::Type{<:ZeroInflatedCategorical}, d::D) where {D} return ZeroInflatedCategorical(1.0, d) end - function Base.convert(T::Type{<:Categorical}, d::ZeroInflatedCategorical) return T(get_params_cat_like(d)) end diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index f603e33..a572e5c 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -20,7 +20,7 @@ function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = e disc = DiscretizerZeroToZero(n_bins, support_bound...) ps = zeros(non_zero_labels_counts(disc)) for i in 1:non_zero_labels_counts(disc) - lb, ub = NetworkHistogram.decode(disc, i) + lb, ub = decode(disc, i) ps[i] = cdf(d, ub) - cdf(d, lb) end probs = ZeroInflatedCategorical(pdf(d,0.0), ps) @@ -36,6 +36,9 @@ function pdf(d::DiscretizedDistribution, x::Real) if !support_encoding(d.discretizer, x) return 0.0 end + if x == 0 + return pdf(d.probs, x) + end bin = encode(d.discretizer, x) return pdf(d.probs, bin) / binwidth(d.discretizer) end diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 00d5418..2e70353 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -24,20 +24,19 @@ maximum(d::ZeroInflated) = max(maximum(d.dist), 0) insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) function Distributions.cdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * cdf(d.dist, x) + return pdf(d.edge_proba, 0) * _dirac_delta(x, 0, Inf) + cdf(d.dist, x) end function Distributions.params(d::ZeroInflated) (first(params(d.edge_proba)), params(d.dist)...) end - function Distributions.fit( ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} indices_0 = findall(x -> x == 0, data) p = length(indices_0) / length(data) if p != 1 - return ZeroInflated(p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) + return ZeroInflated(p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) else return ZeroInflated(1.0, D()) end From 230fdd400a405b894e90b04288ba38b0e3106a57 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 13:40:16 +0100 Subject: [PATCH 096/266] hack for fast bin checking. Will fail if discretizer has other bins than 0 --- src/distributions/discrete_dist.jl | 10 +++++++--- src/distributions/discretizer.jl | 22 +++++++++++++--------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index a572e5c..e0f649b 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -32,13 +32,14 @@ function DiscretizedDistribution(discretizer::Discretizer) discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) end +# fast trick, will fail if discretizer put other categorical bins.... function pdf(d::DiscretizedDistribution, x::Real) + if x == 0 + return pdf(d.probs, 0) + end if !support_encoding(d.discretizer, x) return 0.0 end - if x == 0 - return pdf(d.probs, x) - end bin = encode(d.discretizer, x) return pdf(d.probs, bin) / binwidth(d.discretizer) end @@ -47,6 +48,9 @@ function logpdf(d::DiscretizedDistribution, x::Real) if !support_encoding(d.discretizer, x) return -Inf end + if x == 0 + return log(pdf(d.probs, 0)) + end bin = encode(d.discretizer, x) return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) end diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index e86c717..a5ae37d 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -41,7 +41,7 @@ function encode(d::RegularDiscretizer, x::Real) end function _decode_randomly(rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) - hi,lo = decode(d, bin) + hi, lo = decode(d, bin) return lo + (hi - lo) * rand(rng) end @@ -67,7 +67,6 @@ end non_zero_labels_counts(d::RegularDiscretizer) = nlabels(d) - """ Maps a set of categories to a set of bins """ @@ -100,7 +99,9 @@ function nlabels(d::CategoryDiscretizer) return length(d.bin_to_cat) end - +function binwidth(d::CategoryDiscretizer{F,T}, x::T) where {F,T} + return length(d.bin_to_cat[x]) +end function non_zero_labels_counts(d::CategoryDiscretizer) if 0 ∈ keys(d.bin_to_cat) @@ -127,7 +128,6 @@ struct HybridDiscretizer{F, F2, T, L} <: Discretizer cat::CategoryDiscretizer{F2, T} end - # change so that atoms can be packed together if wanted function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) cat_to_bin = Dict(a => n_bins + i for (i, a) in enumerate(atoms)) @@ -141,7 +141,6 @@ function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) ) end - function DiscretizerZeroToZero(n_bins, lower_bound, upper_bound) cat_to_bin = Dict([0 => 0]) bin_to_cat = Dict([0 => 0]) @@ -158,7 +157,6 @@ function support_encoding(d::HybridDiscretizer, x) return support_encoding(d.lin, x) || support_encoding(d.cat, x) end - function minimum(d::HybridDiscretizer) return min(minimum(d.lin), minimum(d.cat)) end @@ -177,6 +175,14 @@ end binwidth(d::HybridDiscretizer) = binwidth(d.lin) +function binwidth(d::HybridDiscretizer, bin) + if haskey(d.cat.cat_to_bin, bin) + return binwidth(d.cat, bin) + else + return binwidth(d.lin) + end +end + function encode(d::HybridDiscretizer, x::Real) if haskey(d.cat.cat_to_bin, x) return encode(d.cat, x) @@ -193,16 +199,14 @@ function decode(d::HybridDiscretizer, bin::Int) end end - function _decode_randomly(rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) - if haskey(d.cat.bin_to_cat, bin) + if haskey(d.cat.bin_to_cat, bin) return decode(d.cat, bin) else return _decode_randomly(rng, d.lin, bin) end end - function auto_nbins(data) binwidth = 2iqr(data) / cbrt(n) lo, hi = extrema(data) From cf5181d09f1e7bfb4d4d4503d3c17ff55e30e212 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 14:00:09 +0100 Subject: [PATCH 097/266] add hacky cdf computation --- src/distributions/discrete_dist.jl | 55 ++++++++++++++++-------------- src/distributions/zero_inflated.jl | 3 +- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index e0f649b..7fbe144 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -15,7 +15,6 @@ function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) return DiscretizedDistribution(disc, probs) end - function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) disc = DiscretizerZeroToZero(n_bins, support_bound...) ps = zeros(non_zero_labels_counts(disc)) @@ -23,7 +22,7 @@ function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = e lb, ub = decode(disc, i) ps[i] = cdf(d, ub) - cdf(d, lb) end - probs = ZeroInflatedCategorical(pdf(d,0.0), ps) + probs = ZeroInflatedCategorical(pdf(d, 0.0), ps) return DiscretizedDistribution(disc, probs) end @@ -32,29 +31,6 @@ function DiscretizedDistribution(discretizer::Discretizer) discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) end -# fast trick, will fail if discretizer put other categorical bins.... -function pdf(d::DiscretizedDistribution, x::Real) - if x == 0 - return pdf(d.probs, 0) - end - if !support_encoding(d.discretizer, x) - return 0.0 - end - bin = encode(d.discretizer, x) - return pdf(d.probs, bin) / binwidth(d.discretizer) -end - -function logpdf(d::DiscretizedDistribution, x::Real) - if !support_encoding(d.discretizer, x) - return -Inf - end - if x == 0 - return log(pdf(d.probs, 0)) - end - bin = encode(d.discretizer, x) - return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) -end - function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) bin = rand(rng, d.probs) return _decode_randomly(rng, d.discretizer, bin) @@ -87,3 +63,32 @@ end function set_params!(d::DiscretizedDistribution{D, L}, params) where {D, L} d.probs = L(params...) end + +# fast trick, will fail if discretizer put other categorical bins.... +function pdf(d::DiscretizedDistribution, x::Real) + if x == 0 + return pdf(d.probs, 0) + end + if !support_encoding(d.discretizer, x) + return 0.0 + end + bin = encode(d.discretizer, x) + return pdf(d.probs, bin) / binwidth(d.discretizer) +end + +function logpdf(d::DiscretizedDistribution, x::Real) + if !support_encoding(d.discretizer, x) + return -Inf + end + if x == 0 + return log(pdf(d.probs, 0)) + end + bin = encode(d.discretizer, x) + return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) +end + +#lazy cdf computation, not efficient +function Distributions.cdf(d::DiscretizedDistribution, x::Real; step::Real = 0.01) + return mean(pdf(d, minimum(d):step:x)) * (x - minimum(d)) + + pdf(dist, 0) * _dirac_delta(x, 0.0, Inf) +end diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 2e70353..610f510 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -24,7 +24,8 @@ maximum(d::ZeroInflated) = max(maximum(d.dist), 0) insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) function Distributions.cdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x, 0, Inf) + cdf(d.dist, x) + return pdf(d.edge_proba, 0) * _dirac_delta(x, 0, Inf) + + cdf(d.dist, x) * pdf(d.edge_proba, 1) end function Distributions.params(d::ZeroInflated) From e23fdf59e4f067009923fd40d05b19eb3141fa70 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 14:01:28 +0100 Subject: [PATCH 098/266] typo.... --- src/distributions/discrete_dist.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 7fbe144..a1e2b24 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -90,5 +90,5 @@ end #lazy cdf computation, not efficient function Distributions.cdf(d::DiscretizedDistribution, x::Real; step::Real = 0.01) return mean(pdf(d, minimum(d):step:x)) * (x - minimum(d)) + - pdf(dist, 0) * _dirac_delta(x, 0.0, Inf) + pdf(d, 0) * _dirac_delta(x, 0.0, Inf) end From d44797992eb339a25ece9b9ee557d5895bd9ed1e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 18:16:17 +0100 Subject: [PATCH 099/266] update import for bootstrap --- src/NetworkHistogram.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index feb54e0..61cc7bb 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -16,8 +16,7 @@ import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions using Combinatorics: permutations using StaticArrays -using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx -import Bootstrap: bootstrap +using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, bootstrap, nrun import Base.maximum, Base.minimum import Random: rand import Base.convert From b7df926b6f52d9dae6df746557612b8476ae119f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 11 Dec 2024 18:19:01 +0100 Subject: [PATCH 100/266] fix imports for bootstrap --- src/NetworkHistogram.jl | 3 ++- src/bootstrap.jl | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 61cc7bb..3746827 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -16,7 +16,8 @@ import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions using Combinatorics: permutations using StaticArrays -using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, bootstrap, nrun +using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, nrun, zeros_tuple +import Bootstrap: bootstrap import Base.maximum, Base.minimum import Random: rand import Base.convert diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 0b3ff97..2cb7fc5 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -2,7 +2,7 @@ function bootstrap(statistic::Function, data::AbstractMatrix, model::BlockModel, sampling::BootstrapSampling) t0 = tx(statistic(data)) m = nrun(sampling) - t1 = Bootstrap.zeros_tuple(t0, m) + t1 = zeros_tuple(t0, m) data1 = copy(data) for i in 1:m draw_and_fill!(data1, model) From 9c5e454a34b62b3355f6a80999e5fa20605860ed Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 13 Dec 2024 08:42:29 +0100 Subject: [PATCH 101/266] add helper for alignment --- src/sbm.jl | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/sbm.jl b/src/sbm.jl index ad84c72..66e0f61 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -159,3 +159,26 @@ function align_sbm!(sbm::BlockModel, perm) sbm.probs .= sbm.probs[perm, perm] sbm.sizes .= sbm.sizes[perm] end + + +""" + order_groups(a::Assignment, latents::AbstractVector) + +Order the groups of an assignment according to the true latents. This is an heuristic +approach, which is not guaranteed to find the true ordering of the groups. +""" +function order_groups(a::Assignment, latents::AbstractVector) + n = number_nodes(a) + k = number_groups(a) + sort_perm = sortperm(latents) + sorted_group_labels = a.node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + return sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) +end + + +function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) + align_sbm!(sbm, order_groups(a, latents)) +end From b341676b4d37fa76eb1ccd39ad0eb238c8c5d0f3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 13 Dec 2024 21:01:34 +0100 Subject: [PATCH 102/266] minor fixes to cdf and fit of ZeroInflatedCategorical --- src/assignments/SparseAssignment/struct.jl | 5 +++++ src/assignments/SparseAssignment/swap.jl | 4 ++-- src/distributions/categorical_with_0.jl | 8 +++++--- src/distributions/discrete_dist.jl | 13 ++++++++++--- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index 26daf0a..9660aca 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -104,6 +104,11 @@ function _n_decorations_with_0(a::SparseAssignment) return size(a.additional_data.estimated_theta, 1) + 1 end + +function _n_decorations_not_0(a::SparseAssignment) + return size(a.additional_data.estimated_theta, 1) +end + function loglikelihood(assignment::SparseAssignment, g::Observations) return assignment.additional_data.log_likelihood end diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 96825f5..637fff8 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -101,7 +101,7 @@ function _update_counts!(counts, g_from, g_to, missing_update) end function fit(a::SparseAssignment, g::Observations) - dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_with_0(a))) + dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] @@ -115,7 +115,7 @@ end function fit(a::SparseAssignment, g::Observations{G, <:DiscretizedDistribution}) where {G} dists = initialize_sbm(a.group_size, DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_with_0(a)))) + g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index 5c0982a..dc20d5c 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -52,7 +52,8 @@ maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * cdf(d.dist, x) + return pdf(d.edge_proba, 0) * _dirac_delta(x, 0, Inf) + + pdf(d.edge_proba, 1) * cdf(d.dist, x) end function Distributions.params(d::ZeroInflatedCategorical) @@ -62,11 +63,12 @@ end ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) function Distributions.fit( - ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where {B, D} + ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where { + B, D <: Categorical} indices_0 = findall(x -> x == 0, data) p = length(indices_0) / length(data) if p != 1 - dist = fit(D, data[setdiff(1:end, indices_0)]) + dist = fit_mle(Categorical, n_cat, data[setdiff(1:end, indices_0)]) return ZeroInflatedCategorical(p, dist) else return ZeroInflatedCategorical(1.0, zeros(n_cat)) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index a1e2b24..90600f8 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -88,7 +88,14 @@ function logpdf(d::DiscretizedDistribution, x::Real) end #lazy cdf computation, not efficient -function Distributions.cdf(d::DiscretizedDistribution, x::Real; step::Real = 0.01) - return mean(pdf(d, minimum(d):step:x)) * (x - minimum(d)) + - pdf(d, 0) * _dirac_delta(x, 0.0, Inf) +function Distributions.cdf( + d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} + bin = encode(d.discretizer, x) + result = (x == 0) * cdf(d.probs, 0) + if bin != 0 + lb, ub = decode(d.discretizer, bin) + result += cdf(d.probs, bin - 1) + + (cdf(d.probs, bin) - cdf(d.probs, bin - 1)) * (x - lb) / (ub - lb) + end + return result end From a099721bfb13d2b356243dde9168c1f2fc7d01bc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 13 Dec 2024 21:13:50 +0100 Subject: [PATCH 103/266] ? --- src/assignments/SparseAssignment/swap.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 637fff8..96825f5 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -101,7 +101,7 @@ function _update_counts!(counts, g_from, g_to, missing_update) end function fit(a::SparseAssignment, g::Observations) - dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) + dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_with_0(a))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] @@ -115,7 +115,7 @@ end function fit(a::SparseAssignment, g::Observations{G, <:DiscretizedDistribution}) where {G} dists = initialize_sbm(a.group_size, DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) + g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_with_0(a)))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] From 93c30637719ac3a8c260e31f52945563e6e9049f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 13 Dec 2024 21:15:41 +0100 Subject: [PATCH 104/266] test encoding max bin --- src/assignments/SparseAssignment/swap.jl | 4 ++-- src/distributions/discretizer.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 96825f5..637fff8 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -101,7 +101,7 @@ function _update_counts!(counts, g_from, g_to, missing_update) end function fit(a::SparseAssignment, g::Observations) - dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_with_0(a))) + dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] @@ -115,7 +115,7 @@ end function fit(a::SparseAssignment, g::Observations{G, <:DiscretizedDistribution}) where {G} dists = initialize_sbm(a.group_size, DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_with_0(a)))) + g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index a5ae37d..860abfe 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -34,7 +34,7 @@ function maximum(d::RegularDiscretizer) end function encode(d::RegularDiscretizer, x::Real) - if x == d.upper_bound + if x >= d.upper_bound return d.n_bins end return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] From d22f08aaa90d1c7aac5859103e4d59d9e471b893 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 14 Dec 2024 18:45:22 +0100 Subject: [PATCH 105/266] correct bin numbers --- src/observations.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/observations.jl b/src/observations.jl index e7f52eb..93d6fc7 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -153,5 +153,5 @@ function _graph_to_mat(g::Observations{<:AbstractMatrix, D}) where {D<:Univariat end function get_num_levels_from_groups(n, number_groups) - return n^(0.5 * (1 - log(number_groups) / log(n))) + return ceil(Int, n / number_groups) end From 105f237ed1e90dcba49b26c8b468e3779cac4ae1 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 14 Dec 2024 19:00:16 +0100 Subject: [PATCH 106/266] throw on edge cases for number of bins --- src/observations.jl | 2 +- .../config_rules/bandwidth_selection_rule.jl | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/observations.jl b/src/observations.jl index 93d6fc7..59a9ecf 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -153,5 +153,5 @@ function _graph_to_mat(g::Observations{<:AbstractMatrix, D}) where {D<:Univariat end function get_num_levels_from_groups(n, number_groups) - return ceil(Int, n / number_groups) + return max(1, ceil(Int, n / number_groups)) end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl index a378dcb..efa4da2 100644 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ b/src/optimisation/config_rules/bandwidth_selection_rule.jl @@ -11,7 +11,6 @@ struct OracleH <: KSelectionRule H::Int end - function OracleM(M) return OracleM(M, 1.0) end @@ -44,7 +43,7 @@ How to select the number of blocks `K` for the BlockModel model. select_number_node_per_block function select_number_node_per_block(g::Observations, rule::OracleH) - if rule.H > number_nodes(g)÷2 + if rule.H > number_nodes(g) ÷ 2 throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) end @@ -63,8 +62,8 @@ end function select_number_node_per_block(g::Observations, rule::OracleM) rho = density(g) n = number_nodes(g) - k = max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))) - return select_number_node_per_block(g, OracleH(k)) + h = min(max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))), n ÷ 2) + return select_number_node_per_block(g, OracleH(h)) end function select_number_node_per_block(g::Observations, rule::EstimatedM) @@ -92,7 +91,7 @@ function estimated_number_nodes_per_block( end function _approx_k_from_delta_f(u, mult, midpoints, ρ, α = 1.0) - sort!(u, dims=1) + sort!(u, dims = 1) uMid = u[midpoints] β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid # from Olhede and Wolfe (2014), equation (11) From e271a5c41666816026bf357f9dd5f807377463f6 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 14 Dec 2024 20:45:56 +0100 Subject: [PATCH 107/266] revert to correct number of bins given k --- src/observations.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/observations.jl b/src/observations.jl index 59a9ecf..4e23916 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -152,6 +152,10 @@ function _graph_to_mat(g::Observations{<:AbstractMatrix, D}) where {D<:Univariat return g.graph end + +""" +Get the number of levels for the discretized distribution given n and k. +""" function get_num_levels_from_groups(n, number_groups) - return max(1, ceil(Int, n / number_groups)) + return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) end From e1cda212bf915321f884b9d0b2915d5b89042d44 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 14 Dec 2024 21:08:48 +0100 Subject: [PATCH 108/266] bound check for cdf support --- src/distributions/discrete_dist.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 90600f8..fad8f11 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -90,6 +90,7 @@ end #lazy cdf computation, not efficient function Distributions.cdf( d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} + !insupport(d, x) && return 0.0 bin = encode(d.discretizer, x) result = (x == 0) * cdf(d.probs, 0) if bin != 0 From 6bfef277364cd2bebefc016f75a8fb3bfb9604dc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 14 Dec 2024 21:15:37 +0100 Subject: [PATCH 109/266] try to circumvent pdf(x,0)=Inf --- src/distributions/discrete_dist.jl | 4 ++-- src/distributions/zero_inflated.jl | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index fad8f11..50a94ab 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -8,7 +8,7 @@ function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) disc = DiscretizerZeroToZero(n_bins, support_bound...) ps = zeros(non_zero_labels_counts(disc)) for i in 1:non_zero_labels_counts(disc) - lb, ub = NetworkHistogram.decode(disc, i) + lb, ub = decode(disc, i) ps[i] = cdf(d, ub) - cdf(d, lb) end probs = ZeroInflatedCategorical(0.0, ps) @@ -22,7 +22,7 @@ function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = e lb, ub = decode(disc, i) ps[i] = cdf(d, ub) - cdf(d, lb) end - probs = ZeroInflatedCategorical(pdf(d, 0.0), ps) + probs = ZeroInflatedCategorical(get_proba_zero(d), ps) return DiscretizedDistribution(disc, probs) end diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 610f510..630400f 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -11,6 +11,10 @@ function Distributions.pdf(d::ZeroInflated, x::Real) return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * pdf(d.dist, x) end +function get_proba_zero(d::ZeroInflated) + return pdf(d.edge_proba, 0) +end + function rand(rng::Random.AbstractRNG, d::ZeroInflated) return rand(rng, d.edge_proba) * rand(rng, d.dist) end From e220bf114fcca4b35e7981a1000b9912ef818526 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 15 Dec 2024 22:40:29 +0100 Subject: [PATCH 110/266] need to take care of type generate by mixture --- src/distributions/discrete_dist.jl | 3 ++- src/distributions/discretizer.jl | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 50a94ab..4bb2bda 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -90,7 +90,8 @@ end #lazy cdf computation, not efficient function Distributions.cdf( d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} - !insupport(d, x) && return 0.0 + x < minimum(d) && return 0.0 + x > maximum(d) && return 1.0 bin = encode(d.discretizer, x) result = (x == 0) * cdf(d.probs, 0) if bin != 0 diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index 860abfe..8ae1404 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -142,8 +142,8 @@ function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) end function DiscretizerZeroToZero(n_bins, lower_bound, upper_bound) - cat_to_bin = Dict([0 => 0]) - bin_to_cat = Dict([0 => 0]) + cat_to_bin = Dict([0.0 => 0]) + bin_to_cat = Dict([0 => 0.0]) bin_width = (upper_bound - lower_bound) / n_bins return HybridDiscretizer( RegularDiscretizer{typeof(bin_width), Int, n_bins}( From 43f158ff94f7b0c2ba1ca24f4d37999d1373631d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 15 Dec 2024 22:51:43 +0100 Subject: [PATCH 111/266] force same type for both discretizers --- src/distributions/discretizer.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index 8ae1404..98c4708 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -123,9 +123,9 @@ end Uniformly discretizes a continuous distribution into a fixed number of bins of equal width, with additional bins for missing or special values. """ -struct HybridDiscretizer{F, F2, T, L} <: Discretizer +struct HybridDiscretizer{F, T, L} <: Discretizer lin::RegularDiscretizer{F, T, L} - cat::CategoryDiscretizer{F2, T} + cat::CategoryDiscretizer{F, T} end # change so that atoms can be packed together if wanted From f2b35f92d365cd91a737f571316de0887540be2a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 15 Dec 2024 22:56:38 +0100 Subject: [PATCH 112/266] return 0 and one of same type of x --- src/distributions/categorical_with_0.jl | 7 ++++--- src/distributions/discrete_dist.jl | 14 ++++++-------- src/distributions/zero_inflated.jl | 7 ++++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index dc20d5c..d327509 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -36,7 +36,8 @@ end ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * pdf(d.dist, x) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) end function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) @@ -52,8 +53,8 @@ maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x, 0, Inf) + - pdf(d.edge_proba, 1) * cdf(d.dist, x) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, 0, Inf) + + pdf(d.edge_proba, one(x)) * cdf(d.dist, x) end function Distributions.params(d::ZeroInflatedCategorical) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 4bb2bda..201f7b7 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -67,10 +67,10 @@ end # fast trick, will fail if discretizer put other categorical bins.... function pdf(d::DiscretizedDistribution, x::Real) if x == 0 - return pdf(d.probs, 0) + return pdf(d.probs, zero(x)) end if !support_encoding(d.discretizer, x) - return 0.0 + return zero(x) end bin = encode(d.discretizer, x) return pdf(d.probs, bin) / binwidth(d.discretizer) @@ -80,9 +80,7 @@ function logpdf(d::DiscretizedDistribution, x::Real) if !support_encoding(d.discretizer, x) return -Inf end - if x == 0 - return log(pdf(d.probs, 0)) - end + x == 0 && return log(pdf(d.probs, x)) bin = encode(d.discretizer, x) return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) end @@ -90,10 +88,10 @@ end #lazy cdf computation, not efficient function Distributions.cdf( d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} - x < minimum(d) && return 0.0 - x > maximum(d) && return 1.0 + x < minimum(d) && return zero(x) + x > maximum(d) && return one(x) bin = encode(d.discretizer, x) - result = (x == 0) * cdf(d.probs, 0) + result = (x == 0) * cdf(d.probs, x) if bin != 0 lb, ub = decode(d.discretizer, bin) result += cdf(d.probs, bin - 1) + diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 630400f..90f7202 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -8,7 +8,8 @@ function ZeroInflated(p::Real, dist::D) where {D} end function Distributions.pdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x) + pdf(d.edge_proba, 1) * pdf(d.dist, x) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) end function get_proba_zero(d::ZeroInflated) @@ -28,8 +29,8 @@ maximum(d::ZeroInflated) = max(maximum(d.dist), 0) insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) function Distributions.cdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, 0) * _dirac_delta(x, 0, Inf) + - cdf(d.dist, x) * pdf(d.edge_proba, 1) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + + cdf(d.dist, x) * pdf(d.edge_proba, one(x)) end function Distributions.params(d::ZeroInflated) From 322380effd0bf2f9493f7e4f88aba12fe1bbba2d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 15 Dec 2024 23:04:51 +0100 Subject: [PATCH 113/266] remove type instability --- src/distributions/discrete_dist.jl | 4 ++-- src/distributions/discretizer.jl | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 201f7b7..be7159f 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -93,9 +93,9 @@ function Distributions.cdf( bin = encode(d.discretizer, x) result = (x == 0) * cdf(d.probs, x) if bin != 0 - lb, ub = decode(d.discretizer, bin) result += cdf(d.probs, bin - 1) + - (cdf(d.probs, bin) - cdf(d.probs, bin - 1)) * (x - lb) / (ub - lb) + (cdf(d.probs, bin) - cdf(d.probs, bin - 1)) * + progress_in_bin(d.discretizer, x, bin) end return result end diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index 98c4708..6d93102 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -215,3 +215,23 @@ function auto_nbins(data) nbins = max(nbins_fd, nbins_sturges) return nbins end + + +function progress_in_bin(d::CategoryDiscretizer, x::Real, bin) + return one(x) +end + + +function progress_in_bin(d::RegularDiscretizer, x::Real, bin) + lo, hi = decode(d, bin) + return (x - lo) / (hi - lo) +end + + +function progress_in_bin(d::HybridDiscretizer, x::Real, bin) + if haskey(d.cat.bin_to_cat, bin) + return progress_in_bin(d.cat, x, bin) + else + return progress_in_bin(d.lin, x, bin) + end +end From 601935d466e07cf7a4dc706fbda1d67e3c432801 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 19 Dec 2024 19:31:44 +0100 Subject: [PATCH 114/266] correct counting with missing data --- Project.toml | 3 ++ src/assignments/Assignments.jl | 6 +++ src/assignments/SparseAssignment/struct.jl | 57 ++++++++++---------- src/assignments/SparseAssignment/swap.jl | 1 + src/optimisation/config_rules/accept_rule.jl | 21 ++++++++ 5 files changed, 58 insertions(+), 30 deletions(-) diff --git a/Project.toml b/Project.toml index 57a5613..2412ed8 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" @@ -26,6 +27,8 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +[compat] +LossFunctions = "1.0.1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 6d00894..9645ece 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -97,3 +97,9 @@ function get_ordered_adjacency_matrix(a::Assignment, A, by=identity) perm = sortperm(a.node_labels, by=by) return A[perm, perm] end + + + +function Base.deepcopy(a::Assignment) + return Assignment(a.group_size, copy(a.node_labels), deepcopy(a.additional_data)) +end diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index 9660aca..ca816d0 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -8,21 +8,18 @@ mutable struct SparseData{F, C} log_likelihood::F end - - const SparseAssignment{T, F, C} = Assignment{ T, SparseData{F, C}} const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} -function SparseAssignment(g::Observations{G,D}, group_size::GroupSize, node_labels::Vector{Int}) where {G,D} +function SparseAssignment( + g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where {G, D} A = issparse(g.graph) ? g.graph : sparse(g.graph) num_levels = ncategories(g.dist_ref) sparse_data = SparseData(A, size(group_size, 1), num_levels, group_size, node_labels) return Assignment(group_size, node_labels, sparse_data) end - - function make_assignment(g, h, init_rule::SparseInitRule) group_size, node_labels = initialize_node_labels( @@ -30,25 +27,25 @@ function make_assignment(g, h, init_rule::SparseInitRule) return SparseAssignment(g, group_size, node_labels) end - function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, level_count::Int, group_size, node_labels) where {T} n = size(A, 1) data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), - zeros(Float64, level_count, k, k), dropzeros(A), zeros(Int, level_count, k), zeros(Int, k), 0.0) + zeros(Float64, level_count, k, k), dropzeros(A), zeros(Int, level_count, k), zeros( + Int, k), 0.0) _count_possible_occurences!(data, group_size) _count_occurences!(data, node_labels) _fast_div!(data.estimated_theta, data.realized, data.counts) - data.log_likelihood = compute_log_likelihood_without_0(data.estimated_theta, data.realized, data.counts) + data.log_likelihood = compute_log_likelihood_without_0( + data.estimated_theta, data.realized, data.counts) return data end - function _count_possible_occurences!(data, group_size) k = size(group_size, 1) for j in 1:k data.counts[j, j] = group_size[j] * (group_size[j] - 1) ÷ 2 - for i in j+1:k + for i in (j + 1):k data.counts[i, j] = group_size[i] * group_size[j] data.counts[j, i] = group_size[i] * group_size[j] end @@ -56,42 +53,44 @@ function _count_possible_occurences!(data, group_size) end function _count_occurences!(data, node_labels) - rows = rowvals(data.A) - vals = nonzeros(data.A) m, n = size(data.A) - for j in 1:n - groupj = node_labels[j] - for i in nzrange(data.A, j) - row = rows[i] - val = vals[i] - groupi = node_labels[row] - if ismissing(val) - data.counts[groupi, groupj] -= 1 + for k in 1:length(unique(node_labels)) + for l in k:length(unique(node_labels)) + node_group_k = findall(x -> x == k, node_labels) + node_group_l = findall(x -> x == l, node_labels) + if k != l + counts = StatsBase.countmap(data.A[i,j] for i in node_group_k for j in node_group_l if i != j) else - data.realized[val, groupi, groupj] += 1 + counts = StatsBase.countmap(data.A[i,j] for i in node_group_k for j in node_group_l if i < j) end + for m in 1:size(data.realized, 1) + data.realized[m, k, l] = get(counts, m, 0) + data.realized[m, l, k] = get(counts, m, 0) + end + total_witouth_missing = sum(values(counts)) - get(counts, missing, 0) + data.counts[k, l] = total_witouth_missing + data.counts[l, k] = total_witouth_missing end end - for k in axes(data.realized, 2) - data.realized[:,k,k] ./= 2 - end end - function compute_log_likelihood_without_0( estimated_theta::Array{T, 3}, realized::Array{F, 3}, counts) where { T, F} loglik = zero(T) number_groups = size(estimated_theta, 2) number_decorations = size(estimated_theta, 1) - @inbounds for j in 1:number_groups + for j in 1:number_groups for i in j:number_groups total_decorations = counts[i, j] + if total_decorations < sum(realized[:, i, j]) + total_decorations = sum(realized[:, i, j]) + end loglik -= xlogx(total_decorations) for m in 1:number_decorations - loglik += xlogx(realized[m,i,j]) - total_decorations -= realized[m,i,j] + loglik += xlogx(realized[m, i, j]) + total_decorations -= realized[m, i, j] end loglik += xlogx(total_decorations) end @@ -99,12 +98,10 @@ function compute_log_likelihood_without_0( return loglik end - function _n_decorations_with_0(a::SparseAssignment) return size(a.additional_data.estimated_theta, 1) + 1 end - function _n_decorations_not_0(a::SparseAssignment) return size(a.additional_data.estimated_theta, 1) end diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 637fff8..796d8ce 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -35,6 +35,7 @@ function revert_swap!( return nothing end +# this function fails in presence of missing values function apply_swap!( a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} update_observed_and_labels!(a, swap) diff --git a/src/optimisation/config_rules/accept_rule.jl b/src/optimisation/config_rules/accept_rule.jl index 46e7bd6..1029f0b 100644 --- a/src/optimisation/config_rules/accept_rule.jl +++ b/src/optimisation/config_rules/accept_rule.jl @@ -13,11 +13,32 @@ be updated in place if the swap is accepted. """ accept_reject_update! +function slow_swap!(a, swap) + swap_node_labels!(a, swap.index1, swap.index2) + _count_occurences!(a.additional_data, a.node_labels) + update_ll!(a) +end + function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) # calculate the score of the current assignment current_score = score(a, g) # perform the swap + #a_star = deepcopy(a) + #swap_star = deepcopy(swap) apply_swap!(a, swap) + #slow_swap!(a_star, swap_star) + # if any(a_star.additional_data.realized .!= a.additional_data.realized) + # println("The slow and fast swap functions do not agree on realized") + # if any(a_star.additional_data.counts .!= a.additional_data.counts) + # println("The slow and fast swap functions do not agree on counts") + # end + # println("a_star.additional_data.realized: ", a_star.additional_data.realized) + # println("a.additional_data.realized: ", a.additional_data.realized) + # println("a_star.additional_data.counts: ", a_star.additional_data.counts) + # println("a.additional_data.counts: ", a.additional_data.counts) + # error("The slow and fast swap functions do not agree after swapping labels ", swap.index1, " and ", swap.index2) + # end + # if the new assignment is worse, revert the swap if score(a, g) <= current_score revert_swap!(a, swap) From 79fa741fae95786d35e478a83b58fa06e8937486 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 20 Dec 2024 17:55:37 +0100 Subject: [PATCH 115/266] add docstrings and take care of edge cases --- src/distributions/categorical_with_0.jl | 48 +++++++++++++++++++ src/distributions/discrete_dist.jl | 63 +++++++++++++++++++------ src/distributions/discretizer.jl | 10 ++++ src/distributions/zero_inflated.jl | 43 +++++++++++++++++ 4 files changed, 150 insertions(+), 14 deletions(-) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index d327509..d76f046 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -1,3 +1,21 @@ +""" + struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution + +A zero-inflated categorical distribution that combines a Bernoulli distribution with a categorical distribution. + +# Fields +- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. +- `dist::D`: The categorical distribution. + +# Constructors +- `ZeroInflatedCategorical(p::Real, dist::D)`: Creates a zero-inflated categorical distribution with probability `p` of zero and categorical distribution `dist`. + +# Mathematical Explanation +The zero-inflated categorical distribution modifies the original categorical distribution by introducing a probability `p` of zero. The `pmf` and `cdf` are adjusted accordingly: +- `pdf(x) = p * δ(x) + (1 - p) * pmf_original(x)` +- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` +where `δ(x)` is the Dirac delta function. +""" struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution edge_proba::B dist::D @@ -35,11 +53,26 @@ end ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) +""" + Distributions.pdf(d::ZeroInflatedCategorical, x::Real) + +Computes the probability mass function (pmf) of the zero-inflated categorical distribution `d` at `x`. + +# Mathematical Explanation +The `pmf` of the zero-inflated categorical distribution is given by: +- `pmf(x) = p * δ(x) + (1 - p) * pmf_original(x)` +where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `pmf_original(x)` is the pmf of the original categorical distribution. +""" function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) end +""" + rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) + +Generates a random sample from the zero-inflated categorical distribution `d` using the random number generator `rng`. +""" function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) return rand(rng, d.edge_proba) * rand(rng, d.dist) end @@ -52,6 +85,16 @@ maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) +""" + Distributions.cdf(d::ZeroInflatedCategorical, x::Real) + +Computes the cumulative distribution function (cdf) of the zero-inflated categorical distribution `d` at `x`. + +# Mathematical Explanation +The `cdf` of the zero-inflated categorical distribution is given by: +- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` +where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `cdf_original(x)` is the cdf of the original categorical distribution. +""" function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, 0, Inf) + pdf(d.edge_proba, one(x)) * cdf(d.dist, x) @@ -63,6 +106,11 @@ end ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) +""" + Distributions.fit(::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) + +Fits a zero-inflated categorical distribution to the given data. +""" function Distributions.fit( ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where { B, D <: Categorical} diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index be7159f..75fe1ee 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -1,3 +1,20 @@ +""" + struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution + +A discretized distribution that combines a discretizer with a zero-inflated categorical distribution. + +# Fields +- `discretizer::D`: The discretizer used to discretize the continuous distribution. +- `probs::L`: The zero-inflated categorical distribution representing the discretized probabilities. + +# Constructors +- `DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d))`: Creates a discretized distribution with `n_bins` bins and support bound `support_bound`. + +# Mathematical Explanation +The discretized distribution modifies the original continuous distribution by dividing it into `n_bins` bins. The `pdf` and `cdf` are adjusted accordingly: +- `pdf(x) = pdf_discretized(bin) / bin_width` +- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` +""" mutable struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution where {D, L} discretizer::D @@ -31,30 +48,27 @@ function DiscretizedDistribution(discretizer::Discretizer) discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) end +""" + rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) + +Generates a random sample from the discretized distribution `d` using the random number generator `rng`. +""" function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) bin = rand(rng, d.probs) return _decode_randomly(rng, d.discretizer, bin) end -function minimum(d::DiscretizedDistribution) - return minimum(d.discretizer) -end +minimum(d::DiscretizedDistribution) = minimum(d.discretizer) -function maximum(d::DiscretizedDistribution) - return maximum(d.discretizer) -end +maximum(d::DiscretizedDistribution) = maximum(d.discretizer) -function insupport(d::DiscretizedDistribution, x::Real) - return support_encoding(d.discretizer, x) -end +insupport(d::DiscretizedDistribution, x::Real) = support_encoding(d.discretizer, x) function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} return DiscretizedDistribution(d, 10) end -function Distributions.ncategories(d::DiscretizedDistribution) - return ncategories(d.probs) -end +ncategories(d::DiscretizedDistribution) = ncategories(d.probs) function Distributions.fit(::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} return fit(L, data) @@ -64,7 +78,15 @@ function set_params!(d::DiscretizedDistribution{D, L}, params) where {D, L} d.probs = L(params...) end -# fast trick, will fail if discretizer put other categorical bins.... +""" + Distributions.pdf(d::DiscretizedDistribution, x::Real) + +Computes the probability density function (pdf) of the discretized distribution `d` at `x`. + +# Mathematical Explanation +The `pdf` of the discretized distribution is computed as: +- `pdf(x) = pdf_discretized(bin) / bin_width` +""" function pdf(d::DiscretizedDistribution, x::Real) if x == 0 return pdf(d.probs, zero(x)) @@ -76,6 +98,11 @@ function pdf(d::DiscretizedDistribution, x::Real) return pdf(d.probs, bin) / binwidth(d.discretizer) end +""" + Distributions.logpdf(d::DiscretizedDistribution, x::Real) + +Computes the log of the probability density function (logpdf) of the discretized distribution `d` at `x`. +""" function logpdf(d::DiscretizedDistribution, x::Real) if !support_encoding(d.discretizer, x) return -Inf @@ -85,7 +112,15 @@ function logpdf(d::DiscretizedDistribution, x::Real) return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) end -#lazy cdf computation, not efficient +""" + Distributions.cdf(d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} + +Computes the cumulative distribution function (cdf) of the discretized distribution `d` at `x`. + +# Mathematical Explanation +The `cdf` of the discretized distribution is computed as: +- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` +""" function Distributions.cdf( d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} x < minimum(d) && return zero(x) diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index 6d93102..b27867f 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -21,6 +21,16 @@ struct RegularDiscretizer{F, T, L} <: Discretizer bin_width::F end +function RegularDiscretizer(n_bins::Int, lower_bound::F, upper_bound::F) where {F} + if !isfinite(lower_bound) || !isfinite(upper_bound) + throw(ArgumentError("RegularDiscretizer requires finite lower and upper bounds.")) + end + bin_width = (upper_bound - lower_bound) / n_bins + return RegularDiscretizer( + n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), bin_width + ) +end + function support_encoding(d::RegularDiscretizer, x::Real) return d.lower_bound <= x <= d.upper_bound end diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 90f7202..260fe74 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -1,3 +1,21 @@ +""" + struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution + +A zero-inflated distribution that combines a Bernoulli distribution with a continuous distribution. + +# Fields +- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. +- `dist::D`: The continuous distribution. + +# Constructors +- `ZeroInflated(p::Real, dist::D)`: Creates a zero-inflated distribution with probability `p` of zero and continuous distribution `dist`. + +# Mathematical Explanation +The zero-inflated distribution modifies the original distribution by introducing a probability `p` of zero. The `pdf` and `cdf` are adjusted accordingly: +- `pdf(x) = p * δ(x) + (1 - p) * pdf_original(x)` +- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` +where `δ(x)` is the Dirac delta function. +""" struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution edge_proba::B dist::D @@ -7,15 +25,30 @@ function ZeroInflated(p::Real, dist::D) where {D} return ZeroInflated(Bernoulli(1 - p), dist) end +""" + Distributions.pdf(d::ZeroInflated, x::Real) + +Computes the probability density function (pdf) of the zero-inflated distribution `d` at `x`. +""" function Distributions.pdf(d::ZeroInflated, x::Real) return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) end +""" + get_proba_zero(d::ZeroInflated) + +Returns the probability of zero for the zero-inflated distribution `d`. +""" function get_proba_zero(d::ZeroInflated) return pdf(d.edge_proba, 0) end +""" + rand(rng::Random.AbstractRNG, d::ZeroInflated) + +Generates a random sample from the zero-inflated distribution `d` using the random number generator `rng`. +""" function rand(rng::Random.AbstractRNG, d::ZeroInflated) return rand(rng, d.edge_proba) * rand(rng, d.dist) end @@ -28,6 +61,11 @@ maximum(d::ZeroInflated) = max(maximum(d.dist), 0) insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) +""" + Distributions.cdf(d::ZeroInflated, x::Real) + +Computes the cumulative distribution function (cdf) of the zero-inflated distribution `d` at `x`. +""" function Distributions.cdf(d::ZeroInflated, x::Real) return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + cdf(d.dist, x) * pdf(d.edge_proba, one(x)) @@ -37,6 +75,11 @@ function Distributions.params(d::ZeroInflated) (first(params(d.edge_proba)), params(d.dist)...) end +""" + Distributions.fit(::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) + +Fits a zero-inflated distribution to the given data. +""" function Distributions.fit( ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} indices_0 = findall(x -> x == 0, data) From 501f4194bac5d8fce56a1409026803094fabf597 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 20 Dec 2024 17:57:16 +0100 Subject: [PATCH 116/266] add tests --- test/generated_tests/all.jl | 2 + test/generated_tests/test_distribution.jl | 81 ++++++++++++++++++ test/generated_tests/test_zero_inflated.jl | 96 ++++++++++++++++++++++ test/runtests.jl | 12 ++- 4 files changed, 187 insertions(+), 4 deletions(-) create mode 100644 test/generated_tests/all.jl create mode 100644 test/generated_tests/test_distribution.jl create mode 100644 test/generated_tests/test_zero_inflated.jl diff --git a/test/generated_tests/all.jl b/test/generated_tests/all.jl new file mode 100644 index 0000000..3a6cd57 --- /dev/null +++ b/test/generated_tests/all.jl @@ -0,0 +1,2 @@ +include("test_zero_inflated.jl") +include("test_distribution.jl") diff --git a/test/generated_tests/test_distribution.jl b/test/generated_tests/test_distribution.jl new file mode 100644 index 0000000..c9133df --- /dev/null +++ b/test/generated_tests/test_distribution.jl @@ -0,0 +1,81 @@ +using NetworkHistogram: ZeroInflated, DiscretizedDistribution, ZeroInflatedCategorical, + ncategories, Discretizer, encode, decode, binwidth, RegularDiscretizer, + CategoryDiscretizer, HybridDiscretizer, DiscretizerZeroToZero, nlabels +using Distributions +using Test + +@testset "ZeroInflated" begin + dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) + @test pdf(dist, 0) ≈ 0.3 + 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 0) + @test pdf(dist, 1) ≈ 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 1) + @test cdf(dist, 0) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 0) + @test cdf(dist, 1) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 1) +end + +@testset "DiscretizedDistribution" begin + dist = DiscretizedDistribution(truncated(Normal(0, 1), -3, 3), 10) + @test ncategories(dist) == 10 + @test pdf(dist, 0) >= 0 + @test cdf(dist, 0) >= 0 +end + +@testset "ZeroInflatedCategorical" begin + dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) + @test pdf(dist, 0) ≈ 0.3 + @test pdf(dist, 1) ≈ 0.7 * 0.2 + @test cdf(dist, 0) ≈ 0.3 + @test cdf(dist, 1) ≈ 0.3 + 0.7 * 0.2 +end + +@testset "ZeroInflatedDiscretizedDistribution" begin + dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) + disc_dist = DiscretizedDistribution(dist, 10) + @test ncategories(disc_dist) == 10 + @test pdf(disc_dist, 0) >= 0 + @test cdf(disc_dist, 0) >= 0 +end + +@testset "DiscretizedZeroInflatedCategorical" begin + dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) + disc_dist = DiscretizedDistribution(dist, 10) + @test ncategories(disc_dist) == 10 + @test pdf(disc_dist, 0) >= 0 + @test cdf(disc_dist, 0) >= 0 +end + +@testset "Discretizer" begin + using Distributions + disc = RegularDiscretizer(10, 0.0, 1.0) + @test encode(disc, 0.05) == 1 + @test decode(disc, 1) == (0.0, 0.1) + @test binwidth(disc) == 0.1 + @test nlabels(disc) == 10 +end + +@testset "CategoryDiscretizer" begin + cat_to_bin = Dict("a" => 1, "b" => 2, "c" => 3) + bin_to_cat = Dict(1 => "a", 2 => "b", 3 => "c") + disc = CategoryDiscretizer(cat_to_bin, bin_to_cat) + @test encode(disc, "a") == 1 + @test decode(disc, 1) == "a" + @test nlabels(disc) == 3 +end + +@testset "HybridDiscretizer" begin + atoms = [0.0, 1.0] + disc = HybridDiscretizer(10, -1.0, 1.0, atoms) + @test encode(disc, 0.0) == 11 + @test encode(disc, 0.5) == 8 + @test decode(disc, 11) == 0.0 + @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol=1e-2)) + @test nlabels(disc) == 12 +end + +@testset "DiscretizerZeroToZero" begin + disc = DiscretizerZeroToZero(10, -1.0, 1.0) + @test encode(disc, 0.0) == 0 + @test encode(disc, 0.5) == 8 + @test decode(disc, 0) == 0.0 + @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol=1e-2)) + @test nlabels(disc) == 11 +end diff --git a/test/generated_tests/test_zero_inflated.jl b/test/generated_tests/test_zero_inflated.jl new file mode 100644 index 0000000..523b89c --- /dev/null +++ b/test/generated_tests/test_zero_inflated.jl @@ -0,0 +1,96 @@ +using Test +using Distributions +using Random +using NetworkHistogram: ZeroInflated, get_proba_zero + +@testset "ZeroInflated Distribution Tests" begin + @testset "continuous distribution" begin + # Test construction + dist = Normal(0, 1) + zero_inflated_dist = ZeroInflated(0.5, dist) + @test zero_inflated_dist.edge_proba == Bernoulli(0.5) + @test zero_inflated_dist.dist == dist + + # Test pdf + @test pdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * pdf(dist, 0) + @test pdf(zero_inflated_dist, 1) ≈ 0.5 * pdf(dist, 1) + + # Test get_proba_zero + @test get_proba_zero(zero_inflated_dist) == 0.5 + + # Test rand + rng = MersenneTwister(1234) + sample = rand(rng, zero_inflated_dist) + @test sample == 0 || insupport(dist, sample) + + # Test logpdf + @test logpdf(zero_inflated_dist, 0) ≈ log(0.5* (1 + pdf(dist, 0))) + @test logpdf(zero_inflated_dist, 1) ≈ log(0.5 * pdf(dist, 1)) + + # Test minimum and maximum + @test minimum(zero_inflated_dist) == minimum(dist) + @test maximum(zero_inflated_dist) == maximum(dist) + + # Test insupport + @test insupport(zero_inflated_dist, 0) + @test insupport(zero_inflated_dist, 1) == insupport(dist, 1) + + # Test cdf + @test cdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * cdf(dist, 0) + @test cdf(zero_inflated_dist, 1) ≈ 0.5 + 0.5 * cdf(dist, 1) + + # Test params + @test params(zero_inflated_dist) == (0.5, params(dist)...) + + # Test fit + data = [0, 0, 1, 2, 3] + fitted_dist = fit(ZeroInflated{Bernoulli, Normal}, data, 2) + @test fitted_dist.edge_proba == Bernoulli(0.6) + @test fitted_dist.dist isa Normal + end + + @testset "discrete distribution" begin + # Test construction with discrete distribution + dist_disc = Poisson(3) + zero_inflated_dist_disc = ZeroInflated(0.5, dist_disc) + @test zero_inflated_dist_disc.edge_proba == Bernoulli(0.5) + @test zero_inflated_dist_disc.dist == dist_disc + + # Test pdf with discrete distribution + @test pdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * pdf(dist_disc, 0) + @test pdf(zero_inflated_dist_disc, 1) ≈ 0.5 * pdf(dist_disc, 1) + + # Test get_proba_zero with discrete distribution + @test get_proba_zero(zero_inflated_dist_disc) == 0.5 + + # Test rand with discrete distribution + rng = MersenneTwister(1234) + sample_disc = rand(rng, zero_inflated_dist_disc) + @test sample_disc == 0 || insupport(dist_disc, sample_disc) + + # Test logpdf with discrete distribution + @test logpdf(zero_inflated_dist_disc, 0) ≈ log(0.5 * (1 + pdf(dist_disc, 0))) + @test logpdf(zero_inflated_dist_disc, 1) ≈ log(0.5 * pdf(dist_disc, 1)) + + # Test minimum and maximum with discrete distribution + @test minimum(zero_inflated_dist_disc) == minimum(dist_disc) + @test maximum(zero_inflated_dist_disc) == maximum(dist_disc) + + # Test insupport with discrete distribution + @test insupport(zero_inflated_dist_disc, 0) + @test insupport(zero_inflated_dist_disc, 1) == insupport(dist_disc, 1) + + # Test cdf with discrete distribution + @test cdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * cdf(dist_disc, 0) + @test cdf(zero_inflated_dist_disc, 1) ≈ 0.5 + 0.5 * cdf(dist_disc, 1) + + # Test params with discrete distribution + @test params(zero_inflated_dist_disc) == (0.5, params(dist_disc)...) + + # Test fit with discrete distribution + data_disc = [0, 0, 1, 2, 3] + fitted_dist_disc = fit(ZeroInflated{Bernoulli, Poisson}, data_disc, 2) + @test fitted_dist_disc.edge_proba == Bernoulli(0.6) + @test fitted_dist_disc.dist isa Poisson + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 3b39327..9f88662 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,7 +4,6 @@ using SparseArrays include("TestNetworkHistogram.jl") @testset "Tests" begin - @testset "Discretizer tests" begin include("discretised_dist/discretizer.jl") end @@ -26,8 +25,13 @@ include("TestNetworkHistogram.jl") @testset "API tests" begin include("test_api.jl") end - @testset "Aqua.jl for package quality" begin - using NetworkHistogram - Aqua.test_all(NetworkHistogram) + + @testset "Generated tests" begin + include("generated_tests/all.jl") end + + # @testset "Aqua.jl for package quality" begin + # using NetworkHistogram + # Aqua.test_all(NetworkHistogram) + # end end From 994b942b3a9b824aba527eb3c600d9e51e34d181 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 20 Dec 2024 18:15:41 +0100 Subject: [PATCH 117/266] improve api --- benchmark/benchmarks.jl | 2 +- src/NetworkHistogram.jl | 1 + src/api.jl | 74 +++++++++++++- src/observations.jl | 138 +++++++++++++++++++------- src/optimisation/fit.jl | 52 ++++++++-- src/optimisation/least_squares.jl | 67 ++++++++++--- test/assignments/sparse_assignment.jl | 2 +- test/test_api.jl | 4 +- 8 files changed, 279 insertions(+), 61 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 0bb7d66..cc8828e 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -31,7 +31,7 @@ SUITE["Assignment"] = BenchmarkGroup(["assignment"]) Random.seed!(123451) stop_rule = NH.PreviousBestValue(200) -max_iter = 200 +iterations = 200 swap_rule = NH.RandomNodeSwap() accept_rule = NH.Strict() dist = Bernoulli(0.5) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 3746827..a67d34c 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -60,6 +60,7 @@ export BernoulliData, CategoricalData export Observations, discretise export DiscretizedDistribution +export Observations, estimate_graphon, nethist, nethist_discretised export bootstrap diff --git a/src/api.jl b/src/api.jl index 57826b3..e1024d5 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,3 +1,15 @@ +""" + _default_init(dist::Distribution, start = MetisStart()) + +Initialize the distribution with a default rule. + +# Arguments +- `dist::Distribution`: The distribution to initialize. +- `start`: The starting method. + +# Returns +- `InitRule`: The initialization rule. +""" function _default_init(dist::Distribution, start = MetisStart()) if dist isa Bernoulli return InitRule(start, Val{BernoulliData}()) @@ -10,6 +22,20 @@ function _default_init(dist::Distribution, start = MetisStart()) end end +""" + _nethist(g::Observations{G, D}, h; kwargs...) + +Estimate the graphon and fit the model to the given graph observations. + +# Arguments +- `g::Observations{G, D}`: The graph observations. +- `h`: Number of nodes per block. +- `kwargs...`: Additional keyword arguments. + +# Returns +- `fit_model`: The fitted model. +- `a`: The assignment of nodes to blocks. +""" function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} kwargs_dict = Dict(kwargs) start_clustering = pop!(kwargs_dict, :start_clustering, MetisStart()) @@ -20,9 +46,28 @@ function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} return fit(a, g), a end +""" + nethist(g::Observations{G, D}; h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) + +Fit a Stochastic Block Model (SBM) to the given graph observations. + +# Arguments +- `g::Observations{G, D}`: The graph observations. +- `h`: Number of nodes per block. +- `iterations`: Maximum number of iterations. +- `stalled_iter`: Number of stalled iterations before stopping. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `progress_bar::Bool`: Whether to show a progress bar. +- `start_clustering`: Initial clustering method. + +# Returns +- `sbm`: The fitted SBM. +- `a`: The assignment of nodes to blocks. +""" function nethist(g::Observations{G, D}; h = select_number_node_per_block(g, EstimatedDegrees()), - max_iter = 100_000, + iterations = 100_000, stalled_iter = 1000, swap_rule::NodeSwapRule = RandomGroupSwap(), accept_rule::AcceptRule = Strict(), @@ -30,7 +75,7 @@ function nethist(g::Observations{G, D}; start_clustering = MetisStart() ) where {G, D} return _nethist(g, h; - max_iter = max_iter, + iterations = iterations, swap_rule = swap_rule, accept_rule = accept_rule, stop_rule = PreviousBestValue(stalled_iter), @@ -38,10 +83,31 @@ function nethist(g::Observations{G, D}; start_clustering = start_clustering) end +""" + nethist_discretised(g::Observations{G, D}; number_levels, h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) + +Fit a discretised Stochastic Block Model (SBM) to the given graph observations. + +# Arguments +- `g::Observations{G, D}`: The graph observations. +- `number_levels`: Number of levels for discretisation. +- `h`: Number of nodes per block. +- `iterations`: Maximum number of iterations. +- `stalled_iter`: Number of stalled iterations before stopping. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `progress_bar::Bool`: Whether to show a progress bar. +- `start_clustering`: Initial clustering method. + +# Returns +- `sbm_discretise`: The fitted discretised SBM. +- `a`: The assignment of nodes to blocks. +- `discretiser`: The discretiser used. +""" function nethist_discretised(g::Observations{G, D}; number_levels = nothing, h = select_number_node_per_block(g, EstimatedDegrees()), - max_iter = 100_000, + iterations = 100_000, stalled_iter = 1000, swap_rule::NodeSwapRule = RandomGroupSwap(), accept_rule::AcceptRule = Strict(), @@ -52,7 +118,7 @@ function nethist_discretised(g::Observations{G, D}; obs_discrete, discretiser = discretise( g, number_groups = num_groups, number_levels = number_levels) sbm_discretise, a = _nethist(obs_discrete, h; - max_iter = max_iter, + iterations = iterations, swap_rule = swap_rule, accept_rule = accept_rule, stop_rule = PreviousBestValue(stalled_iter), diff --git a/src/observations.jl b/src/observations.jl index 4e23916..0155a37 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -4,20 +4,43 @@ struct Observations{G, D} dist_ref::D end -function number_nodes(g::Observations{AbstractGraph, D}) where {D} - return nv(g.graph) +""" + number_nodes(graph::Observations) + +Get the number of nodes in the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `num_nodes`: The number of nodes. +""" +function number_nodes(graph::Observations{AbstractGraph, D}) where {D} + return nv(graph.graph) end -function number_nodes(g::Observations) - return size(g.graph, 1) +function number_nodes(graph::Observations) + return size(graph.graph, 1) end -function get_obs(g::Observations, x::Tuple) - return get_obs(g, x[1], x[2]) +""" + get_obs(graph::Observations, x::Tuple) + +Get the observation for the given tuple of nodes. + +# Arguments +- `graph::Observations`: The graph observations. +- `x::Tuple`: The tuple of nodes. + +# Returns +- `obs`: The observation. +""" +function get_obs(graph::Observations, x::Tuple) + return get_obs(graph, x[1], x[2]) end -function get_obs(g::Observations, i::Int, j::Int) - return get_obs(g.graph, i, j) +function get_obs(graph::Observations, i::Int, j::Int) + return get_obs(graph.graph, i, j) end function get_obs(g::SimpleGraph, x::Tuple) @@ -31,7 +54,18 @@ end get_obs(g::AbstractArray, x) = get_obs(g, x[1], x[2]) get_obs(g::AbstractArray, i, j) = g[i, j] -density(g::Observations) = density(g.graph) +""" + density(graph::Observations) + +Get the density of the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `density`: The density of the graph. +""" +density(graph::Observations) = density(graph.graph) function density(g::AbstractGraph) return Graphs.density(g) end @@ -40,24 +74,46 @@ function density(g::AbstractMatrix) return sum(g) / ((size(g, 1) * (size(g, 1) - 1))) end -function get_degree(g::Observations{AbstractGraph, D}) where {D} - Graphs.degree(g.graph) +""" + get_degree(graph::Observations) + +Get the degree of each node in the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `degrees`: The degrees of the nodes. +""" +function get_degree(graph::Observations{AbstractGraph, D}) where {D} + Graphs.degree(graph.graph) end -function get_degree(g) - return sum(g.graph, dims = 2) +function get_degree(graph) + return sum(graph.graph, dims = 2) end -function get_adj(g::Observations{AbstractGraph, D}) where {D} - return Graphs.adjacency_matrix(g.graph) +""" + get_adj(graph::Observations) + +Get the adjacency matrix of the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `adj_matrix`: The adjacency matrix. +""" +function get_adj(graph::Observations{AbstractGraph, D}) where {D} + return Graphs.adjacency_matrix(graph.graph) end -function get_adj(g::Observations) - return g.graph +function get_adj(graph::Observations) + return graph.graph end -function normalized_laplacian(g::Observations) - return normalized_laplacian(g.graph) +function normalized_laplacian(graph::Observations) + return normalized_laplacian(graph.graph) end function normalized_laplacian(g::AbstractGraph) @@ -86,20 +142,20 @@ function normalized_laplacian(g::AbstractMatrix) return L end -function Metis.graph(g::Observations{<:AbstractGraph, <:Bernoulli}) - return Metis.graph(g.graph) +function Metis.graph(graph::Observations{<:AbstractGraph, <:Bernoulli}) + return Metis.graph(graph.graph) end function Metis.graph(g::Observations{<:AbstractMatrix, <:Bernoulli}) return Metis.graph(SimpleGraph(g.graph)) end -function Metis.graph(g::Observations{<:AbstractGraph, <:UnivariateDistribution}) - if minimum(g.dist_ref) < 0 +function Metis.graph(graph::Observations{<:AbstractGraph, <:UnivariateDistribution}) + if minimum(graph.dist_ref) < 0 @warn "Negative values are not allowed for MetisStart, using binary graph" - return Metis.graph(g.graph) + return Metis.graph(graph.graph) else - return Metis.graph(g.graph, weights = true) + return Metis.graph(graph.graph, weights = true) end end @@ -116,6 +172,20 @@ end """ + discretise(graph::Observations; number_groups, number_levels) + +Discretise the graph observations. + +# Arguments +- `graph::Observations`: The graph observations. +- `number_groups`: Number of groups for discretisation. +- `number_levels`: Number of levels for discretisation. + +# Returns +- `discretised_graph`: The discretised graph observations. +- `discretiser`: The discretiser used. + + Assume that the diagonal is zero. 0 indicates no edge, while missing indicates no information about the edge. By default maps 0 to 0. If you want another behaviour use the function where you @@ -123,33 +193,33 @@ pass a `Discretizer` object. number_levels will be the number of levels in the discretized distribution (excluding 0). """ -function discretise(g::Observations{G, D}; +function discretise(graph::Observations{G, D}; number_groups = nothing, number_levels = nothing) where {G, D} if isnothing(number_groups) && isnothing(number_levels) throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) end if isnothing(number_levels) - number_levels = round(Int,get_num_levels_from_groups(number_nodes(g), number_groups)) + number_levels = round(Int,get_num_levels_from_groups(number_nodes(graph), number_groups)) else if !isnothing(number_groups) @warn "disregarding `number_groups` as `number_levels` is provided" end end - return discretise(g, DiscretizerZeroToZero(number_levels, extrema(g.graph)...)) + return discretise(graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) end -function discretise(g::Observations{G, D}, discretiser ::Discretizer) where {G,D<:UnivariateDistribution} - A_encoded = encode(discretiser, _graph_to_mat(g)) +function discretise(graph::Observations{G, D}, discretiser ::Discretizer) where {G,D<:UnivariateDistribution} + A_encoded = encode(discretiser, _graph_to_mat(graph)) return Observations(A_encoded, DiscretizedDistribution(discretiser)), discretiser end -function _graph_to_mat(g::Observations{<:AbstractGraph, D}) where {D<:UnivariateDistribution} - return weights(g.graph) +function _graph_to_mat(graph::Observations{<:AbstractGraph, D}) where {D<:UnivariateDistribution} + return weights(graph.graph) end -function _graph_to_mat(g::Observations{<:AbstractMatrix, D}) where {D<:UnivariateDistribution} - return g.graph +function _graph_to_mat(graph::Observations{<:AbstractMatrix, D}) where {D<:UnivariateDistribution} + return graph.graph end diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index 547e925..7c8c29d 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -1,23 +1,43 @@ # Slow fallback methods for the Assignment type -# speed up by implementing specialized methods for the BernoulliAssignment type and others -# method to compute estimator from node clustering as specified in assignment +# Speed up by implementing specialized methods for the BernoulliAssignment type and others + +""" + fit(a::Assignment, g::Observations) + +Compute the estimator from node clustering as specified in the assignment. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `g::Observations`: The graph observations. + +# Returns +- `dists`: The fitted distributions. +""" function fit(a::Assignment, g::Observations) dists = initialize_sbm(a.group_size, g.dist_ref) fit!(dists, g, a) return dists end +""" + fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} + +Fit the SBM to the given graph observations and assignment. + +# Arguments +- `sbm::BlockModel{D,K,F}`: The block model to fit. +- `g::Observations{G,D}`: The graph observations. +- `a::Assignment`: The assignment of nodes to blocks. +""" function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) - sbm[group1, - group2] = fit_group(g.dist_ref, g, edge_indices) + sbm[group1, group2] = fit_group(g.dist_ref, g, edge_indices) end end end - function fit_group(d::ZeroInflatedCategorical, g, edges) return Distributions.fit(typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) end @@ -26,12 +46,22 @@ function fit_group(distribution, g, edges) return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) end - function fit_group(distribution::Binomial, g, edges) return Distributions.fit(typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) end -# method to compute the log likelihood of a BlockModel fitted according to the assignment +""" + loglikelihood(a::Assignment, g::Observations) + +Compute the log likelihood of a BlockModel fitted according to the assignment. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `g::Observations`: The graph observations. + +# Returns +- `log_likelihood`: The log likelihood of the fitted model. +""" function loglikelihood(a::Assignment, g::Observations) return _log_likelihood(a, fit(a, g), g) end @@ -49,7 +79,15 @@ function _log_likelihood(a::Assignment, sbm::BlockModel, g) return log_likelihood end +""" + fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} + +Fit the SBM to the given graph observations. +# Arguments +- `sbm::BlockModel{D,K,F}`: The block model to fit. +- `g::Observations{G,D}`: The graph observations. +""" function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} k = number_blocks(sbm) a = estimate_graphon(g, select_number_node_per_block(g, OracleK(k))) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index f68330e..0fd04a6 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -1,21 +1,53 @@ include("config_rules/include.jl") -function estimate_graphon(g, h = select_number_node_per_block(g, EstimatedDegrees()); - max_iter::Int = 10_000, +""" + estimate_graphon(graph, h; iterations, initialise_rule, swap_rule, accept_rule, stop_rule, progress_bar) + +Estimate the graphon for the given graph. + +# Arguments +- `graph`: The input graph. +- `h`: Number of nodes per block. +- `iterations`: Maximum number of iterations. +- `initialise_rule::InitRule`: Rule for initializing the assignment. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `stop_rule::StopRule`: Rule for stopping the iterations. +- `progress_bar::Bool`: Whether to show a progress bar. + +# Returns +- `a`: The assignment of nodes to blocks. +""" +function estimate_graphon(graph, h = select_number_node_per_block(graph, EstimatedDegrees()); + iterations::Int = 10_000, initialise_rule::InitRule = InitRule(SpectralStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), stop_rule::StopRule = PreviousBestValue(1000), progress_bar::Bool = false ) - a = make_assignment(g, h, initialise_rule) - initialise_stop_rule!(stop_rule, a, g) + a = make_assignment(graph, h, initialise_rule) + initialise_stop_rule!(stop_rule, a, graph) greedy_improve!( - a, g; max_iter, swap_rule, accept_rule, stop_rule, progress_bar) + a, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) return a end -function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, +""" + greedy_improve!(a::Assignment, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) + +Perform greedy improvement on the assignment. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `graph`: The input graph. +- `iterations`: Maximum number of iterations. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `stop_rule::StopRule`: Rule for stopping the iterations. +- `progress_bar::Bool`: Whether to show a progress bar. +""" +function greedy_improve!(a::Assignment, graph; iterations::Int = 10_000, swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict(), stop_rule::StopRule = PreviousBestValue(1000), @@ -25,25 +57,36 @@ function greedy_improve!(a::Assignment, g; max_iter::Int = 10_000, swap = make_swap(a, (1, 1)) p = ProgressUnknown(enabled = progress_bar, showspeed = true, desc = "Greedy search: ") # perform local search until the stopping rule is met - for i in 1:max_iter + for i in 1:iterations local_search!( - a, g, swap, swap_rule = swap_rule, accept_rule = accept_rule) + a, graph, swap, swap_rule = swap_rule, accept_rule = accept_rule) next!(p) - if stopping_rule(a, g, stop_rule) + if stopping_rule(a, graph, stop_rule) finish!(p) break end end end -# perform local search by trying a swap and accepting it if it improves the likelihood +""" + local_search!(a::Assignment, graph, swap; swap_rule, accept_rule) + +Perform local search by trying a swap and accepting it if it improves the likelihood. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `graph`: The input graph. +- `swap`: The swap object. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +""" function local_search!( - a::Assignment, g, swap::Swap = make_swap(a, (1, 1)); + a::Assignment, graph, swap::Swap = make_swap(a, (1, 1)); swap_rule::NodeSwapRule = RandomNodeSwap(), accept_rule::AcceptRule = Strict() ) # select two nodes to swap and build the swap object make_swap!(swap, a, select_swap(a, swap_rule)) # perform the swap and accept it if it improves the likelihood - accept_reject_update!(a, swap, g, accept_rule) + accept_reject_update!(a, swap, graph, accept_rule) end diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl index c7186ac..b23a988 100644 --- a/test/assignments/sparse_assignment.jl +++ b/test/assignments/sparse_assignment.jl @@ -17,7 +17,7 @@ using Random A[i, i] = 0 end g = NH.Observations(A_dense, Categorical(m)) - sbm_fitted, a = nethist(g; h = n ÷ k, max_iter = 10) + sbm_fitted, a = nethist(g; h = n ÷ k, iterations = 10) sparse_a = NH.SparseAssignment( NH.Observations(A, Categorical(m)), a.group_size, a.node_labels) @test a.additional_data.counts == sparse_a.additional_data.counts diff --git a/test/test_api.jl b/test/test_api.jl index db1a77a..e68b72b 100644 --- a/test/test_api.jl +++ b/test/test_api.jl @@ -6,13 +6,13 @@ end g = Observations(Symmetric(A), Uniform(-1, 1)) - sbm_fitted, a = nethist(g; h = 10, max_iter = 10) + sbm_fitted, a = nethist(g; h = 10, iterations = 10) @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) @test size(sbm_fitted) == (4,4) sbm_discretised, a, discretizer = nethist_discretised( - g; number_levels = 5, h = 10, max_iter = 10) + g; number_levels = 5, h = 10, iterations = 10) @test sbm_discretised[1,1] isa DiscretizedDistribution @test ncategories(sbm_discretised[1,1]) == 5 @test size(sbm_discretised) == (4,4) From 90cabac56c814dc86049a2ca451caeb55c9f7867 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 20 Dec 2024 18:43:02 +0100 Subject: [PATCH 118/266] clean observations --- src/observations.jl | 146 ++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/src/observations.jl b/src/observations.jl index 0155a37..096fb55 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -1,4 +1,13 @@ -# remove all references to graphs, and only use sparse matrices ? +""" + Observations{G, D} + +A struct to hold observations for a network. The type parameter `G` represents the network + structure and must support indexing and the `size` function. + +# Fields +- `graph::G`: The network structure (e.g. adjacency matrix). +- `dist_ref::D`: distribution of the observations (used for getting support, type of elements, etc.) +""" struct Observations{G, D} graph::G dist_ref::D @@ -15,10 +24,6 @@ Get the number of nodes in the graph. # Returns - `num_nodes`: The number of nodes. """ -function number_nodes(graph::Observations{AbstractGraph, D}) where {D} - return nv(graph.graph) -end - function number_nodes(graph::Observations) return size(graph.graph, 1) end @@ -39,20 +44,22 @@ function get_obs(graph::Observations, x::Tuple) return get_obs(graph, x[1], x[2]) end -function get_obs(graph::Observations, i::Int, j::Int) - return get_obs(graph.graph, i, j) -end +""" + get_obs(graph::Observations, i::Int, j::Int) -function get_obs(g::SimpleGraph, x::Tuple) - return get_obs(g, x[1], x[2]) -end +Get the observation for the given pair of nodes. -function get_obs(g::SimpleGraph, i::Int, j::Int) - return convert(Bool, has_edge(g, i, j)) -end +# Arguments +- `graph::Observations`: The graph observations. +- `i::Int`: The first node. +- `j::Int`: The second node. -get_obs(g::AbstractArray, x) = get_obs(g, x[1], x[2]) -get_obs(g::AbstractArray, i, j) = g[i, j] +# Returns +- `obs`: The observation. +""" +function get_obs(graph::Observations, i::Int, j::Int) + return graph.graph[i, j] +end """ density(graph::Observations) @@ -65,13 +72,8 @@ Get the density of the graph. # Returns - `density`: The density of the graph. """ -density(graph::Observations) = density(graph.graph) -function density(g::AbstractGraph) - return Graphs.density(g) -end - -function density(g::AbstractMatrix) - return sum(g) / ((size(g, 1) * (size(g, 1) - 1))) +function density(graph::Observations) + return sum(graph.graph) / ((size(graph.graph, 1) * (size(graph.graph, 1) - 1))) end """ @@ -85,11 +87,7 @@ Get the degree of each node in the graph. # Returns - `degrees`: The degrees of the nodes. """ -function get_degree(graph::Observations{AbstractGraph, D}) where {D} - Graphs.degree(graph.graph) -end - -function get_degree(graph) +function get_degree(graph::Observations) return sum(graph.graph, dims = 2) end @@ -104,14 +102,12 @@ Get the adjacency matrix of the graph. # Returns - `adj_matrix`: The adjacency matrix. """ -function get_adj(graph::Observations{AbstractGraph, D}) where {D} - return Graphs.adjacency_matrix(graph.graph) -end - function get_adj(graph::Observations) return graph.graph end + + function normalized_laplacian(graph::Observations) return normalized_laplacian(graph.graph) end @@ -123,18 +119,29 @@ end normalized_laplacian(g::CategoricalArray) = normalized_laplacian(levelcode.(g)) -function normalized_laplacian(g::AbstractMatrix) - degrees = sum(g, dims = 1) +""" + normalized_laplacian(graph::Observations) + +Get the normalized Laplacian of the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `L`: The normalized Laplacian matrix. +""" +function normalized_laplacian(graph::AbstractMatrix) + degrees = sum(graph, dims = 1) degrees .-= minimum(degrees) - n = size(g, 1) - L = similar(g, Float64) + n = size(graph, 1) + L = similar(graph, Float64) for j in 1:n for i in 1:n if i == j L[i, j] = 1 elseif degrees[i] == 0 || degrees[j] == 0 L[i, j] = 0 - elseif g[i, j] != 0 + elseif graph[i, j] != 0 L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) end end @@ -142,32 +149,13 @@ function normalized_laplacian(g::AbstractMatrix) return L end -function Metis.graph(graph::Observations{<:AbstractGraph, <:Bernoulli}) - return Metis.graph(graph.graph) -end - -function Metis.graph(g::Observations{<:AbstractMatrix, <:Bernoulli}) - return Metis.graph(SimpleGraph(g.graph)) -end - -function Metis.graph(graph::Observations{<:AbstractGraph, <:UnivariateDistribution}) +function Metis.graph(graph::Observations{G, <:UnivariateDistribution}) where {G} + use_weights = true if minimum(graph.dist_ref) < 0 @warn "Negative values are not allowed for MetisStart, using binary graph" - return Metis.graph(graph.graph) - else - return Metis.graph(graph.graph, weights = true) + use_weights = false end -end - -function Metis.graph(g::Observations{<:AbstractMatrix, <:UnivariateDistribution}) - return Metis.graph( - weights(SimpleWeightedGraph(g.graph)), weights = true) -end - -function Metis.graph(g::Observations{<:CategoricalMatrix, <:UnivariateFinite}) - A, _ = categorical_matrix(g) - return Metis.graph( - adjacency_matrix(SimpleWeightedGraph(A)), weights = true) + return Metis.graph(sparse(graph.graph), weights = use_weights) end @@ -185,7 +173,6 @@ Discretise the graph observations. - `discretised_graph`: The discretised graph observations. - `discretiser`: The discretiser used. - Assume that the diagonal is zero. 0 indicates no edge, while missing indicates no information about the edge. By default maps 0 to 0. If you want another behaviour use the function where you @@ -193,13 +180,12 @@ pass a `Discretizer` object. number_levels will be the number of levels in the discretized distribution (excluding 0). """ -function discretise(graph::Observations{G, D}; - number_groups = nothing, number_levels = nothing) where {G, D} +function discretise(graph::Observations; number_groups = nothing, number_levels = nothing) if isnothing(number_groups) && isnothing(number_levels) throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) end if isnothing(number_levels) - number_levels = round(Int,get_num_levels_from_groups(number_nodes(graph), number_groups)) + number_levels = round(Int, get_num_levels_from_groups(number_nodes(graph), number_groups)) else if !isnothing(number_groups) @warn "disregarding `number_groups` as `number_levels` is provided" @@ -208,23 +194,35 @@ function discretise(graph::Observations{G, D}; return discretise(graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) end -function discretise(graph::Observations{G, D}, discretiser ::Discretizer) where {G,D<:UnivariateDistribution} - A_encoded = encode(discretiser, _graph_to_mat(graph)) - return Observations(A_encoded, DiscretizedDistribution(discretiser)), discretiser -end +""" + discretise(graph::Observations, discretiser::Discretizer) +Discretise the graph observations using the given discretiser. -function _graph_to_mat(graph::Observations{<:AbstractGraph, D}) where {D<:UnivariateDistribution} - return weights(graph.graph) -end +# Arguments +- `graph::Observations`: The graph observations. +- `discretiser::Discretizer`: The discretiser to use. -function _graph_to_mat(graph::Observations{<:AbstractMatrix, D}) where {D<:UnivariateDistribution} - return graph.graph +# Returns +- `discretised_graph`: The discretised graph observations. +- `discretiser`: The discretiser used. +""" +function discretise(graph::Observations, discretiser::Discretizer) + A_encoded = encode(discretiser, graph.graph) + return Observations(A_encoded, DiscretizedDistribution(discretiser)), discretiser end - """ + get_num_levels_from_groups(n, number_groups) + Get the number of levels for the discretized distribution given n and k. + +# Arguments +- `n`: The number of nodes. +- `number_groups`: The number of groups. + +# Returns +- `num_levels`: The number of levels. """ function get_num_levels_from_groups(n, number_groups) return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) From 49671e2845029f1cfc8c256c944b19501109b91b Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 11:09:10 +0100 Subject: [PATCH 119/266] add julia compat --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index 2412ed8..a5d2d0c 100644 --- a/Project.toml +++ b/Project.toml @@ -29,6 +29,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] LossFunctions = "1.0.1" +julia = "1.11" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" From 64266288b9201ec4ab5d6e30486c971c1247f9dc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 11:14:08 +0100 Subject: [PATCH 120/266] format --- src/NetworkHistogram.jl | 8 +++--- src/api.jl | 5 ++-- src/assignments/Assignments.jl | 9 +++---- src/assignments/BernoulliAssignment/struct.jl | 5 +--- .../CategoricalAssignment/struct.jl | 3 ++- src/assignments/CategoricalAssignment/swap.jl | 15 +++++++---- src/assignments/SparseAssignment/struct.jl | 19 +++++++++----- src/assignments/SparseAssignment/swap.jl | 8 +++--- src/bootstrap.jl | 3 ++- src/distributions/categorical_with_0.jl | 4 ++- src/distributions/discrete_dist.jl | 16 ++++++++---- src/distributions/discretizer.jl | 19 +++++++------- src/distributions/zero_inflated.jl | 3 ++- src/observations.jl | 26 ++++++++++--------- src/optimisation/config_rules/InitRule.jl | 4 +-- src/optimisation/fit.jl | 12 ++++++--- src/optimisation/least_squares.jl | 6 +++-- src/sbm.jl | 20 +++++++------- 18 files changed, 108 insertions(+), 77 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index a67d34c..7bb5e0f 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -16,12 +16,13 @@ import StatsAPI: loglikelihood, fit using CategoricalArrays, CategoricalDistributions using Combinatorics: permutations using StaticArrays -using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, nrun, zeros_tuple +using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, nrun, + zeros_tuple import Bootstrap: bootstrap import Base.maximum, Base.minimum import Random: rand import Base.convert -import Distributions: pdf,logpdf, ncategories, cdf, rand +import Distributions: pdf, logpdf, ncategories, cdf, rand include("distributions/include.jl") include("assignments/Assignments.jl") @@ -49,7 +50,8 @@ export Strict # stopping rules export PreviousBestValue # bandwidth selection rules -export OracleK, EstimatedEigenvalues, EstimatedDegrees, select_number_node_per_block +export OracleK, EstimatedEigenvalues, EstimatedDegrees, + select_number_node_per_block # random local search rules export RandomNodeSwap, RandomGroupSwap diff --git a/src/api.jl b/src/api.jl index e1024d5..01b783e 100644 --- a/src/api.jl +++ b/src/api.jl @@ -15,7 +15,7 @@ function _default_init(dist::Distribution, start = MetisStart()) return InitRule(start, Val{BernoulliData}()) elseif dist isa Categorical return InitRule(start, Val{CategoricalData}()) - elseif dist isa DiscretizedDistribution || dist isa ZeroInflatedCategorical + elseif dist isa DiscretizedDistribution || dist isa ZeroInflatedCategorical return InitRule(start, Val{SparseData}()) else return InitRule(start, nothing) @@ -40,7 +40,8 @@ function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} kwargs_dict = Dict(kwargs) start_clustering = pop!(kwargs_dict, :start_clustering, MetisStart()) initialise_rule = pop!( - kwargs_dict, :initialise_rule, _default_init(g.dist_ref, start_clustering)) + kwargs_dict, :initialise_rule, _default_init( + g.dist_ref, start_clustering)) a = estimate_graphon(g, h; kwargs_dict..., initialise_rule = initialise_rule) return fit(a, g), a diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl index 9645ece..c312324 100644 --- a/src/assignments/Assignments.jl +++ b/src/assignments/Assignments.jl @@ -93,13 +93,12 @@ Base.@propagate_inbounds function Base.getindex(a::Assignment, i) return get_vertex_in_group(a, i) end -function get_ordered_adjacency_matrix(a::Assignment, A, by=identity) - perm = sortperm(a.node_labels, by=by) +function get_ordered_adjacency_matrix(a::Assignment, A, by = identity) + perm = sortperm(a.node_labels, by = by) return A[perm, perm] end - - function Base.deepcopy(a::Assignment) - return Assignment(a.group_size, copy(a.node_labels), deepcopy(a.additional_data)) + return Assignment( + a.group_size, copy(a.node_labels), deepcopy(a.additional_data)) end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl index 390c8c6..73fab50 100644 --- a/src/assignments/BernoulliAssignment/struct.jl +++ b/src/assignments/BernoulliAssignment/struct.jl @@ -99,13 +99,10 @@ end include("swap.jl") - -function get_ordered_adjacency_matrix(a::BernoulliAssignment, by=identity) +function get_ordered_adjacency_matrix(a::BernoulliAssignment, by = identity) return get_ordered_adjacency_matrix(a, a.additional_data.A, by) end - - # TODO: move to sparse structure to encode difference between 0 weight and absence of edge # from docs: # A = sparse(I,J,V) diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl index d44b5f7..5ccd175 100644 --- a/src/assignments/CategoricalAssignment/struct.jl +++ b/src/assignments/CategoricalAssignment/struct.jl @@ -12,7 +12,8 @@ const CategoricalAssignment{T, F, C} = Assignment{ const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} function CategoricalAssignment( - g::Observations{G,D}, group_size::GroupSize, node_labels::Vector{Int}) where {G,D} + g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { + G, D} categorical_data = make_categorical_data(g, node_labels, group_size) return Assignment(group_size, node_labels, categorical_data) end diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl index 6b70eb6..43b6cd9 100644 --- a/src/assignments/CategoricalAssignment/swap.jl +++ b/src/assignments/CategoricalAssignment/swap.jl @@ -67,7 +67,8 @@ function fit( end function fit( - a::CategoricalAssignment{T, F, C}, g::Observations{G, <:DiscretizedDistribution}) where { + a::CategoricalAssignment{T, F, C}, g::Observations{ + G, <:DiscretizedDistribution}) where { T, F, C, G} dists = initialize_sbm( a.group_size, g.dist_ref) @@ -87,8 +88,10 @@ function _move_connection!(realized, group_origin, group_dest, scratch) for label in axes(realized, 1) realized[label, group, group_origin] -= scratch[label, group] realized[label, group, group_dest] += scratch[label, group] - realized[label, group_origin, group] = realized[label, group, group_origin] - realized[label, group_dest, group] = realized[label, group, group_dest] + realized[label, group_origin, group] = realized[ + label, group, group_origin] + realized[label, group_dest, group] = realized[ + label, group, group_dest] end end end @@ -115,7 +118,8 @@ function new_update_observed_and_labels!( a.additional_data.scratch[obs, group_inter] += 1 end end - _move_connection!(a.additional_data.realized, g1, g2, a.additional_data.scratch) + _move_connection!( + a.additional_data.realized, g1, g2, a.additional_data.scratch) a.additional_data.scratch .= 0 for i in axes(a.additional_data.A, 1) @@ -128,7 +132,8 @@ function new_update_observed_and_labels!( a.additional_data.scratch[obs, group_inter] += 1 end end - _move_connection!(a.additional_data.realized, g2, g1, a.additional_data.scratch) + _move_connection!( + a.additional_data.realized, g2, g1, a.additional_data.scratch) _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, a.additional_data.counts) diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index ca816d0..5f6dcce 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -13,10 +13,12 @@ const SparseAssignment{T, F, C} = Assignment{ const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} function SparseAssignment( - g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where {G, D} + g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { + G, D} A = issparse(g.graph) ? g.graph : sparse(g.graph) num_levels = ncategories(g.dist_ref) - sparse_data = SparseData(A, size(group_size, 1), num_levels, group_size, node_labels) + sparse_data = SparseData( + A, size(group_size, 1), num_levels, group_size, node_labels) return Assignment(group_size, node_labels, sparse_data) end @@ -31,7 +33,8 @@ function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, level_count::Int, group_size, node_labels) where {T} n = size(A, 1) data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), - zeros(Float64, level_count, k, k), dropzeros(A), zeros(Int, level_count, k), zeros( + zeros(Float64, level_count, k, k), dropzeros(A), zeros( + Int, level_count, k), zeros( Int, k), 0.0) _count_possible_occurences!(data, group_size) _count_occurences!(data, node_labels) @@ -59,22 +62,24 @@ function _count_occurences!(data, node_labels) node_group_k = findall(x -> x == k, node_labels) node_group_l = findall(x -> x == l, node_labels) if k != l - counts = StatsBase.countmap(data.A[i,j] for i in node_group_k for j in node_group_l if i != j) + counts = StatsBase.countmap(data.A[i, j] for i in node_group_k + for j in node_group_l if i != j) else - counts = StatsBase.countmap(data.A[i,j] for i in node_group_k for j in node_group_l if i < j) + counts = StatsBase.countmap(data.A[i, j] for i in node_group_k + for j in node_group_l if i < j) end for m in 1:size(data.realized, 1) data.realized[m, k, l] = get(counts, m, 0) data.realized[m, l, k] = get(counts, m, 0) end - total_witouth_missing = sum(values(counts)) - get(counts, missing, 0) + total_witouth_missing = sum(values(counts)) - + get(counts, missing, 0) data.counts[k, l] = total_witouth_missing data.counts[l, k] = total_witouth_missing end end end - function compute_log_likelihood_without_0( estimated_theta::Array{T, 3}, realized::Array{F, 3}, counts) where { T, F} diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index 796d8ce..be7b78f 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -102,7 +102,8 @@ function _update_counts!(counts, g_from, g_to, missing_update) end function fit(a::SparseAssignment, g::Observations) - dists = initialize_sbm(a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) + dists = initialize_sbm( + a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] @@ -113,14 +114,15 @@ function fit(a::SparseAssignment, g::Observations) return dists end -function fit(a::SparseAssignment, g::Observations{G, <:DiscretizedDistribution}) where {G} +function fit(a::SparseAssignment, + g::Observations{G, <:DiscretizedDistribution}) where {G} dists = initialize_sbm(a.group_size, DiscretizedDistribution( g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) for group1 in 1:number_groups(a) for group2 in 1:number_groups(a) theta = a.additional_data.estimated_theta[:, group1, group2] - p = clamp(1 - sum(theta),0,1) + p = clamp(1 - sum(theta), 0, 1) dists[group1, group2] = DiscretizedDistribution( g.dist_ref.discretizer, ZeroInflatedCategorical(p, theta)) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 2cb7fc5..77086dd 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,4 +1,5 @@ -function bootstrap(statistic::Function, data::AbstractMatrix, model::BlockModel, +function bootstrap( + statistic::Function, data::AbstractMatrix, model::BlockModel, sampling::BootstrapSampling) t0 = tx(statistic(data)) m = nrun(sampling) diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl index d76f046..045d45b 100644 --- a/src/distributions/categorical_with_0.jl +++ b/src/distributions/categorical_with_0.jl @@ -51,7 +51,9 @@ function ZeroInflatedCategorical(vec_probs::AbstractVector) ZeroInflatedCategorical(vec_probs[1], vec_probs[2:end]) end -ZeroInflatedCategorical(k::Int) = ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) +function ZeroInflatedCategorical(k::Int) + ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) +end """ Distributions.pdf(d::ZeroInflatedCategorical, x::Real) diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl index 75fe1ee..55af36b 100644 --- a/src/distributions/discrete_dist.jl +++ b/src/distributions/discrete_dist.jl @@ -21,7 +21,8 @@ mutable struct DiscretizedDistribution{D, L} <: probs::L end -function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) where {D} +function DiscretizedDistribution( + d::D, n_bins::Int, support_bound = extrema(d)) where {D} disc = DiscretizerZeroToZero(n_bins, support_bound...) ps = zeros(non_zero_labels_counts(disc)) for i in 1:non_zero_labels_counts(disc) @@ -32,7 +33,8 @@ function DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d)) return DiscretizedDistribution(disc, probs) end -function DiscretizedDistribution(d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) +function DiscretizedDistribution( + d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) disc = DiscretizerZeroToZero(n_bins, support_bound...) ps = zeros(non_zero_labels_counts(disc)) for i in 1:non_zero_labels_counts(disc) @@ -62,7 +64,9 @@ minimum(d::DiscretizedDistribution) = minimum(d.discretizer) maximum(d::DiscretizedDistribution) = maximum(d.discretizer) -insupport(d::DiscretizedDistribution, x::Real) = support_encoding(d.discretizer, x) +function insupport(d::DiscretizedDistribution, x::Real) + support_encoding(d.discretizer, x) +end function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} return DiscretizedDistribution(d, 10) @@ -70,7 +74,8 @@ end ncategories(d::DiscretizedDistribution) = ncategories(d.probs) -function Distributions.fit(::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} +function Distributions.fit( + ::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} return fit(L, data) end @@ -122,7 +127,8 @@ The `cdf` of the discretized distribution is computed as: - `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` """ function Distributions.cdf( - d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} + d::DiscretizedDistribution{D, P}, x::Real) where { + D, P <: ZeroInflatedCategorical} x < minimum(d) && return zero(x) x > maximum(d) && return one(x) bin = encode(d.discretizer, x) diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl index b27867f..e039930 100644 --- a/src/distributions/discretizer.jl +++ b/src/distributions/discretizer.jl @@ -21,7 +21,8 @@ struct RegularDiscretizer{F, T, L} <: Discretizer bin_width::F end -function RegularDiscretizer(n_bins::Int, lower_bound::F, upper_bound::F) where {F} +function RegularDiscretizer( + n_bins::Int, lower_bound::F, upper_bound::F) where {F} if !isfinite(lower_bound) || !isfinite(upper_bound) throw(ArgumentError("RegularDiscretizer requires finite lower and upper bounds.")) end @@ -50,7 +51,8 @@ function encode(d::RegularDiscretizer, x::Real) return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] end -function _decode_randomly(rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) +function _decode_randomly( + rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) hi, lo = decode(d, bin) return lo + (hi - lo) * rand(rng) end @@ -60,7 +62,8 @@ function binwidth(d::RegularDiscretizer) end function decode(d::RegularDiscretizer, bin::Int) - return (d.lower_bound + (bin - 1) * d.bin_width, d.lower_bound + bin * d.bin_width) + return (d.lower_bound + (bin - 1) * d.bin_width, + d.lower_bound + bin * d.bin_width) end function encode(d::RegularDiscretizer, x::AbstractArray{Real}) @@ -109,7 +112,7 @@ function nlabels(d::CategoryDiscretizer) return length(d.bin_to_cat) end -function binwidth(d::CategoryDiscretizer{F,T}, x::T) where {F,T} +function binwidth(d::CategoryDiscretizer{F, T}, x::T) where {F, T} return length(d.bin_to_cat[x]) end @@ -209,7 +212,8 @@ function decode(d::HybridDiscretizer, bin::Int) end end -function _decode_randomly(rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) +function _decode_randomly( + rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) if haskey(d.cat.bin_to_cat, bin) return decode(d.cat, bin) else @@ -226,18 +230,15 @@ function auto_nbins(data) return nbins end - function progress_in_bin(d::CategoryDiscretizer, x::Real, bin) - return one(x) + return one(x) end - function progress_in_bin(d::RegularDiscretizer, x::Real, bin) lo, hi = decode(d, bin) return (x - lo) / (hi - lo) end - function progress_in_bin(d::HybridDiscretizer, x::Real, bin) if haskey(d.cat.bin_to_cat, bin) return progress_in_bin(d.cat, x, bin) diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 260fe74..9cc4720 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -85,7 +85,8 @@ function Distributions.fit( indices_0 = findall(x -> x == 0, data) p = length(indices_0) / length(data) if p != 1 - return ZeroInflated(p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) + return ZeroInflated( + p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) else return ZeroInflated(1.0, D()) end diff --git a/src/observations.jl b/src/observations.jl index 096fb55..120642c 100644 --- a/src/observations.jl +++ b/src/observations.jl @@ -73,7 +73,8 @@ Get the density of the graph. - `density`: The density of the graph. """ function density(graph::Observations) - return sum(graph.graph) / ((size(graph.graph, 1) * (size(graph.graph, 1) - 1))) + return sum(graph.graph) / + ((size(graph.graph, 1) * (size(graph.graph, 1) - 1))) end """ @@ -106,8 +107,6 @@ function get_adj(graph::Observations) return graph.graph end - - function normalized_laplacian(graph::Observations) return normalized_laplacian(graph.graph) end @@ -116,7 +115,6 @@ function normalized_laplacian(g::AbstractGraph) return normalized_laplacian(Graphs.adjacency_matrix(g)) end - normalized_laplacian(g::CategoricalArray) = normalized_laplacian(levelcode.(g)) """ @@ -149,16 +147,16 @@ function normalized_laplacian(graph::AbstractMatrix) return L end -function Metis.graph(graph::Observations{G, <:UnivariateDistribution}) where {G} +function Metis.graph(graph::Observations{ + G, <:UnivariateDistribution}) where {G} use_weights = true if minimum(graph.dist_ref) < 0 @warn "Negative values are not allowed for MetisStart, using binary graph" use_weights = false end - return Metis.graph(sparse(graph.graph), weights = use_weights) + return Metis.graph(sparse(graph.graph), weights = use_weights) end - """ discretise(graph::Observations; number_groups, number_levels) @@ -180,18 +178,21 @@ pass a `Discretizer` object. number_levels will be the number of levels in the discretized distribution (excluding 0). """ -function discretise(graph::Observations; number_groups = nothing, number_levels = nothing) +function discretise( + graph::Observations; number_groups = nothing, number_levels = nothing) if isnothing(number_groups) && isnothing(number_levels) throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) end if isnothing(number_levels) - number_levels = round(Int, get_num_levels_from_groups(number_nodes(graph), number_groups)) + number_levels = round(Int, + get_num_levels_from_groups(number_nodes(graph), number_groups)) else if !isnothing(number_groups) @warn "disregarding `number_groups` as `number_levels` is provided" end end - return discretise(graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) + return discretise( + graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) end """ @@ -209,7 +210,8 @@ Discretise the graph observations using the given discretiser. """ function discretise(graph::Observations, discretiser::Discretizer) A_encoded = encode(discretiser, graph.graph) - return Observations(A_encoded, DiscretizedDistribution(discretiser)), discretiser + return Observations(A_encoded, DiscretizedDistribution(discretiser)), + discretiser end """ @@ -225,5 +227,5 @@ Get the number of levels for the discretized distribution given n and k. - `num_levels`: The number of levels. """ function get_num_levels_from_groups(n, number_groups) - return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) + return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl index 7cc4329..c60e144 100644 --- a/src/optimisation/config_rules/InitRule.jl +++ b/src/optimisation/config_rules/InitRule.jl @@ -54,7 +54,7 @@ function initialize_node_labels(g, h, ::SpectralStart) node_labels = zeros(Int, number_nodes(g)) laplacian = normalized_laplacian(g) - decomp, = partialschur(laplacian, nev=2, which=:LR) + decomp, = partialschur(laplacian, nev = 2, which = :LR) # get 2nd eigenvector, sort its components indices = sortperm(real.(decomp.Q[:, 2])) @@ -92,8 +92,6 @@ function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) return group_size, node_labels end - - function initialize_node_labels(g, h, ::BiasAdjustedSoS) # implement method from Bias-adjusted spectral clustering in multilayer stochastic block # models diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl index 7c8c29d..8e5985a 100644 --- a/src/optimisation/fit.jl +++ b/src/optimisation/fit.jl @@ -29,7 +29,8 @@ Fit the SBM to the given graph observations and assignment. - `g::Observations{G,D}`: The graph observations. - `a::Assignment`: The assignment of nodes to blocks. """ -function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} +function fit!(sbm::BlockModel{D, K, F}, g::Observations{G, D}, + a::Assignment) where {G, D, K, F} for group1 in 1:number_groups(a) for group2 in group1:number_groups(a) edge_indices = get_edge_indices(a, group1, group2) @@ -39,7 +40,8 @@ function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where end function fit_group(d::ZeroInflatedCategorical, g, edges) - return Distributions.fit(typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) + return Distributions.fit( + typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) end function fit_group(distribution, g, edges) @@ -47,7 +49,8 @@ function fit_group(distribution, g, edges) end function fit_group(distribution::Binomial, g, edges) - return Distributions.fit(typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) + return Distributions.fit( + typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) end """ @@ -88,7 +91,8 @@ Fit the SBM to the given graph observations. - `sbm::BlockModel{D,K,F}`: The block model to fit. - `g::Observations{G,D}`: The graph observations. """ -function fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} +function fit!( + sbm::BlockModel{D, K, F}, g::Observations{G, D}) where {G, D, K, F} k = number_blocks(sbm) a = estimate_graphon(g, select_number_node_per_block(g, OracleK(k))) fit!(sbm, g, a) diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 0fd04a6..9369333 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -18,7 +18,8 @@ Estimate the graphon for the given graph. # Returns - `a`: The assignment of nodes to blocks. """ -function estimate_graphon(graph, h = select_number_node_per_block(graph, EstimatedDegrees()); +function estimate_graphon( + graph, h = select_number_node_per_block(graph, EstimatedDegrees()); iterations::Int = 10_000, initialise_rule::InitRule = InitRule(SpectralStart(), nothing), swap_rule::NodeSwapRule = RandomNodeSwap(), @@ -55,7 +56,8 @@ function greedy_improve!(a::Assignment, graph; iterations::Int = 10_000, ) # swap memory allocation swap = make_swap(a, (1, 1)) - p = ProgressUnknown(enabled = progress_bar, showspeed = true, desc = "Greedy search: ") + p = ProgressUnknown( + enabled = progress_bar, showspeed = true, desc = "Greedy search: ") # perform local search until the stopping rule is met for i in 1:iterations local_search!( diff --git a/src/sbm.jl b/src/sbm.jl index 66e0f61..1fad688 100644 --- a/src/sbm.jl +++ b/src/sbm.jl @@ -6,7 +6,8 @@ struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} probs::SymmetricTensor{T, K, 2} end -function BlockModel(θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real} +function BlockModel( + θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real} return BlockModel(sizes, SymmetricTensor([θ[i, j] for i in 1:size(θ, 1) for j in i:size(θ, 2)], Val(length(sizes)), Val(2))) @@ -70,8 +71,8 @@ function sample( return sparse(Symmetric(A, :L)), node_labels end - -function draw_and_fill!(rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = false) +function draw_and_fill!( + rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = false) n_blocks = number_blocks(sbm) n_nodes = size(A, 1) node_labels = StatsBase.sample( @@ -87,7 +88,9 @@ function draw_and_fill!(rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = fa A .= Symmetric(A, :L) end -draw_and_fill!(A, sbm, sorted = false) = draw_and_fill!(Random.default_rng(), A, sbm, sorted) +function draw_and_fill!(A, sbm, sorted = false) + draw_and_fill!(Random.default_rng(), A, sbm, sorted) +end function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted = false) sample(Random.default_rng(), sbm, node_labels, sorted) @@ -115,7 +118,6 @@ function _get_params_as_vec(dist::Distribution) return vcat(params(dist)...) end - function latent_to_block_index(latents_vec, sbm::BlockModel) cum_sum_sizes = cumsum(sbm.sizes) cum_sum_sizes[end] = 1.0 @@ -133,7 +135,8 @@ If the difference between the two models is less than `tol`, the function stops This function is not efficient for large numbers of blocks, as it uses brute force to find the best permutation. """ -function best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) +function best_alignment( + fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) k = number_blocks(fitted_sbm) if k != number_blocks(true_sbm) throw(ArgumentError("The number of blocks must be the same for both models")) @@ -160,7 +163,6 @@ function align_sbm!(sbm::BlockModel, perm) sbm.sizes .= sbm.sizes[perm] end - """ order_groups(a::Assignment, latents::AbstractVector) @@ -175,10 +177,10 @@ function order_groups(a::Assignment, latents::AbstractVector) dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) for group in 1:k) - return sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) + return sort( + 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) end - function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) align_sbm!(sbm, order_groups(a, latents)) end From 9ded3cc17331fc2bd710ab78bf1c99b310e627e7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 11:15:36 +0100 Subject: [PATCH 121/266] update julia version in github tests --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3d4b576..ab34529 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: version: - - '1.8' + - '1.11' # - 'nightly' os: - ubuntu-latest From 4e4c4cdd380ebdb24ecf766f2e2cdb408edefffb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 11:16:17 +0100 Subject: [PATCH 122/266] format tests --- test/assignments/categorical_assignment.jl | 1 - test/assignments/sparse_assignment.jl | 10 ++++++---- test/discretised_dist/discretizer.jl | 9 ++++++--- test/generated_tests/test_distribution.jl | 13 ++++++++----- test/generated_tests/test_zero_inflated.jl | 5 +++-- test/test_api.jl | 8 ++++---- 6 files changed, 27 insertions(+), 19 deletions(-) diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl index 5dea6e0..bd4c4db 100644 --- a/test/assignments/categorical_assignment.jl +++ b/test/assignments/categorical_assignment.jl @@ -2,7 +2,6 @@ import NetworkHistogram as NH using Random - @testset "test Categorical swap" begin Random.seed!(1234123) using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl index b23a988..a7b56ac 100644 --- a/test/assignments/sparse_assignment.jl +++ b/test/assignments/sparse_assignment.jl @@ -21,12 +21,14 @@ using Random sparse_a = NH.SparseAssignment( NH.Observations(A, Categorical(m)), a.group_size, a.node_labels) @test a.additional_data.counts == sparse_a.additional_data.counts - for (l,m_index) in enumerate(2:m) - @test a.additional_data.realized[m_index, :, :] == sparse_a.additional_data.realized[l, :, :] + for (l, m_index) in enumerate(2:m) + @test a.additional_data.realized[m_index, :, :] == + sparse_a.additional_data.realized[l, :, :] @test a.additional_data.estimated_theta[m_index, :, :] == - sparse_a.additional_data.estimated_theta[l, :, :] + sparse_a.additional_data.estimated_theta[l, :, :] end - @test a.additional_data.log_likelihood ≈ sparse_a.additional_data.log_likelihood + @test a.additional_data.log_likelihood ≈ + sparse_a.additional_data.log_likelihood end @testset "test sparse swap" begin diff --git a/test/discretised_dist/discretizer.jl b/test/discretised_dist/discretizer.jl index 9094987..17c90fa 100644 --- a/test/discretised_dist/discretizer.jl +++ b/test/discretised_dist/discretizer.jl @@ -2,7 +2,8 @@ using NetworkHistogram @testset "discretizer" begin using StaticArrays - reg_disc = NetworkHistogram.RegularDiscretizer(10, 0.0, 1.0, MVector{10}(1:10), 1 / 10) + reg_disc = NetworkHistogram.RegularDiscretizer( + 10, 0.0, 1.0, MVector{10}(1:10), 1 / 10) cat_disc = NetworkHistogram.CategoryDiscretizer( Dict([0.0 => 11]), Dict([11 => 0.0])) hybrid_disc = NetworkHistogram.HybridDiscretizer( @@ -12,6 +13,8 @@ using NetworkHistogram @test NetworkHistogram.encode(cat_disc, 0.0) == 11 @test NetworkHistogram.encode(hybrid_disc, 0.0) == 11 @test NetworkHistogram.decode(hybrid_disc, 11) == 0.0 - @test all(NetworkHistogram.encode(reg_disc, 0.001:0.001:1.0) .== NetworkHistogram.encode(hybrid_disc, 0.001:0.001:1.0)) - @test all(NetworkHistogram.decode(hybrid_disc, 1:10) .== NetworkHistogram.decode(reg_disc, 1:10)) + @test all(NetworkHistogram.encode(reg_disc, 0.001:0.001:1.0) .== + NetworkHistogram.encode(hybrid_disc, 0.001:0.001:1.0)) + @test all(NetworkHistogram.decode(hybrid_disc, 1:10) .== + NetworkHistogram.decode(reg_disc, 1:10)) end diff --git a/test/generated_tests/test_distribution.jl b/test/generated_tests/test_distribution.jl index c9133df..9389428 100644 --- a/test/generated_tests/test_distribution.jl +++ b/test/generated_tests/test_distribution.jl @@ -1,6 +1,9 @@ -using NetworkHistogram: ZeroInflated, DiscretizedDistribution, ZeroInflatedCategorical, - ncategories, Discretizer, encode, decode, binwidth, RegularDiscretizer, - CategoryDiscretizer, HybridDiscretizer, DiscretizerZeroToZero, nlabels +using NetworkHistogram: ZeroInflated, DiscretizedDistribution, + ZeroInflatedCategorical, + ncategories, Discretizer, encode, decode, binwidth, + RegularDiscretizer, + CategoryDiscretizer, HybridDiscretizer, + DiscretizerZeroToZero, nlabels using Distributions using Test @@ -67,7 +70,7 @@ end @test encode(disc, 0.0) == 11 @test encode(disc, 0.5) == 8 @test decode(disc, 11) == 0.0 - @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol=1e-2)) + @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) @test nlabels(disc) == 12 end @@ -76,6 +79,6 @@ end @test encode(disc, 0.0) == 0 @test encode(disc, 0.5) == 8 @test decode(disc, 0) == 0.0 - @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol=1e-2)) + @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) @test nlabels(disc) == 11 end diff --git a/test/generated_tests/test_zero_inflated.jl b/test/generated_tests/test_zero_inflated.jl index 523b89c..380e80c 100644 --- a/test/generated_tests/test_zero_inflated.jl +++ b/test/generated_tests/test_zero_inflated.jl @@ -24,7 +24,7 @@ using NetworkHistogram: ZeroInflated, get_proba_zero @test sample == 0 || insupport(dist, sample) # Test logpdf - @test logpdf(zero_inflated_dist, 0) ≈ log(0.5* (1 + pdf(dist, 0))) + @test logpdf(zero_inflated_dist, 0) ≈ log(0.5 * (1 + pdf(dist, 0))) @test logpdf(zero_inflated_dist, 1) ≈ log(0.5 * pdf(dist, 1)) # Test minimum and maximum @@ -69,7 +69,8 @@ using NetworkHistogram: ZeroInflated, get_proba_zero @test sample_disc == 0 || insupport(dist_disc, sample_disc) # Test logpdf with discrete distribution - @test logpdf(zero_inflated_dist_disc, 0) ≈ log(0.5 * (1 + pdf(dist_disc, 0))) + @test logpdf(zero_inflated_dist_disc, 0) ≈ + log(0.5 * (1 + pdf(dist_disc, 0))) @test logpdf(zero_inflated_dist_disc, 1) ≈ log(0.5 * pdf(dist_disc, 1)) # Test minimum and maximum with discrete distribution diff --git a/test/test_api.jl b/test/test_api.jl index e68b72b..48fdefb 100644 --- a/test/test_api.jl +++ b/test/test_api.jl @@ -9,11 +9,11 @@ sbm_fitted, a = nethist(g; h = 10, iterations = 10) @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) - @test size(sbm_fitted) == (4,4) + @test size(sbm_fitted) == (4, 4) sbm_discretised, a, discretizer = nethist_discretised( g; number_levels = 5, h = 10, iterations = 10) - @test sbm_discretised[1,1] isa DiscretizedDistribution - @test ncategories(sbm_discretised[1,1]) == 5 - @test size(sbm_discretised) == (4,4) + @test sbm_discretised[1, 1] isa DiscretizedDistribution + @test ncategories(sbm_discretised[1, 1]) == 5 + @test size(sbm_discretised) == (4, 4) end From 9bcd4855bd92799496cb92712fa4f9f1a8b6d89f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 11:18:15 +0100 Subject: [PATCH 123/266] remove benchmarks folder --- benchmark/benchmarks.jl | 56 ----------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 benchmark/benchmarks.jl diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl deleted file mode 100644 index cc8828e..0000000 --- a/benchmark/benchmarks.jl +++ /dev/null @@ -1,56 +0,0 @@ -# using BenchmarkTools - -# SUITE = BenchmarkGroup() -# for file in readdir(@__DIR__) -# if startswith(file, "bench_") && endswith(file, ".jl") -# SUITE[file[length("bench_") + 1:end - length(".jl")]] = -# include(file) -# end -# end - -using BenchmarkTools, Random, Distributions, LinearAlgebra -import NetworkHistogram as NH -const SUITE = BenchmarkGroup() - -function make_A(n, dist) - A = zeros(Int, n, n) - for j in 1:n - for i in j:n - if i == j - A[i, j] = 0 - else - A[i, j] = rand(dist) - A[j, i] = A[i, j] - end - end - end - return A -end -# Create hierarchy of benchmarks: -SUITE["Assignment"] = BenchmarkGroup(["assignment"]) - -Random.seed!(123451) -stop_rule = NH.PreviousBestValue(200) -iterations = 200 -swap_rule = NH.RandomNodeSwap() -accept_rule = NH.Strict() -dist = Bernoulli(0.5) - -for ae in ["Bernoulli", "default"] - if ae == "default" - init_rule = NH.InitRule(NH.OrderedStart(), nothing) - else - init_rule = NH.InitRule(NH.OrderedStart(), Val{NH.BernoulliData}()) - end - for n in [60,120,300] - obs = NH.Observations(make_A(n,dist), dist) - h = n ÷ 20 - a = NH.make_assignment(obs, h, init_rule) - swap = NH.make_swap(a, (1, n)) - SUITE["Assignment"][ae]["local_search!"][n] = @benchmarkable NH.local_search!( - $a, $obs, $swap, swap_rule = $swap_rule, accept_rule = $accept_rule) - end -end - -# tune!(SUITE); -results = run(SUITE, verbose = true) From 5086208e7abdeddad1452382eaf432fa88ac1806 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 11:32:37 +0100 Subject: [PATCH 124/266] remove dubious check and modification --- src/assignments/SparseAssignment/struct.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl index 5f6dcce..8251c25 100644 --- a/src/assignments/SparseAssignment/struct.jl +++ b/src/assignments/SparseAssignment/struct.jl @@ -89,9 +89,6 @@ function compute_log_likelihood_without_0( for j in 1:number_groups for i in j:number_groups total_decorations = counts[i, j] - if total_decorations < sum(realized[:, i, j]) - total_decorations = sum(realized[:, i, j]) - end loglik -= xlogx(total_decorations) for m in 1:number_decorations loglik += xlogx(realized[m, i, j]) From 4daec0381f797263bcdb35a1394b45ddcb2fa82d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 20 Jan 2025 14:34:24 +0100 Subject: [PATCH 125/266] remove unused dependence --- Project.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Project.toml b/Project.toml index a5d2d0c..6e4843e 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,6 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" @@ -28,7 +27,6 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -LossFunctions = "1.0.1" julia = "1.11" [extras] From 07f158064fedfc8f460d0917a012458d539b4ee7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 21 Jan 2025 16:25:17 +0100 Subject: [PATCH 126/266] correct missing update --- src/assignments/SparseAssignment/swap.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl index be7b78f..cdfda51 100644 --- a/src/assignments/SparseAssignment/swap.jl +++ b/src/assignments/SparseAssignment/swap.jl @@ -96,8 +96,10 @@ end function _update_counts!(counts, g_from, g_to, missing_update) for i in axes(counts, 1) - counts[i, g_to] += missing_update[i] - counts[i, g_from] -= missing_update[i] + counts[i, g_to] = counts[i, g_to] - missing_update[i] + counts[i, g_from] = counts[i, g_from] + missing_update[i] + counts[g_to, i] = counts[i, g_to] + counts[g_from, i] = counts[i, g_from] end end From ce4f1bffba231dd31f2392a17b27fc48933699b1 Mon Sep 17 00:00:00 2001 From: Charles Dufour <34485907+dufourc1@users.noreply.github.com> Date: Thu, 6 Feb 2025 09:54:20 +0100 Subject: [PATCH 127/266] Update codecov action --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ab34529..56a8717 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -35,7 +35,7 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v3 + - uses: codecov/codecov-action@v5 with: files: lcov.info token: ${{ secrets.CODECOV_TOKEN }} From 7acde37ff09aab99972e26c5cde05c23e26617bb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 14 Apr 2025 16:57:35 +0200 Subject: [PATCH 128/266] update docs --- docs/Project.toml | 2 ++ docs/make.jl | 2 +- docs/src/api.md | 1 + docs/src/internal.md | 12 ------------ docs/src/internals/assignments.md | 20 ++++++++++++++++++++ docs/src/internals/distributions.md | 14 ++++++++++++++ 6 files changed, 38 insertions(+), 13 deletions(-) create mode 100644 docs/src/internals/assignments.md create mode 100644 docs/src/internals/distributions.md diff --git a/docs/Project.toml b/docs/Project.toml index dfa65cd..e24d6b4 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,2 +1,4 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" +NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" diff --git a/docs/make.jl b/docs/make.jl index 29d001c..c22d3d3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,7 +20,7 @@ makedocs(; "API Reference" => "api.md", "Optimization hyperparameters" => "rules.md", "Examples" => "examples.md", - "Internal" => "internal.md" + "Internal" => ["internals/assignments.md","internals/distributions.md"] ], checkdocs = :none) diff --git a/docs/src/api.md b/docs/src/api.md index d67e6ad..e0cc4b3 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -5,5 +5,6 @@ Depth = 2 ```@autodocs Modules = [NetworkHistogram] +Pages = ["api.jl"] Private = false ``` diff --git a/docs/src/internal.md b/docs/src/internal.md index 307a222..df748f0 100644 --- a/docs/src/internal.md +++ b/docs/src/internal.md @@ -6,15 +6,3 @@ Depth = 2 # Assignments and group sizes - -```@autodocs -Modules = [NetworkHistogram] -Pages = ["Assignments.jl", "group_numbering.jl"] -Private = true -``` - -## How to specialize the `Assignment` type for faster performance - -```@docs -NetworkHistogram.BernoulliData -``` \ No newline at end of file diff --git a/docs/src/internals/assignments.md b/docs/src/internals/assignments.md new file mode 100644 index 0000000..29c717a --- /dev/null +++ b/docs/src/internals/assignments.md @@ -0,0 +1,20 @@ +```@contents +Pages = ["assignments.md"] +Depth = 1 +``` + + +# Assignments and group sizes + + +```@autodocs +Modules = [NetworkHistogram] +Pages = ["Assignments.jl", "group_numbering.jl"] +Private = true +``` + +## How to specialize the `Assignment` type for faster performance + +```@docs +NetworkHistogram.BernoulliData +``` \ No newline at end of file diff --git a/docs/src/internals/distributions.md b/docs/src/internals/distributions.md new file mode 100644 index 0000000..73c2cd4 --- /dev/null +++ b/docs/src/internals/distributions.md @@ -0,0 +1,14 @@ +```@contents +Pages = ["distributions.md"] +Depth = 0 +``` + + +# Distributions + + +```@autodocs +Modules = [NetworkHistogram] +Pages = ["categorical_with_0.jl", "discrete_dist.jl","discretizer.jl", "zero_inflated.jl"] +Private = true +``` \ No newline at end of file From c39ee211c8d2b25b8d8a36cb0f89e9373e077204 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 14 Apr 2025 17:03:18 +0200 Subject: [PATCH 129/266] format make.jl --- docs/make.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index c22d3d3..3c9567d 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,7 +20,8 @@ makedocs(; "API Reference" => "api.md", "Optimization hyperparameters" => "rules.md", "Examples" => "examples.md", - "Internal" => ["internals/assignments.md","internals/distributions.md"] + "Internal" => [ + "internals/assignments.md", "internals/distributions.md"] ], checkdocs = :none) From 9cc573c11b39573aeb22baaffcc32b9c7cc15f6d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 23 Apr 2025 17:44:53 +0200 Subject: [PATCH 130/266] add implementation for MC --- src/assignments/SumAssignment/struct.jl | 81 ++++++++++++++++++ src/assignments/SumAssignment/swap.jl | 103 +++++++++++++++++++++++ src/assignments/include.jl | 1 + src/distributions/include.jl | 1 + src/distributions/markov_chain.jl | 105 ++++++++++++++++++++++++ test/assignments/sum_assignment.jl | 10 +++ test/runtests.jl | 1 + 7 files changed, 302 insertions(+) create mode 100644 src/assignments/SumAssignment/struct.jl create mode 100644 src/assignments/SumAssignment/swap.jl create mode 100644 src/distributions/markov_chain.jl create mode 100644 test/assignments/sum_assignment.jl diff --git a/src/assignments/SumAssignment/struct.jl b/src/assignments/SumAssignment/struct.jl new file mode 100644 index 0000000..4b3aa77 --- /dev/null +++ b/src/assignments/SumAssignment/struct.jl @@ -0,0 +1,81 @@ + +# type F needs to be a vector field! + +struct SumData{F, C} + λ::SparseMatrixCSC{F, Int} + θ::Dict{Tuple{Int, Int}, F} + A::SparseMatrixCSC{C, Int} + counts::Dict{Tuple{Int, Int}, Int} + log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} + log_likelihood::Float64 +end + +const SumAssignment{T, F, C} = Assignment{T, SumData{F, C}} +const SumInitRule{S} = InitRule{S, Val{SumData}} + +function SumAssignment( + A::SparseMatrixCSC{C, Int}, + λ::SparseMatrixCSC{F, Int}, group_size::GroupSize, node_labels::Vector{Int}) where { + F, C} + k = size(group_size, 1) + θ = Dict{Tuple{Int, Int}, F}() + counts = Dict{Tuple{Int, Int}, Int}() + + rows = rowvals(λ) + vals = nonzeros(λ) + m, n = size(λ) + for u in 1:n + for v in rows[nzrange(λ, u)] + if u >= v + break # check that this isn't a mistake trying to be fast + continue + end + key_groups = minmax(node_labels[u], node_labels[v]) + param = vals[i] + if haskey(θ, key_groups) + θ[key_groups] += param + else + θ[key_groups] = param + end + if haskey(counts, key_groups) + counts[key_groups] += 1 + else + counts[key_groups] = 1 + end + end + end + for i in 1:k + for j in i:k + θ[minmax(i, j)] ./= counts[minmax(i, j)] + end + end + for i in 1:k + counts[(i, i)] ./= 2 + end + ll_sum = 0.0 + ll = Dict{Tuple{Int, Int}, Float64}() + for i in 1:k + for j in i:k + ll[(i, j)] = 0.0 + end + end + for i in 1:n + for v in nzrange(λ, j) + u = rows[i] + if u >= v + continue + end + key_groups = minmax(node_labels[u], node_labels[v]) + ll[minmax( + node_labels[u], node_labels[v])] += loglikelihood(θ[key_groups], A[u, v]) + end + end + ll_sum = sum(values(ll)) + return Assignment(group_size, node_labels, SumData(λ, θ, A, counts, ll, ll_sum)) +end + +function loglikelihood(assignment::SumAssignment, g::Observations) + return sum(values(assignment.additional_data.log_likelihood)) +end + +include("swap.jl") diff --git a/src/assignments/SumAssignment/swap.jl b/src/assignments/SumAssignment/swap.jl new file mode 100644 index 0000000..bac2c07 --- /dev/null +++ b/src/assignments/SumAssignment/swap.jl @@ -0,0 +1,103 @@ +mutable struct SumSwap{F} <: Swap + index1::Int + index2::Int + θ::Dict{Tuple{Int, Int}, F} + counts::Dict{Tuple{Int, Int}, Int} + log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} + log_likelihood::Float64 +end + +function make_swap(a::SumAssignment, id) + return SumSwap(id[1], id[2], deepcopy(a.additional_data.θ), + deepcopy(a.additional_data.counts), deepcopy(a.additional_data.log_likelihood_per_group), a.additional_data.log_likelihood) +end + + +function make_swap!(swap::SumSwap{F}, a::SumAssignment{T, F}, id) where {T, F} + swap.index1, swap.index2 = id + swap.θ = deepcopy(a.additional_data.θ) + swap.counts = deepcopy(a.additional_data.counts) + swap.log_likelihood_per_group = deepcopy(a.additional_data.log_likelihood_per_group) + swap.log_likelihood = a.additional_data.log_likelihood +end + +function revert_swap!( + a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} + swap_node_labels!(a, swap.index1, swap.index2) + a.additional_data.θ = deepcopy(swap.θ) + a.additional_data.counts = deepcopy(swap.counts) + a.additional_data.log_likelihood_per_group = deepcopy(swap.log_likelihood_per_group) + a.additional_data.log_likelihood = swap.log_likelihood +end + +function apply_swap!( + a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} + λ = a.additional_data.λ + rows = rowvals(λ) + vals = nonzeros(λ) + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + if g1 == g2 + return nothing + end + + for v in rows[nzrange(λ, swap.index1)] + key_old_groups = minmax(g1, a.node_labels[v]) + key_new_groups = minmax(g2, a.node_labels[v]) + c_og = a.counts[key_old_groups] + c_ng = a.counts[key_new_groups] + param = vals[i] + a.θ[key_old_groups] = (a.θ[key_old_groups]*c_og - param)/(c_og - 1) + a.θ[key_new_groups] = (a.θ[key_new_groups]*c_ng + param)/(c_ng + 1) + a.counts[key_old_groups] -= 1 + a.counts[key_new_groups] += 1 + end + + for v in rows[nzrange(λ, swap.index2)] + key_old_groups = minmax(g2, a.node_labels[v]) + key_new_groups = minmax(g1, a.node_labels[v]) + c_og = a.counts[key_old_groups] + c_ng = a.counts[key_new_groups] + param = vals[i] + a.θ[key_old_groups] = (a.θ[key_old_groups]*c_og - param)/(c_og - 1) + a.θ[key_new_groups] = (a.θ[key_new_groups]*c_ng + param)/(c_ng + 1) + a.counts[key_old_groups] -= 1 + a.counts[key_new_groups] += 1 + end + + swap_node_labels!(a, swap.index1, swap.index2) + fast_update_ll!(a, swap) +end + +function fast_update_ll(a::SumAssignment, swap::SumSwap) + k = size(a.group_size, 1) + for i in 1:k + for j in i:k + index_group = (i, j) + if swap.θ[index_group] != a.θ[index_group] + _update_ll_one_group!(a, index_group) + end + end + end + a.additional_data.log_likelihood = sum(values(a.additional_data.log_likelihood_per_group)) +end + +function _update_ll_one_group!(a::SumAssignment, group) + k = size(a.group_size, 1) + nodes_1 = findall(x -> x == group[1], a.node_labels) + nodes_2 = findall(x -> x == group[2], a.node_labels) + ll = 0.0 + rows = rowvals(a.additional_data.λ) + vals = nonzeros(a.additional_data.λ) + for i in nodes_1 + for u in nodes_1 + for v in rows[nzrange(a.additional_data.λ, u)] + if v ∈ nodes_2 + ll += loglikelihood(a.θ[group], a.additional_data.A[u, v]) + end + end + end + end + a.log_likelihood_per_group[group] = ll + return nothing +end diff --git a/src/assignments/include.jl b/src/assignments/include.jl index d9395c0..5829223 100644 --- a/src/assignments/include.jl +++ b/src/assignments/include.jl @@ -1,3 +1,4 @@ include("BernoulliAssignment/struct.jl") include("CategoricalAssignment/struct.jl") include("SparseAssignment/struct.jl") +include("SumAssignment/struct.jl") diff --git a/src/distributions/include.jl b/src/distributions/include.jl index 875f028..cc5c557 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -2,3 +2,4 @@ include("categorical_with_0.jl") include("discretizer.jl") include("zero_inflated.jl") include("discrete_dist.jl") +include("markov_chain.jl") diff --git a/src/distributions/markov_chain.jl b/src/distributions/markov_chain.jl new file mode 100644 index 0000000..4368458 --- /dev/null +++ b/src/distributions/markov_chain.jl @@ -0,0 +1,105 @@ +# if S is Int, assume the states are ordered and sequential +struct DiscreteMarkovChain{S, T} + states::Vector{S} + transitions::Matrix{T} +end + +struct SampleChain{S} + states::Vector{S} + indices::Vector{Int} + transitions::Matrix{Int} +end + +function state_index(mc::DiscreteMarkovChain{S, T}, state::S) where {S, T} + findfirst(isequal(state), mc.states) +end + +state_space(mc::DiscreteMarkovChain) = mc.states +transition_matrix(mc::DiscreteMarkovChain) = mc.transitions + +function stationary_dist(mc::DiscreteMarkovChain) + T = transition_matrix(mc) + n = length(state_space(mc)) + F = eigen(T') + tol = 1e-8 + idx = findfirst(abs.(F.values .- 1) .< tol) + if idx === nothing + error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") + end + + # Extract the corresponding eigenvector and normalize it to sum to 1. + pi = real(F.vectors[:, idx]) + return pi ./ sum(pi) +end + +function sample_indices(mc::DiscreteMarkovChain{S, T}, t::Int) where {S, T} + indices = Vector{Int}(undef, t) + indices[1] = rand(Categorical(stationary_dist(mc))) + tr_transposed = transpose(mc.transitions) + for i in 2:t + indices[i] = rand(Categorical(tr_transposed[:, indices[i - 1]])) + end + return indices +end + +function sample(mc::DiscreteMarkovChain{S, T}, t::Int) where {S, T} + indices = sample_indices(mc, t) + states = mc.states[indices] + counts = zeros(Int, length(mc.states), length(mc.states)) + for i in 1:(length(indices) - 1) + counts[indices[i], indices[i + 1]] += 1 + end + return SampleChain(states, indices, counts) +end + + +## yes I know this is awful and does not return a proper chain, but... +function Base.:+(a::DiscreteMarkovChain, b::DiscreteMarkovChain) + return DiscreteMarkovChain( + a.states, + a.transitions .+ b.transitions) +end + +function Base.:-(a::DiscreteMarkovChain, b::DiscreteMarkovChain) + return DiscreteMarkovChain( + a.states, + a.transitions .- b.transitions) +end + +function Base.:*(a::DiscreteMarkovChain, c::Real) + return DiscreteMarkovChain( + a.states, + a.transitions .* c) +end + +Base.:*(c::Real, a::DiscreteMarkovChain) = a * c + +function Base.:/(a::DiscreteMarkovChain, c::Real) + return DiscreteMarkovChain( + a.states, + a.transitions ./ c) +end + +function loglikelihood(mc::DiscreteMarkovChain{S, T}, chain::Vector{Int}) where {S, T} + Tr = transition_matrix(mc) + loglik = log(stationary_dist(mc)[chain[1]]) + for i in 1:(length(chain) - 1) + loglik += log(Tr[chain[i], chain[i + 1]]) + end + return loglik +end + +function loglikelihood(mc::DiscreteMarkovChain{S, T}, chain::Vector{S}) where {S, T} + return loglikelihood(mc, state_index.(Ref(mc), chain)) +end + +function loglikelihood(mc::DiscreteMarkovChain{S, T}, chain::SampleChain{S}) where {S, T} + return sum(xlogy.(chain.transitions, mc.transitions)) + + log(stationary_dist(mc)[chain.indices[1]]) +end + + +# user responsability to have the same states... +function fit(mc::DiscreteMarkovChain{S, T}, chain::SampleChain{S}) where {S, T} + return DiscreteMarkovChain(mc.states, chain.transitions ./ sum(chain.transitions, dims=2)) +end diff --git a/test/assignments/sum_assignment.jl b/test/assignments/sum_assignment.jl new file mode 100644 index 0000000..93ecbc6 --- /dev/null +++ b/test/assignments/sum_assignment.jl @@ -0,0 +1,10 @@ +import NetworkHistogram as NH + +using Random + + +@testset "test sum assignment" begin + using Distributions, LinearAlgebra, SparseArrays + @test_broken 1 == 2 + @error "This test is not implemented yet" +end diff --git a/test/runtests.jl b/test/runtests.jl index 9f88662..8c1ee77 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,6 +12,7 @@ include("TestNetworkHistogram.jl") include("assignments/bernoulli_assignment.jl") include("assignments/categorical_assignment.jl") include("assignments/sparse_assignment.jl") + include("assignments/sum_assignment.jl") end @testset "Rule optimization tests" begin From ad1448d5993f4cfdfa85aaa432e4f292e4f95fc7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 23 Apr 2025 17:45:47 +0200 Subject: [PATCH 131/266] typo --- test/assignments/sum_assignment.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/assignments/sum_assignment.jl b/test/assignments/sum_assignment.jl index 93ecbc6..345c95c 100644 --- a/test/assignments/sum_assignment.jl +++ b/test/assignments/sum_assignment.jl @@ -5,6 +5,5 @@ using Random @testset "test sum assignment" begin using Distributions, LinearAlgebra, SparseArrays - @test_broken 1 == 2 @error "This test is not implemented yet" end From a39656d9ca419169ca0160f2ce4ea658696f4338 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 23 Apr 2025 17:46:08 +0200 Subject: [PATCH 132/266] typo --- test/assignments/sum_assignment.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/assignments/sum_assignment.jl b/test/assignments/sum_assignment.jl index 345c95c..1a80856 100644 --- a/test/assignments/sum_assignment.jl +++ b/test/assignments/sum_assignment.jl @@ -5,5 +5,5 @@ using Random @testset "test sum assignment" begin using Distributions, LinearAlgebra, SparseArrays - @error "This test is not implemented yet" + @test 1 == 2 end From 2ebd12e9b382e8b82cae796d575d5c10feba6c40 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 25 Apr 2025 15:03:59 +0200 Subject: [PATCH 133/266] add mc --- Project.toml | 2 + src/NetworkHistogram.jl | 1 + src/api.jl | 26 +++++++++ src/assignments/SumAssignment/struct.jl | 33 +++++++---- src/assignments/SumAssignment/swap.jl | 72 ++++++++++++++--------- src/distributions/markov_chain.jl | 77 +++++++++++++++++++------ src/distributions/utils.jl | 19 ++++++ src/optimisation/least_squares.jl | 1 + 8 files changed, 174 insertions(+), 57 deletions(-) create mode 100644 src/distributions/utils.jl diff --git a/Project.toml b/Project.toml index 6e4843e..2d93d76 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" +KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" @@ -27,6 +28,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] +KrylovKit = "0.9.5" julia = "1.11" [extras] diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 7bb5e0f..36ba276 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -9,6 +9,7 @@ import StatsBase, Random using DensityInterface: logdensityof using LogExpFunctions: xlogx, xlogy using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen +using KrylovKit: eigsolve import Metis import IterativeSolvers import Clustering diff --git a/src/api.jl b/src/api.jl index 01b783e..d09a6f3 100644 --- a/src/api.jl +++ b/src/api.jl @@ -22,6 +22,10 @@ function _default_init(dist::Distribution, start = MetisStart()) end end +function _default_init(::DiscreteMarkovChain, start = RandomStart()) + return InitRule(start, Val{SumData}()) +end + """ _nethist(g::Observations{G, D}, h; kwargs...) @@ -127,3 +131,25 @@ function nethist_discretised(g::Observations{G, D}; start_clustering = start_clustering) return sbm_discretise, a, discretiser end + + + +function nethist_mc(g::Observations{G, <:DiscreteMarkovChain}; + h = number_nodes(g) ÷ 2, + iterations = 100_000, + stalled_iter = 1000, + swap_rule::NodeSwapRule = RandomGroupSwap(), + accept_rule::AcceptRule = Strict(), + progress_bar::Bool = true, + start_clustering = RandomStart() +) where {G} + initialise_rule = _default_init(g.dist_ref, start_clustering) + a = estimate_graphon(g, h; + iterations = iterations, + initialise_rule = initialise_rule, + swap_rule = swap_rule, + accept_rule = accept_rule, + stop_rule = PreviousBestValue(stalled_iter), + progress_bar = progress_bar) + return fit(a, g), a +end diff --git a/src/assignments/SumAssignment/struct.jl b/src/assignments/SumAssignment/struct.jl index 4b3aa77..8451eba 100644 --- a/src/assignments/SumAssignment/struct.jl +++ b/src/assignments/SumAssignment/struct.jl @@ -1,7 +1,7 @@ # type F needs to be a vector field! -struct SumData{F, C} +mutable struct SumData{F, C} λ::SparseMatrixCSC{F, Int} θ::Dict{Tuple{Int, Int}, F} A::SparseMatrixCSC{C, Int} @@ -13,6 +13,21 @@ end const SumAssignment{T, F, C} = Assignment{T, SumData{F, C}} const SumInitRule{S} = InitRule{S, Val{SumData}} + + +function make_assignment(g, h, init_rule::SumInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return SumAssignment(g, group_size, node_labels) +end + +function SumAssignment(g::Observations, group_size::GroupSize, node_labels) + A = issparse(g.graph) ? g.graph : sparse(g.graph) + λ = fit.(Ref(g.dist_ref), A) + return SumAssignment(A, λ, group_size, node_labels) +end + function SumAssignment( A::SparseMatrixCSC{C, Int}, λ::SparseMatrixCSC{F, Int}, group_size::GroupSize, node_labels::Vector{Int}) where { @@ -25,9 +40,10 @@ function SumAssignment( vals = nonzeros(λ) m, n = size(λ) for u in 1:n - for v in rows[nzrange(λ, u)] + for i in nzrange(λ,u) + v = rows[i] if u >= v - break # check that this isn't a mistake trying to be fast + # break # check that this isn't a mistake trying to be fast continue end key_groups = minmax(node_labels[u], node_labels[v]) @@ -46,12 +62,9 @@ function SumAssignment( end for i in 1:k for j in i:k - θ[minmax(i, j)] ./= counts[minmax(i, j)] + θ[minmax(i, j)] /= counts[minmax(i, j)] end end - for i in 1:k - counts[(i, i)] ./= 2 - end ll_sum = 0.0 ll = Dict{Tuple{Int, Int}, Float64}() for i in 1:k @@ -59,9 +72,9 @@ function SumAssignment( ll[(i, j)] = 0.0 end end - for i in 1:n - for v in nzrange(λ, j) - u = rows[i] + for u in 1:n + for i in nzrange(λ, u) + v = rows[i] if u >= v continue end diff --git a/src/assignments/SumAssignment/swap.jl b/src/assignments/SumAssignment/swap.jl index bac2c07..4bd46de 100644 --- a/src/assignments/SumAssignment/swap.jl +++ b/src/assignments/SumAssignment/swap.jl @@ -9,10 +9,10 @@ end function make_swap(a::SumAssignment, id) return SumSwap(id[1], id[2], deepcopy(a.additional_data.θ), - deepcopy(a.additional_data.counts), deepcopy(a.additional_data.log_likelihood_per_group), a.additional_data.log_likelihood) + deepcopy(a.additional_data.counts), deepcopy(a.additional_data.log_likelihood_per_group), + a.additional_data.log_likelihood) end - function make_swap!(swap::SumSwap{F}, a::SumAssignment{T, F}, id) where {T, F} swap.index1, swap.index2 = id swap.θ = deepcopy(a.additional_data.θ) @@ -41,40 +41,46 @@ function apply_swap!( return nothing end - for v in rows[nzrange(λ, swap.index1)] + for i in nzrange(λ, swap.index1) + v = rows[i] key_old_groups = minmax(g1, a.node_labels[v]) key_new_groups = minmax(g2, a.node_labels[v]) - c_og = a.counts[key_old_groups] - c_ng = a.counts[key_new_groups] + c_og = a.additional_data.counts[key_old_groups] + c_ng = a.additional_data.counts[key_new_groups] param = vals[i] - a.θ[key_old_groups] = (a.θ[key_old_groups]*c_og - param)/(c_og - 1) - a.θ[key_new_groups] = (a.θ[key_new_groups]*c_ng + param)/(c_ng + 1) - a.counts[key_old_groups] -= 1 - a.counts[key_new_groups] += 1 + a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - + param)/(c_og - 1) + a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + + param)/(c_ng + 1) + a.additional_data.counts[key_old_groups] -= 1 + a.additional_data.counts[key_new_groups] += 1 end - for v in rows[nzrange(λ, swap.index2)] + for i in nzrange(λ, swap.index2) + v = rows[i] key_old_groups = minmax(g2, a.node_labels[v]) key_new_groups = minmax(g1, a.node_labels[v]) - c_og = a.counts[key_old_groups] - c_ng = a.counts[key_new_groups] + c_og = a.additional_data.counts[key_old_groups] + c_ng = a.additional_data.counts[key_new_groups] param = vals[i] - a.θ[key_old_groups] = (a.θ[key_old_groups]*c_og - param)/(c_og - 1) - a.θ[key_new_groups] = (a.θ[key_new_groups]*c_ng + param)/(c_ng + 1) - a.counts[key_old_groups] -= 1 - a.counts[key_new_groups] += 1 + a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - + param)/(c_og - 1) + a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + + param)/(c_ng + 1) + a.additional_data.counts[key_old_groups] -= 1 + a.additional_data.counts[key_new_groups] += 1 end swap_node_labels!(a, swap.index1, swap.index2) fast_update_ll!(a, swap) end -function fast_update_ll(a::SumAssignment, swap::SumSwap) +function fast_update_ll!(a::SumAssignment, swap::SumSwap) k = size(a.group_size, 1) for i in 1:k for j in i:k index_group = (i, j) - if swap.θ[index_group] != a.θ[index_group] + if swap.θ[index_group] != a.additional_data.θ[index_group] _update_ll_one_group!(a, index_group) end end @@ -83,21 +89,31 @@ function fast_update_ll(a::SumAssignment, swap::SumSwap) end function _update_ll_one_group!(a::SumAssignment, group) - k = size(a.group_size, 1) nodes_1 = findall(x -> x == group[1], a.node_labels) nodes_2 = findall(x -> x == group[2], a.node_labels) ll = 0.0 rows = rowvals(a.additional_data.λ) - vals = nonzeros(a.additional_data.λ) - for i in nodes_1 - for u in nodes_1 - for v in rows[nzrange(a.additional_data.λ, u)] - if v ∈ nodes_2 - ll += loglikelihood(a.θ[group], a.additional_data.A[u, v]) - end - end + for u in nodes_1 + for v in intersect(rows[nzrange(a.additional_data.λ, u)], nodes_2) + ll += loglikelihood( + a.additional_data.θ[group], a.additional_data.A[u, v]) end end - a.log_likelihood_per_group[group] = ll + a.additional_data.log_likelihood_per_group[group] = ll return nothing end + +function fit( + a::SumAssignment{T, F, C}, g::Observations{ + G, <:DiscreteMarkovChain}) where { + T, F, C, G} + dists = initialize_sbm( + a.group_size, g.dist_ref) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + dists[ + group1, group2] = a.additional_data.θ[minmax(group1, group2)] + end + end + return dists +end diff --git a/src/distributions/markov_chain.jl b/src/distributions/markov_chain.jl index 4368458..28a21b8 100644 --- a/src/distributions/markov_chain.jl +++ b/src/distributions/markov_chain.jl @@ -1,16 +1,21 @@ # if S is Int, assume the states are ordered and sequential -struct DiscreteMarkovChain{S, T} +# should store everything in transpose, will be faster but way more +# complicated to read +struct DiscreteMarkovChain{S, M <: AbstractMatrix} states::Vector{S} - transitions::Matrix{T} + transitions::M end -struct SampleChain{S} +struct SampleChain{S, M <: AbstractMatrix} states::Vector{S} indices::Vector{Int} - transitions::Matrix{Int} + transitions::M end -function state_index(mc::DiscreteMarkovChain{S, T}, state::S) where {S, T} +Base.zero(::DiscreteMarkovChain) = DiscreteMarkovChain(Int[], zeros(Int, 0, 0)) +Base.zero(::SampleChain{S}) where {S} = SampleChain(S[], Int[], zeros(Int, 1, 1)) + +function state_index(mc::DiscreteMarkovChain{S}, state::S) where {S} findfirst(isequal(state), mc.states) end @@ -19,20 +24,32 @@ transition_matrix(mc::DiscreteMarkovChain) = mc.transitions function stationary_dist(mc::DiscreteMarkovChain) T = transition_matrix(mc) - n = length(state_space(mc)) F = eigen(T') tol = 1e-8 idx = findfirst(abs.(F.values .- 1) .< tol) if idx === nothing error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") end - # Extract the corresponding eigenvector and normalize it to sum to 1. pi = real(F.vectors[:, idx]) return pi ./ sum(pi) end -function sample_indices(mc::DiscreteMarkovChain{S, T}, t::Int) where {S, T} +function stationary_dist(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}) where {S} + T = transition_matrix(mc) + vals, vecs, _ = eigsolve(T') + tol = 1e-8 + idx = findfirst(abs.(vals .- 1) .< tol) + if idx === nothing + error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") + end + # Extract the corresponding eigenvector and normalize it to sum to 1. + pi = Real.(vecs[idx]) + result = pi ./ sum(pi) + return result +end + +function sample_indices(mc::DiscreteMarkovChain, t::Int) indices = Vector{Int}(undef, t) indices[1] = rand(Categorical(stationary_dist(mc))) tr_transposed = transpose(mc.transitions) @@ -42,7 +59,7 @@ function sample_indices(mc::DiscreteMarkovChain{S, T}, t::Int) where {S, T} return indices end -function sample(mc::DiscreteMarkovChain{S, T}, t::Int) where {S, T} +function sample(mc::DiscreteMarkovChain, t::Int) indices = sample_indices(mc, t) states = mc.states[indices] counts = zeros(Int, length(mc.states), length(mc.states)) @@ -52,6 +69,15 @@ function sample(mc::DiscreteMarkovChain{S, T}, t::Int) where {S, T} return SampleChain(states, indices, counts) end +function sample(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}, t::Int) where {S} + indices = sample_indices(mc, t) + states = mc.states[indices] + counts = zeros(Int, length(mc.states), length(mc.states)) + for i in 1:(length(indices) - 1) + counts[indices[i], indices[i + 1]] += 1 + end + return SampleChain(states, indices, sparse(counts)) +end ## yes I know this is awful and does not return a proper chain, but... function Base.:+(a::DiscreteMarkovChain, b::DiscreteMarkovChain) @@ -80,26 +106,39 @@ function Base.:/(a::DiscreteMarkovChain, c::Real) a.transitions ./ c) end -function loglikelihood(mc::DiscreteMarkovChain{S, T}, chain::Vector{Int}) where {S, T} +function loglikelihood(mc::DiscreteMarkovChain{S, M}, chain::Vector{Int}) where {S, M} Tr = transition_matrix(mc) - loglik = log(stationary_dist(mc)[chain[1]]) + probas = Vector{Float64}(undef, length(chain)) + probas[1] = stationary_dist(mc)[chain[1]] for i in 1:(length(chain) - 1) - loglik += log(Tr[chain[i], chain[i + 1]]) + probas[i + 1] = Tr[chain[i], chain[i + 1]] end - return loglik + return sum(log, probas) end -function loglikelihood(mc::DiscreteMarkovChain{S, T}, chain::Vector{S}) where {S, T} +function loglikelihood( + mc::DiscreteMarkovChain{S, M1}, chain::Vector{S}) where {S, M1} return loglikelihood(mc, state_index.(Ref(mc), chain)) end -function loglikelihood(mc::DiscreteMarkovChain{S, T}, chain::SampleChain{S}) where {S, T} - return sum(xlogy.(chain.transitions, mc.transitions)) + - log(stationary_dist(mc)[chain.indices[1]]) +#without the first state, huge computational speedup +function loglikelihood( + mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} + return sum(map(xlogy, chain.transitions, mc.transitions)) #+log(stationary_dist(mc)[chain.indices[1]]) end + # user responsability to have the same states... -function fit(mc::DiscreteMarkovChain{S, T}, chain::SampleChain{S}) where {S, T} - return DiscreteMarkovChain(mc.states, chain.transitions ./ sum(chain.transitions, dims=2)) +function fit( + mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} + return DiscreteMarkovChain( + mc.states, make_row_stochastic(chain.transitions)) +end + + + +function make_row_stochastic(A::M) where {M <: AbstractMatrix} + f(row) = sum(row) == 0 ? ones(length(row)) / length(row) : row ./ sum(row) + return mapslices(f, A, dims = 2) end diff --git a/src/distributions/utils.jl b/src/distributions/utils.jl new file mode 100644 index 0000000..d30daf7 --- /dev/null +++ b/src/distributions/utils.jl @@ -0,0 +1,19 @@ +const logtwo = log(2.0) + +sumlog(x::AbstractArray{<:Real}) = sum(log,x) + +function sumlog(x::AbstractArray{<:AbstractFloat}) + sig = one(T) + ex = zero(exponent(one(T))) + bound = floatmax(T) / 2 + for xj in x + sig *= significand(xj) + ex += exponent(xj) + if sig > bound + (a, b) = (significand(sig), exponent(sig)) + sig = a + ex += b + end + end + log(sig) + logtwo * ex +end diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl index 9369333..9aae0db 100644 --- a/src/optimisation/least_squares.jl +++ b/src/optimisation/least_squares.jl @@ -28,6 +28,7 @@ function estimate_graphon( progress_bar::Bool = false ) a = make_assignment(graph, h, initialise_rule) + @debug a initialise_stop_rule!(stop_rule, a, graph) greedy_improve!( a, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) From 6c615e351c99833d6776d55a6dd3918a121c5382 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 28 Apr 2025 16:49:18 +0200 Subject: [PATCH 134/266] rewrite and condense --- src/NetworkHistogram.jl | 72 +---- src/api.jl | 155 +---------- src/assignments/Assignments.jl | 104 -------- src/assignments/BernoulliAssignment/struct.jl | 118 --------- src/assignments/BernoulliAssignment/swap.jl | 99 ------- .../CategoricalAssignment/struct.jl | 128 --------- src/assignments/CategoricalAssignment/swap.jl | 201 -------------- src/assignments/SparseAssignment/struct.jl | 115 -------- src/assignments/SparseAssignment/swap.jl | 134 ---------- src/assignments/SumAssignment/struct.jl | 94 ------- src/assignments/SumAssignment/swap.jl | 119 --------- src/assignments/group_numbering.jl | 86 ------ src/assignments/include.jl | 4 - src/bootstrap.jl | 15 -- src/distributions/categorical_with_0.jl | 141 ---------- src/distributions/discrete_dist.jl | 142 ---------- src/distributions/discretizer.jl | 248 ------------------ src/distributions/include.jl | 5 - src/distributions/markov_chain.jl | 144 ---------- src/distributions/utils.jl | 19 -- src/distributions/zero_inflated.jl | 93 ------- src/observations.jl | 231 ---------------- src/optimisation/config_rules/InitRule.jl | 99 ------- src/optimisation/config_rules/accept_rule.jl | 46 ---- .../config_rules/bandwidth_selection_rule.jl | 101 ------- src/optimisation/config_rules/include.jl | 5 - src/optimisation/config_rules/stop_rule.jl | 49 ---- src/optimisation/config_rules/swap_rule.jl | 27 -- src/optimisation/fit.jl | 99 ------- src/optimisation/include.jl | 3 - src/optimisation/least_squares.jl | 95 ------- src/optimisation/swap.jl | 26 -- src/sbm.jl | 186 ------------- test/TestNetworkHistogram.jl | 30 --- test/assignments/bernoulli_assignment.jl | 42 --- test/assignments/categorical_assignment.jl | 126 --------- test/assignments/default_assignment.jl | 17 -- test/assignments/sparse_assignment.jl | 120 --------- test/assignments/sum_assignment.jl | 9 - test/discretised_dist/discretizer.jl | 20 -- test/generated_tests/all.jl | 2 - test/generated_tests/test_distribution.jl | 84 ------ test/generated_tests/test_zero_inflated.jl | 97 ------- test/observations/discretisation.jl | 15 -- test/optimisation/config_rules/init_rule.jl | 46 ---- test/runtests.jl | 46 ++-- test/test_api.jl | 19 -- 47 files changed, 38 insertions(+), 3838 deletions(-) delete mode 100644 src/assignments/Assignments.jl delete mode 100644 src/assignments/BernoulliAssignment/struct.jl delete mode 100644 src/assignments/BernoulliAssignment/swap.jl delete mode 100644 src/assignments/CategoricalAssignment/struct.jl delete mode 100644 src/assignments/CategoricalAssignment/swap.jl delete mode 100644 src/assignments/SparseAssignment/struct.jl delete mode 100644 src/assignments/SparseAssignment/swap.jl delete mode 100644 src/assignments/SumAssignment/struct.jl delete mode 100644 src/assignments/SumAssignment/swap.jl delete mode 100644 src/assignments/group_numbering.jl delete mode 100644 src/assignments/include.jl delete mode 100644 src/bootstrap.jl delete mode 100644 src/distributions/categorical_with_0.jl delete mode 100644 src/distributions/discrete_dist.jl delete mode 100644 src/distributions/discretizer.jl delete mode 100644 src/distributions/include.jl delete mode 100644 src/distributions/markov_chain.jl delete mode 100644 src/distributions/utils.jl delete mode 100644 src/distributions/zero_inflated.jl delete mode 100644 src/observations.jl delete mode 100644 src/optimisation/config_rules/InitRule.jl delete mode 100644 src/optimisation/config_rules/accept_rule.jl delete mode 100644 src/optimisation/config_rules/bandwidth_selection_rule.jl delete mode 100644 src/optimisation/config_rules/include.jl delete mode 100644 src/optimisation/config_rules/stop_rule.jl delete mode 100644 src/optimisation/config_rules/swap_rule.jl delete mode 100644 src/optimisation/fit.jl delete mode 100644 src/optimisation/include.jl delete mode 100644 src/optimisation/least_squares.jl delete mode 100644 src/optimisation/swap.jl delete mode 100644 src/sbm.jl delete mode 100644 test/TestNetworkHistogram.jl delete mode 100644 test/assignments/bernoulli_assignment.jl delete mode 100644 test/assignments/categorical_assignment.jl delete mode 100644 test/assignments/default_assignment.jl delete mode 100644 test/assignments/sparse_assignment.jl delete mode 100644 test/assignments/sum_assignment.jl delete mode 100644 test/discretised_dist/discretizer.jl delete mode 100644 test/generated_tests/all.jl delete mode 100644 test/generated_tests/test_distribution.jl delete mode 100644 test/generated_tests/test_zero_inflated.jl delete mode 100644 test/observations/discretisation.jl delete mode 100644 test/optimisation/config_rules/init_rule.jl delete mode 100644 test/test_api.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 36ba276..fc172f3 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,70 +1,18 @@ module NetworkHistogram - -using LinearAlgebra, SparseArrays -using Distributions, DensityInterface -using Graphs, SimpleWeightedGraphs -using PermutationSymmetricTensors -using ProgressMeter: Progress, next!, finish!, ProgressUnknown -import StatsBase, Random -using DensityInterface: logdensityof -using LogExpFunctions: xlogx, xlogy -using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen -using KrylovKit: eigsolve -import Metis -import IterativeSolvers -import Clustering -import StatsAPI: loglikelihood, fit -using CategoricalArrays, CategoricalDistributions -using Combinatorics: permutations +using StatsBase using StaticArrays -using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, nrun, - zeros_tuple -import Bootstrap: bootstrap -import Base.maximum, Base.minimum -import Random: rand -import Base.convert -import Distributions: pdf, logpdf, ncategories, cdf, rand - -include("distributions/include.jl") -include("assignments/Assignments.jl") -include("sbm.jl") -include("observations.jl") -include("optimisation/include.jl") - -# more specialised and faster assignment types and methods -include("assignments/include.jl") - -include("api.jl") -include("bootstrap.jl") - -export nethist, nethist_discretised -export loglikelihood, fit, cdf, pdf -# export options for optimisation -export estimate_graphon -# starting assignment rules -export InitRule -export OrderedStart, RandomStart, SpectralStart, MetisStart, FromAssignment -# accept rules -export AcceptRule -export Strict -# stopping rules -export PreviousBestValue -# bandwidth selection rules -export OracleK, EstimatedEigenvalues, EstimatedDegrees, - select_number_node_per_block -# random local search rules -export RandomNodeSwap, RandomGroupSwap +include("utils/include.jl") +using .FastSymArray -# export useful function for manipulating assignments -export Assignment, number_groups, number_nodes -export get_ordered_adjacency_matrix, get_vertex_in_group, get_group_of_vertex -export BernoulliData, CategoricalData -export Observations, discretise -export DiscretizedDistribution +include("distributions_type.jl") +include("block_model.jl") +include("EdgeList.jl") +include("assignment.jl") +include("optimization/greedy.jl") -export Observations, estimate_graphon, nethist, nethist_discretised -export bootstrap +export EdgeList, neighbors, nodes +#include("include_old.jl") end diff --git a/src/api.jl b/src/api.jl index d09a6f3..e77b753 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,155 +1,20 @@ -""" - _default_init(dist::Distribution, start = MetisStart()) +function nethist(data_input, dist_user, initial_node_labels, params::GreedyParams) -Initialize the distribution with a default rule. + dist = Dist(dist_user) + g = preprocess_data(data_input, dist) -# Arguments -- `dist::Distribution`: The distribution to initialize. -- `start`: The starting method. + out = greedy_optimize(g, initial_node_labels, params) -# Returns -- `InitRule`: The initialization rule. -""" -function _default_init(dist::Distribution, start = MetisStart()) - if dist isa Bernoulli - return InitRule(start, Val{BernoulliData}()) - elseif dist isa Categorical - return InitRule(start, Val{CategoricalData}()) - elseif dist isa DiscretizedDistribution || dist isa ZeroInflatedCategorical - return InitRule(start, Val{SparseData}()) - else - return InitRule(start, nothing) - end + return postprocess(out) end -function _default_init(::DiscreteMarkovChain, start = RandomStart()) - return InitRule(start, Val{SumData}()) -end - -""" - _nethist(g::Observations{G, D}, h; kwargs...) - -Estimate the graphon and fit the model to the given graph observations. - -# Arguments -- `g::Observations{G, D}`: The graph observations. -- `h`: Number of nodes per block. -- `kwargs...`: Additional keyword arguments. -# Returns -- `fit_model`: The fitted model. -- `a`: The assignment of nodes to blocks. -""" -function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} - kwargs_dict = Dict(kwargs) - start_clustering = pop!(kwargs_dict, :start_clustering, MetisStart()) - initialise_rule = pop!( - kwargs_dict, :initialise_rule, _default_init( - g.dist_ref, start_clustering)) - a = estimate_graphon(g, h; - kwargs_dict..., initialise_rule = initialise_rule) - return fit(a, g), a +function preprocess_data(data, dist) + A = _fast_compressed_g.(dist, data) + return A, dist end -""" - nethist(g::Observations{G, D}; h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) - -Fit a Stochastic Block Model (SBM) to the given graph observations. - -# Arguments -- `g::Observations{G, D}`: The graph observations. -- `h`: Number of nodes per block. -- `iterations`: Maximum number of iterations. -- `stalled_iter`: Number of stalled iterations before stopping. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `progress_bar::Bool`: Whether to show a progress bar. -- `start_clustering`: Initial clustering method. - -# Returns -- `sbm`: The fitted SBM. -- `a`: The assignment of nodes to blocks. -""" -function nethist(g::Observations{G, D}; - h = select_number_node_per_block(g, EstimatedDegrees()), - iterations = 100_000, - stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomGroupSwap(), - accept_rule::AcceptRule = Strict(), - progress_bar::Bool = false, - start_clustering = MetisStart() -) where {G, D} - return _nethist(g, h; - iterations = iterations, - swap_rule = swap_rule, - accept_rule = accept_rule, - stop_rule = PreviousBestValue(stalled_iter), - progress_bar = progress_bar, - start_clustering = start_clustering) -end - -""" - nethist_discretised(g::Observations{G, D}; number_levels, h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) - -Fit a discretised Stochastic Block Model (SBM) to the given graph observations. - -# Arguments -- `g::Observations{G, D}`: The graph observations. -- `number_levels`: Number of levels for discretisation. -- `h`: Number of nodes per block. -- `iterations`: Maximum number of iterations. -- `stalled_iter`: Number of stalled iterations before stopping. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `progress_bar::Bool`: Whether to show a progress bar. -- `start_clustering`: Initial clustering method. - -# Returns -- `sbm_discretise`: The fitted discretised SBM. -- `a`: The assignment of nodes to blocks. -- `discretiser`: The discretiser used. -""" -function nethist_discretised(g::Observations{G, D}; - number_levels = nothing, - h = select_number_node_per_block(g, EstimatedDegrees()), - iterations = 100_000, - stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomGroupSwap(), - accept_rule::AcceptRule = Strict(), - progress_bar::Bool = false, - start_clustering = MetisStart() -) where {G, D} - num_groups = isnothing(number_levels) ? number_nodes(g) ÷ h : nothing - obs_discrete, discretiser = discretise( - g, number_groups = num_groups, number_levels = number_levels) - sbm_discretise, a = _nethist(obs_discrete, h; - iterations = iterations, - swap_rule = swap_rule, - accept_rule = accept_rule, - stop_rule = PreviousBestValue(stalled_iter), - progress_bar = progress_bar, - start_clustering = start_clustering) - return sbm_discretise, a, discretiser -end - - -function nethist_mc(g::Observations{G, <:DiscreteMarkovChain}; - h = number_nodes(g) ÷ 2, - iterations = 100_000, - stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomGroupSwap(), - accept_rule::AcceptRule = Strict(), - progress_bar::Bool = true, - start_clustering = RandomStart() -) where {G} - initialise_rule = _default_init(g.dist_ref, start_clustering) - a = estimate_graphon(g, h; - iterations = iterations, - initialise_rule = initialise_rule, - swap_rule = swap_rule, - accept_rule = accept_rule, - stop_rule = PreviousBestValue(stalled_iter), - progress_bar = progress_bar) - return fit(a, g), a +function postprocess(out) + return BlockModel(optimal_a) end diff --git a/src/assignments/Assignments.jl b/src/assignments/Assignments.jl deleted file mode 100644 index c312324..0000000 --- a/src/assignments/Assignments.jl +++ /dev/null @@ -1,104 +0,0 @@ -include("group_numbering.jl") - -""" - struct Assignment{T, B} <: AbstractVector{Vector{Int}} - -A structure representing an assignment of nodes to groups. - -# Fields -- `group_size::GroupSize{T}`: The size of each group. -- `node_labels::Vector{Int}`: A vector of node labels. -- `additional_data::B`: Additional data associated with the assignment. - -# Constructor - Assignment(group_size::GroupSize{T}, node_labels, additional_data::B) where {T, B} - -Creates a new `Assignment` instance. - -# Arguments -- `group_size::GroupSize{T}`: The size of each group. -- `node_labels::Vector{Int}`: A vector of node labels. The length of this vector must be equal to the sum of `group_size`. -- `additional_data::B`: Additional data associated with the assignment. - -# Throws -- `ArgumentError`: If the length of `node_labels` is not equal to the sum of `group_size`. -""" -struct Assignment{T, B} <: AbstractVector{Vector{Int}} - group_size::GroupSize{T} - node_labels::Vector{Int} - additional_data::B - - function Assignment(group_size::GroupSize{T}, node_labels, - additional_data::B) where {T, B} - if length(node_labels) != sum(group_size) - throw(ArgumentError("The length of `node_labels` must be equal to the sum of \ - `group_size`")) - end - return new{T, B}(group_size, node_labels, additional_data) - end -end - -function Assignment(group_size::GroupSize, node_labels) - if length(node_labels) != sum(group_size) - throw(ArgumentError("The length of `node_labels` $(length(node_labels)) must be \ - equal to the sum of `group_size` $(sum(group_size))")) - end - c = StatsBase.countmap(node_labels) - if length(c) != length(group_size) - throw(ArgumentError("The number of unique elements in `node_labels` $(length(c)) \ - must be equal to the length of `group_size` $(length(group_size))")) - end - for (k, v) in c - if v != group_size[k] - throw(ArgumentError("The number of elements in `node_labels` $(v) for group \ - $(k) must be equal to the size of the group $(group_size[k])")) - end - end - return Assignment(group_size, node_labels, nothing) -end - -function number_groups(assignment::Assignment) - return length(assignment.group_size) -end - -function number_nodes(assignment::Assignment) - return length(assignment.node_labels) -end - -function get_vertex_in_group(assignment::Assignment, group) - return findall(assignment.node_labels .== group) -end - -function get_group_of_vertex(assignment::Assignment, vertex) - return assignment.node_labels[vertex] -end - -function get_edge_indices(a::Assignment, i, j) - if i == j - return get_edge_indices(a, i) - else - return [(x, y) for x in get_vertex_in_group(a, i) - for y in get_vertex_in_group(a, j)] - end -end - -function get_edge_indices(a::Assignment, i) - nodes_i = get_vertex_in_group(a, i) - return [(x, y) for x in nodes_i for y in nodes_i if x < y] -end - -Base.size(a::Assignment) = (number_groups(a),) -Base.@propagate_inbounds function Base.getindex(a::Assignment, i) - @boundscheck checkbounds(a, i) - return get_vertex_in_group(a, i) -end - -function get_ordered_adjacency_matrix(a::Assignment, A, by = identity) - perm = sortperm(a.node_labels, by = by) - return A[perm, perm] -end - -function Base.deepcopy(a::Assignment) - return Assignment( - a.group_size, copy(a.node_labels), deepcopy(a.additional_data)) -end diff --git a/src/assignments/BernoulliAssignment/struct.jl b/src/assignments/BernoulliAssignment/struct.jl deleted file mode 100644 index 73fab50..0000000 --- a/src/assignments/BernoulliAssignment/struct.jl +++ /dev/null @@ -1,118 +0,0 @@ -""" - mutable struct BernoulliData{F} - -A data structure to store information related to a Bernoulli assignment in a network. - -# Fields -- `counts::Matrix{Int}`: A matrix representing the maximum number of edges between groups. -- `realized::Matrix{Int}`: A matrix representing the number of edges between groups. -- `estimated_theta::Matrix{F}`: A matrix of estimated parameters (theta). -- `A::BitMatrix`: An adjacency matrix representing the network structure. -- `log_likelihood::F`: -""" -mutable struct BernoulliData{F} - counts::Matrix{Int} - realized::Matrix{Int} - estimated_theta::Matrix{F} - A::BitMatrix # possible improvement by using an adjacency list - log_likelihood::F -end - -const BernoulliAssignment{T, F} = Assignment{T, BernoulliData{F}} -const BernoulliInitRule{S, F} = InitRule{S, Val{BernoulliData}} - -function BernoulliAssignment( - g, group_size::GroupSize, node_labels::Vector{Int}) - bernoulli_data = make_bernoulli_data(g, node_labels, group_size) - return Assignment(group_size, node_labels, bernoulli_data) -end - -function make_assignment(g, h, init_rule::BernoulliInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - return BernoulliAssignment(g, group_size, node_labels) -end - -# might be worth using graph accessors instead of the adjacency matrix ? -function make_bernoulli_data(g, node_labels, group_size) - number_groups = length(group_size) - n = length(node_labels) - counts = zeros(Int, number_groups, number_groups) - realized = zeros(Int, number_groups, number_groups) - A = convert_bitmatrix(g) - - # below needs to be abstracted: not sure how diagonal is handled if nonzero - # addtioally, we should be able to deal with missing values ! - # This concerns the counts matrix above as well - @inbounds @simd for k in 1:number_groups - for l in k:number_groups - realized[k, l] = sum(A[node_labels .== k, node_labels .== l]) - realized[l, k] = realized[k, l] - counts[k, l] = group_size[k] * group_size[l] - counts[l, k] = counts[k, l] - end - end - - @inbounds @simd for k in 1:number_groups - counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 - realized[k, k] = sum(A[node_labels .== k, node_labels .== k]) ÷ 2 - end - - estimated_theta = realized ./ counts - ll = compute_log_likelihood(estimated_theta, counts) - return BernoulliData(counts, realized, estimated_theta, A, ll) -end - -function convert_bitmatrix(g::Observations{<:AbstractGraph, D}) where {D} - A = collect(adjacency_matrix(g.graph)) - return convert(BitMatrix, collect(adjacency_matrix(g.graph))) -end - -function convert_bitmatrix(g::Observations{<:AbstractMatrix, D}) where {D} - return convert(BitMatrix, g.graph) -end - -function compute_log_likelihood(estimated_theta::AbstractMatrix{F}, - counts::AbstractMatrix{T}) where {F <: Real, T <: Real} - number_groups = size(estimated_theta, 1) - loglik = zero(eltype(estimated_theta)) - @inbounds for j in 1:number_groups - @simd for i in j:number_groups - θ = estimated_theta[i, j] - loglik += (xlogx(θ) + xlogx(1 - θ)) * counts[i, j] - end - end - return loglik -end - -function loglikelihood(assignment::BernoulliAssignment) - return assignment.additional_data.log_likelihood -end - -loglikelihood(a::BernoulliAssignment, g::Observations) = loglikelihood(a) - -function force_recompute_ll(a::BernoulliAssignment, g::Observations) - a_simple = Assignment(a.group_size, a.node_labels) - return loglikelihood(a_simple, g) -end - -include("swap.jl") - -function get_ordered_adjacency_matrix(a::BernoulliAssignment, by = identity) - return get_ordered_adjacency_matrix(a, a.additional_data.A, by) -end - -# TODO: move to sparse structure to encode difference between 0 weight and absence of edge -# from docs: -# A = sparse(I,J,V) -# rows = rowvals(A) -# vals = nonzeros(A) -# m, n = size(A) -# for j = 1:n -# for i in nzrange(A, j) -# row = rows[i] -# val = vals[i] -# # perform sparse wizardry... -# end -# end diff --git a/src/assignments/BernoulliAssignment/swap.jl b/src/assignments/BernoulliAssignment/swap.jl deleted file mode 100644 index edfd9a3..0000000 --- a/src/assignments/BernoulliAssignment/swap.jl +++ /dev/null @@ -1,99 +0,0 @@ -mutable struct BernoulliSwap{F} <: Swap - index1::Int - index2::Int - realized::Matrix{Int} - estimated_theta::Matrix{F} - log_likelihood::F -end - -function make_swap( - a::BernoulliAssignment{T, F}, id) where {T, F} - return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta), - a.additional_data.log_likelihood) -end - -function make_swap!(swap::BernoulliSwap{F}, a::BernoulliAssignment{T, F}, - id) where {T, F} - swap.index1, swap.index2 = id - copy!(swap.realized, a.additional_data.realized) - copy!(swap.estimated_theta, a.additional_data.estimated_theta) - swap.log_likelihood = a.additional_data.log_likelihood -end - -function revert_swap!( - a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - swap_node_labels!(a, swap.index1, swap.index2) - copy!(a.additional_data.realized, swap.realized) - copy!(a.additional_data.estimated_theta, swap.estimated_theta) - a.additional_data.log_likelihood = swap.log_likelihood -end - -function apply_swap!( - a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - update_observed_and_labels!(a, swap) - update_ll!(a) -end - -function update_observed_and_labels!( - a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - - for i in axes(a.additional_data.A, 2) - if i == swap.index1 || i == swap.index2 || - a.additional_data.A[swap.index1, i] == - a.additional_data.A[swap.index2, i] - continue - end - group_inter = get_group_of_vertex(a, i) - if a.additional_data.A[swap.index1, i] - a.additional_data.realized[g1, group_inter] -= 1 - a.additional_data.realized[ - group_inter, g1] = a.additional_data.realized[ - g1, group_inter] - - a.additional_data.realized[g2, group_inter] += 1 - a.additional_data.realized[ - group_inter, g2] = a.additional_data.realized[ - g2, group_inter] - end - if a.additional_data.A[swap.index2, i] - a.additional_data.realized[g2, group_inter] -= 1 - a.additional_data.realized[ - group_inter, g2] = a.additional_data.realized[ - g2, group_inter] - - a.additional_data.realized[g1, group_inter] += 1 - a.additional_data.realized[ - group_inter, g1] = a.additional_data.realized[ - g1, group_inter] - end - end - - @. a.additional_data.estimated_theta = a.additional_data.realized / - a.additional_data.counts - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function update_ll!(a::BernoulliAssignment) - a.additional_data.log_likelihood = compute_log_likelihood( - a.additional_data.estimated_theta, a.additional_data.counts) - return nothing -end - -function fit(a::BernoulliAssignment, g::Observations) - dists = initialize_sbm(a.group_size, Bernoulli(0.5)) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - dists[group1, - group2] = Bernoulli(a.additional_data.estimated_theta[ - group1, group2]) - end - end - return dists -end diff --git a/src/assignments/CategoricalAssignment/struct.jl b/src/assignments/CategoricalAssignment/struct.jl deleted file mode 100644 index 5ccd175..0000000 --- a/src/assignments/CategoricalAssignment/struct.jl +++ /dev/null @@ -1,128 +0,0 @@ -mutable struct CategoricalData{F, C} - counts::Matrix{Int} - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - A::Matrix{C} # possible use of CategoricalArrays.jl ? - log_likelihood::F # need to remove this type - scratch::Matrix{Int} -end - -const CategoricalAssignment{T, F, C} = Assignment{ - T, CategoricalData{F, C}} -const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} - -function CategoricalAssignment( - g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { - G, D} - categorical_data = make_categorical_data(g, node_labels, group_size) - return Assignment(group_size, node_labels, categorical_data) -end - -function make_assignment(g, h, init_rule::CategoricalInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - a = CategoricalAssignment(g, group_size, node_labels) - return a -end - -function make_categorical_data(g, node_labels, group_size) - number_groups = length(group_size) - A, num_categories = categorical_matrix(g) - counts = zeros(Int, number_groups, number_groups) - realized = zeros(Int, num_categories, number_groups, number_groups) - estimated_theta = zeros( - Float64, num_categories, number_groups, number_groups) - - _count_cat_occurences!( - counts, realized, g, Assignment(group_size, node_labels)) - - _fast_div!(estimated_theta, realized, counts) - scratch = zeros(Int, num_categories, number_groups) - - ll = compute_log_likelihood(estimated_theta, realized) - return CategoricalData(counts, realized, estimated_theta, A, ll, scratch) -end - -function _count_cat_occurences!(counts, realized, g, a_dummy) - @inbounds for k in 1:number_groups(a_dummy) - for l in k:number_groups(a_dummy) - counts_dict = StatsBase.countmap(get_obs.( - Ref(g), get_edge_indices(a_dummy, k, l))) - total = 0 - for (m, v) in counts_dict - realized[m, k, l] = v - realized[m, l, k] = v - total += v - end - counts[k, l] = total - counts[l, k] = total - end - end -end - -function recount_occurences!(a) - _count_cat_occurences!( - a.additional_data.counts, a.additional_data.realized, a.additional_data.A, a) - return nothing -end - -function compute_log_likelihood( - estimated_theta::Array{T, 3}, realized::Array{F, 3}) where { - T, F} - loglik = zero(T) - number_groups = size(estimated_theta, 2) - number_decorations = size(estimated_theta, 1) - @inbounds for j in 1:number_groups - for i in j:number_groups - for m in 1:number_decorations - if realized[m, i, j] != 0 - loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) - end - end - #loglik += sum(log.(estimated_theta[i, j]) .* realized[i, j]) - #loglik += sum(xlogy.(realized[i,j], estimated_theta[i, j]) ) - end - end - return loglik -end - -function categorical_matrix(A::CategoricalMatrix) - @info "Converting CategoricalMatrix to matrix" - categories = levels(A) - return levelcode.(recode( - A, [l => i for (i, l) in enumerate(categories)]..., missing => 0)) -end - -# to update, just for test now -function categorical_matrix(A::AbstractMatrix{Int}) - min_A = minimum(A) - if min_A > 1 - A_inter = A .- min_A .+ 1 - else - A_inter = copy(A) - end - for i in 1:size(A_inter, 1) - A_inter[i, i] = 0 - end - return A_inter -end - -function categorical_matrix(g::Observations) - return categorical_matrix(g.graph), ncategories(g.dist_ref) -end - -function loglikelihood(a::CategoricalAssignment, g::Observations) - return a.additional_data.log_likelihood -end - -function force_recompute_ll(a::CategoricalAssignment, g::Observations) - a_simple = Assignment(a.group_size, a.node_labels) - return loglikelihood(a_simple, g) -end - -include("swap.jl") - -function get_ordered_adjacency_matrix(a::CategoricalAssignment, by = identity) - return get_ordered_adjacency_matrix(a, a.additional_data.A, by) -end diff --git a/src/assignments/CategoricalAssignment/swap.jl b/src/assignments/CategoricalAssignment/swap.jl deleted file mode 100644 index 43b6cd9..0000000 --- a/src/assignments/CategoricalAssignment/swap.jl +++ /dev/null @@ -1,201 +0,0 @@ -mutable struct CategoricalSwap{F} <: Swap - index1::Int - index2::Int - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - log_likelihood::F -end - -function make_swap(a::CategoricalAssignment, id) - return CategoricalSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta), - a.additional_data.log_likelihood) -end - -function copy_realized_and_theta!(a, b) - copy!(a.realized, b.realized) - copy!(a.estimated_theta, b.estimated_theta) - a.log_likelihood = b.log_likelihood - return nothing -end - -function make_swap!( - swap::CategoricalSwap{F}, a::CategoricalAssignment{T, F, C}, - id) where {T, F, C} - swap.index1, swap.index2 = id - copy_realized_and_theta!(swap, a.additional_data) -end - -function revert_swap!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - swap_node_labels!(a, swap.index1, swap.index2) - copy_realized_and_theta!(a.additional_data, swap) - #copy!.(a.additional_data.realized, swap.realized) - #copy!.(a.additional_data.estimated_theta, swap.estimated_theta) - #a.additional_data.log_likelihood = swap.log_likelihood - #return nothing -end - -function apply_swap!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - #update_observed_and_labels!(a, swap) - new_update_observed_and_labels!(a, swap) - update_ll!(a) -end - -function update_ll!(a::CategoricalAssignment) - a.additional_data.log_likelihood = compute_log_likelihood( - a.additional_data.estimated_theta, a.additional_data.realized) - return nothing -end - -function fit( - a::CategoricalAssignment{T, F, C}, g::Observations) where { - T, F, C} - dists = initialize_sbm( - a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - dists[group1, - group2] = Categorical(a.additional_data.estimated_theta[:, - group1, group2]) - end - end - return dists -end - -function fit( - a::CategoricalAssignment{T, F, C}, g::Observations{ - G, <:DiscretizedDistribution}) where { - T, F, C, G} - dists = initialize_sbm( - a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - set_params!( - dists[group1, - group2], a.additional_data.estimated_theta[:, - group1, group2]) - end - end - return dists -end - -function _move_connection!(realized, group_origin, group_dest, scratch) - @inbounds for group in axes(realized, 2) - for label in axes(realized, 1) - realized[label, group, group_origin] -= scratch[label, group] - realized[label, group, group_dest] += scratch[label, group] - realized[label, group_origin, group] = realized[ - label, group, group_origin] - realized[label, group_dest, group] = realized[ - label, group, group_dest] - end - end -end - -# need to rethink if want to use muli-threading -# check https://juliafolds.github.io/Transducers.jl/dev/tutorials/words/ -function new_update_observed_and_labels!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - if g1 == g2 - return nothing - end - - a.additional_data.scratch .= 0 - for i in axes(a.additional_data.A, 1) - if i == swap.index1 || i == swap.index2 - continue - end - @inbounds obs = a.additional_data.A[i, swap.index1] - if obs != 0 - group_inter = get_group_of_vertex(a, i) - a.additional_data.scratch[obs, group_inter] += 1 - end - end - _move_connection!( - a.additional_data.realized, g1, g2, a.additional_data.scratch) - - a.additional_data.scratch .= 0 - for i in axes(a.additional_data.A, 1) - if i == swap.index1 || i == swap.index2 - continue - end - @inbounds obs = a.additional_data.A[i, swap.index2] - if obs != 0 - group_inter = get_group_of_vertex(a, i) - a.additional_data.scratch[obs, group_inter] += 1 - end - end - _move_connection!( - a.additional_data.realized, g2, g1, a.additional_data.scratch) - - _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, - a.additional_data.counts) - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function update_observed_and_labels!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - - adj_1 = @view a.additional_data.A[:, swap.index1] - adj_2 = @view a.additional_data.A[:, swap.index2] - - for i in axes(a.additional_data.A, 1) - if i == swap.index1 || i == swap.index2 - continue - end - obs_1 = adj_1[i] - obs_2 = adj_2[i] - group_inter = get_group_of_vertex(a, i) - if obs_1 != obs_2 - _fast_update!!( - a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) - end - end - - _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, - a.additional_data.counts) - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function _fast_update!!(realized, g1, g2, obs_1, obs_2, g_inter) - realized[obs_1, g1, g_inter] -= 1 - realized[obs_1, g_inter, g1] = realized[obs_1, g1, g_inter] - - realized[obs_1, g2, g_inter] += 1 - realized[obs_1, g_inter, g2] = realized[obs_1, g2, g_inter] - - # send from group 2 to group 1 - realized[obs_2, g2, g_inter] -= 1 - realized[obs_2, g_inter, g2] = realized[obs_2, g2, g_inter] - - realized[obs_2, g1, g_inter] += 1 - realized[obs_2, g_inter, g1] = realized[obs_2, g1, g_inter] -end - -function _fast_div!(theta, realized, counts) - for j in axes(theta, 3) - for i in axes(theta, 2) - for m in axes(theta, 1) - theta[m, i, j] = realized[m, i, j] / counts[i, j] - end - end - end -end diff --git a/src/assignments/SparseAssignment/struct.jl b/src/assignments/SparseAssignment/struct.jl deleted file mode 100644 index 8251c25..0000000 --- a/src/assignments/SparseAssignment/struct.jl +++ /dev/null @@ -1,115 +0,0 @@ -mutable struct SparseData{F, C} - counts::Matrix{Int} - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - A::SparseMatrixCSC{C, Int} - scratch_count::Matrix{Int} - scratch_missing::Vector{Int} - log_likelihood::F -end - -const SparseAssignment{T, F, C} = Assignment{ - T, SparseData{F, C}} -const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} - -function SparseAssignment( - g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { - G, D} - A = issparse(g.graph) ? g.graph : sparse(g.graph) - num_levels = ncategories(g.dist_ref) - sparse_data = SparseData( - A, size(group_size, 1), num_levels, group_size, node_labels) - return Assignment(group_size, node_labels, sparse_data) -end - -function make_assignment(g, h, init_rule::SparseInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - return SparseAssignment(g, group_size, node_labels) -end - -function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, - level_count::Int, group_size, node_labels) where {T} - n = size(A, 1) - data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), - zeros(Float64, level_count, k, k), dropzeros(A), zeros( - Int, level_count, k), zeros( - Int, k), 0.0) - _count_possible_occurences!(data, group_size) - _count_occurences!(data, node_labels) - _fast_div!(data.estimated_theta, data.realized, data.counts) - data.log_likelihood = compute_log_likelihood_without_0( - data.estimated_theta, data.realized, data.counts) - return data -end - -function _count_possible_occurences!(data, group_size) - k = size(group_size, 1) - for j in 1:k - data.counts[j, j] = group_size[j] * (group_size[j] - 1) ÷ 2 - for i in (j + 1):k - data.counts[i, j] = group_size[i] * group_size[j] - data.counts[j, i] = group_size[i] * group_size[j] - end - end -end - -function _count_occurences!(data, node_labels) - m, n = size(data.A) - for k in 1:length(unique(node_labels)) - for l in k:length(unique(node_labels)) - node_group_k = findall(x -> x == k, node_labels) - node_group_l = findall(x -> x == l, node_labels) - if k != l - counts = StatsBase.countmap(data.A[i, j] for i in node_group_k - for j in node_group_l if i != j) - else - counts = StatsBase.countmap(data.A[i, j] for i in node_group_k - for j in node_group_l if i < j) - end - for m in 1:size(data.realized, 1) - data.realized[m, k, l] = get(counts, m, 0) - data.realized[m, l, k] = get(counts, m, 0) - end - total_witouth_missing = sum(values(counts)) - - get(counts, missing, 0) - data.counts[k, l] = total_witouth_missing - data.counts[l, k] = total_witouth_missing - end - end -end - -function compute_log_likelihood_without_0( - estimated_theta::Array{T, 3}, realized::Array{F, 3}, counts) where { - T, F} - loglik = zero(T) - number_groups = size(estimated_theta, 2) - number_decorations = size(estimated_theta, 1) - for j in 1:number_groups - for i in j:number_groups - total_decorations = counts[i, j] - loglik -= xlogx(total_decorations) - for m in 1:number_decorations - loglik += xlogx(realized[m, i, j]) - total_decorations -= realized[m, i, j] - end - loglik += xlogx(total_decorations) - end - end - return loglik -end - -function _n_decorations_with_0(a::SparseAssignment) - return size(a.additional_data.estimated_theta, 1) + 1 -end - -function _n_decorations_not_0(a::SparseAssignment) - return size(a.additional_data.estimated_theta, 1) -end - -function loglikelihood(assignment::SparseAssignment, g::Observations) - return assignment.additional_data.log_likelihood -end - -include("swap.jl") diff --git a/src/assignments/SparseAssignment/swap.jl b/src/assignments/SparseAssignment/swap.jl deleted file mode 100644 index cdfda51..0000000 --- a/src/assignments/SparseAssignment/swap.jl +++ /dev/null @@ -1,134 +0,0 @@ -mutable struct SparseSwap{F} <: Swap - index1::Int - index2::Int - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - counts::Matrix{Int} - log_likelihood::F -end - -function make_swap(a::SparseAssignment, id) - return SparseSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta), copy(a.additional_data.counts), - a.additional_data.log_likelihood) -end - -function copy_addtional!(a, b) - copy!(a.realized, b.realized) - copy!(a.estimated_theta, b.estimated_theta) - copy!(a.counts, b.counts) - a.log_likelihood = b.log_likelihood - return nothing -end - -function make_swap!( - swap::SparseSwap{F}, a::SparseAssignment{T, F}, - id) where {T, F} - swap.index1, swap.index2 = id - copy_addtional!(swap, a.additional_data) -end - -function revert_swap!( - a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} - swap_node_labels!(a, swap.index1, swap.index2) - copy_addtional!(a.additional_data, swap) - return nothing -end - -# this function fails in presence of missing values -function apply_swap!( - a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} - update_observed_and_labels!(a, swap) - update_ll!(a) -end - -function update_ll!(a::SparseAssignment) - a.additional_data.log_likelihood = compute_log_likelihood_without_0( - a.additional_data.estimated_theta, a.additional_data.realized, a.additional_data.counts) - return nothing -end - -function update_observed_and_labels!( - a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - - if g1 == g2 - return nothing - end - - rows = rowvals(a.additional_data.A) - vals = nonzeros(a.additional_data.A) - m, n = size(a.additional_data.A) - for j in [swap.index1, swap.index2] - a.additional_data.scratch_count .= 0 - a.additional_data.scratch_missing .= 0 - g_from = swap.index1 == j ? g1 : g2 - g_to = swap.index1 == j ? g2 : g1 - for i_index in nzrange(a.additional_data.A, j) - row = rows[i_index] - if row == swap.index1 || row == swap.index2 - continue - end - val = vals[i_index] - groupi = get_group_of_vertex(a, row) - if ismissing(val) - a.additional_data.scratch_missing[groupi] += 1 - else - a.additional_data.scratch_count[val, groupi] += 1 - end - end - _move_connection!( - a.additional_data.realized, g_from, g_to, a.additional_data.scratch_count) - - _update_counts!( - a.additional_data.counts, g_from, g_to, a.additional_data.scratch_missing) - end - - _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, - a.additional_data.counts) - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function _update_counts!(counts, g_from, g_to, missing_update) - for i in axes(counts, 1) - counts[i, g_to] = counts[i, g_to] - missing_update[i] - counts[i, g_from] = counts[i, g_from] + missing_update[i] - counts[g_to, i] = counts[i, g_to] - counts[g_from, i] = counts[i, g_from] - end -end - -function fit(a::SparseAssignment, g::Observations) - dists = initialize_sbm( - a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - theta = a.additional_data.estimated_theta[:, group1, group2] - dists[group1, - group2] = ZeroInflatedCategorical(1 - sum(theta), theta) - end - end - return dists -end - -function fit(a::SparseAssignment, - g::Observations{G, <:DiscretizedDistribution}) where {G} - dists = initialize_sbm(a.group_size, - DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - theta = a.additional_data.estimated_theta[:, group1, group2] - p = clamp(1 - sum(theta), 0, 1) - dists[group1, - group2] = DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(p, theta)) - end - end - return dists -end diff --git a/src/assignments/SumAssignment/struct.jl b/src/assignments/SumAssignment/struct.jl deleted file mode 100644 index 8451eba..0000000 --- a/src/assignments/SumAssignment/struct.jl +++ /dev/null @@ -1,94 +0,0 @@ - -# type F needs to be a vector field! - -mutable struct SumData{F, C} - λ::SparseMatrixCSC{F, Int} - θ::Dict{Tuple{Int, Int}, F} - A::SparseMatrixCSC{C, Int} - counts::Dict{Tuple{Int, Int}, Int} - log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} - log_likelihood::Float64 -end - -const SumAssignment{T, F, C} = Assignment{T, SumData{F, C}} -const SumInitRule{S} = InitRule{S, Val{SumData}} - - - -function make_assignment(g, h, init_rule::SumInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - return SumAssignment(g, group_size, node_labels) -end - -function SumAssignment(g::Observations, group_size::GroupSize, node_labels) - A = issparse(g.graph) ? g.graph : sparse(g.graph) - λ = fit.(Ref(g.dist_ref), A) - return SumAssignment(A, λ, group_size, node_labels) -end - -function SumAssignment( - A::SparseMatrixCSC{C, Int}, - λ::SparseMatrixCSC{F, Int}, group_size::GroupSize, node_labels::Vector{Int}) where { - F, C} - k = size(group_size, 1) - θ = Dict{Tuple{Int, Int}, F}() - counts = Dict{Tuple{Int, Int}, Int}() - - rows = rowvals(λ) - vals = nonzeros(λ) - m, n = size(λ) - for u in 1:n - for i in nzrange(λ,u) - v = rows[i] - if u >= v - # break # check that this isn't a mistake trying to be fast - continue - end - key_groups = minmax(node_labels[u], node_labels[v]) - param = vals[i] - if haskey(θ, key_groups) - θ[key_groups] += param - else - θ[key_groups] = param - end - if haskey(counts, key_groups) - counts[key_groups] += 1 - else - counts[key_groups] = 1 - end - end - end - for i in 1:k - for j in i:k - θ[minmax(i, j)] /= counts[minmax(i, j)] - end - end - ll_sum = 0.0 - ll = Dict{Tuple{Int, Int}, Float64}() - for i in 1:k - for j in i:k - ll[(i, j)] = 0.0 - end - end - for u in 1:n - for i in nzrange(λ, u) - v = rows[i] - if u >= v - continue - end - key_groups = minmax(node_labels[u], node_labels[v]) - ll[minmax( - node_labels[u], node_labels[v])] += loglikelihood(θ[key_groups], A[u, v]) - end - end - ll_sum = sum(values(ll)) - return Assignment(group_size, node_labels, SumData(λ, θ, A, counts, ll, ll_sum)) -end - -function loglikelihood(assignment::SumAssignment, g::Observations) - return sum(values(assignment.additional_data.log_likelihood)) -end - -include("swap.jl") diff --git a/src/assignments/SumAssignment/swap.jl b/src/assignments/SumAssignment/swap.jl deleted file mode 100644 index 4bd46de..0000000 --- a/src/assignments/SumAssignment/swap.jl +++ /dev/null @@ -1,119 +0,0 @@ -mutable struct SumSwap{F} <: Swap - index1::Int - index2::Int - θ::Dict{Tuple{Int, Int}, F} - counts::Dict{Tuple{Int, Int}, Int} - log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} - log_likelihood::Float64 -end - -function make_swap(a::SumAssignment, id) - return SumSwap(id[1], id[2], deepcopy(a.additional_data.θ), - deepcopy(a.additional_data.counts), deepcopy(a.additional_data.log_likelihood_per_group), - a.additional_data.log_likelihood) -end - -function make_swap!(swap::SumSwap{F}, a::SumAssignment{T, F}, id) where {T, F} - swap.index1, swap.index2 = id - swap.θ = deepcopy(a.additional_data.θ) - swap.counts = deepcopy(a.additional_data.counts) - swap.log_likelihood_per_group = deepcopy(a.additional_data.log_likelihood_per_group) - swap.log_likelihood = a.additional_data.log_likelihood -end - -function revert_swap!( - a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} - swap_node_labels!(a, swap.index1, swap.index2) - a.additional_data.θ = deepcopy(swap.θ) - a.additional_data.counts = deepcopy(swap.counts) - a.additional_data.log_likelihood_per_group = deepcopy(swap.log_likelihood_per_group) - a.additional_data.log_likelihood = swap.log_likelihood -end - -function apply_swap!( - a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} - λ = a.additional_data.λ - rows = rowvals(λ) - vals = nonzeros(λ) - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - if g1 == g2 - return nothing - end - - for i in nzrange(λ, swap.index1) - v = rows[i] - key_old_groups = minmax(g1, a.node_labels[v]) - key_new_groups = minmax(g2, a.node_labels[v]) - c_og = a.additional_data.counts[key_old_groups] - c_ng = a.additional_data.counts[key_new_groups] - param = vals[i] - a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - - param)/(c_og - 1) - a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + - param)/(c_ng + 1) - a.additional_data.counts[key_old_groups] -= 1 - a.additional_data.counts[key_new_groups] += 1 - end - - for i in nzrange(λ, swap.index2) - v = rows[i] - key_old_groups = minmax(g2, a.node_labels[v]) - key_new_groups = minmax(g1, a.node_labels[v]) - c_og = a.additional_data.counts[key_old_groups] - c_ng = a.additional_data.counts[key_new_groups] - param = vals[i] - a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - - param)/(c_og - 1) - a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + - param)/(c_ng + 1) - a.additional_data.counts[key_old_groups] -= 1 - a.additional_data.counts[key_new_groups] += 1 - end - - swap_node_labels!(a, swap.index1, swap.index2) - fast_update_ll!(a, swap) -end - -function fast_update_ll!(a::SumAssignment, swap::SumSwap) - k = size(a.group_size, 1) - for i in 1:k - for j in i:k - index_group = (i, j) - if swap.θ[index_group] != a.additional_data.θ[index_group] - _update_ll_one_group!(a, index_group) - end - end - end - a.additional_data.log_likelihood = sum(values(a.additional_data.log_likelihood_per_group)) -end - -function _update_ll_one_group!(a::SumAssignment, group) - nodes_1 = findall(x -> x == group[1], a.node_labels) - nodes_2 = findall(x -> x == group[2], a.node_labels) - ll = 0.0 - rows = rowvals(a.additional_data.λ) - for u in nodes_1 - for v in intersect(rows[nzrange(a.additional_data.λ, u)], nodes_2) - ll += loglikelihood( - a.additional_data.θ[group], a.additional_data.A[u, v]) - end - end - a.additional_data.log_likelihood_per_group[group] = ll - return nothing -end - -function fit( - a::SumAssignment{T, F, C}, g::Observations{ - G, <:DiscreteMarkovChain}) where { - T, F, C, G} - dists = initialize_sbm( - a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - dists[ - group1, group2] = a.additional_data.θ[minmax(group1, group2)] - end - end - return dists -end diff --git a/src/assignments/group_numbering.jl b/src/assignments/group_numbering.jl deleted file mode 100644 index 879e909..0000000 --- a/src/assignments/group_numbering.jl +++ /dev/null @@ -1,86 +0,0 @@ -""" -Array-like storage for the number of nodes in each group. Try to split the number of nodes -into equal groups, but if it is not possible, the last group may have more nodes. -""" -struct GroupSize{T} <: AbstractVector{Int} - group_number::T - number_groups::Int - - function GroupSize(number_nodes, h::Real) - @assert 0 < h < 1 - standard_group = floor(Int, number_nodes * h) - GroupSize(number_nodes, standard_group) - end - - function GroupSize(number_nodes, standard_group::Integer) - @assert 1 < standard_group <= number_nodes - number_groups = number_nodes ÷ standard_group # number of standard groups! - if number_groups * standard_group == number_nodes - new{Int}(standard_group, number_groups) - else - remainder_group = standard_group + - mod(number_nodes, standard_group) - new{Tuple{Int, Int}}( - (standard_group, remainder_group), number_groups) - end - end -end - -Base.size(g::GroupSize) = (g.number_groups,) -Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) - @boundscheck checkbounds(g, i) - return g.group_number -end - -Base.@propagate_inbounds function Base.getindex( - g::GroupSize{Tuple{Int, Int}}, i::Int) - @boundscheck checkbounds(g, i) - return i < length(g) ? g.group_number[1] : g.group_number[2] -end - -function check_compatiblity!(node_labels, g::GroupSize) - counts = StatsBase.countmap(node_labels) - - if length(counts) != g.number_groups - throw(ArgumentError("The vector of node labels is not compatible with the \ - group size: number of group in labels $(length(counts)) != expected number $(g.number_groups)")) - end - if size(node_labels, 1) != sum(g) - throw(ArgumentError("The vector of node labels is not compatible with the \ - group size: number of node labels $(size(node_labels, 1)) != expected number of nodes $(sum(g))")) - end - unbalanced = any(((k, v),) -> v != g[k], counts) - if unbalanced - @debug "The group size is unbalanced, trying to fix it : $(counts)" - g, node_labels = try_fixing_group_size!(node_labels, g) - if any(((k, v),) -> v != g[k], StatsBase.countmap(node_labels)) - throw(ArgumentError("Could not fix the group size")) - else - @debug "Fixed the group size by moving nodes between groups" - end - end -end - -function try_fixing_group_size!(node_labels, g::GroupSize) - counts = StatsBase.countmap(node_labels) - groups_too_small = filter(((k, v),) -> v < g[k], counts) - groups_too_large = filter(((k, v),) -> v > g[k], counts) - amount_too_small = sum(g[k] - v for (k, v) in groups_too_small) - amount_too_large = sum(v - g[k] for (k, v) in groups_too_large) - if amount_too_small == amount_too_large - nodes_to_move = [] - for (l, v) in groups_too_large - number_nodes_to_move = v - g[l] - nodes_to_move = vcat(nodes_to_move, - findall(x -> x == l, node_labels)[1:number_nodes_to_move]) - end - for (k, v) in groups_too_small - number_nodes_to_move = g[k] - v - for i in 1:number_nodes_to_move - index = popfirst!(nodes_to_move) - node_labels[index] = k - end - end - end - return g, node_labels -end diff --git a/src/assignments/include.jl b/src/assignments/include.jl deleted file mode 100644 index 5829223..0000000 --- a/src/assignments/include.jl +++ /dev/null @@ -1,4 +0,0 @@ -include("BernoulliAssignment/struct.jl") -include("CategoricalAssignment/struct.jl") -include("SparseAssignment/struct.jl") -include("SumAssignment/struct.jl") diff --git a/src/bootstrap.jl b/src/bootstrap.jl deleted file mode 100644 index 77086dd..0000000 --- a/src/bootstrap.jl +++ /dev/null @@ -1,15 +0,0 @@ -function bootstrap( - statistic::Function, data::AbstractMatrix, model::BlockModel, - sampling::BootstrapSampling) - t0 = tx(statistic(data)) - m = nrun(sampling) - t1 = zeros_tuple(t0, m) - data1 = copy(data) - for i in 1:m - draw_and_fill!(data1, model) - for (j, t) in enumerate(tx(statistic(data1))) - t1[j][i] = t - end - end - return ParametricBootstrapSample(t0, t1, statistic, data, model, sampling) -end diff --git a/src/distributions/categorical_with_0.jl b/src/distributions/categorical_with_0.jl deleted file mode 100644 index 045d45b..0000000 --- a/src/distributions/categorical_with_0.jl +++ /dev/null @@ -1,141 +0,0 @@ -""" - struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution - -A zero-inflated categorical distribution that combines a Bernoulli distribution with a categorical distribution. - -# Fields -- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. -- `dist::D`: The categorical distribution. - -# Constructors -- `ZeroInflatedCategorical(p::Real, dist::D)`: Creates a zero-inflated categorical distribution with probability `p` of zero and categorical distribution `dist`. - -# Mathematical Explanation -The zero-inflated categorical distribution modifies the original categorical distribution by introducing a probability `p` of zero. The `pmf` and `cdf` are adjusted accordingly: -- `pdf(x) = p * δ(x) + (1 - p) * pmf_original(x)` -- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` -where `δ(x)` is the Dirac delta function. -""" -struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution - edge_proba::B - dist::D -end - -_dirac_delta(x) = x == 0 ? one(x) : zero(x) -_dirac_delta(x, lb, ub) = lb <= x <= ub ? one(x) : zero(x) - -function ZeroInflatedCategorical(p::Real, dist::D) where {D} - if p < 0 - p = zero(p) - elseif p > 1 - p = one(p) - end - return ZeroInflatedCategorical(Bernoulli(1 - p), dist) -end - -function ZeroInflatedCategorical(p::Real, probs::AbstractVector) - if sum(probs) == 0 - probs_ = ones(length(probs)) / length(probs) - else - probs_ = probs / sum(probs) - end - if p < 0 - p = zero(p) - elseif p > 1 - p = one(p) - end - return ZeroInflatedCategorical(p, Categorical(probs_)) -end - -function ZeroInflatedCategorical(vec_probs::AbstractVector) - ZeroInflatedCategorical(vec_probs[1], vec_probs[2:end]) -end - -function ZeroInflatedCategorical(k::Int) - ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) -end - -""" - Distributions.pdf(d::ZeroInflatedCategorical, x::Real) - -Computes the probability mass function (pmf) of the zero-inflated categorical distribution `d` at `x`. - -# Mathematical Explanation -The `pmf` of the zero-inflated categorical distribution is given by: -- `pmf(x) = p * δ(x) + (1 - p) * pmf_original(x)` -where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `pmf_original(x)` is the pmf of the original categorical distribution. -""" -function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + - pdf(d.edge_proba, one(x)) * pdf(d.dist, x) -end - -""" - rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) - -Generates a random sample from the zero-inflated categorical distribution `d` using the random number generator `rng`. -""" -function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) - return rand(rng, d.edge_proba) * rand(rng, d.dist) -end - -logpdf(d::ZeroInflatedCategorical, x::Real) = log(pdf(d, x)) - -minimum(d::ZeroInflatedCategorical) = min(minimum(d.dist), 0) - -maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) - -insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) - -""" - Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - -Computes the cumulative distribution function (cdf) of the zero-inflated categorical distribution `d` at `x`. - -# Mathematical Explanation -The `cdf` of the zero-inflated categorical distribution is given by: -- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` -where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `cdf_original(x)` is the cdf of the original categorical distribution. -""" -function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, 0, Inf) + - pdf(d.edge_proba, one(x)) * cdf(d.dist, x) -end - -function Distributions.params(d::ZeroInflatedCategorical) - (first(params(d.edge_proba)), params(d.dist)...) -end - -ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) - -""" - Distributions.fit(::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) - -Fits a zero-inflated categorical distribution to the given data. -""" -function Distributions.fit( - ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where { - B, D <: Categorical} - indices_0 = findall(x -> x == 0, data) - p = length(indices_0) / length(data) - if p != 1 - dist = fit_mle(Categorical, n_cat, data[setdiff(1:end, indices_0)]) - return ZeroInflatedCategorical(p, dist) - else - return ZeroInflatedCategorical(1.0, zeros(n_cat)) - end -end - -function get_params_cat_like(dist::ZeroInflatedCategorical) - p = first(params(dist.edge_proba)) - probs = vcat(params(dist.dist)...) - return vcat(1 - p, probs .* p) -end - -function Base.convert(::Type{<:ZeroInflatedCategorical}, d::D) where {D} - return ZeroInflatedCategorical(1.0, d) -end - -function Base.convert(T::Type{<:Categorical}, d::ZeroInflatedCategorical) - return T(get_params_cat_like(d)) -end diff --git a/src/distributions/discrete_dist.jl b/src/distributions/discrete_dist.jl deleted file mode 100644 index 55af36b..0000000 --- a/src/distributions/discrete_dist.jl +++ /dev/null @@ -1,142 +0,0 @@ -""" - struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution - -A discretized distribution that combines a discretizer with a zero-inflated categorical distribution. - -# Fields -- `discretizer::D`: The discretizer used to discretize the continuous distribution. -- `probs::L`: The zero-inflated categorical distribution representing the discretized probabilities. - -# Constructors -- `DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d))`: Creates a discretized distribution with `n_bins` bins and support bound `support_bound`. - -# Mathematical Explanation -The discretized distribution modifies the original continuous distribution by dividing it into `n_bins` bins. The `pdf` and `cdf` are adjusted accordingly: -- `pdf(x) = pdf_discretized(bin) / bin_width` -- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` -""" -mutable struct DiscretizedDistribution{D, L} <: - ContinuousUnivariateDistribution where {D, L} - discretizer::D - probs::L -end - -function DiscretizedDistribution( - d::D, n_bins::Int, support_bound = extrema(d)) where {D} - disc = DiscretizerZeroToZero(n_bins, support_bound...) - ps = zeros(non_zero_labels_counts(disc)) - for i in 1:non_zero_labels_counts(disc) - lb, ub = decode(disc, i) - ps[i] = cdf(d, ub) - cdf(d, lb) - end - probs = ZeroInflatedCategorical(0.0, ps) - return DiscretizedDistribution(disc, probs) -end - -function DiscretizedDistribution( - d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) - disc = DiscretizerZeroToZero(n_bins, support_bound...) - ps = zeros(non_zero_labels_counts(disc)) - for i in 1:non_zero_labels_counts(disc) - lb, ub = decode(disc, i) - ps[i] = cdf(d, ub) - cdf(d, lb) - end - probs = ZeroInflatedCategorical(get_proba_zero(d), ps) - return DiscretizedDistribution(disc, probs) -end - -function DiscretizedDistribution(discretizer::Discretizer) - return DiscretizedDistribution( - discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) -end - -""" - rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) - -Generates a random sample from the discretized distribution `d` using the random number generator `rng`. -""" -function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) - bin = rand(rng, d.probs) - return _decode_randomly(rng, d.discretizer, bin) -end - -minimum(d::DiscretizedDistribution) = minimum(d.discretizer) - -maximum(d::DiscretizedDistribution) = maximum(d.discretizer) - -function insupport(d::DiscretizedDistribution, x::Real) - support_encoding(d.discretizer, x) -end - -function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} - return DiscretizedDistribution(d, 10) -end - -ncategories(d::DiscretizedDistribution) = ncategories(d.probs) - -function Distributions.fit( - ::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} - return fit(L, data) -end - -function set_params!(d::DiscretizedDistribution{D, L}, params) where {D, L} - d.probs = L(params...) -end - -""" - Distributions.pdf(d::DiscretizedDistribution, x::Real) - -Computes the probability density function (pdf) of the discretized distribution `d` at `x`. - -# Mathematical Explanation -The `pdf` of the discretized distribution is computed as: -- `pdf(x) = pdf_discretized(bin) / bin_width` -""" -function pdf(d::DiscretizedDistribution, x::Real) - if x == 0 - return pdf(d.probs, zero(x)) - end - if !support_encoding(d.discretizer, x) - return zero(x) - end - bin = encode(d.discretizer, x) - return pdf(d.probs, bin) / binwidth(d.discretizer) -end - -""" - Distributions.logpdf(d::DiscretizedDistribution, x::Real) - -Computes the log of the probability density function (logpdf) of the discretized distribution `d` at `x`. -""" -function logpdf(d::DiscretizedDistribution, x::Real) - if !support_encoding(d.discretizer, x) - return -Inf - end - x == 0 && return log(pdf(d.probs, x)) - bin = encode(d.discretizer, x) - return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) -end - -""" - Distributions.cdf(d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} - -Computes the cumulative distribution function (cdf) of the discretized distribution `d` at `x`. - -# Mathematical Explanation -The `cdf` of the discretized distribution is computed as: -- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` -""" -function Distributions.cdf( - d::DiscretizedDistribution{D, P}, x::Real) where { - D, P <: ZeroInflatedCategorical} - x < minimum(d) && return zero(x) - x > maximum(d) && return one(x) - bin = encode(d.discretizer, x) - result = (x == 0) * cdf(d.probs, x) - if bin != 0 - result += cdf(d.probs, bin - 1) + - (cdf(d.probs, bin) - cdf(d.probs, bin - 1)) * - progress_in_bin(d.discretizer, x, bin) - end - return result -end diff --git a/src/distributions/discretizer.jl b/src/distributions/discretizer.jl deleted file mode 100644 index e039930..0000000 --- a/src/distributions/discretizer.jl +++ /dev/null @@ -1,248 +0,0 @@ -# Inspired by Discretizer.jl but with the fast decoding function and built-in -# convention for discretizing continuous distributions. -abstract type Discretizer end - -function encode(d::Discretizer, x::AbstractArray{<:Real}) - return [encode(d, u) for u in x] -end - -function decode(d::Discretizer, x::AbstractArray{<:Real}) - return [decode(d, u) for u in x] -end - -""" -Uniformly discretizes a continuous distribution into a fixed number of bins of equal width. -""" -struct RegularDiscretizer{F, T, L} <: Discretizer - n_bins::Int - lower_bound::F - upper_bound::F - bin_labels::MVector{L, T} - bin_width::F -end - -function RegularDiscretizer( - n_bins::Int, lower_bound::F, upper_bound::F) where {F} - if !isfinite(lower_bound) || !isfinite(upper_bound) - throw(ArgumentError("RegularDiscretizer requires finite lower and upper bounds.")) - end - bin_width = (upper_bound - lower_bound) / n_bins - return RegularDiscretizer( - n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), bin_width - ) -end - -function support_encoding(d::RegularDiscretizer, x::Real) - return d.lower_bound <= x <= d.upper_bound -end - -function minimum(d::RegularDiscretizer) - return d.lower_bound -end - -function maximum(d::RegularDiscretizer) - return d.upper_bound -end - -function encode(d::RegularDiscretizer, x::Real) - if x >= d.upper_bound - return d.n_bins - end - return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] -end - -function _decode_randomly( - rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) - hi, lo = decode(d, bin) - return lo + (hi - lo) * rand(rng) -end - -function binwidth(d::RegularDiscretizer) - return d.bin_width -end - -function decode(d::RegularDiscretizer, bin::Int) - return (d.lower_bound + (bin - 1) * d.bin_width, - d.lower_bound + bin * d.bin_width) -end - -function encode(d::RegularDiscretizer, x::AbstractArray{Real}) - return [encode(d, u) for u in x] -end - -function decode(d::RegularDiscretizer, x::AbstractArray{Real}) - return [decode(d, u) for u in x] -end - -function nlabels(d::RegularDiscretizer) - return d.n_bins -end - -non_zero_labels_counts(d::RegularDiscretizer) = nlabels(d) - -""" -Maps a set of categories to a set of bins -""" -struct CategoryDiscretizer{F, T} - cat_to_bin::Dict{F, T} - bin_to_cat::Dict{T, F} - min_label::T - max_label::T -end - -function CategoryDiscretizer(cat_to_bin::Dict, bin_to_cat::Dict) - min_label = minimum(keys(bin_to_cat)) - max_label = maximum(keys(bin_to_cat)) - return CategoryDiscretizer(cat_to_bin, bin_to_cat, min_label, max_label) -end - -function support_encoding(d::CategoryDiscretizer, x) - return haskey(d.cat_to_bin, x) -end - -function encode(d::CategoryDiscretizer, x) - return d.cat_to_bin[x] -end - -function decode(d::CategoryDiscretizer, label) - return d.bin_to_cat[label] -end - -function nlabels(d::CategoryDiscretizer) - return length(d.bin_to_cat) -end - -function binwidth(d::CategoryDiscretizer{F, T}, x::T) where {F, T} - return length(d.bin_to_cat[x]) -end - -function non_zero_labels_counts(d::CategoryDiscretizer) - if 0 ∈ keys(d.bin_to_cat) - return length(d.bin_to_cat) - 1 - else - return length(d.bin_to_cat) - end -end - -function minimum(d::CategoryDiscretizer) - return d.min_label -end - -function maximum(d::CategoryDiscretizer) - return d.max_label -end - -""" -Uniformly discretizes a continuous distribution into a fixed number of bins of equal width, -with additional bins for missing or special values. -""" -struct HybridDiscretizer{F, T, L} <: Discretizer - lin::RegularDiscretizer{F, T, L} - cat::CategoryDiscretizer{F, T} -end - -# change so that atoms can be packed together if wanted -function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) - cat_to_bin = Dict(a => n_bins + i for (i, a) in enumerate(atoms)) - bin_to_cat = Dict(n_bins + i => a for (i, a) in enumerate(atoms)) - bin_width = (upper_bound - lower_bound) / n_bins - return HybridDiscretizer( - RegularDiscretizer{typeof(bin_width), Int, n_bins}( - n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), - (upper_bound - lower_bound) / n_bins), - CategoryDiscretizer(cat_to_bin, bin_to_cat) - ) -end - -function DiscretizerZeroToZero(n_bins, lower_bound, upper_bound) - cat_to_bin = Dict([0.0 => 0]) - bin_to_cat = Dict([0 => 0.0]) - bin_width = (upper_bound - lower_bound) / n_bins - return HybridDiscretizer( - RegularDiscretizer{typeof(bin_width), Int, n_bins}( - n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), - (upper_bound - lower_bound) / n_bins), - CategoryDiscretizer(cat_to_bin, bin_to_cat) - ) -end - -function support_encoding(d::HybridDiscretizer, x) - return support_encoding(d.lin, x) || support_encoding(d.cat, x) -end - -function minimum(d::HybridDiscretizer) - return min(minimum(d.lin), minimum(d.cat)) -end - -function maximum(d::HybridDiscretizer) - return max(maximum(d.lin), maximum(d.cat)) -end - -function nlabels(d::HybridDiscretizer) - return nlabels(d.lin) + nlabels(d.cat) -end - -function non_zero_labels_counts(d::HybridDiscretizer) - return non_zero_labels_counts(d.lin) + non_zero_labels_counts(d.cat) -end - -binwidth(d::HybridDiscretizer) = binwidth(d.lin) - -function binwidth(d::HybridDiscretizer, bin) - if haskey(d.cat.cat_to_bin, bin) - return binwidth(d.cat, bin) - else - return binwidth(d.lin) - end -end - -function encode(d::HybridDiscretizer, x::Real) - if haskey(d.cat.cat_to_bin, x) - return encode(d.cat, x) - else - return encode(d.lin, x) - end -end - -function decode(d::HybridDiscretizer, bin::Int) - if haskey(d.cat.bin_to_cat, bin) - return decode(d.cat, bin) - else - return decode(d.lin, bin) - end -end - -function _decode_randomly( - rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) - if haskey(d.cat.bin_to_cat, bin) - return decode(d.cat, bin) - else - return _decode_randomly(rng, d.lin, bin) - end -end - -function auto_nbins(data) - binwidth = 2iqr(data) / cbrt(n) - lo, hi = extrema(data) - nbins_fd = ceil(Int, (hi - lo) / binwidth) - nbins_sturges = ceil(Int, log(2, n)) + 1 - nbins = max(nbins_fd, nbins_sturges) - return nbins -end - -function progress_in_bin(d::CategoryDiscretizer, x::Real, bin) - return one(x) -end - -function progress_in_bin(d::RegularDiscretizer, x::Real, bin) - lo, hi = decode(d, bin) - return (x - lo) / (hi - lo) -end - -function progress_in_bin(d::HybridDiscretizer, x::Real, bin) - if haskey(d.cat.bin_to_cat, bin) - return progress_in_bin(d.cat, x, bin) - else - return progress_in_bin(d.lin, x, bin) - end -end diff --git a/src/distributions/include.jl b/src/distributions/include.jl deleted file mode 100644 index cc5c557..0000000 --- a/src/distributions/include.jl +++ /dev/null @@ -1,5 +0,0 @@ -include("categorical_with_0.jl") -include("discretizer.jl") -include("zero_inflated.jl") -include("discrete_dist.jl") -include("markov_chain.jl") diff --git a/src/distributions/markov_chain.jl b/src/distributions/markov_chain.jl deleted file mode 100644 index 28a21b8..0000000 --- a/src/distributions/markov_chain.jl +++ /dev/null @@ -1,144 +0,0 @@ -# if S is Int, assume the states are ordered and sequential -# should store everything in transpose, will be faster but way more -# complicated to read -struct DiscreteMarkovChain{S, M <: AbstractMatrix} - states::Vector{S} - transitions::M -end - -struct SampleChain{S, M <: AbstractMatrix} - states::Vector{S} - indices::Vector{Int} - transitions::M -end - -Base.zero(::DiscreteMarkovChain) = DiscreteMarkovChain(Int[], zeros(Int, 0, 0)) -Base.zero(::SampleChain{S}) where {S} = SampleChain(S[], Int[], zeros(Int, 1, 1)) - -function state_index(mc::DiscreteMarkovChain{S}, state::S) where {S} - findfirst(isequal(state), mc.states) -end - -state_space(mc::DiscreteMarkovChain) = mc.states -transition_matrix(mc::DiscreteMarkovChain) = mc.transitions - -function stationary_dist(mc::DiscreteMarkovChain) - T = transition_matrix(mc) - F = eigen(T') - tol = 1e-8 - idx = findfirst(abs.(F.values .- 1) .< tol) - if idx === nothing - error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") - end - # Extract the corresponding eigenvector and normalize it to sum to 1. - pi = real(F.vectors[:, idx]) - return pi ./ sum(pi) -end - -function stationary_dist(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}) where {S} - T = transition_matrix(mc) - vals, vecs, _ = eigsolve(T') - tol = 1e-8 - idx = findfirst(abs.(vals .- 1) .< tol) - if idx === nothing - error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") - end - # Extract the corresponding eigenvector and normalize it to sum to 1. - pi = Real.(vecs[idx]) - result = pi ./ sum(pi) - return result -end - -function sample_indices(mc::DiscreteMarkovChain, t::Int) - indices = Vector{Int}(undef, t) - indices[1] = rand(Categorical(stationary_dist(mc))) - tr_transposed = transpose(mc.transitions) - for i in 2:t - indices[i] = rand(Categorical(tr_transposed[:, indices[i - 1]])) - end - return indices -end - -function sample(mc::DiscreteMarkovChain, t::Int) - indices = sample_indices(mc, t) - states = mc.states[indices] - counts = zeros(Int, length(mc.states), length(mc.states)) - for i in 1:(length(indices) - 1) - counts[indices[i], indices[i + 1]] += 1 - end - return SampleChain(states, indices, counts) -end - -function sample(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}, t::Int) where {S} - indices = sample_indices(mc, t) - states = mc.states[indices] - counts = zeros(Int, length(mc.states), length(mc.states)) - for i in 1:(length(indices) - 1) - counts[indices[i], indices[i + 1]] += 1 - end - return SampleChain(states, indices, sparse(counts)) -end - -## yes I know this is awful and does not return a proper chain, but... -function Base.:+(a::DiscreteMarkovChain, b::DiscreteMarkovChain) - return DiscreteMarkovChain( - a.states, - a.transitions .+ b.transitions) -end - -function Base.:-(a::DiscreteMarkovChain, b::DiscreteMarkovChain) - return DiscreteMarkovChain( - a.states, - a.transitions .- b.transitions) -end - -function Base.:*(a::DiscreteMarkovChain, c::Real) - return DiscreteMarkovChain( - a.states, - a.transitions .* c) -end - -Base.:*(c::Real, a::DiscreteMarkovChain) = a * c - -function Base.:/(a::DiscreteMarkovChain, c::Real) - return DiscreteMarkovChain( - a.states, - a.transitions ./ c) -end - -function loglikelihood(mc::DiscreteMarkovChain{S, M}, chain::Vector{Int}) where {S, M} - Tr = transition_matrix(mc) - probas = Vector{Float64}(undef, length(chain)) - probas[1] = stationary_dist(mc)[chain[1]] - for i in 1:(length(chain) - 1) - probas[i + 1] = Tr[chain[i], chain[i + 1]] - end - return sum(log, probas) -end - -function loglikelihood( - mc::DiscreteMarkovChain{S, M1}, chain::Vector{S}) where {S, M1} - return loglikelihood(mc, state_index.(Ref(mc), chain)) -end - -#without the first state, huge computational speedup -function loglikelihood( - mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} - return sum(map(xlogy, chain.transitions, mc.transitions)) #+log(stationary_dist(mc)[chain.indices[1]]) -end - - - -# user responsability to have the same states... -function fit( - mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} - return DiscreteMarkovChain( - mc.states, make_row_stochastic(chain.transitions)) -end - - - -function make_row_stochastic(A::M) where {M <: AbstractMatrix} - f(row) = sum(row) == 0 ? ones(length(row)) / length(row) : row ./ sum(row) - return mapslices(f, A, dims = 2) -end diff --git a/src/distributions/utils.jl b/src/distributions/utils.jl deleted file mode 100644 index d30daf7..0000000 --- a/src/distributions/utils.jl +++ /dev/null @@ -1,19 +0,0 @@ -const logtwo = log(2.0) - -sumlog(x::AbstractArray{<:Real}) = sum(log,x) - -function sumlog(x::AbstractArray{<:AbstractFloat}) - sig = one(T) - ex = zero(exponent(one(T))) - bound = floatmax(T) / 2 - for xj in x - sig *= significand(xj) - ex += exponent(xj) - if sig > bound - (a, b) = (significand(sig), exponent(sig)) - sig = a - ex += b - end - end - log(sig) + logtwo * ex -end diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl deleted file mode 100644 index 9cc4720..0000000 --- a/src/distributions/zero_inflated.jl +++ /dev/null @@ -1,93 +0,0 @@ -""" - struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution - -A zero-inflated distribution that combines a Bernoulli distribution with a continuous distribution. - -# Fields -- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. -- `dist::D`: The continuous distribution. - -# Constructors -- `ZeroInflated(p::Real, dist::D)`: Creates a zero-inflated distribution with probability `p` of zero and continuous distribution `dist`. - -# Mathematical Explanation -The zero-inflated distribution modifies the original distribution by introducing a probability `p` of zero. The `pdf` and `cdf` are adjusted accordingly: -- `pdf(x) = p * δ(x) + (1 - p) * pdf_original(x)` -- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` -where `δ(x)` is the Dirac delta function. -""" -struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution - edge_proba::B - dist::D -end - -function ZeroInflated(p::Real, dist::D) where {D} - return ZeroInflated(Bernoulli(1 - p), dist) -end - -""" - Distributions.pdf(d::ZeroInflated, x::Real) - -Computes the probability density function (pdf) of the zero-inflated distribution `d` at `x`. -""" -function Distributions.pdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + - pdf(d.edge_proba, one(x)) * pdf(d.dist, x) -end - -""" - get_proba_zero(d::ZeroInflated) - -Returns the probability of zero for the zero-inflated distribution `d`. -""" -function get_proba_zero(d::ZeroInflated) - return pdf(d.edge_proba, 0) -end - -""" - rand(rng::Random.AbstractRNG, d::ZeroInflated) - -Generates a random sample from the zero-inflated distribution `d` using the random number generator `rng`. -""" -function rand(rng::Random.AbstractRNG, d::ZeroInflated) - return rand(rng, d.edge_proba) * rand(rng, d.dist) -end - -logpdf(d::ZeroInflated, x::Real) = log(pdf(d, x)) - -minimum(d::ZeroInflated) = min(minimum(d.dist), 0) - -maximum(d::ZeroInflated) = max(maximum(d.dist), 0) - -insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) - -""" - Distributions.cdf(d::ZeroInflated, x::Real) - -Computes the cumulative distribution function (cdf) of the zero-inflated distribution `d` at `x`. -""" -function Distributions.cdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + - cdf(d.dist, x) * pdf(d.edge_proba, one(x)) -end - -function Distributions.params(d::ZeroInflated) - (first(params(d.edge_proba)), params(d.dist)...) -end - -""" - Distributions.fit(::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) - -Fits a zero-inflated distribution to the given data. -""" -function Distributions.fit( - ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} - indices_0 = findall(x -> x == 0, data) - p = length(indices_0) / length(data) - if p != 1 - return ZeroInflated( - p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) - else - return ZeroInflated(1.0, D()) - end -end diff --git a/src/observations.jl b/src/observations.jl deleted file mode 100644 index 120642c..0000000 --- a/src/observations.jl +++ /dev/null @@ -1,231 +0,0 @@ -""" - Observations{G, D} - -A struct to hold observations for a network. The type parameter `G` represents the network - structure and must support indexing and the `size` function. - -# Fields -- `graph::G`: The network structure (e.g. adjacency matrix). -- `dist_ref::D`: distribution of the observations (used for getting support, type of elements, etc.) -""" -struct Observations{G, D} - graph::G - dist_ref::D -end - -""" - number_nodes(graph::Observations) - -Get the number of nodes in the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `num_nodes`: The number of nodes. -""" -function number_nodes(graph::Observations) - return size(graph.graph, 1) -end - -""" - get_obs(graph::Observations, x::Tuple) - -Get the observation for the given tuple of nodes. - -# Arguments -- `graph::Observations`: The graph observations. -- `x::Tuple`: The tuple of nodes. - -# Returns -- `obs`: The observation. -""" -function get_obs(graph::Observations, x::Tuple) - return get_obs(graph, x[1], x[2]) -end - -""" - get_obs(graph::Observations, i::Int, j::Int) - -Get the observation for the given pair of nodes. - -# Arguments -- `graph::Observations`: The graph observations. -- `i::Int`: The first node. -- `j::Int`: The second node. - -# Returns -- `obs`: The observation. -""" -function get_obs(graph::Observations, i::Int, j::Int) - return graph.graph[i, j] -end - -""" - density(graph::Observations) - -Get the density of the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `density`: The density of the graph. -""" -function density(graph::Observations) - return sum(graph.graph) / - ((size(graph.graph, 1) * (size(graph.graph, 1) - 1))) -end - -""" - get_degree(graph::Observations) - -Get the degree of each node in the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `degrees`: The degrees of the nodes. -""" -function get_degree(graph::Observations) - return sum(graph.graph, dims = 2) -end - -""" - get_adj(graph::Observations) - -Get the adjacency matrix of the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `adj_matrix`: The adjacency matrix. -""" -function get_adj(graph::Observations) - return graph.graph -end - -function normalized_laplacian(graph::Observations) - return normalized_laplacian(graph.graph) -end - -function normalized_laplacian(g::AbstractGraph) - return normalized_laplacian(Graphs.adjacency_matrix(g)) -end - -normalized_laplacian(g::CategoricalArray) = normalized_laplacian(levelcode.(g)) - -""" - normalized_laplacian(graph::Observations) - -Get the normalized Laplacian of the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `L`: The normalized Laplacian matrix. -""" -function normalized_laplacian(graph::AbstractMatrix) - degrees = sum(graph, dims = 1) - degrees .-= minimum(degrees) - n = size(graph, 1) - L = similar(graph, Float64) - for j in 1:n - for i in 1:n - if i == j - L[i, j] = 1 - elseif degrees[i] == 0 || degrees[j] == 0 - L[i, j] = 0 - elseif graph[i, j] != 0 - L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) - end - end - end - return L -end - -function Metis.graph(graph::Observations{ - G, <:UnivariateDistribution}) where {G} - use_weights = true - if minimum(graph.dist_ref) < 0 - @warn "Negative values are not allowed for MetisStart, using binary graph" - use_weights = false - end - return Metis.graph(sparse(graph.graph), weights = use_weights) -end - -""" - discretise(graph::Observations; number_groups, number_levels) - -Discretise the graph observations. - -# Arguments -- `graph::Observations`: The graph observations. -- `number_groups`: Number of groups for discretisation. -- `number_levels`: Number of levels for discretisation. - -# Returns -- `discretised_graph`: The discretised graph observations. -- `discretiser`: The discretiser used. - -Assume that the diagonal is zero. -0 indicates no edge, while missing indicates no information about the edge. -By default maps 0 to 0. If you want another behaviour use the function where you -pass a `Discretizer` object. - -number_levels will be the number of levels in the discretized distribution (excluding 0). -""" -function discretise( - graph::Observations; number_groups = nothing, number_levels = nothing) - if isnothing(number_groups) && isnothing(number_levels) - throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) - end - if isnothing(number_levels) - number_levels = round(Int, - get_num_levels_from_groups(number_nodes(graph), number_groups)) - else - if !isnothing(number_groups) - @warn "disregarding `number_groups` as `number_levels` is provided" - end - end - return discretise( - graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) -end - -""" - discretise(graph::Observations, discretiser::Discretizer) - -Discretise the graph observations using the given discretiser. - -# Arguments -- `graph::Observations`: The graph observations. -- `discretiser::Discretizer`: The discretiser to use. - -# Returns -- `discretised_graph`: The discretised graph observations. -- `discretiser`: The discretiser used. -""" -function discretise(graph::Observations, discretiser::Discretizer) - A_encoded = encode(discretiser, graph.graph) - return Observations(A_encoded, DiscretizedDistribution(discretiser)), - discretiser -end - -""" - get_num_levels_from_groups(n, number_groups) - -Get the number of levels for the discretized distribution given n and k. - -# Arguments -- `n`: The number of nodes. -- `number_groups`: The number of groups. - -# Returns -- `num_levels`: The number of levels. -""" -function get_num_levels_from_groups(n, number_groups) - return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) -end diff --git a/src/optimisation/config_rules/InitRule.jl b/src/optimisation/config_rules/InitRule.jl deleted file mode 100644 index c60e144..0000000 --- a/src/optimisation/config_rules/InitRule.jl +++ /dev/null @@ -1,99 +0,0 @@ -abstract type StartingAssignment end -struct OrderedStart <: StartingAssignment end -struct RandomStart <: StartingAssignment end -struct SpectralStart <: StartingAssignment end -struct MetisStart <: StartingAssignment end -struct BiasAdjustedSoS <: StartingAssignment end - -struct FromAssignment{A} <: StartingAssignment - assignment::A -end -struct HigherOrderSpectralStart <: StartingAssignment - k::Int -end - -struct InitRule{S <: StartingAssignment, I} - starting_assignment_rule::S - assignment_rule::I -end - -function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} - return Assignment(initialize_node_labels( - g, h, init_rule.starting_assignment_rule)...) -end - -""" - initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) - -initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` -object and a vector of node labels. - -# Implemented rules -- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. -- `RandomStart()`: Randomly assign nodes to groups. -- `SpectralStart()`: Assign nodes to groups based on spectral clustering. -- `MetisStart()`: Assign nodes to groups based on Metis partitioning. -- `FromAssignment(a)`: Assign nodes to groups based on the given assignment `a`. -""" -initialize_node_labels - -function initialize_node_labels(g, h, ::OrderedStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::RandomStart) - group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) - Random.shuffle!(node_labels) - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::SpectralStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = zeros(Int, number_nodes(g)) - - laplacian = normalized_laplacian(g) - decomp, = partialschur(laplacian, nev = 2, which = :LR) - - # get 2nd eigenvector, sort its components - indices = sortperm(real.(decomp.Q[:, 2])) - # bin them into groups of correct size - start = 1 - for (i, group) in enumerate(group_size) - stop = start + group - 1 - node_labels[indices[start:stop]] .= i - start = stop + 1 - end - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::MetisStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = convert.( - Int, Metis.partition(Metis.graph(g), length(group_size))) - check_compatiblity!(node_labels, group_size) - return group_size, node_labels -end - -function initialize_node_labels(g, h, rule::FromAssignment{A}) where {A} - group_size = GroupSize(number_nodes(g), h) - check_compatiblity!(rule.assignment.node_labels, group_size) - return group_size, rule.assignment.node_labels -end - -function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) - throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) - # this will need to have the main optim changed -> no assumption that all blocks are - # the same size - group_size = GroupSize(number_nodes(g), h) - laplacian = normalized_laplacian(g) - results = IterativeSolvers.lobpcg(laplacian, true, rule.k) - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::BiasAdjustedSoS) - # implement method from Bias-adjusted spectral clustering in multilayer stochastic block - # models - -end diff --git a/src/optimisation/config_rules/accept_rule.jl b/src/optimisation/config_rules/accept_rule.jl deleted file mode 100644 index 1029f0b..0000000 --- a/src/optimisation/config_rules/accept_rule.jl +++ /dev/null @@ -1,46 +0,0 @@ -abstract type AcceptRule end -struct Strict <: AcceptRule end - -""" - accept_reject_update!(a::Assignment, swap::Swap, g, accept_rule::AcceptRule) - - -Perform the swap and accept it if it improves the likelihood of the assignment. `a` will -be updated in place if the swap is accepted. - -# Implemented rules -- `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. -""" -accept_reject_update! - -function slow_swap!(a, swap) - swap_node_labels!(a, swap.index1, swap.index2) - _count_occurences!(a.additional_data, a.node_labels) - update_ll!(a) -end - -function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) - # calculate the score of the current assignment - current_score = score(a, g) - # perform the swap - #a_star = deepcopy(a) - #swap_star = deepcopy(swap) - apply_swap!(a, swap) - #slow_swap!(a_star, swap_star) - # if any(a_star.additional_data.realized .!= a.additional_data.realized) - # println("The slow and fast swap functions do not agree on realized") - # if any(a_star.additional_data.counts .!= a.additional_data.counts) - # println("The slow and fast swap functions do not agree on counts") - # end - # println("a_star.additional_data.realized: ", a_star.additional_data.realized) - # println("a.additional_data.realized: ", a.additional_data.realized) - # println("a_star.additional_data.counts: ", a_star.additional_data.counts) - # println("a.additional_data.counts: ", a.additional_data.counts) - # error("The slow and fast swap functions do not agree after swapping labels ", swap.index1, " and ", swap.index2) - # end - - # if the new assignment is worse, revert the swap - if score(a, g) <= current_score - revert_swap!(a, swap) - end -end diff --git a/src/optimisation/config_rules/bandwidth_selection_rule.jl b/src/optimisation/config_rules/bandwidth_selection_rule.jl deleted file mode 100644 index efa4da2..0000000 --- a/src/optimisation/config_rules/bandwidth_selection_rule.jl +++ /dev/null @@ -1,101 +0,0 @@ -abstract type KSelectionRule end -struct OracleK <: KSelectionRule - K::Int -end -struct OracleM{F} <: KSelectionRule - M::F - α::F -end - -struct OracleH <: KSelectionRule - H::Int -end - -function OracleM(M) - return OracleM(M, 1.0) -end - -abstract type EstimatedM <: KSelectionRule end -struct EstimatedEigenvalues <: EstimatedM end -struct EstimatedDegrees <: EstimatedM end - -""" - select_number_node_per_block(g::Observations, rule::KSelectionRule) - -How to select the number of blocks `K` for the BlockModel model. - -# Implemented rules -- `OracleK(K::Int)`: Use the oracle number of blocks `K`. -- `OracleM(M::Int)`: Give the Holder constant `M` of the graphon, use the results from - [Olhede and Wolfe (2014)](https://www.pnas.org/doi/epdf/10.1073/pnas.1400374111) to - estimate the number of blocks `K`. -- `EstimatedEigenvalues()`: Use the estimated eigenvalues of the adjacency matrix to - estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. -- `EstimatedDegrees()`: Use the estimated degrees of the adjacency matrix to estimate the - Holder constant and then use `OracleM` to estimate the number of blocks `K`. -- `OracleH(H::Int)`: Use the oracle number of nodes per block `H`. - -!!! info - - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in - the graph. - - The estimated Holder constant `M` comes from equation (11) in Olhede and Wolfe (2014). -""" -select_number_node_per_block - -function select_number_node_per_block(g::Observations, rule::OracleH) - if rule.H > number_nodes(g) ÷ 2 - throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ - number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) - end - if rule.H <= 1 - throw(ArgumentError("The number of nodes per block $(rule.H) is too small, it should \ - be at least 2")) - end - return rule.H -end - -function select_number_node_per_block(g::Observations, rule::OracleK) - nodes_per_block = number_nodes(g) ÷ rule.K - return select_number_node_per_block(g, OracleH(nodes_per_block)) -end - -function select_number_node_per_block(g::Observations, rule::OracleM) - rho = density(g) - n = number_nodes(g) - h = min(max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))), n ÷ 2) - return select_number_node_per_block(g, OracleH(h)) -end - -function select_number_node_per_block(g::Observations, rule::EstimatedM) - n = number_nodes(g) - c = min(4, sqrt(n) / 8) - number_points_from_mid = round(Int, c * sqrt(n)) - mid_points = max(1, n ÷ 2 - number_points_from_mid):(n ÷ 2 + number_points_from_mid) - m = estimated_number_nodes_per_block(g, rule, mid_points, density(g)) - return select_number_node_per_block(g, OracleH(m)) -end - -function estimated_number_nodes_per_block( - g::Observations, ::EstimatedEigenvalues, points, rho) - @warn "Check this method again" - decomp, = partialschur(get_adj(g), nev = 1, which = :LR) - u, λ = real.(decomp.Q), decomp.eigenvalues[1] - return _approx_k_from_delta_f(u, λ, points, rho) -end - -function estimated_number_nodes_per_block( - g::Observations, ::EstimatedDegrees, points, rho) - d = get_degree(g) - mult = ((d' * get_adj(g) * d) / (sum(d .^ 2))^2)[1] - return _approx_k_from_delta_f(d, mult, points, rho) -end - -function _approx_k_from_delta_f(u, mult, midpoints, ρ, α = 1.0) - sort!(u, dims = 1) - uMid = u[midpoints] - β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid - # from Olhede and Wolfe (2014), equation (11) - h = (2^(α + 1) * α * mult^2 * (β₁ * length(uMid) / 2 + β₀)^2 * β₁^2 * - ρ^(-1))^(-1 / (2 * (α + 1))) - return round(Int, h) -end diff --git a/src/optimisation/config_rules/include.jl b/src/optimisation/config_rules/include.jl deleted file mode 100644 index 8c5b6bf..0000000 --- a/src/optimisation/config_rules/include.jl +++ /dev/null @@ -1,5 +0,0 @@ -include("swap_rule.jl") -include("accept_rule.jl") -include("InitRule.jl") -include("stop_rule.jl") -include("bandwidth_selection_rule.jl") diff --git a/src/optimisation/config_rules/stop_rule.jl b/src/optimisation/config_rules/stop_rule.jl deleted file mode 100644 index 89575dc..0000000 --- a/src/optimisation/config_rules/stop_rule.jl +++ /dev/null @@ -1,49 +0,0 @@ -abstract type StopRule end - -function initialise_stop_rule!(stop_rule::StopRule, a, g) -end - -# default score is the log likelihood -function score(a::Assignment, g::Observations) - return loglikelihood(a, g) / binomial(number_nodes(a), 2) -end - -mutable struct PreviousBestValue{T} <: StopRule - k::Int - previous_best_value::T - iterations_since_best::Int - function PreviousBestValue( - k::Int, x::T = -Inf) where {T <: Real} - @assert k > 0 - # queue stores the best values and at most k subsequent values - new{T}(k, x, 0) - end -end - -function initialise_stop_rule!(stop_rule::PreviousBestValue, a, g) - score_value = score(a, g) - stop_rule.previous_best_value = score_value -end - -""" - stopping_rule(assignment::Assignment,g, stop_rule::StopRule) - -Returns a Bool with true if we should stop the optimization based on the `stop_rule`. - -# Implemented rules -- `PreviousBestValue(k)`: Stop if the current iteration is `k` iterations away from the - iteration with the best value. -""" -stopping_rule - -function stopping_rule( - assignment::Assignment, g, stop_rule::PreviousBestValue) - score_value = score(assignment, g) - if score_value > stop_rule.previous_best_value - stop_rule.previous_best_value = score_value - stop_rule.iterations_since_best = 0 - else - stop_rule.iterations_since_best += 1 - end - return stop_rule.iterations_since_best >= stop_rule.k -end diff --git a/src/optimisation/config_rules/swap_rule.jl b/src/optimisation/config_rules/swap_rule.jl deleted file mode 100644 index e811d4f..0000000 --- a/src/optimisation/config_rules/swap_rule.jl +++ /dev/null @@ -1,27 +0,0 @@ -abstract type NodeSwapRule end - -struct RandomNodeSwap <: NodeSwapRule end -struct RandomGroupSwap <: NodeSwapRule end -""" - select_swap(node_assignment::Assignment, ::NodeSwapRule) - -Selects two nodes to swap based on the `NodeSwapRule`, the adjacency matrix `A` and the -current assignment `node_assignment`. - -# Implemented rules -- `RandomNodeSwap()`: Select two nodes at random. -- `RandomGroupSwap()`: Select two nodes from two different groups at random. -""" -select_swap - -function select_swap(assignment::Assignment, ::RandomNodeSwap) - return StatsBase.sample(1:number_nodes(assignment), 2; replace = false) -end - -function select_swap(assignment::Assignment, ::RandomGroupSwap) - groups = StatsBase.sample( - 1:number_groups(assignment), 2; replace = false) - index1 = rand(get_vertex_in_group(assignment, groups[1])) - index2 = rand(get_vertex_in_group(assignment, groups[2])) - return (index1, index2) -end diff --git a/src/optimisation/fit.jl b/src/optimisation/fit.jl deleted file mode 100644 index 8e5985a..0000000 --- a/src/optimisation/fit.jl +++ /dev/null @@ -1,99 +0,0 @@ -# Slow fallback methods for the Assignment type -# Speed up by implementing specialized methods for the BernoulliAssignment type and others - -""" - fit(a::Assignment, g::Observations) - -Compute the estimator from node clustering as specified in the assignment. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `g::Observations`: The graph observations. - -# Returns -- `dists`: The fitted distributions. -""" -function fit(a::Assignment, g::Observations) - dists = initialize_sbm(a.group_size, g.dist_ref) - fit!(dists, g, a) - return dists -end - -""" - fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} - -Fit the SBM to the given graph observations and assignment. - -# Arguments -- `sbm::BlockModel{D,K,F}`: The block model to fit. -- `g::Observations{G,D}`: The graph observations. -- `a::Assignment`: The assignment of nodes to blocks. -""" -function fit!(sbm::BlockModel{D, K, F}, g::Observations{G, D}, - a::Assignment) where {G, D, K, F} - for group1 in 1:number_groups(a) - for group2 in group1:number_groups(a) - edge_indices = get_edge_indices(a, group1, group2) - sbm[group1, group2] = fit_group(g.dist_ref, g, edge_indices) - end - end -end - -function fit_group(d::ZeroInflatedCategorical, g, edges) - return Distributions.fit( - typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) -end - -function fit_group(distribution, g, edges) - return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) -end - -function fit_group(distribution::Binomial, g, edges) - return Distributions.fit( - typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) -end - -""" - loglikelihood(a::Assignment, g::Observations) - -Compute the log likelihood of a BlockModel fitted according to the assignment. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `g::Observations`: The graph observations. - -# Returns -- `log_likelihood`: The log likelihood of the fitted model. -""" -function loglikelihood(a::Assignment, g::Observations) - return _log_likelihood(a, fit(a, g), g) -end - -function _log_likelihood(a::Assignment, sbm::BlockModel, g) - log_likelihood = 0.0 - for i in 1:number_nodes(a) - label_a = a.node_labels[i] - for j in (i + 1):number_nodes(a) - label_b = a.node_labels[j] - log_likelihood += logdensityof( - sbm[label_a, label_b], get_obs(g, i, j)) - end - end - return log_likelihood -end - -""" - fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} - -Fit the SBM to the given graph observations. - -# Arguments -- `sbm::BlockModel{D,K,F}`: The block model to fit. -- `g::Observations{G,D}`: The graph observations. -""" -function fit!( - sbm::BlockModel{D, K, F}, g::Observations{G, D}) where {G, D, K, F} - k = number_blocks(sbm) - a = estimate_graphon(g, select_number_node_per_block(g, OracleK(k))) - fit!(sbm, g, a) -end diff --git a/src/optimisation/include.jl b/src/optimisation/include.jl deleted file mode 100644 index 044e887..0000000 --- a/src/optimisation/include.jl +++ /dev/null @@ -1,3 +0,0 @@ -include("fit.jl") -include("swap.jl") -include("least_squares.jl") diff --git a/src/optimisation/least_squares.jl b/src/optimisation/least_squares.jl deleted file mode 100644 index 9aae0db..0000000 --- a/src/optimisation/least_squares.jl +++ /dev/null @@ -1,95 +0,0 @@ -include("config_rules/include.jl") - -""" - estimate_graphon(graph, h; iterations, initialise_rule, swap_rule, accept_rule, stop_rule, progress_bar) - -Estimate the graphon for the given graph. - -# Arguments -- `graph`: The input graph. -- `h`: Number of nodes per block. -- `iterations`: Maximum number of iterations. -- `initialise_rule::InitRule`: Rule for initializing the assignment. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `stop_rule::StopRule`: Rule for stopping the iterations. -- `progress_bar::Bool`: Whether to show a progress bar. - -# Returns -- `a`: The assignment of nodes to blocks. -""" -function estimate_graphon( - graph, h = select_number_node_per_block(graph, EstimatedDegrees()); - iterations::Int = 10_000, - initialise_rule::InitRule = InitRule(SpectralStart(), nothing), - swap_rule::NodeSwapRule = RandomNodeSwap(), - accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(1000), - progress_bar::Bool = false -) - a = make_assignment(graph, h, initialise_rule) - @debug a - initialise_stop_rule!(stop_rule, a, graph) - greedy_improve!( - a, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) - return a -end - -""" - greedy_improve!(a::Assignment, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) - -Perform greedy improvement on the assignment. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `graph`: The input graph. -- `iterations`: Maximum number of iterations. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `stop_rule::StopRule`: Rule for stopping the iterations. -- `progress_bar::Bool`: Whether to show a progress bar. -""" -function greedy_improve!(a::Assignment, graph; iterations::Int = 10_000, - swap_rule::NodeSwapRule = RandomNodeSwap(), - accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(1000), - progress_bar::Bool = false -) - # swap memory allocation - swap = make_swap(a, (1, 1)) - p = ProgressUnknown( - enabled = progress_bar, showspeed = true, desc = "Greedy search: ") - # perform local search until the stopping rule is met - for i in 1:iterations - local_search!( - a, graph, swap, swap_rule = swap_rule, accept_rule = accept_rule) - next!(p) - if stopping_rule(a, graph, stop_rule) - finish!(p) - break - end - end -end - -""" - local_search!(a::Assignment, graph, swap; swap_rule, accept_rule) - -Perform local search by trying a swap and accepting it if it improves the likelihood. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `graph`: The input graph. -- `swap`: The swap object. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -""" -function local_search!( - a::Assignment, graph, swap::Swap = make_swap(a, (1, 1)); - swap_rule::NodeSwapRule = RandomNodeSwap(), - accept_rule::AcceptRule = Strict() -) - # select two nodes to swap and build the swap object - make_swap!(swap, a, select_swap(a, swap_rule)) - # perform the swap and accept it if it improves the likelihood - accept_reject_update!(a, swap, graph, accept_rule) -end diff --git a/src/optimisation/swap.jl b/src/optimisation/swap.jl deleted file mode 100644 index 69b1ff8..0000000 --- a/src/optimisation/swap.jl +++ /dev/null @@ -1,26 +0,0 @@ -abstract type Swap end - -mutable struct DefaultSwap <: Swap - index1::Int - index2::Int -end - -function make_swap(::Assignment, id) - return DefaultSwap(id[1], id[2]) -end - -function make_swap!(swap::DefaultSwap, ::Assignment, id) - swap.index1, swap.index2 = id -end - -function apply_swap!(a::Assignment, s::DefaultSwap) - swap_node_labels!(a, s.index1, s.index2) -end - -function revert_swap!(assignment::Assignment, swap::DefaultSwap) - apply_swap!(assignment, swap) -end - -function swap_node_labels!(a::Assignment, i, j) - a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] -end diff --git a/src/sbm.jl b/src/sbm.jl deleted file mode 100644 index 1fad688..0000000 --- a/src/sbm.jl +++ /dev/null @@ -1,186 +0,0 @@ -# TODO: remove BlockModel being a subtype of AbstractMatrix -# this was fun but useless and actually harmful - -struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} - sizes::Vector{F} - probs::SymmetricTensor{T, K, 2} -end - -function BlockModel( - θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real} - return BlockModel(sizes, - SymmetricTensor([θ[i, j] for i in 1:size(θ, 1) for j in i:size(θ, 2)], - Val(length(sizes)), Val(2))) -end - -function edge_type(::BlockModel{T, K, F}) where {T, K, F} - return eltype(T) -end - -function _check_sizes(sizes) - @assert sum(sizes)≈1 "Sizes must sum to 1, got $(sum(sizes))" - return sizes -end - -function _check_sizes(sizes::Vector{Int}) - return sizes ./ sum(sizes) -end - -function initialize_sbm(sizes::Vector, dist, k = length(sizes)) - sizes = _check_sizes(sizes) - n_dims = binomial(k + 1, 2) - probs = Vector{typeof(dist)}(undef, n_dims) - fill!(probs, dist) - return BlockModel(sizes, SymmetricTensor(probs, Val(k), Val(2))) -end - -function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) - size_bins = sizes ./ sum(sizes) - n_dims = binomial(k + 1, 2) - probs = Vector{typeof(dist)}(undef, n_dims) - fill!(probs, dist) - return BlockModel(size_bins, SymmetricTensor(probs, Val(k), Val(2))) -end - -function initialize_sbm(k::Int, dist) - return initialize_sbm(ones(k) / k, dist) -end - -number_blocks(::BlockModel{T, K, F}) where {T, K, F} = K - -Base.size(s::BlockModel) = size(s.probs) -Base.ndims(::BlockModel) = 2 -Base.eltype(::BlockModel{T, K, F}) where {T, K, F} = T -Base.setindex!(s::BlockModel, v, i, j) = setindex!(s.probs, v, i, j) -Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) - return getindex(s.probs, i, j) -end - -function sample( - rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}, sorted = false) - n_nodes = length(node_labels) - if sorted - sort!(node_labels) - end - A = zeros(edge_type(sbm), n_nodes, n_nodes) - for j in 1:n_nodes - for i in (j + 1):n_nodes - A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) - end - end - return sparse(Symmetric(A, :L)), node_labels -end - -function draw_and_fill!( - rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = false) - n_blocks = number_blocks(sbm) - n_nodes = size(A, 1) - node_labels = StatsBase.sample( - rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) - if sorted - sort!(node_labels) - end - @inbounds for j in 1:n_nodes - for i in (j + 1):n_nodes - A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) - end - end - A .= Symmetric(A, :L) -end - -function draw_and_fill!(A, sbm, sorted = false) - draw_and_fill!(Random.default_rng(), A, sbm, sorted) -end - -function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted = false) - sample(Random.default_rng(), sbm, node_labels, sorted) -end -function sample( - rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = false) - n_blocks = number_blocks(sbm) - node_labels = StatsBase.sample( - rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) - if sorted - sort!(node_labels) - end - return sample(rng, sbm, node_labels) -end - -function sample(sbm::BlockModel, n_nodes::Int, sorted = false) - sample(Random.default_rng(), sbm, n_nodes, sorted) -end - -function get_probability_matrix(sbm::BlockModel, node_labels::Vector{Int}) - return sbm.probs[node_labels, node_labels] -end - -function _get_params_as_vec(dist::Distribution) - return vcat(params(dist)...) -end - -function latent_to_block_index(latents_vec, sbm::BlockModel) - cum_sum_sizes = cumsum(sbm.sizes) - cum_sum_sizes[end] = 1.0 - return [findfirst(x -> x >= l, cum_sum_sizes) for l in latents_vec] -end - -""" - best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) - -Find the best permutation of the blocks of `fitted_sbm` to match the blocks of `true_sbm` by -comparing the mean absolute difference of the parameters of the two models. -If the difference between the two models is less than `tol`, the function stops early. - -!!! warning - This function is not efficient for large numbers of blocks, as it uses brute force to - find the best permutation. -""" -function best_alignment( - fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) - k = number_blocks(fitted_sbm) - if k != number_blocks(true_sbm) - throw(ArgumentError("The number of blocks must be the same for both models")) - end - best_perm = nothing - best_loss = Inf - fitted_params = _get_params_as_vec.(fitted_sbm) - true_params = _get_params_as_vec.(true_sbm) - for perm in permutations(1:k) - loss = sum(map(x -> sum(abs.(x)), fitted_params[perm] .- true_params)) - if loss < best_loss - best_loss = loss - best_perm = perm - end - if best_loss < tol - break - end - end - return best_perm -end - -function align_sbm!(sbm::BlockModel, perm) - sbm.probs .= sbm.probs[perm, perm] - sbm.sizes .= sbm.sizes[perm] -end - -""" - order_groups(a::Assignment, latents::AbstractVector) - -Order the groups of an assignment according to the true latents. This is an heuristic -approach, which is not guaranteed to find the true ordering of the groups. -""" -function order_groups(a::Assignment, latents::AbstractVector) - n = number_nodes(a) - k = number_groups(a) - sort_perm = sortperm(latents) - sorted_group_labels = a.node_labels[sort_perm] - dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - return sort( - 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) -end - -function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) - align_sbm!(sbm, order_groups(a, latents)) -end diff --git a/test/TestNetworkHistogram.jl b/test/TestNetworkHistogram.jl deleted file mode 100644 index 0018f70..0000000 --- a/test/TestNetworkHistogram.jl +++ /dev/null @@ -1,30 +0,0 @@ -module TestNetworkHistogram - -import NetworkHistogram as NH -using Test - -function to_default_assignment(a_specialised::NH.Assignment{T, B}) where {T, B} - return NH.Assignment(a_specialised.group_size, a_specialised.node_labels) -end - -to_default_assignment(a::NH.Assignment{T, Nothing}) where {T} = a - -function test_swap_revertible( - a::NH.Assignment, swap::NH.Swap, g::NH.Observations) - a_test = deepcopy(a) - NH.apply_swap!(a_test, swap) - @test NH.get_group_of_vertex(a, swap.index1) == - NH.get_group_of_vertex(a_test, swap.index2) - @test NH.get_group_of_vertex(a, swap.index2) == - NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment - a_new = to_default_assignment(a_test) - @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - - # revert the swap and check if the assignment is the same as before - NH.revert_swap!(a_test, swap) - @test a == a_test - @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) -end - -end diff --git a/test/assignments/bernoulli_assignment.jl b/test/assignments/bernoulli_assignment.jl deleted file mode 100644 index d683e1a..0000000 --- a/test/assignments/bernoulli_assignment.jl +++ /dev/null @@ -1,42 +0,0 @@ -import NetworkHistogram as NH - -@testset "test construction Bernoulli assignment" begin - using Distributions: Bernoulli - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - obs = NH.Observations(A, Bernoulli(0.5)) - node_labels = [1, 1, 1, 1, 2, 2, 2, 2] - group_size = NH.GroupSize(8, 4) - a = NH.BernoulliAssignment(obs, group_size, node_labels) - for i in 1:8 - @test NH.get_group_of_vertex(a, i) == node_labels[i] - end - @test all(a.additional_data.A .== A) - @test a.additional_data.realized == [5 2; 2 5] - @test a.additional_data.counts == [6 16; 16 6] - @test a.additional_data.estimated_theta == [5/6 1/8; 1/8 5/6] -end - -@testset "test Bernoulli swap" begin - using ..TestNetworkHistogram: test_swap_revertible - using Distributions: Bernoulli - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - obs = NH.Observations(A, Bernoulli(0.5)) - a = NH.BernoulliAssignment( - obs, NH.GroupSize(8, 4), [1, 1, 1, 1, 2, 2, 2, 2]) - swap = NH.make_swap(a, (1, 2)) - test_swap_revertible(a, swap, obs) -end diff --git a/test/assignments/categorical_assignment.jl b/test/assignments/categorical_assignment.jl deleted file mode 100644 index bd4c4db..0000000 --- a/test/assignments/categorical_assignment.jl +++ /dev/null @@ -1,126 +0,0 @@ -import NetworkHistogram as NH - -using Random - -@testset "test Categorical swap" begin - Random.seed!(1234123) - using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment - using Distributions: Categorical - using LinearAlgebra: Symmetric - import Random - m = 2 - p = ones(m) ./ m - n = 12 - k = 4 - dist = Categorical(p) - sbm = NH.initialize_sbm(ones(k) ./ k, dist) - node_labels = repeat(1:k, inner = n ÷ k) - A, _ = NH.sample(sbm, node_labels) - g = NH.Observations(collect(A), dist) - a = NH.CategoricalAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) - swap = NH.make_swap(a, (1, k + 1)) - @test A[:, 1] != A[:, k + 1] - a_test = deepcopy(a) - NH.apply_swap!(a_test, swap) - @test NH.get_group_of_vertex(a, swap.index1) == - NH.get_group_of_vertex(a_test, swap.index2) - @test NH.get_group_of_vertex(a, swap.index2) == - NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment - a_new = to_default_assignment(a_test) - @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - @test a_test.additional_data.realized != a.additional_data.realized - @test a_test.additional_data.estimated_theta != - a.additional_data.estimated_theta - @test a_test.additional_data.log_likelihood != - a.additional_data.log_likelihood - # revert the swap and check if the assignment is the same as before - NH.revert_swap!(a_test, swap) - @test a == a_test - @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) -end - -@testset "fast update test" begin - using Distributions - realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; - [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized = [realized[I][k] - for k in eachindex(realized[1, 1]), - I in CartesianIndices(realized)] - counts = [1 4 4 - 4 1 4 - 4 4 1] - A = [0 1 2 2 3 3 - 1 0 2 2 3 3 - 2 2 0 1 3 3 - 2 2 1 0 3 3 - 3 3 3 3 0 1 - 3 3 3 3 1 0] - groupsize = NH.GroupSize(6, 2) - node_labels = [1, 1, 2, 2, 3, 3] - g = NH.Observations(A, Categorical(3)) - a = NH.CategoricalAssignment(g, groupsize, node_labels) - for index in eachindex(realized) - @test all(realized[index] .== a.additional_data.realized[index]) - end - @test loglikelihood(a, g) ≈ 0 - @test a.additional_data.counts == counts - swap_id = (1, 3) - ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; - [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized_after_swap = [ras[I][k] - for k in eachindex(ras[1, 1]), - I in CartesianIndices(ras)] - - swap = NH.make_swap(a, swap_id) - NH.apply_swap!(a, swap) - for j in 1:3 - for i in 1:3 - @test all(realized_after_swap[:, i, j] .== - a.additional_data.realized[:, i, j]) - @test all(a.additional_data.estimated_theta[:, i, j] .≈ - realized_after_swap[:, i, j] ./ counts[i, j]) - end - end - @test loglikelihood(a, g) == 4 * log(0.5) -end - -#todo: test ll against categorical likelihood on basic assignment -@testset "test swap is not overwritten" begin - A = [0 4 4 2 1 2 2 3 4 2 3 1 4 1 1 3 4 4 3 3 - 4 0 4 2 4 2 1 1 1 3 3 1 1 1 3 3 4 2 1 4 - 4 4 0 1 2 4 2 2 1 3 2 3 1 2 3 2 3 4 1 1 - 2 2 1 0 2 1 2 2 2 3 1 1 3 3 3 3 3 1 1 2 - 1 4 2 2 0 4 1 4 3 2 4 3 4 3 1 3 1 1 1 3 - 2 2 4 1 4 0 2 3 1 3 1 4 3 3 1 3 1 3 3 3 - 2 1 2 2 1 2 0 3 2 2 1 1 1 3 3 1 1 3 1 1 - 3 1 2 2 4 3 3 0 4 3 2 3 1 1 1 1 1 3 2 1 - 4 1 1 2 3 1 2 4 0 3 1 1 1 3 2 1 3 1 4 1 - 2 3 3 3 2 3 2 3 3 0 1 3 1 1 3 1 3 1 1 4 - 3 3 2 1 4 1 1 2 1 1 0 2 3 2 2 1 2 2 1 3 - 1 1 3 1 3 4 1 3 1 3 2 0 4 4 2 2 2 3 1 1 - 4 1 1 3 4 3 1 1 1 1 3 4 0 2 2 1 2 1 1 3 - 1 1 2 3 3 3 3 1 3 1 2 4 2 0 1 2 1 2 1 1 - 1 3 3 3 1 1 3 1 2 3 2 2 2 1 0 2 1 2 1 1 - 3 3 2 3 3 3 1 1 1 1 1 2 1 2 2 0 1 1 1 3 - 4 4 3 3 1 1 1 1 3 3 2 2 2 1 1 1 0 1 1 1 - 4 2 4 1 1 3 3 3 1 1 2 3 1 2 2 1 1 0 1 1 - 3 1 1 1 1 3 1 2 4 1 1 1 1 1 1 1 1 1 0 1 - 3 4 1 2 3 3 1 1 1 4 3 1 3 1 1 3 1 1 1 0] - g = NH.Observations(A, Categorical(4)) - h = 6 - a = NH.make_assignment( - g, h, NH.InitRule(NH.OrderedStart(), Val{NH.CategoricalData}())) - a_ref = deepcopy(a) - swap_indices = [(18, 5), (15, 10), (5, 13)] - swap = NH.make_swap(a, swap_indices[1]) - for swap_index in swap_indices - NH.make_swap!(swap, a, swap_index) - NH.apply_swap!(a, swap) - @test swap.realized == a_ref.additional_data.realized - @test swap.estimated_theta == a_ref.additional_data.estimated_theta - NH.revert_swap!(a, swap) - end -end diff --git a/test/assignments/default_assignment.jl b/test/assignments/default_assignment.jl deleted file mode 100644 index fefbf64..0000000 --- a/test/assignments/default_assignment.jl +++ /dev/null @@ -1,17 +0,0 @@ -import NetworkHistogram as NH - -@testset "test default swap" begin - using ..TestNetworkHistogram: test_swap_revertible - import Random, LinearAlgebra - using Distributions: Bernoulli, Normal - Random.seed!(1234123) - n = 20 - k = 5 - #data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) - data = Random.rand(Normal(), n, n) - g = NH.Observations(data, Normal(0, 1)) - labels = repeat(1:(n ÷ k), inner = k) - a = NH.Assignment(NH.GroupSize(n, k), labels) - swap = NH.DefaultSwap(1, 2) - test_swap_revertible(a, swap, g) -end diff --git a/test/assignments/sparse_assignment.jl b/test/assignments/sparse_assignment.jl deleted file mode 100644 index a7b56ac..0000000 --- a/test/assignments/sparse_assignment.jl +++ /dev/null @@ -1,120 +0,0 @@ -import NetworkHistogram as NH - -using Random - -@testset "test sparse give the same as categorical" begin - using Distributions, LinearAlgebra, SparseArrays - k = 2 - m = 5 - level_count = 4 - n = 20 - tau = [0.8, 0.1, 0.1, 0.1, 0.1] - sbm = NH.initialize_sbm(ones(k) ./ k, Categorical(tau ./ sum(tau))) - A, _ = NH.sample(sbm, n) - A_dense = collect(A) - A = sparse(A_dense .- 1) - for i in 1:n - A[i, i] = 0 - end - g = NH.Observations(A_dense, Categorical(m)) - sbm_fitted, a = nethist(g; h = n ÷ k, iterations = 10) - sparse_a = NH.SparseAssignment( - NH.Observations(A, Categorical(m)), a.group_size, a.node_labels) - @test a.additional_data.counts == sparse_a.additional_data.counts - for (l, m_index) in enumerate(2:m) - @test a.additional_data.realized[m_index, :, :] == - sparse_a.additional_data.realized[l, :, :] - @test a.additional_data.estimated_theta[m_index, :, :] == - sparse_a.additional_data.estimated_theta[l, :, :] - end - @test a.additional_data.log_likelihood ≈ - sparse_a.additional_data.log_likelihood -end - -@testset "test sparse swap" begin - Random.seed!(1234123) - using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment - using Distributions: DiscreteNonParametric - using LinearAlgebra: Symmetric - import Random - m = 4 - p = ones(m) ./ m - n = 12 - k = 4 - dist = NH.ZeroInflatedCategorical(p) - sbm = NH.initialize_sbm(ones(k) ./ k, dist) - node_labels = repeat(1:k, inner = n ÷ k) - A = sparse(first(NH.sample(sbm, node_labels))) - g = NH.Observations(A, dist) - a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) - swap = NH.make_swap(a, (1, k + 1)) - @test A[:, 1] != A[:, k + 1] - a_test = deepcopy(a) - NH.apply_swap!(a_test, swap) - @test NH.get_group_of_vertex(a, swap.index1) == - NH.get_group_of_vertex(a_test, swap.index2) - @test NH.get_group_of_vertex(a, swap.index2) == - NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment - a_new = to_default_assignment(a_test) - @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - @test a_test.additional_data.realized != a.additional_data.realized - @test a_test.additional_data.estimated_theta != - a.additional_data.estimated_theta - @test a_test.additional_data.log_likelihood != - a.additional_data.log_likelihood - # revert the swap and check if the assignment is the same as before - NH.revert_swap!(a_test, swap) - @test a == a_test - @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) -end - -@testset "fast sparse update test" begin - using Distributions - realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; - [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized = [realized[I][k] - for k in eachindex(realized[1, 1]), - I in CartesianIndices(realized)] - counts = [1 4 4 - 4 1 4 - 4 4 1] - A = sparse([0 1 2 2 3 3 - 1 0 2 2 3 3 - 2 2 0 1 3 3 - 2 2 1 0 3 3 - 3 3 3 3 0 1 - 3 3 3 3 1 0]) - groupsize = NH.GroupSize(6, 2) - node_labels = [1, 1, 2, 2, 3, 3] - g = NH.Observations(A, Categorical(3)) - k = 3 - m = 3 - n = size(A, 1) - a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) - for index in eachindex(realized) - @test all(realized[index] .== a.additional_data.realized[index]) - end - @test loglikelihood(a, g) ≈ 0 - @test a.additional_data.counts == counts - swap_id = (1, 3) - ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; - [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized_after_swap = [ras[I][k] - for k in eachindex(ras[1, 1]), - I in CartesianIndices(ras)] - - swap = NH.make_swap(a, swap_id) - NH.apply_swap!(a, swap) - for j in 1:3 - for i in 1:3 - @test all(realized_after_swap[:, i, j] .== - a.additional_data.realized[:, i, j]) - @test all(a.additional_data.estimated_theta[:, i, j] .≈ - realized_after_swap[:, i, j] ./ counts[i, j]) - end - end - @test loglikelihood(a, g) ≈ 4 * log(0.5) -end diff --git a/test/assignments/sum_assignment.jl b/test/assignments/sum_assignment.jl deleted file mode 100644 index 1a80856..0000000 --- a/test/assignments/sum_assignment.jl +++ /dev/null @@ -1,9 +0,0 @@ -import NetworkHistogram as NH - -using Random - - -@testset "test sum assignment" begin - using Distributions, LinearAlgebra, SparseArrays - @test 1 == 2 -end diff --git a/test/discretised_dist/discretizer.jl b/test/discretised_dist/discretizer.jl deleted file mode 100644 index 17c90fa..0000000 --- a/test/discretised_dist/discretizer.jl +++ /dev/null @@ -1,20 +0,0 @@ -using NetworkHistogram - -@testset "discretizer" begin - using StaticArrays - reg_disc = NetworkHistogram.RegularDiscretizer( - 10, 0.0, 1.0, MVector{10}(1:10), 1 / 10) - cat_disc = NetworkHistogram.CategoryDiscretizer( - Dict([0.0 => 11]), Dict([11 => 0.0])) - hybrid_disc = NetworkHistogram.HybridDiscretizer( - reg_disc, cat_disc) - - @test NetworkHistogram.encode(reg_disc, 0.0) == 1 - @test NetworkHistogram.encode(cat_disc, 0.0) == 11 - @test NetworkHistogram.encode(hybrid_disc, 0.0) == 11 - @test NetworkHistogram.decode(hybrid_disc, 11) == 0.0 - @test all(NetworkHistogram.encode(reg_disc, 0.001:0.001:1.0) .== - NetworkHistogram.encode(hybrid_disc, 0.001:0.001:1.0)) - @test all(NetworkHistogram.decode(hybrid_disc, 1:10) .== - NetworkHistogram.decode(reg_disc, 1:10)) -end diff --git a/test/generated_tests/all.jl b/test/generated_tests/all.jl deleted file mode 100644 index 3a6cd57..0000000 --- a/test/generated_tests/all.jl +++ /dev/null @@ -1,2 +0,0 @@ -include("test_zero_inflated.jl") -include("test_distribution.jl") diff --git a/test/generated_tests/test_distribution.jl b/test/generated_tests/test_distribution.jl deleted file mode 100644 index 9389428..0000000 --- a/test/generated_tests/test_distribution.jl +++ /dev/null @@ -1,84 +0,0 @@ -using NetworkHistogram: ZeroInflated, DiscretizedDistribution, - ZeroInflatedCategorical, - ncategories, Discretizer, encode, decode, binwidth, - RegularDiscretizer, - CategoryDiscretizer, HybridDiscretizer, - DiscretizerZeroToZero, nlabels -using Distributions -using Test - -@testset "ZeroInflated" begin - dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) - @test pdf(dist, 0) ≈ 0.3 + 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 0) - @test pdf(dist, 1) ≈ 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 1) - @test cdf(dist, 0) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 0) - @test cdf(dist, 1) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 1) -end - -@testset "DiscretizedDistribution" begin - dist = DiscretizedDistribution(truncated(Normal(0, 1), -3, 3), 10) - @test ncategories(dist) == 10 - @test pdf(dist, 0) >= 0 - @test cdf(dist, 0) >= 0 -end - -@testset "ZeroInflatedCategorical" begin - dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) - @test pdf(dist, 0) ≈ 0.3 - @test pdf(dist, 1) ≈ 0.7 * 0.2 - @test cdf(dist, 0) ≈ 0.3 - @test cdf(dist, 1) ≈ 0.3 + 0.7 * 0.2 -end - -@testset "ZeroInflatedDiscretizedDistribution" begin - dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) - disc_dist = DiscretizedDistribution(dist, 10) - @test ncategories(disc_dist) == 10 - @test pdf(disc_dist, 0) >= 0 - @test cdf(disc_dist, 0) >= 0 -end - -@testset "DiscretizedZeroInflatedCategorical" begin - dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) - disc_dist = DiscretizedDistribution(dist, 10) - @test ncategories(disc_dist) == 10 - @test pdf(disc_dist, 0) >= 0 - @test cdf(disc_dist, 0) >= 0 -end - -@testset "Discretizer" begin - using Distributions - disc = RegularDiscretizer(10, 0.0, 1.0) - @test encode(disc, 0.05) == 1 - @test decode(disc, 1) == (0.0, 0.1) - @test binwidth(disc) == 0.1 - @test nlabels(disc) == 10 -end - -@testset "CategoryDiscretizer" begin - cat_to_bin = Dict("a" => 1, "b" => 2, "c" => 3) - bin_to_cat = Dict(1 => "a", 2 => "b", 3 => "c") - disc = CategoryDiscretizer(cat_to_bin, bin_to_cat) - @test encode(disc, "a") == 1 - @test decode(disc, 1) == "a" - @test nlabels(disc) == 3 -end - -@testset "HybridDiscretizer" begin - atoms = [0.0, 1.0] - disc = HybridDiscretizer(10, -1.0, 1.0, atoms) - @test encode(disc, 0.0) == 11 - @test encode(disc, 0.5) == 8 - @test decode(disc, 11) == 0.0 - @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) - @test nlabels(disc) == 12 -end - -@testset "DiscretizerZeroToZero" begin - disc = DiscretizerZeroToZero(10, -1.0, 1.0) - @test encode(disc, 0.0) == 0 - @test encode(disc, 0.5) == 8 - @test decode(disc, 0) == 0.0 - @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) - @test nlabels(disc) == 11 -end diff --git a/test/generated_tests/test_zero_inflated.jl b/test/generated_tests/test_zero_inflated.jl deleted file mode 100644 index 380e80c..0000000 --- a/test/generated_tests/test_zero_inflated.jl +++ /dev/null @@ -1,97 +0,0 @@ -using Test -using Distributions -using Random -using NetworkHistogram: ZeroInflated, get_proba_zero - -@testset "ZeroInflated Distribution Tests" begin - @testset "continuous distribution" begin - # Test construction - dist = Normal(0, 1) - zero_inflated_dist = ZeroInflated(0.5, dist) - @test zero_inflated_dist.edge_proba == Bernoulli(0.5) - @test zero_inflated_dist.dist == dist - - # Test pdf - @test pdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * pdf(dist, 0) - @test pdf(zero_inflated_dist, 1) ≈ 0.5 * pdf(dist, 1) - - # Test get_proba_zero - @test get_proba_zero(zero_inflated_dist) == 0.5 - - # Test rand - rng = MersenneTwister(1234) - sample = rand(rng, zero_inflated_dist) - @test sample == 0 || insupport(dist, sample) - - # Test logpdf - @test logpdf(zero_inflated_dist, 0) ≈ log(0.5 * (1 + pdf(dist, 0))) - @test logpdf(zero_inflated_dist, 1) ≈ log(0.5 * pdf(dist, 1)) - - # Test minimum and maximum - @test minimum(zero_inflated_dist) == minimum(dist) - @test maximum(zero_inflated_dist) == maximum(dist) - - # Test insupport - @test insupport(zero_inflated_dist, 0) - @test insupport(zero_inflated_dist, 1) == insupport(dist, 1) - - # Test cdf - @test cdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * cdf(dist, 0) - @test cdf(zero_inflated_dist, 1) ≈ 0.5 + 0.5 * cdf(dist, 1) - - # Test params - @test params(zero_inflated_dist) == (0.5, params(dist)...) - - # Test fit - data = [0, 0, 1, 2, 3] - fitted_dist = fit(ZeroInflated{Bernoulli, Normal}, data, 2) - @test fitted_dist.edge_proba == Bernoulli(0.6) - @test fitted_dist.dist isa Normal - end - - @testset "discrete distribution" begin - # Test construction with discrete distribution - dist_disc = Poisson(3) - zero_inflated_dist_disc = ZeroInflated(0.5, dist_disc) - @test zero_inflated_dist_disc.edge_proba == Bernoulli(0.5) - @test zero_inflated_dist_disc.dist == dist_disc - - # Test pdf with discrete distribution - @test pdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * pdf(dist_disc, 0) - @test pdf(zero_inflated_dist_disc, 1) ≈ 0.5 * pdf(dist_disc, 1) - - # Test get_proba_zero with discrete distribution - @test get_proba_zero(zero_inflated_dist_disc) == 0.5 - - # Test rand with discrete distribution - rng = MersenneTwister(1234) - sample_disc = rand(rng, zero_inflated_dist_disc) - @test sample_disc == 0 || insupport(dist_disc, sample_disc) - - # Test logpdf with discrete distribution - @test logpdf(zero_inflated_dist_disc, 0) ≈ - log(0.5 * (1 + pdf(dist_disc, 0))) - @test logpdf(zero_inflated_dist_disc, 1) ≈ log(0.5 * pdf(dist_disc, 1)) - - # Test minimum and maximum with discrete distribution - @test minimum(zero_inflated_dist_disc) == minimum(dist_disc) - @test maximum(zero_inflated_dist_disc) == maximum(dist_disc) - - # Test insupport with discrete distribution - @test insupport(zero_inflated_dist_disc, 0) - @test insupport(zero_inflated_dist_disc, 1) == insupport(dist_disc, 1) - - # Test cdf with discrete distribution - @test cdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * cdf(dist_disc, 0) - @test cdf(zero_inflated_dist_disc, 1) ≈ 0.5 + 0.5 * cdf(dist_disc, 1) - - # Test params with discrete distribution - @test params(zero_inflated_dist_disc) == (0.5, params(dist_disc)...) - - # Test fit with discrete distribution - data_disc = [0, 0, 1, 2, 3] - fitted_dist_disc = fit(ZeroInflated{Bernoulli, Poisson}, data_disc, 2) - @test fitted_dist_disc.edge_proba == Bernoulli(0.6) - @test fitted_dist_disc.dist isa Poisson - end -end diff --git a/test/observations/discretisation.jl b/test/observations/discretisation.jl deleted file mode 100644 index 49eb959..0000000 --- a/test/observations/discretisation.jl +++ /dev/null @@ -1,15 +0,0 @@ -using NetworkHistogram - -@testset "discretisation" begin - using Distributions - A = rand(-1:1, 20, 20) - for i in 1:20 - A[i, i] = 0 - end - g = Observations(A, Uniform(-1, 1)) - discretised_g, discretizer = discretise(g; number_levels = 6) - @test size(discretised_g.graph) == size(g.graph) - @test discretised_g.dist_ref isa NetworkHistogram.DiscretizedDistribution - @test ncategories(discretised_g.dist_ref) == 6 - @test all(discretised_g.graph .∈ Ref(0:6)) -end diff --git a/test/optimisation/config_rules/init_rule.jl b/test/optimisation/config_rules/init_rule.jl deleted file mode 100644 index f304378..0000000 --- a/test/optimisation/config_rules/init_rule.jl +++ /dev/null @@ -1,46 +0,0 @@ -import NetworkHistogram as NH - -@testset "regression test" begin - Random.seed!(1234123) - using Distributions: Bernoulli - A = BitMatrix([0 0 1 0 1 0 1 1 0 1 - 0 0 1 1 1 1 1 1 0 0 - 1 1 0 1 0 0 0 0 1 0 - 0 1 1 0 1 0 1 0 0 0 - 1 1 0 1 0 0 1 0 0 1 - 0 1 0 0 0 0 0 1 0 0 - 1 1 0 1 1 0 0 1 0 1 - 1 1 0 0 0 1 1 0 0 1 - 0 0 1 0 0 0 0 0 0 1 - 1 0 0 0 1 0 1 1 1 0]) - h_true_nethist = 2.643731 # version 0.2.3 from nethist package - k_true = 3 - obs = NH.Observations(A, Bernoulli(0.5)) - @testset "degrees" begin - k = NH.select_number_node_per_block(obs, NH.EstimatedDegrees()) - @test k == k_true - end - @testset "eigenvalues" begin - k = NH.select_number_node_per_block(obs, NH.EstimatedEigenvalues()) - @test k == k_true - end -end - -@testset "test oracle K" begin - Random.seed!(1234123) - using Distributions: Bernoulli - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - obs = NH.Observations(A, Bernoulli(0.5)) - oracle = NH.OracleH(4) - @test NH.select_number_node_per_block(obs, oracle) == 4 - err = ArgumentError("The number of nodes per block 5 is too large for the \ - number of nodes 8, it should be at most 4") - @test_throws err NH.select_number_node_per_block(obs, NH.OracleH(5)) -end diff --git a/test/runtests.jl b/test/runtests.jl index 8c1ee77..069b837 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,38 +1,28 @@ using Test -using Aqua -using SparseArrays -include("TestNetworkHistogram.jl") - +using LinearAlgebra, SparseArrays +using NetworkHistogram @testset "Tests" begin - @testset "Discretizer tests" begin - include("discretised_dist/discretizer.jl") - end - @testset "Assignment tests" begin - include("assignments/default_assignment.jl") - include("assignments/bernoulli_assignment.jl") - include("assignments/categorical_assignment.jl") - include("assignments/sparse_assignment.jl") - include("assignments/sum_assignment.jl") + @testset "test can run" begin + @test 1 == 1 end - @testset "Rule optimization tests" begin - include("optimisation/config_rules/init_rule.jl") - end - @testset "Observations tests" begin - include("observations/discretisation.jl") - end + @testset "Edge list tests" begin + A = Symmetric(sprand(20,20,0.5)) + edgelist = EdgeList(A) - @testset "API tests" begin - include("test_api.jl") - end + for j in 1:20 + for i in 1:20 + if A[i,j] != 0 + nv_j, val_j = neighbors(edgelist, j) + @test i in nv_j + @test A[i,j] == val_j[findfirst(x -> x == i, nv_j)] + end + end + end - @testset "Generated tests" begin - include("generated_tests/all.jl") + @test eltype(edgelist) == eltype(A) + @test nodes(edgelist) == size(A,1) end - # @testset "Aqua.jl for package quality" begin - # using NetworkHistogram - # Aqua.test_all(NetworkHistogram) - # end end diff --git a/test/test_api.jl b/test/test_api.jl deleted file mode 100644 index 48fdefb..0000000 --- a/test/test_api.jl +++ /dev/null @@ -1,19 +0,0 @@ -@testset "test api" begin - using Distributions - A = rand(-1:1, 40, 40) - for i in 1:40 - A[i, i] = 0 - end - - g = Observations(Symmetric(A), Uniform(-1, 1)) - sbm_fitted, a = nethist(g; h = 10, iterations = 10) - - @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) - @test size(sbm_fitted) == (4, 4) - - sbm_discretised, a, discretizer = nethist_discretised( - g; number_levels = 5, h = 10, iterations = 10) - @test sbm_discretised[1, 1] isa DiscretizedDistribution - @test ncategories(sbm_discretised[1, 1]) == 5 - @test size(sbm_discretised) == (4, 4) -end From 8079189390a83a28bc474ac077deb3fb77c45102 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 28 Apr 2025 16:49:29 +0200 Subject: [PATCH 135/266] new things --- src/EdgeList.jl | 30 +++ src/assignment.jl | 106 ++++++++ src/block_model.jl | 13 + src/distributions_type.jl | 47 ++++ src/include_old.jl | 67 +++++ src/old_messy/api.jl | 155 +++++++++++ src/old_messy/assignments/Assignments.jl | 104 ++++++++ .../assignments/BernoulliAssignment/struct.jl | 118 +++++++++ .../assignments/BernoulliAssignment/swap.jl | 99 +++++++ .../CategoricalAssignment/struct.jl | 128 +++++++++ .../assignments/CategoricalAssignment/swap.jl | 201 ++++++++++++++ .../assignments/SparseAssignment/struct.jl | 115 ++++++++ .../assignments/SparseAssignment/swap.jl | 134 ++++++++++ .../assignments/SumAssignment/struct.jl | 94 +++++++ .../assignments/SumAssignment/swap.jl | 119 +++++++++ src/old_messy/assignments/group_numbering.jl | 86 ++++++ src/old_messy/assignments/include.jl | 4 + src/old_messy/bootstrap.jl | 15 ++ .../distributions/categorical_with_0.jl | 141 ++++++++++ src/old_messy/distributions/discrete_dist.jl | 142 ++++++++++ src/old_messy/distributions/discretizer.jl | 248 ++++++++++++++++++ src/old_messy/distributions/include.jl | 5 + src/old_messy/distributions/markov_chain.jl | 144 ++++++++++ src/old_messy/distributions/utils.jl | 19 ++ src/old_messy/distributions/zero_inflated.jl | 93 +++++++ src/old_messy/observations.jl | 231 ++++++++++++++++ .../optimisation/config_rules/InitRule.jl | 99 +++++++ .../optimisation/config_rules/accept_rule.jl | 25 ++ .../config_rules/bandwidth_selection_rule.jl | 101 +++++++ .../optimisation/config_rules/include.jl | 5 + .../optimisation/config_rules/stop_rule.jl | 49 ++++ .../optimisation/config_rules/swap_rule.jl | 27 ++ src/old_messy/optimisation/fit.jl | 99 +++++++ src/old_messy/optimisation/include.jl | 3 + src/old_messy/optimisation/least_squares.jl | 95 +++++++ src/old_messy/optimisation/swap.jl | 26 ++ src/old_messy/sbm.jl | 186 +++++++++++++ src/optimization/config_rules/InitRule.jl | 44 ++++ src/optimization/config_rules/accept_rule.jl | 22 ++ .../config_rules/bandwidth_selection_rule.jl | 42 +++ src/optimization/config_rules/include.jl | 5 + src/optimization/config_rules/stop_rule.jl | 47 ++++ src/optimization/config_rules/swap_rule.jl | 27 ++ src/optimization/greedy.jl | 43 +++ src/optimization/swap_workspace.jl | 75 ++++++ src/utils/SymArray.jl | 28 ++ src/utils/include.jl | 1 + test/old_tests/TestNetworkHistogram.jl | 30 +++ .../assignments/bernoulli_assignment.jl | 42 +++ .../assignments/categorical_assignment.jl | 126 +++++++++ .../assignments/default_assignment.jl | 17 ++ .../assignments/sparse_assignment.jl | 120 +++++++++ test/old_tests/assignments/sum_assignment.jl | 9 + .../old_tests/discretised_dist/discretizer.jl | 20 ++ test/old_tests/generated_tests/all.jl | 2 + .../generated_tests/test_distribution.jl | 84 ++++++ .../generated_tests/test_zero_inflated.jl | 97 +++++++ test/old_tests/observations/discretisation.jl | 15 ++ .../optimisation/config_rules/init_rule.jl | 46 ++++ test/old_tests/runtests.jl | 38 +++ test/old_tests/test_api.jl | 19 ++ 61 files changed, 4372 insertions(+) create mode 100644 src/EdgeList.jl create mode 100644 src/assignment.jl create mode 100644 src/block_model.jl create mode 100644 src/distributions_type.jl create mode 100644 src/include_old.jl create mode 100644 src/old_messy/api.jl create mode 100644 src/old_messy/assignments/Assignments.jl create mode 100644 src/old_messy/assignments/BernoulliAssignment/struct.jl create mode 100644 src/old_messy/assignments/BernoulliAssignment/swap.jl create mode 100644 src/old_messy/assignments/CategoricalAssignment/struct.jl create mode 100644 src/old_messy/assignments/CategoricalAssignment/swap.jl create mode 100644 src/old_messy/assignments/SparseAssignment/struct.jl create mode 100644 src/old_messy/assignments/SparseAssignment/swap.jl create mode 100644 src/old_messy/assignments/SumAssignment/struct.jl create mode 100644 src/old_messy/assignments/SumAssignment/swap.jl create mode 100644 src/old_messy/assignments/group_numbering.jl create mode 100644 src/old_messy/assignments/include.jl create mode 100644 src/old_messy/bootstrap.jl create mode 100644 src/old_messy/distributions/categorical_with_0.jl create mode 100644 src/old_messy/distributions/discrete_dist.jl create mode 100644 src/old_messy/distributions/discretizer.jl create mode 100644 src/old_messy/distributions/include.jl create mode 100644 src/old_messy/distributions/markov_chain.jl create mode 100644 src/old_messy/distributions/utils.jl create mode 100644 src/old_messy/distributions/zero_inflated.jl create mode 100644 src/old_messy/observations.jl create mode 100644 src/old_messy/optimisation/config_rules/InitRule.jl create mode 100644 src/old_messy/optimisation/config_rules/accept_rule.jl create mode 100644 src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl create mode 100644 src/old_messy/optimisation/config_rules/include.jl create mode 100644 src/old_messy/optimisation/config_rules/stop_rule.jl create mode 100644 src/old_messy/optimisation/config_rules/swap_rule.jl create mode 100644 src/old_messy/optimisation/fit.jl create mode 100644 src/old_messy/optimisation/include.jl create mode 100644 src/old_messy/optimisation/least_squares.jl create mode 100644 src/old_messy/optimisation/swap.jl create mode 100644 src/old_messy/sbm.jl create mode 100644 src/optimization/config_rules/InitRule.jl create mode 100644 src/optimization/config_rules/accept_rule.jl create mode 100644 src/optimization/config_rules/bandwidth_selection_rule.jl create mode 100644 src/optimization/config_rules/include.jl create mode 100644 src/optimization/config_rules/stop_rule.jl create mode 100644 src/optimization/config_rules/swap_rule.jl create mode 100644 src/optimization/greedy.jl create mode 100644 src/optimization/swap_workspace.jl create mode 100644 src/utils/SymArray.jl create mode 100644 src/utils/include.jl create mode 100644 test/old_tests/TestNetworkHistogram.jl create mode 100644 test/old_tests/assignments/bernoulli_assignment.jl create mode 100644 test/old_tests/assignments/categorical_assignment.jl create mode 100644 test/old_tests/assignments/default_assignment.jl create mode 100644 test/old_tests/assignments/sparse_assignment.jl create mode 100644 test/old_tests/assignments/sum_assignment.jl create mode 100644 test/old_tests/discretised_dist/discretizer.jl create mode 100644 test/old_tests/generated_tests/all.jl create mode 100644 test/old_tests/generated_tests/test_distribution.jl create mode 100644 test/old_tests/generated_tests/test_zero_inflated.jl create mode 100644 test/old_tests/observations/discretisation.jl create mode 100644 test/old_tests/optimisation/config_rules/init_rule.jl create mode 100644 test/old_tests/runtests.jl create mode 100644 test/old_tests/test_api.jl diff --git a/src/EdgeList.jl b/src/EdgeList.jl new file mode 100644 index 0000000..2f57477 --- /dev/null +++ b/src/EdgeList.jl @@ -0,0 +1,30 @@ +struct EdgeList{E} + data::Vector{Vector{Tuple{Int,E}}} +end + +function neighbors(A::EdgeList{E}, i::Int) where {E} + return first.(A.data[i]), last.(A.data[i]) +end + +function Base.eltype(edgelist::EdgeList{E}) where {E} + return E +end + +function nodes(edgelist::EdgeList{E}) where {E} + return length(edgelist.data) +end + + +function EdgeList(A::AbstractMatrix{E}) where {E} + n = size(A, 1) + data = Vector{Vector{Tuple{Int,E}}}(undef, n) + for j in 1:n + data[j] = Vector{Tuple{Int,E}}(undef, 0) + for i in 1:n + if A[i, j] != 0 + push!(data[j], (i, A[i, j])) + end + end + end + return EdgeList(data) +end diff --git a/src/assignment.jl b/src/assignment.jl new file mode 100644 index 0000000..7c0dce5 --- /dev/null +++ b/src/assignment.jl @@ -0,0 +1,106 @@ +""" +Array-like storage for the number of nodes in each group. Try to split the number of nodes +into equal groups, but if it is not possible, the last group may have more nodes. +""" +struct GroupSize{T} <: AbstractVector{Int} + group_number::T + number_groups::Int + + function GroupSize(number_nodes, h::Real) + @assert 0 < h < 1 + standard_group = floor(Int, number_nodes * h) + GroupSize(number_nodes, standard_group) + end + + function GroupSize(number_nodes, standard_group::Integer) + @assert 1 < standard_group <= number_nodes + number_groups = number_nodes ÷ standard_group # number of standard groups! + if number_groups * standard_group == number_nodes + new{Int}(standard_group, number_groups) + else + remainder_group = standard_group + + mod(number_nodes, standard_group) + new{Tuple{Int, Int}}( + (standard_group, remainder_group), number_groups) + end + end +end + +Base.size(g::GroupSize) = (g.number_groups,) +Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) + @boundscheck checkbounds(g, i) + return g.group_number +end + +Base.@propagate_inbounds function Base.getindex( + g::GroupSize{Tuple{Int, Int}}, i::Int) + @boundscheck checkbounds(g, i) + return i < length(g) ? g.group_number[1] : g.group_number[2] +end + + + +mutable struct Assignment{E, D, A, F} + node_labels::AbstractVector{Int} + const edges::EdgeList{E} + const dists::EdgeList{D} + θ::SymArray{D} + log_likelihood::SymArray{F} +end + + +function loglikelihood(a::Assignment) + return sum(a.log_likelihood) +end + +function group(a::Assignment, node::Int) + return a.node_labels[node] +end + +function get_edges_in_groups(a::Assignment, g1::Int, g2::Int) + nodes_g1 = findall(x -> x == g1, a.node_labels) + edges = Vector(eltype(a.edges), 0) + if g1 == g2 + for u in nodes_g1 + for (v, e) in a.edges[u] + if v in nodes_g1 && u < v + push!(edges, e) + end + end + end + else + nodes_g2 = findall(x -> x == g2, a.node_labels) + for u in nodes_g1 + for (v, e) in a.edges[u] + if v in nodes_g2 + push!(edges, e) + end + end + end + end + return edges +end + + +function Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} + dists = fit.(dist ,edge_list) + number_groups = length(unique(node_labels)) + θ = SymArray(number_groups, dist) + log_likelihood = SymArray(number_groups, 0.0) + for u in 1:length(dists) + g1 = node_labels[u] + for (v,d) in neighbors(dists, u) + g2 = node_labels[v] + if g1 == g2 && u < v + continue + end + θ[g1, g2] = add_to(θ[g1, g2], d) + end + end + for k in 1:number_groups + for l in k:number_groups + log_likelihood[k, l] = loglikelihood(θ[k,l], get_edges_in_groups(edge_list, k, l)) + end + end + return Assignment(node_labels, edge_list, dists, θ, log_likelihood) +end diff --git a/src/block_model.jl b/src/block_model.jl new file mode 100644 index 0000000..c824d92 --- /dev/null +++ b/src/block_model.jl @@ -0,0 +1,13 @@ +struct BlockModel{D,K,T} + _dists::SymArray{D} + sizes::SVector{K,T} +end + + +Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) + return s._dists[minmax(i, j)] +end + +function Base.setindex!(s::BlockModel, v, i, j) + s._dists[minmax(i, j)] = v +end diff --git a/src/distributions_type.jl b/src/distributions_type.jl new file mode 100644 index 0000000..7402941 --- /dev/null +++ b/src/distributions_type.jl @@ -0,0 +1,47 @@ +struct Dist{D} + dist::D + counts::Int +end + +Dist(d) = Dist(d, 1) + +Base.broadcastable(x::Dist) = Ref(x) + +function add_to(avgdist::Dist{D}, dist::D) where {D} + return Dist(agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), 1 / (avgdist.counts + 1)), avgdist.counts + 1) +end + + +function remove_from(avgdist::Dist{D}, dist::D) where {D} + if avgdist.counts == 1 + error("Cannot remove from a distribution with only one sample") + else + return Dist(agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts - 1), - 1 / (avgdist.counts - 1)), avgdist.counts -1) + end +end + +for f in [:logpdf, :sample, :dist, :eltype] + @eval $f(d::Dist, args...) = $f(d.dist, args...) +end + +fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) +loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) + +# expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined +# by default do nothing +_fast_compressed_obs(d, x) = x + +unwrap(d::Dist) = d.dist + +# Bernoulli distribution + +struct Bernoulli{T<:Real} + p::T +end + + +agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) = Bernoulli(w1 * d1.p + w2 * d2.p) +fit(::Bernoulli, x) = Bernoulli(mean(x)) +sample(d::Bernoulli, n=1) = rand(n) .<= d.p +dist(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) +logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) diff --git a/src/include_old.jl b/src/include_old.jl new file mode 100644 index 0000000..beda772 --- /dev/null +++ b/src/include_old.jl @@ -0,0 +1,67 @@ +using LinearAlgebra, SparseArrays +using Distributions, DensityInterface +using Graphs, SimpleWeightedGraphs +using PermutationSymmetricTensors +using ProgressMeter: Progress, next!, finish!, ProgressUnknown +import StatsBase, Random +using DensityInterface: logdensityof +using LogExpFunctions: xlogx, xlogy +using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen +using KrylovKit: eigsolve +import Metis +import IterativeSolvers +import Clustering +import StatsAPI: loglikelihood, fit +using CategoricalArrays, CategoricalDistributions +using Combinatorics: permutations +using StaticArrays +using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, nrun, + zeros_tuple +import Bootstrap: bootstrap +import Base.maximum, Base.minimum +import Random: rand +import Base.convert +import Distributions: pdf, logpdf, ncategories, cdf, rand + + +include("old_messy/distributions/include.jl") +include("old_messy/assignments/Assignments.jl") +include("old_messy/sbm.jl") +include("old_messy/observations.jl") +include("old_messy/optimisation/include.jl") + +# more specialised and faster assignment types and methods +include("old_messy/assignments/include.jl") + +include("old_messy/api.jl") +include("old_messy/bootstrap.jl") + +export nethist, nethist_discretised +export loglikelihood, fit, cdf, pdf + +# export options for optimisation +export estimate_graphon +# starting assignment rules +export InitRule +export OrderedStart, RandomStart, SpectralStart, MetisStart, FromAssignment +# accept rules +export AcceptRule +export Strict +# stopping rules +export PreviousBestValue +# bandwidth selection rules +export OracleK, EstimatedEigenvalues, EstimatedDegrees, + select_number_node_per_block +# random local search rules +export RandomNodeSwap, RandomGroupSwap + +# export useful function for manipulating assignments +export Assignment, number_groups, number_nodes +export get_ordered_adjacency_matrix, get_vertex_in_group, get_group_of_vertex +export BernoulliData, CategoricalData +export Observations, discretise +export DiscretizedDistribution + +export Observations, estimate_graphon, nethist, nethist_discretised + +export bootstrap diff --git a/src/old_messy/api.jl b/src/old_messy/api.jl new file mode 100644 index 0000000..d09a6f3 --- /dev/null +++ b/src/old_messy/api.jl @@ -0,0 +1,155 @@ +""" + _default_init(dist::Distribution, start = MetisStart()) + +Initialize the distribution with a default rule. + +# Arguments +- `dist::Distribution`: The distribution to initialize. +- `start`: The starting method. + +# Returns +- `InitRule`: The initialization rule. +""" +function _default_init(dist::Distribution, start = MetisStart()) + if dist isa Bernoulli + return InitRule(start, Val{BernoulliData}()) + elseif dist isa Categorical + return InitRule(start, Val{CategoricalData}()) + elseif dist isa DiscretizedDistribution || dist isa ZeroInflatedCategorical + return InitRule(start, Val{SparseData}()) + else + return InitRule(start, nothing) + end +end + +function _default_init(::DiscreteMarkovChain, start = RandomStart()) + return InitRule(start, Val{SumData}()) +end + +""" + _nethist(g::Observations{G, D}, h; kwargs...) + +Estimate the graphon and fit the model to the given graph observations. + +# Arguments +- `g::Observations{G, D}`: The graph observations. +- `h`: Number of nodes per block. +- `kwargs...`: Additional keyword arguments. + +# Returns +- `fit_model`: The fitted model. +- `a`: The assignment of nodes to blocks. +""" +function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} + kwargs_dict = Dict(kwargs) + start_clustering = pop!(kwargs_dict, :start_clustering, MetisStart()) + initialise_rule = pop!( + kwargs_dict, :initialise_rule, _default_init( + g.dist_ref, start_clustering)) + a = estimate_graphon(g, h; + kwargs_dict..., initialise_rule = initialise_rule) + return fit(a, g), a +end + +""" + nethist(g::Observations{G, D}; h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) + +Fit a Stochastic Block Model (SBM) to the given graph observations. + +# Arguments +- `g::Observations{G, D}`: The graph observations. +- `h`: Number of nodes per block. +- `iterations`: Maximum number of iterations. +- `stalled_iter`: Number of stalled iterations before stopping. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `progress_bar::Bool`: Whether to show a progress bar. +- `start_clustering`: Initial clustering method. + +# Returns +- `sbm`: The fitted SBM. +- `a`: The assignment of nodes to blocks. +""" +function nethist(g::Observations{G, D}; + h = select_number_node_per_block(g, EstimatedDegrees()), + iterations = 100_000, + stalled_iter = 1000, + swap_rule::NodeSwapRule = RandomGroupSwap(), + accept_rule::AcceptRule = Strict(), + progress_bar::Bool = false, + start_clustering = MetisStart() +) where {G, D} + return _nethist(g, h; + iterations = iterations, + swap_rule = swap_rule, + accept_rule = accept_rule, + stop_rule = PreviousBestValue(stalled_iter), + progress_bar = progress_bar, + start_clustering = start_clustering) +end + +""" + nethist_discretised(g::Observations{G, D}; number_levels, h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) + +Fit a discretised Stochastic Block Model (SBM) to the given graph observations. + +# Arguments +- `g::Observations{G, D}`: The graph observations. +- `number_levels`: Number of levels for discretisation. +- `h`: Number of nodes per block. +- `iterations`: Maximum number of iterations. +- `stalled_iter`: Number of stalled iterations before stopping. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `progress_bar::Bool`: Whether to show a progress bar. +- `start_clustering`: Initial clustering method. + +# Returns +- `sbm_discretise`: The fitted discretised SBM. +- `a`: The assignment of nodes to blocks. +- `discretiser`: The discretiser used. +""" +function nethist_discretised(g::Observations{G, D}; + number_levels = nothing, + h = select_number_node_per_block(g, EstimatedDegrees()), + iterations = 100_000, + stalled_iter = 1000, + swap_rule::NodeSwapRule = RandomGroupSwap(), + accept_rule::AcceptRule = Strict(), + progress_bar::Bool = false, + start_clustering = MetisStart() +) where {G, D} + num_groups = isnothing(number_levels) ? number_nodes(g) ÷ h : nothing + obs_discrete, discretiser = discretise( + g, number_groups = num_groups, number_levels = number_levels) + sbm_discretise, a = _nethist(obs_discrete, h; + iterations = iterations, + swap_rule = swap_rule, + accept_rule = accept_rule, + stop_rule = PreviousBestValue(stalled_iter), + progress_bar = progress_bar, + start_clustering = start_clustering) + return sbm_discretise, a, discretiser +end + + + +function nethist_mc(g::Observations{G, <:DiscreteMarkovChain}; + h = number_nodes(g) ÷ 2, + iterations = 100_000, + stalled_iter = 1000, + swap_rule::NodeSwapRule = RandomGroupSwap(), + accept_rule::AcceptRule = Strict(), + progress_bar::Bool = true, + start_clustering = RandomStart() +) where {G} + initialise_rule = _default_init(g.dist_ref, start_clustering) + a = estimate_graphon(g, h; + iterations = iterations, + initialise_rule = initialise_rule, + swap_rule = swap_rule, + accept_rule = accept_rule, + stop_rule = PreviousBestValue(stalled_iter), + progress_bar = progress_bar) + return fit(a, g), a +end diff --git a/src/old_messy/assignments/Assignments.jl b/src/old_messy/assignments/Assignments.jl new file mode 100644 index 0000000..c312324 --- /dev/null +++ b/src/old_messy/assignments/Assignments.jl @@ -0,0 +1,104 @@ +include("group_numbering.jl") + +""" + struct Assignment{T, B} <: AbstractVector{Vector{Int}} + +A structure representing an assignment of nodes to groups. + +# Fields +- `group_size::GroupSize{T}`: The size of each group. +- `node_labels::Vector{Int}`: A vector of node labels. +- `additional_data::B`: Additional data associated with the assignment. + +# Constructor + Assignment(group_size::GroupSize{T}, node_labels, additional_data::B) where {T, B} + +Creates a new `Assignment` instance. + +# Arguments +- `group_size::GroupSize{T}`: The size of each group. +- `node_labels::Vector{Int}`: A vector of node labels. The length of this vector must be equal to the sum of `group_size`. +- `additional_data::B`: Additional data associated with the assignment. + +# Throws +- `ArgumentError`: If the length of `node_labels` is not equal to the sum of `group_size`. +""" +struct Assignment{T, B} <: AbstractVector{Vector{Int}} + group_size::GroupSize{T} + node_labels::Vector{Int} + additional_data::B + + function Assignment(group_size::GroupSize{T}, node_labels, + additional_data::B) where {T, B} + if length(node_labels) != sum(group_size) + throw(ArgumentError("The length of `node_labels` must be equal to the sum of \ + `group_size`")) + end + return new{T, B}(group_size, node_labels, additional_data) + end +end + +function Assignment(group_size::GroupSize, node_labels) + if length(node_labels) != sum(group_size) + throw(ArgumentError("The length of `node_labels` $(length(node_labels)) must be \ + equal to the sum of `group_size` $(sum(group_size))")) + end + c = StatsBase.countmap(node_labels) + if length(c) != length(group_size) + throw(ArgumentError("The number of unique elements in `node_labels` $(length(c)) \ + must be equal to the length of `group_size` $(length(group_size))")) + end + for (k, v) in c + if v != group_size[k] + throw(ArgumentError("The number of elements in `node_labels` $(v) for group \ + $(k) must be equal to the size of the group $(group_size[k])")) + end + end + return Assignment(group_size, node_labels, nothing) +end + +function number_groups(assignment::Assignment) + return length(assignment.group_size) +end + +function number_nodes(assignment::Assignment) + return length(assignment.node_labels) +end + +function get_vertex_in_group(assignment::Assignment, group) + return findall(assignment.node_labels .== group) +end + +function get_group_of_vertex(assignment::Assignment, vertex) + return assignment.node_labels[vertex] +end + +function get_edge_indices(a::Assignment, i, j) + if i == j + return get_edge_indices(a, i) + else + return [(x, y) for x in get_vertex_in_group(a, i) + for y in get_vertex_in_group(a, j)] + end +end + +function get_edge_indices(a::Assignment, i) + nodes_i = get_vertex_in_group(a, i) + return [(x, y) for x in nodes_i for y in nodes_i if x < y] +end + +Base.size(a::Assignment) = (number_groups(a),) +Base.@propagate_inbounds function Base.getindex(a::Assignment, i) + @boundscheck checkbounds(a, i) + return get_vertex_in_group(a, i) +end + +function get_ordered_adjacency_matrix(a::Assignment, A, by = identity) + perm = sortperm(a.node_labels, by = by) + return A[perm, perm] +end + +function Base.deepcopy(a::Assignment) + return Assignment( + a.group_size, copy(a.node_labels), deepcopy(a.additional_data)) +end diff --git a/src/old_messy/assignments/BernoulliAssignment/struct.jl b/src/old_messy/assignments/BernoulliAssignment/struct.jl new file mode 100644 index 0000000..73fab50 --- /dev/null +++ b/src/old_messy/assignments/BernoulliAssignment/struct.jl @@ -0,0 +1,118 @@ +""" + mutable struct BernoulliData{F} + +A data structure to store information related to a Bernoulli assignment in a network. + +# Fields +- `counts::Matrix{Int}`: A matrix representing the maximum number of edges between groups. +- `realized::Matrix{Int}`: A matrix representing the number of edges between groups. +- `estimated_theta::Matrix{F}`: A matrix of estimated parameters (theta). +- `A::BitMatrix`: An adjacency matrix representing the network structure. +- `log_likelihood::F`: +""" +mutable struct BernoulliData{F} + counts::Matrix{Int} + realized::Matrix{Int} + estimated_theta::Matrix{F} + A::BitMatrix # possible improvement by using an adjacency list + log_likelihood::F +end + +const BernoulliAssignment{T, F} = Assignment{T, BernoulliData{F}} +const BernoulliInitRule{S, F} = InitRule{S, Val{BernoulliData}} + +function BernoulliAssignment( + g, group_size::GroupSize, node_labels::Vector{Int}) + bernoulli_data = make_bernoulli_data(g, node_labels, group_size) + return Assignment(group_size, node_labels, bernoulli_data) +end + +function make_assignment(g, h, init_rule::BernoulliInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return BernoulliAssignment(g, group_size, node_labels) +end + +# might be worth using graph accessors instead of the adjacency matrix ? +function make_bernoulli_data(g, node_labels, group_size) + number_groups = length(group_size) + n = length(node_labels) + counts = zeros(Int, number_groups, number_groups) + realized = zeros(Int, number_groups, number_groups) + A = convert_bitmatrix(g) + + # below needs to be abstracted: not sure how diagonal is handled if nonzero + # addtioally, we should be able to deal with missing values ! + # This concerns the counts matrix above as well + @inbounds @simd for k in 1:number_groups + for l in k:number_groups + realized[k, l] = sum(A[node_labels .== k, node_labels .== l]) + realized[l, k] = realized[k, l] + counts[k, l] = group_size[k] * group_size[l] + counts[l, k] = counts[k, l] + end + end + + @inbounds @simd for k in 1:number_groups + counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 + realized[k, k] = sum(A[node_labels .== k, node_labels .== k]) ÷ 2 + end + + estimated_theta = realized ./ counts + ll = compute_log_likelihood(estimated_theta, counts) + return BernoulliData(counts, realized, estimated_theta, A, ll) +end + +function convert_bitmatrix(g::Observations{<:AbstractGraph, D}) where {D} + A = collect(adjacency_matrix(g.graph)) + return convert(BitMatrix, collect(adjacency_matrix(g.graph))) +end + +function convert_bitmatrix(g::Observations{<:AbstractMatrix, D}) where {D} + return convert(BitMatrix, g.graph) +end + +function compute_log_likelihood(estimated_theta::AbstractMatrix{F}, + counts::AbstractMatrix{T}) where {F <: Real, T <: Real} + number_groups = size(estimated_theta, 1) + loglik = zero(eltype(estimated_theta)) + @inbounds for j in 1:number_groups + @simd for i in j:number_groups + θ = estimated_theta[i, j] + loglik += (xlogx(θ) + xlogx(1 - θ)) * counts[i, j] + end + end + return loglik +end + +function loglikelihood(assignment::BernoulliAssignment) + return assignment.additional_data.log_likelihood +end + +loglikelihood(a::BernoulliAssignment, g::Observations) = loglikelihood(a) + +function force_recompute_ll(a::BernoulliAssignment, g::Observations) + a_simple = Assignment(a.group_size, a.node_labels) + return loglikelihood(a_simple, g) +end + +include("swap.jl") + +function get_ordered_adjacency_matrix(a::BernoulliAssignment, by = identity) + return get_ordered_adjacency_matrix(a, a.additional_data.A, by) +end + +# TODO: move to sparse structure to encode difference between 0 weight and absence of edge +# from docs: +# A = sparse(I,J,V) +# rows = rowvals(A) +# vals = nonzeros(A) +# m, n = size(A) +# for j = 1:n +# for i in nzrange(A, j) +# row = rows[i] +# val = vals[i] +# # perform sparse wizardry... +# end +# end diff --git a/src/old_messy/assignments/BernoulliAssignment/swap.jl b/src/old_messy/assignments/BernoulliAssignment/swap.jl new file mode 100644 index 0000000..edfd9a3 --- /dev/null +++ b/src/old_messy/assignments/BernoulliAssignment/swap.jl @@ -0,0 +1,99 @@ +mutable struct BernoulliSwap{F} <: Swap + index1::Int + index2::Int + realized::Matrix{Int} + estimated_theta::Matrix{F} + log_likelihood::F +end + +function make_swap( + a::BernoulliAssignment{T, F}, id) where {T, F} + return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), + copy(a.additional_data.estimated_theta), + a.additional_data.log_likelihood) +end + +function make_swap!(swap::BernoulliSwap{F}, a::BernoulliAssignment{T, F}, + id) where {T, F} + swap.index1, swap.index2 = id + copy!(swap.realized, a.additional_data.realized) + copy!(swap.estimated_theta, a.additional_data.estimated_theta) + swap.log_likelihood = a.additional_data.log_likelihood +end + +function revert_swap!( + a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + swap_node_labels!(a, swap.index1, swap.index2) + copy!(a.additional_data.realized, swap.realized) + copy!(a.additional_data.estimated_theta, swap.estimated_theta) + a.additional_data.log_likelihood = swap.log_likelihood +end + +function apply_swap!( + a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + update_observed_and_labels!(a, swap) + update_ll!(a) +end + +function update_observed_and_labels!( + a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + + for i in axes(a.additional_data.A, 2) + if i == swap.index1 || i == swap.index2 || + a.additional_data.A[swap.index1, i] == + a.additional_data.A[swap.index2, i] + continue + end + group_inter = get_group_of_vertex(a, i) + if a.additional_data.A[swap.index1, i] + a.additional_data.realized[g1, group_inter] -= 1 + a.additional_data.realized[ + group_inter, g1] = a.additional_data.realized[ + g1, group_inter] + + a.additional_data.realized[g2, group_inter] += 1 + a.additional_data.realized[ + group_inter, g2] = a.additional_data.realized[ + g2, group_inter] + end + if a.additional_data.A[swap.index2, i] + a.additional_data.realized[g2, group_inter] -= 1 + a.additional_data.realized[ + group_inter, g2] = a.additional_data.realized[ + g2, group_inter] + + a.additional_data.realized[g1, group_inter] += 1 + a.additional_data.realized[ + group_inter, g1] = a.additional_data.realized[ + g1, group_inter] + end + end + + @. a.additional_data.estimated_theta = a.additional_data.realized / + a.additional_data.counts + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + +function update_ll!(a::BernoulliAssignment) + a.additional_data.log_likelihood = compute_log_likelihood( + a.additional_data.estimated_theta, a.additional_data.counts) + return nothing +end + +function fit(a::BernoulliAssignment, g::Observations) + dists = initialize_sbm(a.group_size, Bernoulli(0.5)) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + dists[group1, + group2] = Bernoulli(a.additional_data.estimated_theta[ + group1, group2]) + end + end + return dists +end diff --git a/src/old_messy/assignments/CategoricalAssignment/struct.jl b/src/old_messy/assignments/CategoricalAssignment/struct.jl new file mode 100644 index 0000000..5ccd175 --- /dev/null +++ b/src/old_messy/assignments/CategoricalAssignment/struct.jl @@ -0,0 +1,128 @@ +mutable struct CategoricalData{F, C} + counts::Matrix{Int} + realized::Array{Int, 3} + estimated_theta::Array{F, 3} + A::Matrix{C} # possible use of CategoricalArrays.jl ? + log_likelihood::F # need to remove this type + scratch::Matrix{Int} +end + +const CategoricalAssignment{T, F, C} = Assignment{ + T, CategoricalData{F, C}} +const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} + +function CategoricalAssignment( + g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { + G, D} + categorical_data = make_categorical_data(g, node_labels, group_size) + return Assignment(group_size, node_labels, categorical_data) +end + +function make_assignment(g, h, init_rule::CategoricalInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + a = CategoricalAssignment(g, group_size, node_labels) + return a +end + +function make_categorical_data(g, node_labels, group_size) + number_groups = length(group_size) + A, num_categories = categorical_matrix(g) + counts = zeros(Int, number_groups, number_groups) + realized = zeros(Int, num_categories, number_groups, number_groups) + estimated_theta = zeros( + Float64, num_categories, number_groups, number_groups) + + _count_cat_occurences!( + counts, realized, g, Assignment(group_size, node_labels)) + + _fast_div!(estimated_theta, realized, counts) + scratch = zeros(Int, num_categories, number_groups) + + ll = compute_log_likelihood(estimated_theta, realized) + return CategoricalData(counts, realized, estimated_theta, A, ll, scratch) +end + +function _count_cat_occurences!(counts, realized, g, a_dummy) + @inbounds for k in 1:number_groups(a_dummy) + for l in k:number_groups(a_dummy) + counts_dict = StatsBase.countmap(get_obs.( + Ref(g), get_edge_indices(a_dummy, k, l))) + total = 0 + for (m, v) in counts_dict + realized[m, k, l] = v + realized[m, l, k] = v + total += v + end + counts[k, l] = total + counts[l, k] = total + end + end +end + +function recount_occurences!(a) + _count_cat_occurences!( + a.additional_data.counts, a.additional_data.realized, a.additional_data.A, a) + return nothing +end + +function compute_log_likelihood( + estimated_theta::Array{T, 3}, realized::Array{F, 3}) where { + T, F} + loglik = zero(T) + number_groups = size(estimated_theta, 2) + number_decorations = size(estimated_theta, 1) + @inbounds for j in 1:number_groups + for i in j:number_groups + for m in 1:number_decorations + if realized[m, i, j] != 0 + loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) + end + end + #loglik += sum(log.(estimated_theta[i, j]) .* realized[i, j]) + #loglik += sum(xlogy.(realized[i,j], estimated_theta[i, j]) ) + end + end + return loglik +end + +function categorical_matrix(A::CategoricalMatrix) + @info "Converting CategoricalMatrix to matrix" + categories = levels(A) + return levelcode.(recode( + A, [l => i for (i, l) in enumerate(categories)]..., missing => 0)) +end + +# to update, just for test now +function categorical_matrix(A::AbstractMatrix{Int}) + min_A = minimum(A) + if min_A > 1 + A_inter = A .- min_A .+ 1 + else + A_inter = copy(A) + end + for i in 1:size(A_inter, 1) + A_inter[i, i] = 0 + end + return A_inter +end + +function categorical_matrix(g::Observations) + return categorical_matrix(g.graph), ncategories(g.dist_ref) +end + +function loglikelihood(a::CategoricalAssignment, g::Observations) + return a.additional_data.log_likelihood +end + +function force_recompute_ll(a::CategoricalAssignment, g::Observations) + a_simple = Assignment(a.group_size, a.node_labels) + return loglikelihood(a_simple, g) +end + +include("swap.jl") + +function get_ordered_adjacency_matrix(a::CategoricalAssignment, by = identity) + return get_ordered_adjacency_matrix(a, a.additional_data.A, by) +end diff --git a/src/old_messy/assignments/CategoricalAssignment/swap.jl b/src/old_messy/assignments/CategoricalAssignment/swap.jl new file mode 100644 index 0000000..43b6cd9 --- /dev/null +++ b/src/old_messy/assignments/CategoricalAssignment/swap.jl @@ -0,0 +1,201 @@ +mutable struct CategoricalSwap{F} <: Swap + index1::Int + index2::Int + realized::Array{Int, 3} + estimated_theta::Array{F, 3} + log_likelihood::F +end + +function make_swap(a::CategoricalAssignment, id) + return CategoricalSwap(id[1], id[2], copy(a.additional_data.realized), + copy(a.additional_data.estimated_theta), + a.additional_data.log_likelihood) +end + +function copy_realized_and_theta!(a, b) + copy!(a.realized, b.realized) + copy!(a.estimated_theta, b.estimated_theta) + a.log_likelihood = b.log_likelihood + return nothing +end + +function make_swap!( + swap::CategoricalSwap{F}, a::CategoricalAssignment{T, F, C}, + id) where {T, F, C} + swap.index1, swap.index2 = id + copy_realized_and_theta!(swap, a.additional_data) +end + +function revert_swap!( + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} + swap_node_labels!(a, swap.index1, swap.index2) + copy_realized_and_theta!(a.additional_data, swap) + #copy!.(a.additional_data.realized, swap.realized) + #copy!.(a.additional_data.estimated_theta, swap.estimated_theta) + #a.additional_data.log_likelihood = swap.log_likelihood + #return nothing +end + +function apply_swap!( + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} + #update_observed_and_labels!(a, swap) + new_update_observed_and_labels!(a, swap) + update_ll!(a) +end + +function update_ll!(a::CategoricalAssignment) + a.additional_data.log_likelihood = compute_log_likelihood( + a.additional_data.estimated_theta, a.additional_data.realized) + return nothing +end + +function fit( + a::CategoricalAssignment{T, F, C}, g::Observations) where { + T, F, C} + dists = initialize_sbm( + a.group_size, g.dist_ref) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + dists[group1, + group2] = Categorical(a.additional_data.estimated_theta[:, + group1, group2]) + end + end + return dists +end + +function fit( + a::CategoricalAssignment{T, F, C}, g::Observations{ + G, <:DiscretizedDistribution}) where { + T, F, C, G} + dists = initialize_sbm( + a.group_size, g.dist_ref) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + set_params!( + dists[group1, + group2], a.additional_data.estimated_theta[:, + group1, group2]) + end + end + return dists +end + +function _move_connection!(realized, group_origin, group_dest, scratch) + @inbounds for group in axes(realized, 2) + for label in axes(realized, 1) + realized[label, group, group_origin] -= scratch[label, group] + realized[label, group, group_dest] += scratch[label, group] + realized[label, group_origin, group] = realized[ + label, group, group_origin] + realized[label, group_dest, group] = realized[ + label, group, group_dest] + end + end +end + +# need to rethink if want to use muli-threading +# check https://juliafolds.github.io/Transducers.jl/dev/tutorials/words/ +function new_update_observed_and_labels!( + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + if g1 == g2 + return nothing + end + + a.additional_data.scratch .= 0 + for i in axes(a.additional_data.A, 1) + if i == swap.index1 || i == swap.index2 + continue + end + @inbounds obs = a.additional_data.A[i, swap.index1] + if obs != 0 + group_inter = get_group_of_vertex(a, i) + a.additional_data.scratch[obs, group_inter] += 1 + end + end + _move_connection!( + a.additional_data.realized, g1, g2, a.additional_data.scratch) + + a.additional_data.scratch .= 0 + for i in axes(a.additional_data.A, 1) + if i == swap.index1 || i == swap.index2 + continue + end + @inbounds obs = a.additional_data.A[i, swap.index2] + if obs != 0 + group_inter = get_group_of_vertex(a, i) + a.additional_data.scratch[obs, group_inter] += 1 + end + end + _move_connection!( + a.additional_data.realized, g2, g1, a.additional_data.scratch) + + _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, + a.additional_data.counts) + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + +function update_observed_and_labels!( + a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { + T, F, C} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + + adj_1 = @view a.additional_data.A[:, swap.index1] + adj_2 = @view a.additional_data.A[:, swap.index2] + + for i in axes(a.additional_data.A, 1) + if i == swap.index1 || i == swap.index2 + continue + end + obs_1 = adj_1[i] + obs_2 = adj_2[i] + group_inter = get_group_of_vertex(a, i) + if obs_1 != obs_2 + _fast_update!!( + a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) + end + end + + _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, + a.additional_data.counts) + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + +function _fast_update!!(realized, g1, g2, obs_1, obs_2, g_inter) + realized[obs_1, g1, g_inter] -= 1 + realized[obs_1, g_inter, g1] = realized[obs_1, g1, g_inter] + + realized[obs_1, g2, g_inter] += 1 + realized[obs_1, g_inter, g2] = realized[obs_1, g2, g_inter] + + # send from group 2 to group 1 + realized[obs_2, g2, g_inter] -= 1 + realized[obs_2, g_inter, g2] = realized[obs_2, g2, g_inter] + + realized[obs_2, g1, g_inter] += 1 + realized[obs_2, g_inter, g1] = realized[obs_2, g1, g_inter] +end + +function _fast_div!(theta, realized, counts) + for j in axes(theta, 3) + for i in axes(theta, 2) + for m in axes(theta, 1) + theta[m, i, j] = realized[m, i, j] / counts[i, j] + end + end + end +end diff --git a/src/old_messy/assignments/SparseAssignment/struct.jl b/src/old_messy/assignments/SparseAssignment/struct.jl new file mode 100644 index 0000000..8251c25 --- /dev/null +++ b/src/old_messy/assignments/SparseAssignment/struct.jl @@ -0,0 +1,115 @@ +mutable struct SparseData{F, C} + counts::Matrix{Int} + realized::Array{Int, 3} + estimated_theta::Array{F, 3} + A::SparseMatrixCSC{C, Int} + scratch_count::Matrix{Int} + scratch_missing::Vector{Int} + log_likelihood::F +end + +const SparseAssignment{T, F, C} = Assignment{ + T, SparseData{F, C}} +const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} + +function SparseAssignment( + g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { + G, D} + A = issparse(g.graph) ? g.graph : sparse(g.graph) + num_levels = ncategories(g.dist_ref) + sparse_data = SparseData( + A, size(group_size, 1), num_levels, group_size, node_labels) + return Assignment(group_size, node_labels, sparse_data) +end + +function make_assignment(g, h, init_rule::SparseInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return SparseAssignment(g, group_size, node_labels) +end + +function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, + level_count::Int, group_size, node_labels) where {T} + n = size(A, 1) + data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), + zeros(Float64, level_count, k, k), dropzeros(A), zeros( + Int, level_count, k), zeros( + Int, k), 0.0) + _count_possible_occurences!(data, group_size) + _count_occurences!(data, node_labels) + _fast_div!(data.estimated_theta, data.realized, data.counts) + data.log_likelihood = compute_log_likelihood_without_0( + data.estimated_theta, data.realized, data.counts) + return data +end + +function _count_possible_occurences!(data, group_size) + k = size(group_size, 1) + for j in 1:k + data.counts[j, j] = group_size[j] * (group_size[j] - 1) ÷ 2 + for i in (j + 1):k + data.counts[i, j] = group_size[i] * group_size[j] + data.counts[j, i] = group_size[i] * group_size[j] + end + end +end + +function _count_occurences!(data, node_labels) + m, n = size(data.A) + for k in 1:length(unique(node_labels)) + for l in k:length(unique(node_labels)) + node_group_k = findall(x -> x == k, node_labels) + node_group_l = findall(x -> x == l, node_labels) + if k != l + counts = StatsBase.countmap(data.A[i, j] for i in node_group_k + for j in node_group_l if i != j) + else + counts = StatsBase.countmap(data.A[i, j] for i in node_group_k + for j in node_group_l if i < j) + end + for m in 1:size(data.realized, 1) + data.realized[m, k, l] = get(counts, m, 0) + data.realized[m, l, k] = get(counts, m, 0) + end + total_witouth_missing = sum(values(counts)) - + get(counts, missing, 0) + data.counts[k, l] = total_witouth_missing + data.counts[l, k] = total_witouth_missing + end + end +end + +function compute_log_likelihood_without_0( + estimated_theta::Array{T, 3}, realized::Array{F, 3}, counts) where { + T, F} + loglik = zero(T) + number_groups = size(estimated_theta, 2) + number_decorations = size(estimated_theta, 1) + for j in 1:number_groups + for i in j:number_groups + total_decorations = counts[i, j] + loglik -= xlogx(total_decorations) + for m in 1:number_decorations + loglik += xlogx(realized[m, i, j]) + total_decorations -= realized[m, i, j] + end + loglik += xlogx(total_decorations) + end + end + return loglik +end + +function _n_decorations_with_0(a::SparseAssignment) + return size(a.additional_data.estimated_theta, 1) + 1 +end + +function _n_decorations_not_0(a::SparseAssignment) + return size(a.additional_data.estimated_theta, 1) +end + +function loglikelihood(assignment::SparseAssignment, g::Observations) + return assignment.additional_data.log_likelihood +end + +include("swap.jl") diff --git a/src/old_messy/assignments/SparseAssignment/swap.jl b/src/old_messy/assignments/SparseAssignment/swap.jl new file mode 100644 index 0000000..cdfda51 --- /dev/null +++ b/src/old_messy/assignments/SparseAssignment/swap.jl @@ -0,0 +1,134 @@ +mutable struct SparseSwap{F} <: Swap + index1::Int + index2::Int + realized::Array{Int, 3} + estimated_theta::Array{F, 3} + counts::Matrix{Int} + log_likelihood::F +end + +function make_swap(a::SparseAssignment, id) + return SparseSwap(id[1], id[2], copy(a.additional_data.realized), + copy(a.additional_data.estimated_theta), copy(a.additional_data.counts), + a.additional_data.log_likelihood) +end + +function copy_addtional!(a, b) + copy!(a.realized, b.realized) + copy!(a.estimated_theta, b.estimated_theta) + copy!(a.counts, b.counts) + a.log_likelihood = b.log_likelihood + return nothing +end + +function make_swap!( + swap::SparseSwap{F}, a::SparseAssignment{T, F}, + id) where {T, F} + swap.index1, swap.index2 = id + copy_addtional!(swap, a.additional_data) +end + +function revert_swap!( + a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} + swap_node_labels!(a, swap.index1, swap.index2) + copy_addtional!(a.additional_data, swap) + return nothing +end + +# this function fails in presence of missing values +function apply_swap!( + a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} + update_observed_and_labels!(a, swap) + update_ll!(a) +end + +function update_ll!(a::SparseAssignment) + a.additional_data.log_likelihood = compute_log_likelihood_without_0( + a.additional_data.estimated_theta, a.additional_data.realized, a.additional_data.counts) + return nothing +end + +function update_observed_and_labels!( + a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + + if g1 == g2 + return nothing + end + + rows = rowvals(a.additional_data.A) + vals = nonzeros(a.additional_data.A) + m, n = size(a.additional_data.A) + for j in [swap.index1, swap.index2] + a.additional_data.scratch_count .= 0 + a.additional_data.scratch_missing .= 0 + g_from = swap.index1 == j ? g1 : g2 + g_to = swap.index1 == j ? g2 : g1 + for i_index in nzrange(a.additional_data.A, j) + row = rows[i_index] + if row == swap.index1 || row == swap.index2 + continue + end + val = vals[i_index] + groupi = get_group_of_vertex(a, row) + if ismissing(val) + a.additional_data.scratch_missing[groupi] += 1 + else + a.additional_data.scratch_count[val, groupi] += 1 + end + end + _move_connection!( + a.additional_data.realized, g_from, g_to, a.additional_data.scratch_count) + + _update_counts!( + a.additional_data.counts, g_from, g_to, a.additional_data.scratch_missing) + end + + _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, + a.additional_data.counts) + + # swap of the labels should happen after the update of the realized and estimated_theta + # for the above loop to work correctly + swap_node_labels!(a, swap.index1, swap.index2) + return nothing +end + +function _update_counts!(counts, g_from, g_to, missing_update) + for i in axes(counts, 1) + counts[i, g_to] = counts[i, g_to] - missing_update[i] + counts[i, g_from] = counts[i, g_from] + missing_update[i] + counts[g_to, i] = counts[i, g_to] + counts[g_from, i] = counts[i, g_from] + end +end + +function fit(a::SparseAssignment, g::Observations) + dists = initialize_sbm( + a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + theta = a.additional_data.estimated_theta[:, group1, group2] + dists[group1, + group2] = ZeroInflatedCategorical(1 - sum(theta), theta) + end + end + return dists +end + +function fit(a::SparseAssignment, + g::Observations{G, <:DiscretizedDistribution}) where {G} + dists = initialize_sbm(a.group_size, + DiscretizedDistribution( + g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + theta = a.additional_data.estimated_theta[:, group1, group2] + p = clamp(1 - sum(theta), 0, 1) + dists[group1, + group2] = DiscretizedDistribution( + g.dist_ref.discretizer, ZeroInflatedCategorical(p, theta)) + end + end + return dists +end diff --git a/src/old_messy/assignments/SumAssignment/struct.jl b/src/old_messy/assignments/SumAssignment/struct.jl new file mode 100644 index 0000000..8451eba --- /dev/null +++ b/src/old_messy/assignments/SumAssignment/struct.jl @@ -0,0 +1,94 @@ + +# type F needs to be a vector field! + +mutable struct SumData{F, C} + λ::SparseMatrixCSC{F, Int} + θ::Dict{Tuple{Int, Int}, F} + A::SparseMatrixCSC{C, Int} + counts::Dict{Tuple{Int, Int}, Int} + log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} + log_likelihood::Float64 +end + +const SumAssignment{T, F, C} = Assignment{T, SumData{F, C}} +const SumInitRule{S} = InitRule{S, Val{SumData}} + + + +function make_assignment(g, h, init_rule::SumInitRule) + group_size, + node_labels = initialize_node_labels( + g, h, init_rule.starting_assignment_rule) + return SumAssignment(g, group_size, node_labels) +end + +function SumAssignment(g::Observations, group_size::GroupSize, node_labels) + A = issparse(g.graph) ? g.graph : sparse(g.graph) + λ = fit.(Ref(g.dist_ref), A) + return SumAssignment(A, λ, group_size, node_labels) +end + +function SumAssignment( + A::SparseMatrixCSC{C, Int}, + λ::SparseMatrixCSC{F, Int}, group_size::GroupSize, node_labels::Vector{Int}) where { + F, C} + k = size(group_size, 1) + θ = Dict{Tuple{Int, Int}, F}() + counts = Dict{Tuple{Int, Int}, Int}() + + rows = rowvals(λ) + vals = nonzeros(λ) + m, n = size(λ) + for u in 1:n + for i in nzrange(λ,u) + v = rows[i] + if u >= v + # break # check that this isn't a mistake trying to be fast + continue + end + key_groups = minmax(node_labels[u], node_labels[v]) + param = vals[i] + if haskey(θ, key_groups) + θ[key_groups] += param + else + θ[key_groups] = param + end + if haskey(counts, key_groups) + counts[key_groups] += 1 + else + counts[key_groups] = 1 + end + end + end + for i in 1:k + for j in i:k + θ[minmax(i, j)] /= counts[minmax(i, j)] + end + end + ll_sum = 0.0 + ll = Dict{Tuple{Int, Int}, Float64}() + for i in 1:k + for j in i:k + ll[(i, j)] = 0.0 + end + end + for u in 1:n + for i in nzrange(λ, u) + v = rows[i] + if u >= v + continue + end + key_groups = minmax(node_labels[u], node_labels[v]) + ll[minmax( + node_labels[u], node_labels[v])] += loglikelihood(θ[key_groups], A[u, v]) + end + end + ll_sum = sum(values(ll)) + return Assignment(group_size, node_labels, SumData(λ, θ, A, counts, ll, ll_sum)) +end + +function loglikelihood(assignment::SumAssignment, g::Observations) + return sum(values(assignment.additional_data.log_likelihood)) +end + +include("swap.jl") diff --git a/src/old_messy/assignments/SumAssignment/swap.jl b/src/old_messy/assignments/SumAssignment/swap.jl new file mode 100644 index 0000000..4bd46de --- /dev/null +++ b/src/old_messy/assignments/SumAssignment/swap.jl @@ -0,0 +1,119 @@ +mutable struct SumSwap{F} <: Swap + index1::Int + index2::Int + θ::Dict{Tuple{Int, Int}, F} + counts::Dict{Tuple{Int, Int}, Int} + log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} + log_likelihood::Float64 +end + +function make_swap(a::SumAssignment, id) + return SumSwap(id[1], id[2], deepcopy(a.additional_data.θ), + deepcopy(a.additional_data.counts), deepcopy(a.additional_data.log_likelihood_per_group), + a.additional_data.log_likelihood) +end + +function make_swap!(swap::SumSwap{F}, a::SumAssignment{T, F}, id) where {T, F} + swap.index1, swap.index2 = id + swap.θ = deepcopy(a.additional_data.θ) + swap.counts = deepcopy(a.additional_data.counts) + swap.log_likelihood_per_group = deepcopy(a.additional_data.log_likelihood_per_group) + swap.log_likelihood = a.additional_data.log_likelihood +end + +function revert_swap!( + a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} + swap_node_labels!(a, swap.index1, swap.index2) + a.additional_data.θ = deepcopy(swap.θ) + a.additional_data.counts = deepcopy(swap.counts) + a.additional_data.log_likelihood_per_group = deepcopy(swap.log_likelihood_per_group) + a.additional_data.log_likelihood = swap.log_likelihood +end + +function apply_swap!( + a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} + λ = a.additional_data.λ + rows = rowvals(λ) + vals = nonzeros(λ) + g1 = get_group_of_vertex(a, swap.index1) + g2 = get_group_of_vertex(a, swap.index2) + if g1 == g2 + return nothing + end + + for i in nzrange(λ, swap.index1) + v = rows[i] + key_old_groups = minmax(g1, a.node_labels[v]) + key_new_groups = minmax(g2, a.node_labels[v]) + c_og = a.additional_data.counts[key_old_groups] + c_ng = a.additional_data.counts[key_new_groups] + param = vals[i] + a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - + param)/(c_og - 1) + a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + + param)/(c_ng + 1) + a.additional_data.counts[key_old_groups] -= 1 + a.additional_data.counts[key_new_groups] += 1 + end + + for i in nzrange(λ, swap.index2) + v = rows[i] + key_old_groups = minmax(g2, a.node_labels[v]) + key_new_groups = minmax(g1, a.node_labels[v]) + c_og = a.additional_data.counts[key_old_groups] + c_ng = a.additional_data.counts[key_new_groups] + param = vals[i] + a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - + param)/(c_og - 1) + a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + + param)/(c_ng + 1) + a.additional_data.counts[key_old_groups] -= 1 + a.additional_data.counts[key_new_groups] += 1 + end + + swap_node_labels!(a, swap.index1, swap.index2) + fast_update_ll!(a, swap) +end + +function fast_update_ll!(a::SumAssignment, swap::SumSwap) + k = size(a.group_size, 1) + for i in 1:k + for j in i:k + index_group = (i, j) + if swap.θ[index_group] != a.additional_data.θ[index_group] + _update_ll_one_group!(a, index_group) + end + end + end + a.additional_data.log_likelihood = sum(values(a.additional_data.log_likelihood_per_group)) +end + +function _update_ll_one_group!(a::SumAssignment, group) + nodes_1 = findall(x -> x == group[1], a.node_labels) + nodes_2 = findall(x -> x == group[2], a.node_labels) + ll = 0.0 + rows = rowvals(a.additional_data.λ) + for u in nodes_1 + for v in intersect(rows[nzrange(a.additional_data.λ, u)], nodes_2) + ll += loglikelihood( + a.additional_data.θ[group], a.additional_data.A[u, v]) + end + end + a.additional_data.log_likelihood_per_group[group] = ll + return nothing +end + +function fit( + a::SumAssignment{T, F, C}, g::Observations{ + G, <:DiscreteMarkovChain}) where { + T, F, C, G} + dists = initialize_sbm( + a.group_size, g.dist_ref) + for group1 in 1:number_groups(a) + for group2 in 1:number_groups(a) + dists[ + group1, group2] = a.additional_data.θ[minmax(group1, group2)] + end + end + return dists +end diff --git a/src/old_messy/assignments/group_numbering.jl b/src/old_messy/assignments/group_numbering.jl new file mode 100644 index 0000000..879e909 --- /dev/null +++ b/src/old_messy/assignments/group_numbering.jl @@ -0,0 +1,86 @@ +""" +Array-like storage for the number of nodes in each group. Try to split the number of nodes +into equal groups, but if it is not possible, the last group may have more nodes. +""" +struct GroupSize{T} <: AbstractVector{Int} + group_number::T + number_groups::Int + + function GroupSize(number_nodes, h::Real) + @assert 0 < h < 1 + standard_group = floor(Int, number_nodes * h) + GroupSize(number_nodes, standard_group) + end + + function GroupSize(number_nodes, standard_group::Integer) + @assert 1 < standard_group <= number_nodes + number_groups = number_nodes ÷ standard_group # number of standard groups! + if number_groups * standard_group == number_nodes + new{Int}(standard_group, number_groups) + else + remainder_group = standard_group + + mod(number_nodes, standard_group) + new{Tuple{Int, Int}}( + (standard_group, remainder_group), number_groups) + end + end +end + +Base.size(g::GroupSize) = (g.number_groups,) +Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) + @boundscheck checkbounds(g, i) + return g.group_number +end + +Base.@propagate_inbounds function Base.getindex( + g::GroupSize{Tuple{Int, Int}}, i::Int) + @boundscheck checkbounds(g, i) + return i < length(g) ? g.group_number[1] : g.group_number[2] +end + +function check_compatiblity!(node_labels, g::GroupSize) + counts = StatsBase.countmap(node_labels) + + if length(counts) != g.number_groups + throw(ArgumentError("The vector of node labels is not compatible with the \ + group size: number of group in labels $(length(counts)) != expected number $(g.number_groups)")) + end + if size(node_labels, 1) != sum(g) + throw(ArgumentError("The vector of node labels is not compatible with the \ + group size: number of node labels $(size(node_labels, 1)) != expected number of nodes $(sum(g))")) + end + unbalanced = any(((k, v),) -> v != g[k], counts) + if unbalanced + @debug "The group size is unbalanced, trying to fix it : $(counts)" + g, node_labels = try_fixing_group_size!(node_labels, g) + if any(((k, v),) -> v != g[k], StatsBase.countmap(node_labels)) + throw(ArgumentError("Could not fix the group size")) + else + @debug "Fixed the group size by moving nodes between groups" + end + end +end + +function try_fixing_group_size!(node_labels, g::GroupSize) + counts = StatsBase.countmap(node_labels) + groups_too_small = filter(((k, v),) -> v < g[k], counts) + groups_too_large = filter(((k, v),) -> v > g[k], counts) + amount_too_small = sum(g[k] - v for (k, v) in groups_too_small) + amount_too_large = sum(v - g[k] for (k, v) in groups_too_large) + if amount_too_small == amount_too_large + nodes_to_move = [] + for (l, v) in groups_too_large + number_nodes_to_move = v - g[l] + nodes_to_move = vcat(nodes_to_move, + findall(x -> x == l, node_labels)[1:number_nodes_to_move]) + end + for (k, v) in groups_too_small + number_nodes_to_move = g[k] - v + for i in 1:number_nodes_to_move + index = popfirst!(nodes_to_move) + node_labels[index] = k + end + end + end + return g, node_labels +end diff --git a/src/old_messy/assignments/include.jl b/src/old_messy/assignments/include.jl new file mode 100644 index 0000000..5829223 --- /dev/null +++ b/src/old_messy/assignments/include.jl @@ -0,0 +1,4 @@ +include("BernoulliAssignment/struct.jl") +include("CategoricalAssignment/struct.jl") +include("SparseAssignment/struct.jl") +include("SumAssignment/struct.jl") diff --git a/src/old_messy/bootstrap.jl b/src/old_messy/bootstrap.jl new file mode 100644 index 0000000..77086dd --- /dev/null +++ b/src/old_messy/bootstrap.jl @@ -0,0 +1,15 @@ +function bootstrap( + statistic::Function, data::AbstractMatrix, model::BlockModel, + sampling::BootstrapSampling) + t0 = tx(statistic(data)) + m = nrun(sampling) + t1 = zeros_tuple(t0, m) + data1 = copy(data) + for i in 1:m + draw_and_fill!(data1, model) + for (j, t) in enumerate(tx(statistic(data1))) + t1[j][i] = t + end + end + return ParametricBootstrapSample(t0, t1, statistic, data, model, sampling) +end diff --git a/src/old_messy/distributions/categorical_with_0.jl b/src/old_messy/distributions/categorical_with_0.jl new file mode 100644 index 0000000..045d45b --- /dev/null +++ b/src/old_messy/distributions/categorical_with_0.jl @@ -0,0 +1,141 @@ +""" + struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution + +A zero-inflated categorical distribution that combines a Bernoulli distribution with a categorical distribution. + +# Fields +- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. +- `dist::D`: The categorical distribution. + +# Constructors +- `ZeroInflatedCategorical(p::Real, dist::D)`: Creates a zero-inflated categorical distribution with probability `p` of zero and categorical distribution `dist`. + +# Mathematical Explanation +The zero-inflated categorical distribution modifies the original categorical distribution by introducing a probability `p` of zero. The `pmf` and `cdf` are adjusted accordingly: +- `pdf(x) = p * δ(x) + (1 - p) * pmf_original(x)` +- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` +where `δ(x)` is the Dirac delta function. +""" +struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution + edge_proba::B + dist::D +end + +_dirac_delta(x) = x == 0 ? one(x) : zero(x) +_dirac_delta(x, lb, ub) = lb <= x <= ub ? one(x) : zero(x) + +function ZeroInflatedCategorical(p::Real, dist::D) where {D} + if p < 0 + p = zero(p) + elseif p > 1 + p = one(p) + end + return ZeroInflatedCategorical(Bernoulli(1 - p), dist) +end + +function ZeroInflatedCategorical(p::Real, probs::AbstractVector) + if sum(probs) == 0 + probs_ = ones(length(probs)) / length(probs) + else + probs_ = probs / sum(probs) + end + if p < 0 + p = zero(p) + elseif p > 1 + p = one(p) + end + return ZeroInflatedCategorical(p, Categorical(probs_)) +end + +function ZeroInflatedCategorical(vec_probs::AbstractVector) + ZeroInflatedCategorical(vec_probs[1], vec_probs[2:end]) +end + +function ZeroInflatedCategorical(k::Int) + ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) +end + +""" + Distributions.pdf(d::ZeroInflatedCategorical, x::Real) + +Computes the probability mass function (pmf) of the zero-inflated categorical distribution `d` at `x`. + +# Mathematical Explanation +The `pmf` of the zero-inflated categorical distribution is given by: +- `pmf(x) = p * δ(x) + (1 - p) * pmf_original(x)` +where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `pmf_original(x)` is the pmf of the original categorical distribution. +""" +function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) +end + +""" + rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) + +Generates a random sample from the zero-inflated categorical distribution `d` using the random number generator `rng`. +""" +function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) + return rand(rng, d.edge_proba) * rand(rng, d.dist) +end + +logpdf(d::ZeroInflatedCategorical, x::Real) = log(pdf(d, x)) + +minimum(d::ZeroInflatedCategorical) = min(minimum(d.dist), 0) + +maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) + +insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) + +""" + Distributions.cdf(d::ZeroInflatedCategorical, x::Real) + +Computes the cumulative distribution function (cdf) of the zero-inflated categorical distribution `d` at `x`. + +# Mathematical Explanation +The `cdf` of the zero-inflated categorical distribution is given by: +- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` +where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `cdf_original(x)` is the cdf of the original categorical distribution. +""" +function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, 0, Inf) + + pdf(d.edge_proba, one(x)) * cdf(d.dist, x) +end + +function Distributions.params(d::ZeroInflatedCategorical) + (first(params(d.edge_proba)), params(d.dist)...) +end + +ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) + +""" + Distributions.fit(::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) + +Fits a zero-inflated categorical distribution to the given data. +""" +function Distributions.fit( + ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where { + B, D <: Categorical} + indices_0 = findall(x -> x == 0, data) + p = length(indices_0) / length(data) + if p != 1 + dist = fit_mle(Categorical, n_cat, data[setdiff(1:end, indices_0)]) + return ZeroInflatedCategorical(p, dist) + else + return ZeroInflatedCategorical(1.0, zeros(n_cat)) + end +end + +function get_params_cat_like(dist::ZeroInflatedCategorical) + p = first(params(dist.edge_proba)) + probs = vcat(params(dist.dist)...) + return vcat(1 - p, probs .* p) +end + +function Base.convert(::Type{<:ZeroInflatedCategorical}, d::D) where {D} + return ZeroInflatedCategorical(1.0, d) +end + +function Base.convert(T::Type{<:Categorical}, d::ZeroInflatedCategorical) + return T(get_params_cat_like(d)) +end diff --git a/src/old_messy/distributions/discrete_dist.jl b/src/old_messy/distributions/discrete_dist.jl new file mode 100644 index 0000000..55af36b --- /dev/null +++ b/src/old_messy/distributions/discrete_dist.jl @@ -0,0 +1,142 @@ +""" + struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution + +A discretized distribution that combines a discretizer with a zero-inflated categorical distribution. + +# Fields +- `discretizer::D`: The discretizer used to discretize the continuous distribution. +- `probs::L`: The zero-inflated categorical distribution representing the discretized probabilities. + +# Constructors +- `DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d))`: Creates a discretized distribution with `n_bins` bins and support bound `support_bound`. + +# Mathematical Explanation +The discretized distribution modifies the original continuous distribution by dividing it into `n_bins` bins. The `pdf` and `cdf` are adjusted accordingly: +- `pdf(x) = pdf_discretized(bin) / bin_width` +- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` +""" +mutable struct DiscretizedDistribution{D, L} <: + ContinuousUnivariateDistribution where {D, L} + discretizer::D + probs::L +end + +function DiscretizedDistribution( + d::D, n_bins::Int, support_bound = extrema(d)) where {D} + disc = DiscretizerZeroToZero(n_bins, support_bound...) + ps = zeros(non_zero_labels_counts(disc)) + for i in 1:non_zero_labels_counts(disc) + lb, ub = decode(disc, i) + ps[i] = cdf(d, ub) - cdf(d, lb) + end + probs = ZeroInflatedCategorical(0.0, ps) + return DiscretizedDistribution(disc, probs) +end + +function DiscretizedDistribution( + d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) + disc = DiscretizerZeroToZero(n_bins, support_bound...) + ps = zeros(non_zero_labels_counts(disc)) + for i in 1:non_zero_labels_counts(disc) + lb, ub = decode(disc, i) + ps[i] = cdf(d, ub) - cdf(d, lb) + end + probs = ZeroInflatedCategorical(get_proba_zero(d), ps) + return DiscretizedDistribution(disc, probs) +end + +function DiscretizedDistribution(discretizer::Discretizer) + return DiscretizedDistribution( + discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) +end + +""" + rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) + +Generates a random sample from the discretized distribution `d` using the random number generator `rng`. +""" +function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) + bin = rand(rng, d.probs) + return _decode_randomly(rng, d.discretizer, bin) +end + +minimum(d::DiscretizedDistribution) = minimum(d.discretizer) + +maximum(d::DiscretizedDistribution) = maximum(d.discretizer) + +function insupport(d::DiscretizedDistribution, x::Real) + support_encoding(d.discretizer, x) +end + +function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} + return DiscretizedDistribution(d, 10) +end + +ncategories(d::DiscretizedDistribution) = ncategories(d.probs) + +function Distributions.fit( + ::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} + return fit(L, data) +end + +function set_params!(d::DiscretizedDistribution{D, L}, params) where {D, L} + d.probs = L(params...) +end + +""" + Distributions.pdf(d::DiscretizedDistribution, x::Real) + +Computes the probability density function (pdf) of the discretized distribution `d` at `x`. + +# Mathematical Explanation +The `pdf` of the discretized distribution is computed as: +- `pdf(x) = pdf_discretized(bin) / bin_width` +""" +function pdf(d::DiscretizedDistribution, x::Real) + if x == 0 + return pdf(d.probs, zero(x)) + end + if !support_encoding(d.discretizer, x) + return zero(x) + end + bin = encode(d.discretizer, x) + return pdf(d.probs, bin) / binwidth(d.discretizer) +end + +""" + Distributions.logpdf(d::DiscretizedDistribution, x::Real) + +Computes the log of the probability density function (logpdf) of the discretized distribution `d` at `x`. +""" +function logpdf(d::DiscretizedDistribution, x::Real) + if !support_encoding(d.discretizer, x) + return -Inf + end + x == 0 && return log(pdf(d.probs, x)) + bin = encode(d.discretizer, x) + return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) +end + +""" + Distributions.cdf(d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} + +Computes the cumulative distribution function (cdf) of the discretized distribution `d` at `x`. + +# Mathematical Explanation +The `cdf` of the discretized distribution is computed as: +- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` +""" +function Distributions.cdf( + d::DiscretizedDistribution{D, P}, x::Real) where { + D, P <: ZeroInflatedCategorical} + x < minimum(d) && return zero(x) + x > maximum(d) && return one(x) + bin = encode(d.discretizer, x) + result = (x == 0) * cdf(d.probs, x) + if bin != 0 + result += cdf(d.probs, bin - 1) + + (cdf(d.probs, bin) - cdf(d.probs, bin - 1)) * + progress_in_bin(d.discretizer, x, bin) + end + return result +end diff --git a/src/old_messy/distributions/discretizer.jl b/src/old_messy/distributions/discretizer.jl new file mode 100644 index 0000000..e039930 --- /dev/null +++ b/src/old_messy/distributions/discretizer.jl @@ -0,0 +1,248 @@ +# Inspired by Discretizer.jl but with the fast decoding function and built-in +# convention for discretizing continuous distributions. +abstract type Discretizer end + +function encode(d::Discretizer, x::AbstractArray{<:Real}) + return [encode(d, u) for u in x] +end + +function decode(d::Discretizer, x::AbstractArray{<:Real}) + return [decode(d, u) for u in x] +end + +""" +Uniformly discretizes a continuous distribution into a fixed number of bins of equal width. +""" +struct RegularDiscretizer{F, T, L} <: Discretizer + n_bins::Int + lower_bound::F + upper_bound::F + bin_labels::MVector{L, T} + bin_width::F +end + +function RegularDiscretizer( + n_bins::Int, lower_bound::F, upper_bound::F) where {F} + if !isfinite(lower_bound) || !isfinite(upper_bound) + throw(ArgumentError("RegularDiscretizer requires finite lower and upper bounds.")) + end + bin_width = (upper_bound - lower_bound) / n_bins + return RegularDiscretizer( + n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), bin_width + ) +end + +function support_encoding(d::RegularDiscretizer, x::Real) + return d.lower_bound <= x <= d.upper_bound +end + +function minimum(d::RegularDiscretizer) + return d.lower_bound +end + +function maximum(d::RegularDiscretizer) + return d.upper_bound +end + +function encode(d::RegularDiscretizer, x::Real) + if x >= d.upper_bound + return d.n_bins + end + return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] +end + +function _decode_randomly( + rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) + hi, lo = decode(d, bin) + return lo + (hi - lo) * rand(rng) +end + +function binwidth(d::RegularDiscretizer) + return d.bin_width +end + +function decode(d::RegularDiscretizer, bin::Int) + return (d.lower_bound + (bin - 1) * d.bin_width, + d.lower_bound + bin * d.bin_width) +end + +function encode(d::RegularDiscretizer, x::AbstractArray{Real}) + return [encode(d, u) for u in x] +end + +function decode(d::RegularDiscretizer, x::AbstractArray{Real}) + return [decode(d, u) for u in x] +end + +function nlabels(d::RegularDiscretizer) + return d.n_bins +end + +non_zero_labels_counts(d::RegularDiscretizer) = nlabels(d) + +""" +Maps a set of categories to a set of bins +""" +struct CategoryDiscretizer{F, T} + cat_to_bin::Dict{F, T} + bin_to_cat::Dict{T, F} + min_label::T + max_label::T +end + +function CategoryDiscretizer(cat_to_bin::Dict, bin_to_cat::Dict) + min_label = minimum(keys(bin_to_cat)) + max_label = maximum(keys(bin_to_cat)) + return CategoryDiscretizer(cat_to_bin, bin_to_cat, min_label, max_label) +end + +function support_encoding(d::CategoryDiscretizer, x) + return haskey(d.cat_to_bin, x) +end + +function encode(d::CategoryDiscretizer, x) + return d.cat_to_bin[x] +end + +function decode(d::CategoryDiscretizer, label) + return d.bin_to_cat[label] +end + +function nlabels(d::CategoryDiscretizer) + return length(d.bin_to_cat) +end + +function binwidth(d::CategoryDiscretizer{F, T}, x::T) where {F, T} + return length(d.bin_to_cat[x]) +end + +function non_zero_labels_counts(d::CategoryDiscretizer) + if 0 ∈ keys(d.bin_to_cat) + return length(d.bin_to_cat) - 1 + else + return length(d.bin_to_cat) + end +end + +function minimum(d::CategoryDiscretizer) + return d.min_label +end + +function maximum(d::CategoryDiscretizer) + return d.max_label +end + +""" +Uniformly discretizes a continuous distribution into a fixed number of bins of equal width, +with additional bins for missing or special values. +""" +struct HybridDiscretizer{F, T, L} <: Discretizer + lin::RegularDiscretizer{F, T, L} + cat::CategoryDiscretizer{F, T} +end + +# change so that atoms can be packed together if wanted +function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) + cat_to_bin = Dict(a => n_bins + i for (i, a) in enumerate(atoms)) + bin_to_cat = Dict(n_bins + i => a for (i, a) in enumerate(atoms)) + bin_width = (upper_bound - lower_bound) / n_bins + return HybridDiscretizer( + RegularDiscretizer{typeof(bin_width), Int, n_bins}( + n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), + (upper_bound - lower_bound) / n_bins), + CategoryDiscretizer(cat_to_bin, bin_to_cat) + ) +end + +function DiscretizerZeroToZero(n_bins, lower_bound, upper_bound) + cat_to_bin = Dict([0.0 => 0]) + bin_to_cat = Dict([0 => 0.0]) + bin_width = (upper_bound - lower_bound) / n_bins + return HybridDiscretizer( + RegularDiscretizer{typeof(bin_width), Int, n_bins}( + n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), + (upper_bound - lower_bound) / n_bins), + CategoryDiscretizer(cat_to_bin, bin_to_cat) + ) +end + +function support_encoding(d::HybridDiscretizer, x) + return support_encoding(d.lin, x) || support_encoding(d.cat, x) +end + +function minimum(d::HybridDiscretizer) + return min(minimum(d.lin), minimum(d.cat)) +end + +function maximum(d::HybridDiscretizer) + return max(maximum(d.lin), maximum(d.cat)) +end + +function nlabels(d::HybridDiscretizer) + return nlabels(d.lin) + nlabels(d.cat) +end + +function non_zero_labels_counts(d::HybridDiscretizer) + return non_zero_labels_counts(d.lin) + non_zero_labels_counts(d.cat) +end + +binwidth(d::HybridDiscretizer) = binwidth(d.lin) + +function binwidth(d::HybridDiscretizer, bin) + if haskey(d.cat.cat_to_bin, bin) + return binwidth(d.cat, bin) + else + return binwidth(d.lin) + end +end + +function encode(d::HybridDiscretizer, x::Real) + if haskey(d.cat.cat_to_bin, x) + return encode(d.cat, x) + else + return encode(d.lin, x) + end +end + +function decode(d::HybridDiscretizer, bin::Int) + if haskey(d.cat.bin_to_cat, bin) + return decode(d.cat, bin) + else + return decode(d.lin, bin) + end +end + +function _decode_randomly( + rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) + if haskey(d.cat.bin_to_cat, bin) + return decode(d.cat, bin) + else + return _decode_randomly(rng, d.lin, bin) + end +end + +function auto_nbins(data) + binwidth = 2iqr(data) / cbrt(n) + lo, hi = extrema(data) + nbins_fd = ceil(Int, (hi - lo) / binwidth) + nbins_sturges = ceil(Int, log(2, n)) + 1 + nbins = max(nbins_fd, nbins_sturges) + return nbins +end + +function progress_in_bin(d::CategoryDiscretizer, x::Real, bin) + return one(x) +end + +function progress_in_bin(d::RegularDiscretizer, x::Real, bin) + lo, hi = decode(d, bin) + return (x - lo) / (hi - lo) +end + +function progress_in_bin(d::HybridDiscretizer, x::Real, bin) + if haskey(d.cat.bin_to_cat, bin) + return progress_in_bin(d.cat, x, bin) + else + return progress_in_bin(d.lin, x, bin) + end +end diff --git a/src/old_messy/distributions/include.jl b/src/old_messy/distributions/include.jl new file mode 100644 index 0000000..cc5c557 --- /dev/null +++ b/src/old_messy/distributions/include.jl @@ -0,0 +1,5 @@ +include("categorical_with_0.jl") +include("discretizer.jl") +include("zero_inflated.jl") +include("discrete_dist.jl") +include("markov_chain.jl") diff --git a/src/old_messy/distributions/markov_chain.jl b/src/old_messy/distributions/markov_chain.jl new file mode 100644 index 0000000..28a21b8 --- /dev/null +++ b/src/old_messy/distributions/markov_chain.jl @@ -0,0 +1,144 @@ +# if S is Int, assume the states are ordered and sequential +# should store everything in transpose, will be faster but way more +# complicated to read +struct DiscreteMarkovChain{S, M <: AbstractMatrix} + states::Vector{S} + transitions::M +end + +struct SampleChain{S, M <: AbstractMatrix} + states::Vector{S} + indices::Vector{Int} + transitions::M +end + +Base.zero(::DiscreteMarkovChain) = DiscreteMarkovChain(Int[], zeros(Int, 0, 0)) +Base.zero(::SampleChain{S}) where {S} = SampleChain(S[], Int[], zeros(Int, 1, 1)) + +function state_index(mc::DiscreteMarkovChain{S}, state::S) where {S} + findfirst(isequal(state), mc.states) +end + +state_space(mc::DiscreteMarkovChain) = mc.states +transition_matrix(mc::DiscreteMarkovChain) = mc.transitions + +function stationary_dist(mc::DiscreteMarkovChain) + T = transition_matrix(mc) + F = eigen(T') + tol = 1e-8 + idx = findfirst(abs.(F.values .- 1) .< tol) + if idx === nothing + error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") + end + # Extract the corresponding eigenvector and normalize it to sum to 1. + pi = real(F.vectors[:, idx]) + return pi ./ sum(pi) +end + +function stationary_dist(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}) where {S} + T = transition_matrix(mc) + vals, vecs, _ = eigsolve(T') + tol = 1e-8 + idx = findfirst(abs.(vals .- 1) .< tol) + if idx === nothing + error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") + end + # Extract the corresponding eigenvector and normalize it to sum to 1. + pi = Real.(vecs[idx]) + result = pi ./ sum(pi) + return result +end + +function sample_indices(mc::DiscreteMarkovChain, t::Int) + indices = Vector{Int}(undef, t) + indices[1] = rand(Categorical(stationary_dist(mc))) + tr_transposed = transpose(mc.transitions) + for i in 2:t + indices[i] = rand(Categorical(tr_transposed[:, indices[i - 1]])) + end + return indices +end + +function sample(mc::DiscreteMarkovChain, t::Int) + indices = sample_indices(mc, t) + states = mc.states[indices] + counts = zeros(Int, length(mc.states), length(mc.states)) + for i in 1:(length(indices) - 1) + counts[indices[i], indices[i + 1]] += 1 + end + return SampleChain(states, indices, counts) +end + +function sample(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}, t::Int) where {S} + indices = sample_indices(mc, t) + states = mc.states[indices] + counts = zeros(Int, length(mc.states), length(mc.states)) + for i in 1:(length(indices) - 1) + counts[indices[i], indices[i + 1]] += 1 + end + return SampleChain(states, indices, sparse(counts)) +end + +## yes I know this is awful and does not return a proper chain, but... +function Base.:+(a::DiscreteMarkovChain, b::DiscreteMarkovChain) + return DiscreteMarkovChain( + a.states, + a.transitions .+ b.transitions) +end + +function Base.:-(a::DiscreteMarkovChain, b::DiscreteMarkovChain) + return DiscreteMarkovChain( + a.states, + a.transitions .- b.transitions) +end + +function Base.:*(a::DiscreteMarkovChain, c::Real) + return DiscreteMarkovChain( + a.states, + a.transitions .* c) +end + +Base.:*(c::Real, a::DiscreteMarkovChain) = a * c + +function Base.:/(a::DiscreteMarkovChain, c::Real) + return DiscreteMarkovChain( + a.states, + a.transitions ./ c) +end + +function loglikelihood(mc::DiscreteMarkovChain{S, M}, chain::Vector{Int}) where {S, M} + Tr = transition_matrix(mc) + probas = Vector{Float64}(undef, length(chain)) + probas[1] = stationary_dist(mc)[chain[1]] + for i in 1:(length(chain) - 1) + probas[i + 1] = Tr[chain[i], chain[i + 1]] + end + return sum(log, probas) +end + +function loglikelihood( + mc::DiscreteMarkovChain{S, M1}, chain::Vector{S}) where {S, M1} + return loglikelihood(mc, state_index.(Ref(mc), chain)) +end + +#without the first state, huge computational speedup +function loglikelihood( + mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} + return sum(map(xlogy, chain.transitions, mc.transitions)) #+log(stationary_dist(mc)[chain.indices[1]]) +end + + + +# user responsability to have the same states... +function fit( + mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} + return DiscreteMarkovChain( + mc.states, make_row_stochastic(chain.transitions)) +end + + + +function make_row_stochastic(A::M) where {M <: AbstractMatrix} + f(row) = sum(row) == 0 ? ones(length(row)) / length(row) : row ./ sum(row) + return mapslices(f, A, dims = 2) +end diff --git a/src/old_messy/distributions/utils.jl b/src/old_messy/distributions/utils.jl new file mode 100644 index 0000000..d30daf7 --- /dev/null +++ b/src/old_messy/distributions/utils.jl @@ -0,0 +1,19 @@ +const logtwo = log(2.0) + +sumlog(x::AbstractArray{<:Real}) = sum(log,x) + +function sumlog(x::AbstractArray{<:AbstractFloat}) + sig = one(T) + ex = zero(exponent(one(T))) + bound = floatmax(T) / 2 + for xj in x + sig *= significand(xj) + ex += exponent(xj) + if sig > bound + (a, b) = (significand(sig), exponent(sig)) + sig = a + ex += b + end + end + log(sig) + logtwo * ex +end diff --git a/src/old_messy/distributions/zero_inflated.jl b/src/old_messy/distributions/zero_inflated.jl new file mode 100644 index 0000000..9cc4720 --- /dev/null +++ b/src/old_messy/distributions/zero_inflated.jl @@ -0,0 +1,93 @@ +""" + struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution + +A zero-inflated distribution that combines a Bernoulli distribution with a continuous distribution. + +# Fields +- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. +- `dist::D`: The continuous distribution. + +# Constructors +- `ZeroInflated(p::Real, dist::D)`: Creates a zero-inflated distribution with probability `p` of zero and continuous distribution `dist`. + +# Mathematical Explanation +The zero-inflated distribution modifies the original distribution by introducing a probability `p` of zero. The `pdf` and `cdf` are adjusted accordingly: +- `pdf(x) = p * δ(x) + (1 - p) * pdf_original(x)` +- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` +where `δ(x)` is the Dirac delta function. +""" +struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution + edge_proba::B + dist::D +end + +function ZeroInflated(p::Real, dist::D) where {D} + return ZeroInflated(Bernoulli(1 - p), dist) +end + +""" + Distributions.pdf(d::ZeroInflated, x::Real) + +Computes the probability density function (pdf) of the zero-inflated distribution `d` at `x`. +""" +function Distributions.pdf(d::ZeroInflated, x::Real) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) +end + +""" + get_proba_zero(d::ZeroInflated) + +Returns the probability of zero for the zero-inflated distribution `d`. +""" +function get_proba_zero(d::ZeroInflated) + return pdf(d.edge_proba, 0) +end + +""" + rand(rng::Random.AbstractRNG, d::ZeroInflated) + +Generates a random sample from the zero-inflated distribution `d` using the random number generator `rng`. +""" +function rand(rng::Random.AbstractRNG, d::ZeroInflated) + return rand(rng, d.edge_proba) * rand(rng, d.dist) +end + +logpdf(d::ZeroInflated, x::Real) = log(pdf(d, x)) + +minimum(d::ZeroInflated) = min(minimum(d.dist), 0) + +maximum(d::ZeroInflated) = max(maximum(d.dist), 0) + +insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) + +""" + Distributions.cdf(d::ZeroInflated, x::Real) + +Computes the cumulative distribution function (cdf) of the zero-inflated distribution `d` at `x`. +""" +function Distributions.cdf(d::ZeroInflated, x::Real) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + + cdf(d.dist, x) * pdf(d.edge_proba, one(x)) +end + +function Distributions.params(d::ZeroInflated) + (first(params(d.edge_proba)), params(d.dist)...) +end + +""" + Distributions.fit(::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) + +Fits a zero-inflated distribution to the given data. +""" +function Distributions.fit( + ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} + indices_0 = findall(x -> x == 0, data) + p = length(indices_0) / length(data) + if p != 1 + return ZeroInflated( + p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) + else + return ZeroInflated(1.0, D()) + end +end diff --git a/src/old_messy/observations.jl b/src/old_messy/observations.jl new file mode 100644 index 0000000..120642c --- /dev/null +++ b/src/old_messy/observations.jl @@ -0,0 +1,231 @@ +""" + Observations{G, D} + +A struct to hold observations for a network. The type parameter `G` represents the network + structure and must support indexing and the `size` function. + +# Fields +- `graph::G`: The network structure (e.g. adjacency matrix). +- `dist_ref::D`: distribution of the observations (used for getting support, type of elements, etc.) +""" +struct Observations{G, D} + graph::G + dist_ref::D +end + +""" + number_nodes(graph::Observations) + +Get the number of nodes in the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `num_nodes`: The number of nodes. +""" +function number_nodes(graph::Observations) + return size(graph.graph, 1) +end + +""" + get_obs(graph::Observations, x::Tuple) + +Get the observation for the given tuple of nodes. + +# Arguments +- `graph::Observations`: The graph observations. +- `x::Tuple`: The tuple of nodes. + +# Returns +- `obs`: The observation. +""" +function get_obs(graph::Observations, x::Tuple) + return get_obs(graph, x[1], x[2]) +end + +""" + get_obs(graph::Observations, i::Int, j::Int) + +Get the observation for the given pair of nodes. + +# Arguments +- `graph::Observations`: The graph observations. +- `i::Int`: The first node. +- `j::Int`: The second node. + +# Returns +- `obs`: The observation. +""" +function get_obs(graph::Observations, i::Int, j::Int) + return graph.graph[i, j] +end + +""" + density(graph::Observations) + +Get the density of the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `density`: The density of the graph. +""" +function density(graph::Observations) + return sum(graph.graph) / + ((size(graph.graph, 1) * (size(graph.graph, 1) - 1))) +end + +""" + get_degree(graph::Observations) + +Get the degree of each node in the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `degrees`: The degrees of the nodes. +""" +function get_degree(graph::Observations) + return sum(graph.graph, dims = 2) +end + +""" + get_adj(graph::Observations) + +Get the adjacency matrix of the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `adj_matrix`: The adjacency matrix. +""" +function get_adj(graph::Observations) + return graph.graph +end + +function normalized_laplacian(graph::Observations) + return normalized_laplacian(graph.graph) +end + +function normalized_laplacian(g::AbstractGraph) + return normalized_laplacian(Graphs.adjacency_matrix(g)) +end + +normalized_laplacian(g::CategoricalArray) = normalized_laplacian(levelcode.(g)) + +""" + normalized_laplacian(graph::Observations) + +Get the normalized Laplacian of the graph. + +# Arguments +- `graph::Observations`: The graph observations. + +# Returns +- `L`: The normalized Laplacian matrix. +""" +function normalized_laplacian(graph::AbstractMatrix) + degrees = sum(graph, dims = 1) + degrees .-= minimum(degrees) + n = size(graph, 1) + L = similar(graph, Float64) + for j in 1:n + for i in 1:n + if i == j + L[i, j] = 1 + elseif degrees[i] == 0 || degrees[j] == 0 + L[i, j] = 0 + elseif graph[i, j] != 0 + L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) + end + end + end + return L +end + +function Metis.graph(graph::Observations{ + G, <:UnivariateDistribution}) where {G} + use_weights = true + if minimum(graph.dist_ref) < 0 + @warn "Negative values are not allowed for MetisStart, using binary graph" + use_weights = false + end + return Metis.graph(sparse(graph.graph), weights = use_weights) +end + +""" + discretise(graph::Observations; number_groups, number_levels) + +Discretise the graph observations. + +# Arguments +- `graph::Observations`: The graph observations. +- `number_groups`: Number of groups for discretisation. +- `number_levels`: Number of levels for discretisation. + +# Returns +- `discretised_graph`: The discretised graph observations. +- `discretiser`: The discretiser used. + +Assume that the diagonal is zero. +0 indicates no edge, while missing indicates no information about the edge. +By default maps 0 to 0. If you want another behaviour use the function where you +pass a `Discretizer` object. + +number_levels will be the number of levels in the discretized distribution (excluding 0). +""" +function discretise( + graph::Observations; number_groups = nothing, number_levels = nothing) + if isnothing(number_groups) && isnothing(number_levels) + throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) + end + if isnothing(number_levels) + number_levels = round(Int, + get_num_levels_from_groups(number_nodes(graph), number_groups)) + else + if !isnothing(number_groups) + @warn "disregarding `number_groups` as `number_levels` is provided" + end + end + return discretise( + graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) +end + +""" + discretise(graph::Observations, discretiser::Discretizer) + +Discretise the graph observations using the given discretiser. + +# Arguments +- `graph::Observations`: The graph observations. +- `discretiser::Discretizer`: The discretiser to use. + +# Returns +- `discretised_graph`: The discretised graph observations. +- `discretiser`: The discretiser used. +""" +function discretise(graph::Observations, discretiser::Discretizer) + A_encoded = encode(discretiser, graph.graph) + return Observations(A_encoded, DiscretizedDistribution(discretiser)), + discretiser +end + +""" + get_num_levels_from_groups(n, number_groups) + +Get the number of levels for the discretized distribution given n and k. + +# Arguments +- `n`: The number of nodes. +- `number_groups`: The number of groups. + +# Returns +- `num_levels`: The number of levels. +""" +function get_num_levels_from_groups(n, number_groups) + return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) +end diff --git a/src/old_messy/optimisation/config_rules/InitRule.jl b/src/old_messy/optimisation/config_rules/InitRule.jl new file mode 100644 index 0000000..c60e144 --- /dev/null +++ b/src/old_messy/optimisation/config_rules/InitRule.jl @@ -0,0 +1,99 @@ +abstract type StartingAssignment end +struct OrderedStart <: StartingAssignment end +struct RandomStart <: StartingAssignment end +struct SpectralStart <: StartingAssignment end +struct MetisStart <: StartingAssignment end +struct BiasAdjustedSoS <: StartingAssignment end + +struct FromAssignment{A} <: StartingAssignment + assignment::A +end +struct HigherOrderSpectralStart <: StartingAssignment + k::Int +end + +struct InitRule{S <: StartingAssignment, I} + starting_assignment_rule::S + assignment_rule::I +end + +function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} + return Assignment(initialize_node_labels( + g, h, init_rule.starting_assignment_rule)...) +end + +""" + initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) + +initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` +object and a vector of node labels. + +# Implemented rules +- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. +- `RandomStart()`: Randomly assign nodes to groups. +- `SpectralStart()`: Assign nodes to groups based on spectral clustering. +- `MetisStart()`: Assign nodes to groups based on Metis partitioning. +- `FromAssignment(a)`: Assign nodes to groups based on the given assignment `a`. +""" +initialize_node_labels + +function initialize_node_labels(g, h, ::OrderedStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) + return group_size, node_labels +end + +function initialize_node_labels(g, h, ::RandomStart) + group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) + Random.shuffle!(node_labels) + return group_size, node_labels +end + +function initialize_node_labels(g, h, ::SpectralStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = zeros(Int, number_nodes(g)) + + laplacian = normalized_laplacian(g) + decomp, = partialschur(laplacian, nev = 2, which = :LR) + + # get 2nd eigenvector, sort its components + indices = sortperm(real.(decomp.Q[:, 2])) + # bin them into groups of correct size + start = 1 + for (i, group) in enumerate(group_size) + stop = start + group - 1 + node_labels[indices[start:stop]] .= i + start = stop + 1 + end + return group_size, node_labels +end + +function initialize_node_labels(g, h, ::MetisStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = convert.( + Int, Metis.partition(Metis.graph(g), length(group_size))) + check_compatiblity!(node_labels, group_size) + return group_size, node_labels +end + +function initialize_node_labels(g, h, rule::FromAssignment{A}) where {A} + group_size = GroupSize(number_nodes(g), h) + check_compatiblity!(rule.assignment.node_labels, group_size) + return group_size, rule.assignment.node_labels +end + +function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) + throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) + # this will need to have the main optim changed -> no assumption that all blocks are + # the same size + group_size = GroupSize(number_nodes(g), h) + laplacian = normalized_laplacian(g) + results = IterativeSolvers.lobpcg(laplacian, true, rule.k) + return group_size, node_labels +end + +function initialize_node_labels(g, h, ::BiasAdjustedSoS) + # implement method from Bias-adjusted spectral clustering in multilayer stochastic block + # models + +end diff --git a/src/old_messy/optimisation/config_rules/accept_rule.jl b/src/old_messy/optimisation/config_rules/accept_rule.jl new file mode 100644 index 0000000..46e7bd6 --- /dev/null +++ b/src/old_messy/optimisation/config_rules/accept_rule.jl @@ -0,0 +1,25 @@ +abstract type AcceptRule end +struct Strict <: AcceptRule end + +""" + accept_reject_update!(a::Assignment, swap::Swap, g, accept_rule::AcceptRule) + + +Perform the swap and accept it if it improves the likelihood of the assignment. `a` will +be updated in place if the swap is accepted. + +# Implemented rules +- `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. +""" +accept_reject_update! + +function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) + # calculate the score of the current assignment + current_score = score(a, g) + # perform the swap + apply_swap!(a, swap) + # if the new assignment is worse, revert the swap + if score(a, g) <= current_score + revert_swap!(a, swap) + end +end diff --git a/src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl b/src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl new file mode 100644 index 0000000..efa4da2 --- /dev/null +++ b/src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl @@ -0,0 +1,101 @@ +abstract type KSelectionRule end +struct OracleK <: KSelectionRule + K::Int +end +struct OracleM{F} <: KSelectionRule + M::F + α::F +end + +struct OracleH <: KSelectionRule + H::Int +end + +function OracleM(M) + return OracleM(M, 1.0) +end + +abstract type EstimatedM <: KSelectionRule end +struct EstimatedEigenvalues <: EstimatedM end +struct EstimatedDegrees <: EstimatedM end + +""" + select_number_node_per_block(g::Observations, rule::KSelectionRule) + +How to select the number of blocks `K` for the BlockModel model. + +# Implemented rules +- `OracleK(K::Int)`: Use the oracle number of blocks `K`. +- `OracleM(M::Int)`: Give the Holder constant `M` of the graphon, use the results from + [Olhede and Wolfe (2014)](https://www.pnas.org/doi/epdf/10.1073/pnas.1400374111) to + estimate the number of blocks `K`. +- `EstimatedEigenvalues()`: Use the estimated eigenvalues of the adjacency matrix to + estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. +- `EstimatedDegrees()`: Use the estimated degrees of the adjacency matrix to estimate the + Holder constant and then use `OracleM` to estimate the number of blocks `K`. +- `OracleH(H::Int)`: Use the oracle number of nodes per block `H`. + +!!! info + - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in + the graph. + - The estimated Holder constant `M` comes from equation (11) in Olhede and Wolfe (2014). +""" +select_number_node_per_block + +function select_number_node_per_block(g::Observations, rule::OracleH) + if rule.H > number_nodes(g) ÷ 2 + throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ + number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) + end + if rule.H <= 1 + throw(ArgumentError("The number of nodes per block $(rule.H) is too small, it should \ + be at least 2")) + end + return rule.H +end + +function select_number_node_per_block(g::Observations, rule::OracleK) + nodes_per_block = number_nodes(g) ÷ rule.K + return select_number_node_per_block(g, OracleH(nodes_per_block)) +end + +function select_number_node_per_block(g::Observations, rule::OracleM) + rho = density(g) + n = number_nodes(g) + h = min(max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))), n ÷ 2) + return select_number_node_per_block(g, OracleH(h)) +end + +function select_number_node_per_block(g::Observations, rule::EstimatedM) + n = number_nodes(g) + c = min(4, sqrt(n) / 8) + number_points_from_mid = round(Int, c * sqrt(n)) + mid_points = max(1, n ÷ 2 - number_points_from_mid):(n ÷ 2 + number_points_from_mid) + m = estimated_number_nodes_per_block(g, rule, mid_points, density(g)) + return select_number_node_per_block(g, OracleH(m)) +end + +function estimated_number_nodes_per_block( + g::Observations, ::EstimatedEigenvalues, points, rho) + @warn "Check this method again" + decomp, = partialschur(get_adj(g), nev = 1, which = :LR) + u, λ = real.(decomp.Q), decomp.eigenvalues[1] + return _approx_k_from_delta_f(u, λ, points, rho) +end + +function estimated_number_nodes_per_block( + g::Observations, ::EstimatedDegrees, points, rho) + d = get_degree(g) + mult = ((d' * get_adj(g) * d) / (sum(d .^ 2))^2)[1] + return _approx_k_from_delta_f(d, mult, points, rho) +end + +function _approx_k_from_delta_f(u, mult, midpoints, ρ, α = 1.0) + sort!(u, dims = 1) + uMid = u[midpoints] + β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid + # from Olhede and Wolfe (2014), equation (11) + h = (2^(α + 1) * α * mult^2 * (β₁ * length(uMid) / 2 + β₀)^2 * β₁^2 * + ρ^(-1))^(-1 / (2 * (α + 1))) + return round(Int, h) +end diff --git a/src/old_messy/optimisation/config_rules/include.jl b/src/old_messy/optimisation/config_rules/include.jl new file mode 100644 index 0000000..8c5b6bf --- /dev/null +++ b/src/old_messy/optimisation/config_rules/include.jl @@ -0,0 +1,5 @@ +include("swap_rule.jl") +include("accept_rule.jl") +include("InitRule.jl") +include("stop_rule.jl") +include("bandwidth_selection_rule.jl") diff --git a/src/old_messy/optimisation/config_rules/stop_rule.jl b/src/old_messy/optimisation/config_rules/stop_rule.jl new file mode 100644 index 0000000..89575dc --- /dev/null +++ b/src/old_messy/optimisation/config_rules/stop_rule.jl @@ -0,0 +1,49 @@ +abstract type StopRule end + +function initialise_stop_rule!(stop_rule::StopRule, a, g) +end + +# default score is the log likelihood +function score(a::Assignment, g::Observations) + return loglikelihood(a, g) / binomial(number_nodes(a), 2) +end + +mutable struct PreviousBestValue{T} <: StopRule + k::Int + previous_best_value::T + iterations_since_best::Int + function PreviousBestValue( + k::Int, x::T = -Inf) where {T <: Real} + @assert k > 0 + # queue stores the best values and at most k subsequent values + new{T}(k, x, 0) + end +end + +function initialise_stop_rule!(stop_rule::PreviousBestValue, a, g) + score_value = score(a, g) + stop_rule.previous_best_value = score_value +end + +""" + stopping_rule(assignment::Assignment,g, stop_rule::StopRule) + +Returns a Bool with true if we should stop the optimization based on the `stop_rule`. + +# Implemented rules +- `PreviousBestValue(k)`: Stop if the current iteration is `k` iterations away from the + iteration with the best value. +""" +stopping_rule + +function stopping_rule( + assignment::Assignment, g, stop_rule::PreviousBestValue) + score_value = score(assignment, g) + if score_value > stop_rule.previous_best_value + stop_rule.previous_best_value = score_value + stop_rule.iterations_since_best = 0 + else + stop_rule.iterations_since_best += 1 + end + return stop_rule.iterations_since_best >= stop_rule.k +end diff --git a/src/old_messy/optimisation/config_rules/swap_rule.jl b/src/old_messy/optimisation/config_rules/swap_rule.jl new file mode 100644 index 0000000..e811d4f --- /dev/null +++ b/src/old_messy/optimisation/config_rules/swap_rule.jl @@ -0,0 +1,27 @@ +abstract type NodeSwapRule end + +struct RandomNodeSwap <: NodeSwapRule end +struct RandomGroupSwap <: NodeSwapRule end +""" + select_swap(node_assignment::Assignment, ::NodeSwapRule) + +Selects two nodes to swap based on the `NodeSwapRule`, the adjacency matrix `A` and the +current assignment `node_assignment`. + +# Implemented rules +- `RandomNodeSwap()`: Select two nodes at random. +- `RandomGroupSwap()`: Select two nodes from two different groups at random. +""" +select_swap + +function select_swap(assignment::Assignment, ::RandomNodeSwap) + return StatsBase.sample(1:number_nodes(assignment), 2; replace = false) +end + +function select_swap(assignment::Assignment, ::RandomGroupSwap) + groups = StatsBase.sample( + 1:number_groups(assignment), 2; replace = false) + index1 = rand(get_vertex_in_group(assignment, groups[1])) + index2 = rand(get_vertex_in_group(assignment, groups[2])) + return (index1, index2) +end diff --git a/src/old_messy/optimisation/fit.jl b/src/old_messy/optimisation/fit.jl new file mode 100644 index 0000000..8e5985a --- /dev/null +++ b/src/old_messy/optimisation/fit.jl @@ -0,0 +1,99 @@ +# Slow fallback methods for the Assignment type +# Speed up by implementing specialized methods for the BernoulliAssignment type and others + +""" + fit(a::Assignment, g::Observations) + +Compute the estimator from node clustering as specified in the assignment. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `g::Observations`: The graph observations. + +# Returns +- `dists`: The fitted distributions. +""" +function fit(a::Assignment, g::Observations) + dists = initialize_sbm(a.group_size, g.dist_ref) + fit!(dists, g, a) + return dists +end + +""" + fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} + +Fit the SBM to the given graph observations and assignment. + +# Arguments +- `sbm::BlockModel{D,K,F}`: The block model to fit. +- `g::Observations{G,D}`: The graph observations. +- `a::Assignment`: The assignment of nodes to blocks. +""" +function fit!(sbm::BlockModel{D, K, F}, g::Observations{G, D}, + a::Assignment) where {G, D, K, F} + for group1 in 1:number_groups(a) + for group2 in group1:number_groups(a) + edge_indices = get_edge_indices(a, group1, group2) + sbm[group1, group2] = fit_group(g.dist_ref, g, edge_indices) + end + end +end + +function fit_group(d::ZeroInflatedCategorical, g, edges) + return Distributions.fit( + typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) +end + +function fit_group(distribution, g, edges) + return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) +end + +function fit_group(distribution::Binomial, g, edges) + return Distributions.fit( + typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) +end + +""" + loglikelihood(a::Assignment, g::Observations) + +Compute the log likelihood of a BlockModel fitted according to the assignment. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `g::Observations`: The graph observations. + +# Returns +- `log_likelihood`: The log likelihood of the fitted model. +""" +function loglikelihood(a::Assignment, g::Observations) + return _log_likelihood(a, fit(a, g), g) +end + +function _log_likelihood(a::Assignment, sbm::BlockModel, g) + log_likelihood = 0.0 + for i in 1:number_nodes(a) + label_a = a.node_labels[i] + for j in (i + 1):number_nodes(a) + label_b = a.node_labels[j] + log_likelihood += logdensityof( + sbm[label_a, label_b], get_obs(g, i, j)) + end + end + return log_likelihood +end + +""" + fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} + +Fit the SBM to the given graph observations. + +# Arguments +- `sbm::BlockModel{D,K,F}`: The block model to fit. +- `g::Observations{G,D}`: The graph observations. +""" +function fit!( + sbm::BlockModel{D, K, F}, g::Observations{G, D}) where {G, D, K, F} + k = number_blocks(sbm) + a = estimate_graphon(g, select_number_node_per_block(g, OracleK(k))) + fit!(sbm, g, a) +end diff --git a/src/old_messy/optimisation/include.jl b/src/old_messy/optimisation/include.jl new file mode 100644 index 0000000..044e887 --- /dev/null +++ b/src/old_messy/optimisation/include.jl @@ -0,0 +1,3 @@ +include("fit.jl") +include("swap.jl") +include("least_squares.jl") diff --git a/src/old_messy/optimisation/least_squares.jl b/src/old_messy/optimisation/least_squares.jl new file mode 100644 index 0000000..9aae0db --- /dev/null +++ b/src/old_messy/optimisation/least_squares.jl @@ -0,0 +1,95 @@ +include("config_rules/include.jl") + +""" + estimate_graphon(graph, h; iterations, initialise_rule, swap_rule, accept_rule, stop_rule, progress_bar) + +Estimate the graphon for the given graph. + +# Arguments +- `graph`: The input graph. +- `h`: Number of nodes per block. +- `iterations`: Maximum number of iterations. +- `initialise_rule::InitRule`: Rule for initializing the assignment. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `stop_rule::StopRule`: Rule for stopping the iterations. +- `progress_bar::Bool`: Whether to show a progress bar. + +# Returns +- `a`: The assignment of nodes to blocks. +""" +function estimate_graphon( + graph, h = select_number_node_per_block(graph, EstimatedDegrees()); + iterations::Int = 10_000, + initialise_rule::InitRule = InitRule(SpectralStart(), nothing), + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict(), + stop_rule::StopRule = PreviousBestValue(1000), + progress_bar::Bool = false +) + a = make_assignment(graph, h, initialise_rule) + @debug a + initialise_stop_rule!(stop_rule, a, graph) + greedy_improve!( + a, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) + return a +end + +""" + greedy_improve!(a::Assignment, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) + +Perform greedy improvement on the assignment. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `graph`: The input graph. +- `iterations`: Maximum number of iterations. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +- `stop_rule::StopRule`: Rule for stopping the iterations. +- `progress_bar::Bool`: Whether to show a progress bar. +""" +function greedy_improve!(a::Assignment, graph; iterations::Int = 10_000, + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict(), + stop_rule::StopRule = PreviousBestValue(1000), + progress_bar::Bool = false +) + # swap memory allocation + swap = make_swap(a, (1, 1)) + p = ProgressUnknown( + enabled = progress_bar, showspeed = true, desc = "Greedy search: ") + # perform local search until the stopping rule is met + for i in 1:iterations + local_search!( + a, graph, swap, swap_rule = swap_rule, accept_rule = accept_rule) + next!(p) + if stopping_rule(a, graph, stop_rule) + finish!(p) + break + end + end +end + +""" + local_search!(a::Assignment, graph, swap; swap_rule, accept_rule) + +Perform local search by trying a swap and accepting it if it improves the likelihood. + +# Arguments +- `a::Assignment`: The assignment of nodes to blocks. +- `graph`: The input graph. +- `swap`: The swap object. +- `swap_rule::NodeSwapRule`: Rule for swapping nodes. +- `accept_rule::AcceptRule`: Rule for accepting swaps. +""" +function local_search!( + a::Assignment, graph, swap::Swap = make_swap(a, (1, 1)); + swap_rule::NodeSwapRule = RandomNodeSwap(), + accept_rule::AcceptRule = Strict() +) + # select two nodes to swap and build the swap object + make_swap!(swap, a, select_swap(a, swap_rule)) + # perform the swap and accept it if it improves the likelihood + accept_reject_update!(a, swap, graph, accept_rule) +end diff --git a/src/old_messy/optimisation/swap.jl b/src/old_messy/optimisation/swap.jl new file mode 100644 index 0000000..69b1ff8 --- /dev/null +++ b/src/old_messy/optimisation/swap.jl @@ -0,0 +1,26 @@ +abstract type Swap end + +mutable struct DefaultSwap <: Swap + index1::Int + index2::Int +end + +function make_swap(::Assignment, id) + return DefaultSwap(id[1], id[2]) +end + +function make_swap!(swap::DefaultSwap, ::Assignment, id) + swap.index1, swap.index2 = id +end + +function apply_swap!(a::Assignment, s::DefaultSwap) + swap_node_labels!(a, s.index1, s.index2) +end + +function revert_swap!(assignment::Assignment, swap::DefaultSwap) + apply_swap!(assignment, swap) +end + +function swap_node_labels!(a::Assignment, i, j) + a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] +end diff --git a/src/old_messy/sbm.jl b/src/old_messy/sbm.jl new file mode 100644 index 0000000..1fad688 --- /dev/null +++ b/src/old_messy/sbm.jl @@ -0,0 +1,186 @@ +# TODO: remove BlockModel being a subtype of AbstractMatrix +# this was fun but useless and actually harmful + +struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} + sizes::Vector{F} + probs::SymmetricTensor{T, K, 2} +end + +function BlockModel( + θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real} + return BlockModel(sizes, + SymmetricTensor([θ[i, j] for i in 1:size(θ, 1) for j in i:size(θ, 2)], + Val(length(sizes)), Val(2))) +end + +function edge_type(::BlockModel{T, K, F}) where {T, K, F} + return eltype(T) +end + +function _check_sizes(sizes) + @assert sum(sizes)≈1 "Sizes must sum to 1, got $(sum(sizes))" + return sizes +end + +function _check_sizes(sizes::Vector{Int}) + return sizes ./ sum(sizes) +end + +function initialize_sbm(sizes::Vector, dist, k = length(sizes)) + sizes = _check_sizes(sizes) + n_dims = binomial(k + 1, 2) + probs = Vector{typeof(dist)}(undef, n_dims) + fill!(probs, dist) + return BlockModel(sizes, SymmetricTensor(probs, Val(k), Val(2))) +end + +function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) + size_bins = sizes ./ sum(sizes) + n_dims = binomial(k + 1, 2) + probs = Vector{typeof(dist)}(undef, n_dims) + fill!(probs, dist) + return BlockModel(size_bins, SymmetricTensor(probs, Val(k), Val(2))) +end + +function initialize_sbm(k::Int, dist) + return initialize_sbm(ones(k) / k, dist) +end + +number_blocks(::BlockModel{T, K, F}) where {T, K, F} = K + +Base.size(s::BlockModel) = size(s.probs) +Base.ndims(::BlockModel) = 2 +Base.eltype(::BlockModel{T, K, F}) where {T, K, F} = T +Base.setindex!(s::BlockModel, v, i, j) = setindex!(s.probs, v, i, j) +Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) + return getindex(s.probs, i, j) +end + +function sample( + rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}, sorted = false) + n_nodes = length(node_labels) + if sorted + sort!(node_labels) + end + A = zeros(edge_type(sbm), n_nodes, n_nodes) + for j in 1:n_nodes + for i in (j + 1):n_nodes + A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) + end + end + return sparse(Symmetric(A, :L)), node_labels +end + +function draw_and_fill!( + rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = false) + n_blocks = number_blocks(sbm) + n_nodes = size(A, 1) + node_labels = StatsBase.sample( + rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) + if sorted + sort!(node_labels) + end + @inbounds for j in 1:n_nodes + for i in (j + 1):n_nodes + A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) + end + end + A .= Symmetric(A, :L) +end + +function draw_and_fill!(A, sbm, sorted = false) + draw_and_fill!(Random.default_rng(), A, sbm, sorted) +end + +function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted = false) + sample(Random.default_rng(), sbm, node_labels, sorted) +end +function sample( + rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = false) + n_blocks = number_blocks(sbm) + node_labels = StatsBase.sample( + rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) + if sorted + sort!(node_labels) + end + return sample(rng, sbm, node_labels) +end + +function sample(sbm::BlockModel, n_nodes::Int, sorted = false) + sample(Random.default_rng(), sbm, n_nodes, sorted) +end + +function get_probability_matrix(sbm::BlockModel, node_labels::Vector{Int}) + return sbm.probs[node_labels, node_labels] +end + +function _get_params_as_vec(dist::Distribution) + return vcat(params(dist)...) +end + +function latent_to_block_index(latents_vec, sbm::BlockModel) + cum_sum_sizes = cumsum(sbm.sizes) + cum_sum_sizes[end] = 1.0 + return [findfirst(x -> x >= l, cum_sum_sizes) for l in latents_vec] +end + +""" + best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) + +Find the best permutation of the blocks of `fitted_sbm` to match the blocks of `true_sbm` by +comparing the mean absolute difference of the parameters of the two models. +If the difference between the two models is less than `tol`, the function stops early. + +!!! warning + This function is not efficient for large numbers of blocks, as it uses brute force to + find the best permutation. +""" +function best_alignment( + fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) + k = number_blocks(fitted_sbm) + if k != number_blocks(true_sbm) + throw(ArgumentError("The number of blocks must be the same for both models")) + end + best_perm = nothing + best_loss = Inf + fitted_params = _get_params_as_vec.(fitted_sbm) + true_params = _get_params_as_vec.(true_sbm) + for perm in permutations(1:k) + loss = sum(map(x -> sum(abs.(x)), fitted_params[perm] .- true_params)) + if loss < best_loss + best_loss = loss + best_perm = perm + end + if best_loss < tol + break + end + end + return best_perm +end + +function align_sbm!(sbm::BlockModel, perm) + sbm.probs .= sbm.probs[perm, perm] + sbm.sizes .= sbm.sizes[perm] +end + +""" + order_groups(a::Assignment, latents::AbstractVector) + +Order the groups of an assignment according to the true latents. This is an heuristic +approach, which is not guaranteed to find the true ordering of the groups. +""" +function order_groups(a::Assignment, latents::AbstractVector) + n = number_nodes(a) + k = number_groups(a) + sort_perm = sortperm(latents) + sorted_group_labels = a.node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + return sort( + 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) +end + +function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) + align_sbm!(sbm, order_groups(a, latents)) +end diff --git a/src/optimization/config_rules/InitRule.jl b/src/optimization/config_rules/InitRule.jl new file mode 100644 index 0000000..23ac2fa --- /dev/null +++ b/src/optimization/config_rules/InitRule.jl @@ -0,0 +1,44 @@ +abstract type StartingAssignment end +struct OrderedStart <: StartingAssignment end +struct RandomStart <: StartingAssignment end + + +struct FromAssignment{A} <: StartingAssignment + assignment::A +end + +struct InitRule{S <: StartingAssignment, I} + starting_assignment_rule::S + assignment_rule::I +end + + +# check that this is necessary! +function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} + return Assignment(initialize_node_labels( + g, h, init_rule.starting_assignment_rule)...) +end + +""" + initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) + +initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` +object and a vector of node labels. + +# Implemented rules +- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. +- `RandomStart()`: Randomly assign nodes to groups. +""" +initialize_node_labels + +function initialize_node_labels(g, h, ::OrderedStart) + group_size = GroupSize(number_nodes(g), h) + node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) + return group_size, node_labels +end + +function initialize_node_labels(g, h, ::RandomStart) + group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) + Random.shuffle!(node_labels) + return group_size, node_labels +end diff --git a/src/optimization/config_rules/accept_rule.jl b/src/optimization/config_rules/accept_rule.jl new file mode 100644 index 0000000..94ce206 --- /dev/null +++ b/src/optimization/config_rules/accept_rule.jl @@ -0,0 +1,22 @@ +abstract type AcceptRule end +struct Strict <: AcceptRule end + +""" + accept_reject_update!(a::Assignment, swap::Swap, g, accept_rule::AcceptRule) + + +Perform the swap and accept it if it improves the likelihood of the assignment. `a` will +be updated in place if the swap is accepted. + +# Implemented rules +- `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. +""" +accept_reject_update! + +function accept_reject_update!(a::Assignment, swap::Swap, ::Strict) + current_score = loglikelihood(a) + apply_swap!(a, swap) + if loglikelihood(a) <= current_score + revert_swap!(a, swap) + end +end diff --git a/src/optimization/config_rules/bandwidth_selection_rule.jl b/src/optimization/config_rules/bandwidth_selection_rule.jl new file mode 100644 index 0000000..0606c11 --- /dev/null +++ b/src/optimization/config_rules/bandwidth_selection_rule.jl @@ -0,0 +1,42 @@ +abstract type KSelectionRule end +struct OracleK <: KSelectionRule + K::Int +end + +struct OracleH <: KSelectionRule + H::Int +end + + + +""" + select_number_node_per_block(g::Observations, rule::KSelectionRule) + +How to select the number of blocks `K` for the BlockModel model. + +# Implemented rules +- `OracleK(K::Int)`: Use the oracle number of blocks `K`. +- `OracleH(H::Int)`: Use the oracle number of nodes per block `H`. + +!!! info + - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in + the graph. +""" +select_number_node_per_block + +function select_number_node_per_block(g, rule::OracleH) + if rule.H > number_nodes(g) ÷ 2 + throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ + number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) + end + if rule.H <= 1 + throw(ArgumentError("The number of nodes per block $(rule.H) is too small, it should \ + be at least 2")) + end + return rule.H +end + +function select_number_node_per_block(g, rule::OracleK) + nodes_per_block = number_nodes(g) ÷ rule.K + return select_number_node_per_block(g, OracleH(nodes_per_block)) +end diff --git a/src/optimization/config_rules/include.jl b/src/optimization/config_rules/include.jl new file mode 100644 index 0000000..8c5b6bf --- /dev/null +++ b/src/optimization/config_rules/include.jl @@ -0,0 +1,5 @@ +include("swap_rule.jl") +include("accept_rule.jl") +include("InitRule.jl") +include("stop_rule.jl") +include("bandwidth_selection_rule.jl") diff --git a/src/optimization/config_rules/stop_rule.jl b/src/optimization/config_rules/stop_rule.jl new file mode 100644 index 0000000..a4e12c5 --- /dev/null +++ b/src/optimization/config_rules/stop_rule.jl @@ -0,0 +1,47 @@ +abstract type StopRule end + +function initialise_stop_rule!(stop_rule::StopRule, a, g) +end + +# default score is the log likelihood +function score(a::Assignment) + return loglikelihood(a) / binomial(number_nodes(a), 2) +end + +mutable struct PreviousBestValue{T} <: StopRule + k::Int + previous_best_value::T + iterations_since_best::Int + function PreviousBestValue(k::Int, x::T = -Inf) where {T <: Real} + @assert k > 0 + # queue stores the best values and at most k subsequent values + new{T}(k, x, 0) + end +end + +function initialise_stop_rule!(stop_rule::PreviousBestValue, a) + score_value = score(a) + stop_rule.previous_best_value = score_value +end + +""" + stopping_rule(assignment::Assignment,g, stop_rule::StopRule) + +Returns a Bool with true if we should stop the optimization based on the `stop_rule`. + +# Implemented rules +- `PreviousBestValue(k)`: Stop if the current iteration is `k` iterations away from the + iteration with the best value. +""" +stopping_rule + +function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) + score_value = score(assignment) + if score_value > stop_rule.previous_best_value + stop_rule.previous_best_value = score_value + stop_rule.iterations_since_best = 0 + else + stop_rule.iterations_since_best += 1 + end + return stop_rule.iterations_since_best >= stop_rule.k +end diff --git a/src/optimization/config_rules/swap_rule.jl b/src/optimization/config_rules/swap_rule.jl new file mode 100644 index 0000000..63cb5b0 --- /dev/null +++ b/src/optimization/config_rules/swap_rule.jl @@ -0,0 +1,27 @@ +abstract type NodeSwapRule end + +struct RandomNodeSwap <: NodeSwapRule end +struct RandomGroupSwap <: NodeSwapRule end +""" + select_indices_swap(node_assignment::Assignment, ::NodeSwapRule) + +Selects two nodes to swap based on the `NodeSwapRule`, the adjacency matrix `A` and the +current assignment `node_assignment`. + +# Implemented rules +- `RandomNodeSwap()`: Select two nodes at random. +- `RandomGroupSwap()`: Select two nodes from two different groups at random. +""" +select_swap + +function select_indices_swap(assignment::Assignment, ::RandomNodeSwap) + return StatsBase.sample(1:number_nodes(assignment), 2; replace = false) +end + +function select_indices_swap(assignment::Assignment, ::RandomGroupSwap) + groups = StatsBase.sample( + 1:number_groups(assignment), 2; replace = false) + index1 = rand(get_vertex_in_group(assignment, groups[1])) + index2 = rand(get_vertex_in_group(assignment, groups[2])) + return (index1, index2) +end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl new file mode 100644 index 0000000..8983b54 --- /dev/null +++ b/src/optimization/greedy.jl @@ -0,0 +1,43 @@ +include("swap_workspace.jl") +include("config_rules/include.jl") + +mutable struct GreedyParams + max_iter::Int + swap_rule::NodeSwapRule + accept_rule::AcceptRule + stop_rule::StopRule + progress_bar::Bool +end + +GreedyParams() = GreedyParams(10_000, RandomNodeSwap(), Strict(), PreviousBestValue(1000), false) + +function greedy_optimize(g, initial_labels, params::GreedyParams) + a = Assignment(g, initial_labels) + greedy_improve!(a; params = params) + return a +end + + +function greedy_improve!(a::Assignment; params = GreedyParams()) + # allocate memory for swap + swap = make_swap(a, (1, 1)) + + # display progress bar + p = ProgressUnknown(enabled = params.progress_bar, showspeed = true, desc = "Greedy search: ") + + for i in 1:params.max_iter + local_search!(a, swap, params) + next!(p) + if stopping_rule(a, params.stop_rule) + finish!(p) + break + end + end +end + +function local_search!(a::Assignment, swap, params::GreedyParams) + # select two nodes to swap and update data in the swap object + make_swap!(swap, a, select_indices_swap(a, params.swap_rule)) + # apply swap, test if local improvement and update assignment if needed + accept_reject_update!(a, swap, params.accept_rule) +end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl new file mode 100644 index 0000000..ba99540 --- /dev/null +++ b/src/optimization/swap_workspace.jl @@ -0,0 +1,75 @@ +mutable struct WorkspaceSwap{D,K,F} + θ::SymArray{D} + log_likelihood_per_group::SymArray{F} +end + +mutable struct Swap{D, F} + u::Int + v::Int + workspace::WorkspaceSwap{D,F} +end + + +function make_swap(a::Assignment, id) + return Swap(id[1], id[2], WorkspaceSwap(deepcopy(a.θ), deepcopy(a.log_likelihood))) +end + +function make_swap!(swap::Swap, a::Assignment, id) + swap.index1, swap.index2 = id + swap.workspace.θ = deepcopy(a.θ) + swap.workspace.log_likelihood_per_group = deepcopy(a.log_likelihood) +end + +function revert_swap!(assignment::Assignment, swap::Swap) + apply_swap!(assignment, swap) + assignment.θ = deepcopy(swap.workspace.θ) + assignment.log_likelihood = deepcopy(swap.workspace.log_likelihood_per_group) +end + +function swap_node_labels!(a::Assignment, i, j) + a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] +end + +function apply_swap!(a::Assignment, s::Swap) + g1 = get_group_of_vertex(a, s.index1) + g2 = get_group_of_vertex(a, s.index2) + groups_concerned = Set([minmax(g1, g2)]) + for (u, g_old, g_new) in [(s.index1, g1, g2), (s.index2, g2, g1)] + # iterate over neighbors of u and get the decoration of the edge + for (v,d) in a.dists[u] + g_v = get_group_of_vertex(a, v) + a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) + a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) + push!(groups_concerned, minmax(g_new, g_v)) + push!(groups_concerned, minmax(g_old, g_v)) + end + end + fast_ll_update!(a, groups_concerned) + + swap_node_labels!(a, s.index1, s.index2) +end + + +## below can be specialised for Bernoulli probably + +function fast_ll_update!(a, groups_concerned) + for g in groups_concerned + a.log_likelihood[g[1], g[2]] = _fast_ll_one_group(a, g[1], g[2]) + end +end + + +function _fast_ll_one_group(a::Assignment, g1, g2) + nodes_g1 = findall(x -> x == g1, a.node_labels) + nodes_g2 = findall(x -> x == g2, a.node_labels) + ll = 0.0 + d = a.θ[g1, g2] + for u in nodes_g1 + for (v,e) in a.edges[u] # assume implicitly that g1 != g2 + if v in nodes_g2 + ll += loglikelihood(d, e) + end + end + end + return ll +end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl new file mode 100644 index 0000000..7d71e4a --- /dev/null +++ b/src/utils/SymArray.jl @@ -0,0 +1,28 @@ +module FastSymArray + + export SymArray + + mutable struct SymArray{F} + d::Dict{Tuple{Int, Int}, F} + k::Int + end + + function SymArray(k, d::F) where {F} + @assert k > 0 + return SymArray{F}(Dict{Tuple{Int, Int}, F}(minmax(i, j) => d for i in 1:k + for j in i:k), k) + end + + Base.@propagate_inbounds function Base.getindex(a::SymArray, i, j) + return a.d[minmax(i, j)] + end + + function Base.setindex!(a::SymArray, v, i, j) + a.d[minmax(i, j)] = v + end + + + function Base.sum(a::SymArray) + return sum(values(a.d)) + end +end diff --git a/src/utils/include.jl b/src/utils/include.jl new file mode 100644 index 0000000..2e9c8ad --- /dev/null +++ b/src/utils/include.jl @@ -0,0 +1 @@ +include("SymArray.jl") diff --git a/test/old_tests/TestNetworkHistogram.jl b/test/old_tests/TestNetworkHistogram.jl new file mode 100644 index 0000000..0018f70 --- /dev/null +++ b/test/old_tests/TestNetworkHistogram.jl @@ -0,0 +1,30 @@ +module TestNetworkHistogram + +import NetworkHistogram as NH +using Test + +function to_default_assignment(a_specialised::NH.Assignment{T, B}) where {T, B} + return NH.Assignment(a_specialised.group_size, a_specialised.node_labels) +end + +to_default_assignment(a::NH.Assignment{T, Nothing}) where {T} = a + +function test_swap_revertible( + a::NH.Assignment, swap::NH.Swap, g::NH.Observations) + a_test = deepcopy(a) + NH.apply_swap!(a_test, swap) + @test NH.get_group_of_vertex(a, swap.index1) == + NH.get_group_of_vertex(a_test, swap.index2) + @test NH.get_group_of_vertex(a, swap.index2) == + NH.get_group_of_vertex(a_test, swap.index1) + # force recomputation of the log likelihood using default assignment + a_new = to_default_assignment(a_test) + @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) + + # revert the swap and check if the assignment is the same as before + NH.revert_swap!(a_test, swap) + @test a == a_test + @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) +end + +end diff --git a/test/old_tests/assignments/bernoulli_assignment.jl b/test/old_tests/assignments/bernoulli_assignment.jl new file mode 100644 index 0000000..d683e1a --- /dev/null +++ b/test/old_tests/assignments/bernoulli_assignment.jl @@ -0,0 +1,42 @@ +import NetworkHistogram as NH + +@testset "test construction Bernoulli assignment" begin + using Distributions: Bernoulli + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + obs = NH.Observations(A, Bernoulli(0.5)) + node_labels = [1, 1, 1, 1, 2, 2, 2, 2] + group_size = NH.GroupSize(8, 4) + a = NH.BernoulliAssignment(obs, group_size, node_labels) + for i in 1:8 + @test NH.get_group_of_vertex(a, i) == node_labels[i] + end + @test all(a.additional_data.A .== A) + @test a.additional_data.realized == [5 2; 2 5] + @test a.additional_data.counts == [6 16; 16 6] + @test a.additional_data.estimated_theta == [5/6 1/8; 1/8 5/6] +end + +@testset "test Bernoulli swap" begin + using ..TestNetworkHistogram: test_swap_revertible + using Distributions: Bernoulli + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + obs = NH.Observations(A, Bernoulli(0.5)) + a = NH.BernoulliAssignment( + obs, NH.GroupSize(8, 4), [1, 1, 1, 1, 2, 2, 2, 2]) + swap = NH.make_swap(a, (1, 2)) + test_swap_revertible(a, swap, obs) +end diff --git a/test/old_tests/assignments/categorical_assignment.jl b/test/old_tests/assignments/categorical_assignment.jl new file mode 100644 index 0000000..bd4c4db --- /dev/null +++ b/test/old_tests/assignments/categorical_assignment.jl @@ -0,0 +1,126 @@ +import NetworkHistogram as NH + +using Random + +@testset "test Categorical swap" begin + Random.seed!(1234123) + using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment + using Distributions: Categorical + using LinearAlgebra: Symmetric + import Random + m = 2 + p = ones(m) ./ m + n = 12 + k = 4 + dist = Categorical(p) + sbm = NH.initialize_sbm(ones(k) ./ k, dist) + node_labels = repeat(1:k, inner = n ÷ k) + A, _ = NH.sample(sbm, node_labels) + g = NH.Observations(collect(A), dist) + a = NH.CategoricalAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) + swap = NH.make_swap(a, (1, k + 1)) + @test A[:, 1] != A[:, k + 1] + a_test = deepcopy(a) + NH.apply_swap!(a_test, swap) + @test NH.get_group_of_vertex(a, swap.index1) == + NH.get_group_of_vertex(a_test, swap.index2) + @test NH.get_group_of_vertex(a, swap.index2) == + NH.get_group_of_vertex(a_test, swap.index1) + # force recomputation of the log likelihood using default assignment + a_new = to_default_assignment(a_test) + @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) + @test a_test.additional_data.realized != a.additional_data.realized + @test a_test.additional_data.estimated_theta != + a.additional_data.estimated_theta + @test a_test.additional_data.log_likelihood != + a.additional_data.log_likelihood + # revert the swap and check if the assignment is the same as before + NH.revert_swap!(a_test, swap) + @test a == a_test + @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) +end + +@testset "fast update test" begin + using Distributions + realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; + [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized = [realized[I][k] + for k in eachindex(realized[1, 1]), + I in CartesianIndices(realized)] + counts = [1 4 4 + 4 1 4 + 4 4 1] + A = [0 1 2 2 3 3 + 1 0 2 2 3 3 + 2 2 0 1 3 3 + 2 2 1 0 3 3 + 3 3 3 3 0 1 + 3 3 3 3 1 0] + groupsize = NH.GroupSize(6, 2) + node_labels = [1, 1, 2, 2, 3, 3] + g = NH.Observations(A, Categorical(3)) + a = NH.CategoricalAssignment(g, groupsize, node_labels) + for index in eachindex(realized) + @test all(realized[index] .== a.additional_data.realized[index]) + end + @test loglikelihood(a, g) ≈ 0 + @test a.additional_data.counts == counts + swap_id = (1, 3) + ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; + [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized_after_swap = [ras[I][k] + for k in eachindex(ras[1, 1]), + I in CartesianIndices(ras)] + + swap = NH.make_swap(a, swap_id) + NH.apply_swap!(a, swap) + for j in 1:3 + for i in 1:3 + @test all(realized_after_swap[:, i, j] .== + a.additional_data.realized[:, i, j]) + @test all(a.additional_data.estimated_theta[:, i, j] .≈ + realized_after_swap[:, i, j] ./ counts[i, j]) + end + end + @test loglikelihood(a, g) == 4 * log(0.5) +end + +#todo: test ll against categorical likelihood on basic assignment +@testset "test swap is not overwritten" begin + A = [0 4 4 2 1 2 2 3 4 2 3 1 4 1 1 3 4 4 3 3 + 4 0 4 2 4 2 1 1 1 3 3 1 1 1 3 3 4 2 1 4 + 4 4 0 1 2 4 2 2 1 3 2 3 1 2 3 2 3 4 1 1 + 2 2 1 0 2 1 2 2 2 3 1 1 3 3 3 3 3 1 1 2 + 1 4 2 2 0 4 1 4 3 2 4 3 4 3 1 3 1 1 1 3 + 2 2 4 1 4 0 2 3 1 3 1 4 3 3 1 3 1 3 3 3 + 2 1 2 2 1 2 0 3 2 2 1 1 1 3 3 1 1 3 1 1 + 3 1 2 2 4 3 3 0 4 3 2 3 1 1 1 1 1 3 2 1 + 4 1 1 2 3 1 2 4 0 3 1 1 1 3 2 1 3 1 4 1 + 2 3 3 3 2 3 2 3 3 0 1 3 1 1 3 1 3 1 1 4 + 3 3 2 1 4 1 1 2 1 1 0 2 3 2 2 1 2 2 1 3 + 1 1 3 1 3 4 1 3 1 3 2 0 4 4 2 2 2 3 1 1 + 4 1 1 3 4 3 1 1 1 1 3 4 0 2 2 1 2 1 1 3 + 1 1 2 3 3 3 3 1 3 1 2 4 2 0 1 2 1 2 1 1 + 1 3 3 3 1 1 3 1 2 3 2 2 2 1 0 2 1 2 1 1 + 3 3 2 3 3 3 1 1 1 1 1 2 1 2 2 0 1 1 1 3 + 4 4 3 3 1 1 1 1 3 3 2 2 2 1 1 1 0 1 1 1 + 4 2 4 1 1 3 3 3 1 1 2 3 1 2 2 1 1 0 1 1 + 3 1 1 1 1 3 1 2 4 1 1 1 1 1 1 1 1 1 0 1 + 3 4 1 2 3 3 1 1 1 4 3 1 3 1 1 3 1 1 1 0] + g = NH.Observations(A, Categorical(4)) + h = 6 + a = NH.make_assignment( + g, h, NH.InitRule(NH.OrderedStart(), Val{NH.CategoricalData}())) + a_ref = deepcopy(a) + swap_indices = [(18, 5), (15, 10), (5, 13)] + swap = NH.make_swap(a, swap_indices[1]) + for swap_index in swap_indices + NH.make_swap!(swap, a, swap_index) + NH.apply_swap!(a, swap) + @test swap.realized == a_ref.additional_data.realized + @test swap.estimated_theta == a_ref.additional_data.estimated_theta + NH.revert_swap!(a, swap) + end +end diff --git a/test/old_tests/assignments/default_assignment.jl b/test/old_tests/assignments/default_assignment.jl new file mode 100644 index 0000000..fefbf64 --- /dev/null +++ b/test/old_tests/assignments/default_assignment.jl @@ -0,0 +1,17 @@ +import NetworkHistogram as NH + +@testset "test default swap" begin + using ..TestNetworkHistogram: test_swap_revertible + import Random, LinearAlgebra + using Distributions: Bernoulli, Normal + Random.seed!(1234123) + n = 20 + k = 5 + #data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) + data = Random.rand(Normal(), n, n) + g = NH.Observations(data, Normal(0, 1)) + labels = repeat(1:(n ÷ k), inner = k) + a = NH.Assignment(NH.GroupSize(n, k), labels) + swap = NH.DefaultSwap(1, 2) + test_swap_revertible(a, swap, g) +end diff --git a/test/old_tests/assignments/sparse_assignment.jl b/test/old_tests/assignments/sparse_assignment.jl new file mode 100644 index 0000000..a7b56ac --- /dev/null +++ b/test/old_tests/assignments/sparse_assignment.jl @@ -0,0 +1,120 @@ +import NetworkHistogram as NH + +using Random + +@testset "test sparse give the same as categorical" begin + using Distributions, LinearAlgebra, SparseArrays + k = 2 + m = 5 + level_count = 4 + n = 20 + tau = [0.8, 0.1, 0.1, 0.1, 0.1] + sbm = NH.initialize_sbm(ones(k) ./ k, Categorical(tau ./ sum(tau))) + A, _ = NH.sample(sbm, n) + A_dense = collect(A) + A = sparse(A_dense .- 1) + for i in 1:n + A[i, i] = 0 + end + g = NH.Observations(A_dense, Categorical(m)) + sbm_fitted, a = nethist(g; h = n ÷ k, iterations = 10) + sparse_a = NH.SparseAssignment( + NH.Observations(A, Categorical(m)), a.group_size, a.node_labels) + @test a.additional_data.counts == sparse_a.additional_data.counts + for (l, m_index) in enumerate(2:m) + @test a.additional_data.realized[m_index, :, :] == + sparse_a.additional_data.realized[l, :, :] + @test a.additional_data.estimated_theta[m_index, :, :] == + sparse_a.additional_data.estimated_theta[l, :, :] + end + @test a.additional_data.log_likelihood ≈ + sparse_a.additional_data.log_likelihood +end + +@testset "test sparse swap" begin + Random.seed!(1234123) + using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment + using Distributions: DiscreteNonParametric + using LinearAlgebra: Symmetric + import Random + m = 4 + p = ones(m) ./ m + n = 12 + k = 4 + dist = NH.ZeroInflatedCategorical(p) + sbm = NH.initialize_sbm(ones(k) ./ k, dist) + node_labels = repeat(1:k, inner = n ÷ k) + A = sparse(first(NH.sample(sbm, node_labels))) + g = NH.Observations(A, dist) + a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) + swap = NH.make_swap(a, (1, k + 1)) + @test A[:, 1] != A[:, k + 1] + a_test = deepcopy(a) + NH.apply_swap!(a_test, swap) + @test NH.get_group_of_vertex(a, swap.index1) == + NH.get_group_of_vertex(a_test, swap.index2) + @test NH.get_group_of_vertex(a, swap.index2) == + NH.get_group_of_vertex(a_test, swap.index1) + # force recomputation of the log likelihood using default assignment + a_new = to_default_assignment(a_test) + @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) + @test a_test.additional_data.realized != a.additional_data.realized + @test a_test.additional_data.estimated_theta != + a.additional_data.estimated_theta + @test a_test.additional_data.log_likelihood != + a.additional_data.log_likelihood + # revert the swap and check if the assignment is the same as before + NH.revert_swap!(a_test, swap) + @test a == a_test + @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) +end + +@testset "fast sparse update test" begin + using Distributions + realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; + [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized = [realized[I][k] + for k in eachindex(realized[1, 1]), + I in CartesianIndices(realized)] + counts = [1 4 4 + 4 1 4 + 4 4 1] + A = sparse([0 1 2 2 3 3 + 1 0 2 2 3 3 + 2 2 0 1 3 3 + 2 2 1 0 3 3 + 3 3 3 3 0 1 + 3 3 3 3 1 0]) + groupsize = NH.GroupSize(6, 2) + node_labels = [1, 1, 2, 2, 3, 3] + g = NH.Observations(A, Categorical(3)) + k = 3 + m = 3 + n = size(A, 1) + a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) + for index in eachindex(realized) + @test all(realized[index] .== a.additional_data.realized[index]) + end + @test loglikelihood(a, g) ≈ 0 + @test a.additional_data.counts == counts + swap_id = (1, 3) + ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; + [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; + [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] + realized_after_swap = [ras[I][k] + for k in eachindex(ras[1, 1]), + I in CartesianIndices(ras)] + + swap = NH.make_swap(a, swap_id) + NH.apply_swap!(a, swap) + for j in 1:3 + for i in 1:3 + @test all(realized_after_swap[:, i, j] .== + a.additional_data.realized[:, i, j]) + @test all(a.additional_data.estimated_theta[:, i, j] .≈ + realized_after_swap[:, i, j] ./ counts[i, j]) + end + end + @test loglikelihood(a, g) ≈ 4 * log(0.5) +end diff --git a/test/old_tests/assignments/sum_assignment.jl b/test/old_tests/assignments/sum_assignment.jl new file mode 100644 index 0000000..1a80856 --- /dev/null +++ b/test/old_tests/assignments/sum_assignment.jl @@ -0,0 +1,9 @@ +import NetworkHistogram as NH + +using Random + + +@testset "test sum assignment" begin + using Distributions, LinearAlgebra, SparseArrays + @test 1 == 2 +end diff --git a/test/old_tests/discretised_dist/discretizer.jl b/test/old_tests/discretised_dist/discretizer.jl new file mode 100644 index 0000000..17c90fa --- /dev/null +++ b/test/old_tests/discretised_dist/discretizer.jl @@ -0,0 +1,20 @@ +using NetworkHistogram + +@testset "discretizer" begin + using StaticArrays + reg_disc = NetworkHistogram.RegularDiscretizer( + 10, 0.0, 1.0, MVector{10}(1:10), 1 / 10) + cat_disc = NetworkHistogram.CategoryDiscretizer( + Dict([0.0 => 11]), Dict([11 => 0.0])) + hybrid_disc = NetworkHistogram.HybridDiscretizer( + reg_disc, cat_disc) + + @test NetworkHistogram.encode(reg_disc, 0.0) == 1 + @test NetworkHistogram.encode(cat_disc, 0.0) == 11 + @test NetworkHistogram.encode(hybrid_disc, 0.0) == 11 + @test NetworkHistogram.decode(hybrid_disc, 11) == 0.0 + @test all(NetworkHistogram.encode(reg_disc, 0.001:0.001:1.0) .== + NetworkHistogram.encode(hybrid_disc, 0.001:0.001:1.0)) + @test all(NetworkHistogram.decode(hybrid_disc, 1:10) .== + NetworkHistogram.decode(reg_disc, 1:10)) +end diff --git a/test/old_tests/generated_tests/all.jl b/test/old_tests/generated_tests/all.jl new file mode 100644 index 0000000..3a6cd57 --- /dev/null +++ b/test/old_tests/generated_tests/all.jl @@ -0,0 +1,2 @@ +include("test_zero_inflated.jl") +include("test_distribution.jl") diff --git a/test/old_tests/generated_tests/test_distribution.jl b/test/old_tests/generated_tests/test_distribution.jl new file mode 100644 index 0000000..9389428 --- /dev/null +++ b/test/old_tests/generated_tests/test_distribution.jl @@ -0,0 +1,84 @@ +using NetworkHistogram: ZeroInflated, DiscretizedDistribution, + ZeroInflatedCategorical, + ncategories, Discretizer, encode, decode, binwidth, + RegularDiscretizer, + CategoryDiscretizer, HybridDiscretizer, + DiscretizerZeroToZero, nlabels +using Distributions +using Test + +@testset "ZeroInflated" begin + dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) + @test pdf(dist, 0) ≈ 0.3 + 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 0) + @test pdf(dist, 1) ≈ 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 1) + @test cdf(dist, 0) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 0) + @test cdf(dist, 1) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 1) +end + +@testset "DiscretizedDistribution" begin + dist = DiscretizedDistribution(truncated(Normal(0, 1), -3, 3), 10) + @test ncategories(dist) == 10 + @test pdf(dist, 0) >= 0 + @test cdf(dist, 0) >= 0 +end + +@testset "ZeroInflatedCategorical" begin + dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) + @test pdf(dist, 0) ≈ 0.3 + @test pdf(dist, 1) ≈ 0.7 * 0.2 + @test cdf(dist, 0) ≈ 0.3 + @test cdf(dist, 1) ≈ 0.3 + 0.7 * 0.2 +end + +@testset "ZeroInflatedDiscretizedDistribution" begin + dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) + disc_dist = DiscretizedDistribution(dist, 10) + @test ncategories(disc_dist) == 10 + @test pdf(disc_dist, 0) >= 0 + @test cdf(disc_dist, 0) >= 0 +end + +@testset "DiscretizedZeroInflatedCategorical" begin + dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) + disc_dist = DiscretizedDistribution(dist, 10) + @test ncategories(disc_dist) == 10 + @test pdf(disc_dist, 0) >= 0 + @test cdf(disc_dist, 0) >= 0 +end + +@testset "Discretizer" begin + using Distributions + disc = RegularDiscretizer(10, 0.0, 1.0) + @test encode(disc, 0.05) == 1 + @test decode(disc, 1) == (0.0, 0.1) + @test binwidth(disc) == 0.1 + @test nlabels(disc) == 10 +end + +@testset "CategoryDiscretizer" begin + cat_to_bin = Dict("a" => 1, "b" => 2, "c" => 3) + bin_to_cat = Dict(1 => "a", 2 => "b", 3 => "c") + disc = CategoryDiscretizer(cat_to_bin, bin_to_cat) + @test encode(disc, "a") == 1 + @test decode(disc, 1) == "a" + @test nlabels(disc) == 3 +end + +@testset "HybridDiscretizer" begin + atoms = [0.0, 1.0] + disc = HybridDiscretizer(10, -1.0, 1.0, atoms) + @test encode(disc, 0.0) == 11 + @test encode(disc, 0.5) == 8 + @test decode(disc, 11) == 0.0 + @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) + @test nlabels(disc) == 12 +end + +@testset "DiscretizerZeroToZero" begin + disc = DiscretizerZeroToZero(10, -1.0, 1.0) + @test encode(disc, 0.0) == 0 + @test encode(disc, 0.5) == 8 + @test decode(disc, 0) == 0.0 + @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) + @test nlabels(disc) == 11 +end diff --git a/test/old_tests/generated_tests/test_zero_inflated.jl b/test/old_tests/generated_tests/test_zero_inflated.jl new file mode 100644 index 0000000..380e80c --- /dev/null +++ b/test/old_tests/generated_tests/test_zero_inflated.jl @@ -0,0 +1,97 @@ +using Test +using Distributions +using Random +using NetworkHistogram: ZeroInflated, get_proba_zero + +@testset "ZeroInflated Distribution Tests" begin + @testset "continuous distribution" begin + # Test construction + dist = Normal(0, 1) + zero_inflated_dist = ZeroInflated(0.5, dist) + @test zero_inflated_dist.edge_proba == Bernoulli(0.5) + @test zero_inflated_dist.dist == dist + + # Test pdf + @test pdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * pdf(dist, 0) + @test pdf(zero_inflated_dist, 1) ≈ 0.5 * pdf(dist, 1) + + # Test get_proba_zero + @test get_proba_zero(zero_inflated_dist) == 0.5 + + # Test rand + rng = MersenneTwister(1234) + sample = rand(rng, zero_inflated_dist) + @test sample == 0 || insupport(dist, sample) + + # Test logpdf + @test logpdf(zero_inflated_dist, 0) ≈ log(0.5 * (1 + pdf(dist, 0))) + @test logpdf(zero_inflated_dist, 1) ≈ log(0.5 * pdf(dist, 1)) + + # Test minimum and maximum + @test minimum(zero_inflated_dist) == minimum(dist) + @test maximum(zero_inflated_dist) == maximum(dist) + + # Test insupport + @test insupport(zero_inflated_dist, 0) + @test insupport(zero_inflated_dist, 1) == insupport(dist, 1) + + # Test cdf + @test cdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * cdf(dist, 0) + @test cdf(zero_inflated_dist, 1) ≈ 0.5 + 0.5 * cdf(dist, 1) + + # Test params + @test params(zero_inflated_dist) == (0.5, params(dist)...) + + # Test fit + data = [0, 0, 1, 2, 3] + fitted_dist = fit(ZeroInflated{Bernoulli, Normal}, data, 2) + @test fitted_dist.edge_proba == Bernoulli(0.6) + @test fitted_dist.dist isa Normal + end + + @testset "discrete distribution" begin + # Test construction with discrete distribution + dist_disc = Poisson(3) + zero_inflated_dist_disc = ZeroInflated(0.5, dist_disc) + @test zero_inflated_dist_disc.edge_proba == Bernoulli(0.5) + @test zero_inflated_dist_disc.dist == dist_disc + + # Test pdf with discrete distribution + @test pdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * pdf(dist_disc, 0) + @test pdf(zero_inflated_dist_disc, 1) ≈ 0.5 * pdf(dist_disc, 1) + + # Test get_proba_zero with discrete distribution + @test get_proba_zero(zero_inflated_dist_disc) == 0.5 + + # Test rand with discrete distribution + rng = MersenneTwister(1234) + sample_disc = rand(rng, zero_inflated_dist_disc) + @test sample_disc == 0 || insupport(dist_disc, sample_disc) + + # Test logpdf with discrete distribution + @test logpdf(zero_inflated_dist_disc, 0) ≈ + log(0.5 * (1 + pdf(dist_disc, 0))) + @test logpdf(zero_inflated_dist_disc, 1) ≈ log(0.5 * pdf(dist_disc, 1)) + + # Test minimum and maximum with discrete distribution + @test minimum(zero_inflated_dist_disc) == minimum(dist_disc) + @test maximum(zero_inflated_dist_disc) == maximum(dist_disc) + + # Test insupport with discrete distribution + @test insupport(zero_inflated_dist_disc, 0) + @test insupport(zero_inflated_dist_disc, 1) == insupport(dist_disc, 1) + + # Test cdf with discrete distribution + @test cdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * cdf(dist_disc, 0) + @test cdf(zero_inflated_dist_disc, 1) ≈ 0.5 + 0.5 * cdf(dist_disc, 1) + + # Test params with discrete distribution + @test params(zero_inflated_dist_disc) == (0.5, params(dist_disc)...) + + # Test fit with discrete distribution + data_disc = [0, 0, 1, 2, 3] + fitted_dist_disc = fit(ZeroInflated{Bernoulli, Poisson}, data_disc, 2) + @test fitted_dist_disc.edge_proba == Bernoulli(0.6) + @test fitted_dist_disc.dist isa Poisson + end +end diff --git a/test/old_tests/observations/discretisation.jl b/test/old_tests/observations/discretisation.jl new file mode 100644 index 0000000..49eb959 --- /dev/null +++ b/test/old_tests/observations/discretisation.jl @@ -0,0 +1,15 @@ +using NetworkHistogram + +@testset "discretisation" begin + using Distributions + A = rand(-1:1, 20, 20) + for i in 1:20 + A[i, i] = 0 + end + g = Observations(A, Uniform(-1, 1)) + discretised_g, discretizer = discretise(g; number_levels = 6) + @test size(discretised_g.graph) == size(g.graph) + @test discretised_g.dist_ref isa NetworkHistogram.DiscretizedDistribution + @test ncategories(discretised_g.dist_ref) == 6 + @test all(discretised_g.graph .∈ Ref(0:6)) +end diff --git a/test/old_tests/optimisation/config_rules/init_rule.jl b/test/old_tests/optimisation/config_rules/init_rule.jl new file mode 100644 index 0000000..f304378 --- /dev/null +++ b/test/old_tests/optimisation/config_rules/init_rule.jl @@ -0,0 +1,46 @@ +import NetworkHistogram as NH + +@testset "regression test" begin + Random.seed!(1234123) + using Distributions: Bernoulli + A = BitMatrix([0 0 1 0 1 0 1 1 0 1 + 0 0 1 1 1 1 1 1 0 0 + 1 1 0 1 0 0 0 0 1 0 + 0 1 1 0 1 0 1 0 0 0 + 1 1 0 1 0 0 1 0 0 1 + 0 1 0 0 0 0 0 1 0 0 + 1 1 0 1 1 0 0 1 0 1 + 1 1 0 0 0 1 1 0 0 1 + 0 0 1 0 0 0 0 0 0 1 + 1 0 0 0 1 0 1 1 1 0]) + h_true_nethist = 2.643731 # version 0.2.3 from nethist package + k_true = 3 + obs = NH.Observations(A, Bernoulli(0.5)) + @testset "degrees" begin + k = NH.select_number_node_per_block(obs, NH.EstimatedDegrees()) + @test k == k_true + end + @testset "eigenvalues" begin + k = NH.select_number_node_per_block(obs, NH.EstimatedEigenvalues()) + @test k == k_true + end +end + +@testset "test oracle K" begin + Random.seed!(1234123) + using Distributions: Bernoulli + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + obs = NH.Observations(A, Bernoulli(0.5)) + oracle = NH.OracleH(4) + @test NH.select_number_node_per_block(obs, oracle) == 4 + err = ArgumentError("The number of nodes per block 5 is too large for the \ + number of nodes 8, it should be at most 4") + @test_throws err NH.select_number_node_per_block(obs, NH.OracleH(5)) +end diff --git a/test/old_tests/runtests.jl b/test/old_tests/runtests.jl new file mode 100644 index 0000000..8c1ee77 --- /dev/null +++ b/test/old_tests/runtests.jl @@ -0,0 +1,38 @@ +using Test +using Aqua +using SparseArrays +include("TestNetworkHistogram.jl") + +@testset "Tests" begin + @testset "Discretizer tests" begin + include("discretised_dist/discretizer.jl") + end + @testset "Assignment tests" begin + include("assignments/default_assignment.jl") + include("assignments/bernoulli_assignment.jl") + include("assignments/categorical_assignment.jl") + include("assignments/sparse_assignment.jl") + include("assignments/sum_assignment.jl") + end + + @testset "Rule optimization tests" begin + include("optimisation/config_rules/init_rule.jl") + end + + @testset "Observations tests" begin + include("observations/discretisation.jl") + end + + @testset "API tests" begin + include("test_api.jl") + end + + @testset "Generated tests" begin + include("generated_tests/all.jl") + end + + # @testset "Aqua.jl for package quality" begin + # using NetworkHistogram + # Aqua.test_all(NetworkHistogram) + # end +end diff --git a/test/old_tests/test_api.jl b/test/old_tests/test_api.jl new file mode 100644 index 0000000..48fdefb --- /dev/null +++ b/test/old_tests/test_api.jl @@ -0,0 +1,19 @@ +@testset "test api" begin + using Distributions + A = rand(-1:1, 40, 40) + for i in 1:40 + A[i, i] = 0 + end + + g = Observations(Symmetric(A), Uniform(-1, 1)) + sbm_fitted, a = nethist(g; h = 10, iterations = 10) + + @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) + @test size(sbm_fitted) == (4, 4) + + sbm_discretised, a, discretizer = nethist_discretised( + g; number_levels = 5, h = 10, iterations = 10) + @test sbm_discretised[1, 1] isa DiscretizedDistribution + @test ncategories(sbm_discretised[1, 1]) == 5 + @test size(sbm_discretised) == (4, 4) +end From c7995a65d9bab90cae9db6ef43496820b4f5e67a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 28 Apr 2025 17:28:57 +0200 Subject: [PATCH 136/266] start testing --- ext/DistributionsExt.jl | 8 ++++++ src/EdgeList.jl | 5 ++++ src/api.jl | 2 ++ src/block_model.jl | 44 ++++++++++++++++++++++++++++----- src/optimization/greedy.jl | 2 +- test/runtests.jl | 24 ++---------------- test/test_data_format.jl | 22 +++++++++++++++++ test/test_distributions_type.jl | 12 +++++++++ 8 files changed, 90 insertions(+), 29 deletions(-) create mode 100644 ext/DistributionsExt.jl create mode 100644 test/test_data_format.jl create mode 100644 test/test_distributions_type.jl diff --git a/ext/DistributionsExt.jl b/ext/DistributionsExt.jl new file mode 100644 index 0000000..13fba05 --- /dev/null +++ b/ext/DistributionsExt.jl @@ -0,0 +1,8 @@ +module DistributionsExt + + using NetworkHistogram + using StatsBase + import NetworkHistogram: fast_ll_update! + import Distributions: logpdf + +end diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 2f57477..21dcb2a 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -28,3 +28,8 @@ function EdgeList(A::AbstractMatrix{E}) where {E} end return EdgeList(data) end + + +function Base.convert(::Type{EdgeList{E}}, A::AbstractMatrix{E}) where {E} + return EdgeList(A) +end diff --git a/src/api.jl b/src/api.jl index e77b753..99cd71e 100644 --- a/src/api.jl +++ b/src/api.jl @@ -3,6 +3,7 @@ function nethist(data_input, dist_user, initial_node_labels, params::GreedyParam dist = Dist(dist_user) g = preprocess_data(data_input, dist) + out = greedy_optimize(g, initial_node_labels, params) return postprocess(out) @@ -16,5 +17,6 @@ end function postprocess(out) + return true return BlockModel(optimal_a) end diff --git a/src/block_model.jl b/src/block_model.jl index c824d92..9cb2e79 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -1,13 +1,45 @@ -struct BlockModel{D,K,T} +struct BlockModel{D, K, T} _dists::SymArray{D} - sizes::SVector{K,T} + sizes::SVector{K, T} + cum_sizes::Vector{T} +end + +function BlockModel(k::Int, d::D) where {D} + sizes = @SVector fill(1/k, k) + cumulative_sizes = cumsum(sizes) + _dists = SymArray(k, d) + return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) +end + + +function sample(bm::BlockModel, latents::Vector{T}) where {T} + #fuck need the element type of the distribution... +end + + +# this is probably awfull + +function Base.getindex(s::BlockModel, i::Int, j::Int) + return s._dists[i, j] +end + +function Base.setindex!(s::BlockModel, v, i::Int, j::Int) + s._dists[i, j] = v +end + +function Base.size(s::BlockModel) + return (s._dists.k, s._dists.k) end -Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) - return s._dists[minmax(i, j)] +function Base.getindex(s::BlockModel, i::Real, j::Real) + k = findfirst(x -> x ≥ i, s.cum_sizes) + l = findfirst(x -> x ≥ j, s.cum_sizes) + return s._dists[k, l] end -function Base.setindex!(s::BlockModel, v, i, j) - s._dists[minmax(i, j)] = v +function Base.setindex!(s::BlockModel, v, i::Real, j::Real) + k = findfirst(x -> x ≥ i, s.cum_sizes) + l = findfirst(x -> x ≥ j, s.cum_sizes) + s._dists[k, l] = v end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 8983b54..8d4ec9b 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -12,7 +12,7 @@ end GreedyParams() = GreedyParams(10_000, RandomNodeSwap(), Strict(), PreviousBestValue(1000), false) function greedy_optimize(g, initial_labels, params::GreedyParams) - a = Assignment(g, initial_labels) + a = Assignment(initial_labels, g...) greedy_improve!(a; params = params) return a end diff --git a/test/runtests.jl b/test/runtests.jl index 069b837..5dd564b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,27 +2,7 @@ using Test using LinearAlgebra, SparseArrays using NetworkHistogram @testset "Tests" begin - @testset "test can run" begin - @test 1 == 1 - end - - - @testset "Edge list tests" begin - A = Symmetric(sprand(20,20,0.5)) - edgelist = EdgeList(A) - - for j in 1:20 - for i in 1:20 - if A[i,j] != 0 - nv_j, val_j = neighbors(edgelist, j) - @test i in nv_j - @test A[i,j] == val_j[findfirst(x -> x == i, nv_j)] - end - end - end - - @test eltype(edgelist) == eltype(A) - @test nodes(edgelist) == size(A,1) - end + include("test_data_format.jl") + include("test_distributions_type.jl") end diff --git a/test/test_data_format.jl b/test/test_data_format.jl new file mode 100644 index 0000000..bdc68e4 --- /dev/null +++ b/test/test_data_format.jl @@ -0,0 +1,22 @@ +@testset "Edge list tests" begin + using Random + Random.seed!(1234) + A = Symmetric(sprand(20,20,0.5)) + edgelist = EdgeList(A) + + for j in 1:20 + nv_j, val_j = neighbors(edgelist, j) + for i in 1:20 + if A[i,j] == 0 + @test i ∉ nv_j + end + if A[i,j] != 0 + @test i in nv_j + @test A[i,j] == val_j[findfirst(x -> x == i, nv_j)] + end + end + end + + @test eltype(edgelist) == eltype(A) + @test nodes(edgelist) == size(A,1) +end diff --git a/test/test_distributions_type.jl b/test/test_distributions_type.jl new file mode 100644 index 0000000..b8c53e9 --- /dev/null +++ b/test/test_distributions_type.jl @@ -0,0 +1,12 @@ +@testset "Distribution tests" begin + import NetworkHistogram as NH + d1 = NH.Bernoulli(0.5) + d2 = NH.Bernoulli(0.7) + my_d = NH.Dist(d1) + d_avg = NH.add_to(my_d, d2) + @test d_avg.counts == 2 + @test d_avg.dist.p == 0.6 + d_removed = NH.remove_from(d_avg, d2) + @test d_removed.counts == 1 + @test d_removed.dist == d1 +end From fede79b81c196a817db6d9ae0ab9b192ab76100d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 30 Apr 2025 12:19:43 +0200 Subject: [PATCH 137/266] issues with zero vs non-edge --- src/EdgeList.jl | 18 +++++++++++++- src/assignment.jl | 39 +++++++++++++++--------------- src/distributions_type.jl | 8 +++++- src/optimization/swap_workspace.jl | 6 ++--- src/utils/SymArray.jl | 10 ++++++-- test/test_data_format.jl | 2 +- 6 files changed, 56 insertions(+), 27 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 21dcb2a..41c1723 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -6,7 +6,11 @@ function neighbors(A::EdgeList{E}, i::Int) where {E} return first.(A.data[i]), last.(A.data[i]) end -function Base.eltype(edgelist::EdgeList{E}) where {E} +function iterate_neighbors(A::EdgeList{E}, i::Int) where {E} + return zip(first.(A.data[i]), last.(A.data[i])) +end + +function edge_type(edgelist::EdgeList{E}) where {E} return E end @@ -33,3 +37,15 @@ end function Base.convert(::Type{EdgeList{E}}, A::AbstractMatrix{E}) where {E} return EdgeList(A) end + + +function fit(d::Dist, A::EdgeList{E}) where {E} + new_data = Vector{Vector{Tuple{Int, typeof(d)}}}(undef, length(A.data)) + for j in 1:length(A.data) + new_data[j] = Vector{Tuple{Int, typeof(d)}}(undef, length(A.data[j])) + for (k,(i, e)) in enumerate(A.data[j]) + new_data[j][k] = (i, fit(d, e)) + end + end + return EdgeList(new_data) +end diff --git a/src/assignment.jl b/src/assignment.jl index 7c0dce5..603197e 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -38,9 +38,7 @@ Base.@propagate_inbounds function Base.getindex( return i < length(g) ? g.group_number[1] : g.group_number[2] end - - -mutable struct Assignment{E, D, A, F} +mutable struct Assignment{E, D, F} node_labels::AbstractVector{Int} const edges::EdgeList{E} const dists::EdgeList{D} @@ -48,7 +46,6 @@ mutable struct Assignment{E, D, A, F} log_likelihood::SymArray{F} end - function loglikelihood(a::Assignment) return sum(a.log_likelihood) end @@ -58,48 +55,52 @@ function group(a::Assignment, node::Int) end function get_edges_in_groups(a::Assignment, g1::Int, g2::Int) - nodes_g1 = findall(x -> x == g1, a.node_labels) - edges = Vector(eltype(a.edges), 0) + return get_edges_in_groups(a.node_labels, a.edges, g1, g2) +end + +function get_edges_in_groups(node_labels, edges_all, g1, g2) + nodes_g1 = findall(x -> x == g1, node_labels) + edges = Vector{edge_type(edges_all)}(undef, 0) if g1 == g2 - for u in nodes_g1 - for (v, e) in a.edges[u] + for u in nodes_g1 + for (v, e) in iterate_neighbors(edges_all, u) if v in nodes_g1 && u < v push!(edges, e) end end end else - nodes_g2 = findall(x -> x == g2, a.node_labels) + nodes_g2 = findall(x -> x == g2, node_labels) for u in nodes_g1 - for (v, e) in a.edges[u] + for (v, e) in iterate_neighbors(edges_all, u) if v in nodes_g2 push!(edges, e) end end end end - return edges + return edges end - function Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} - dists = fit.(dist ,edge_list) + dists = fit(dist, edge_list) number_groups = length(unique(node_labels)) θ = SymArray(number_groups, dist) log_likelihood = SymArray(number_groups, 0.0) - for u in 1:length(dists) + for u in 1:nodes(dists) g1 = node_labels[u] - for (v,d) in neighbors(dists, u) + for (v, d) in iterate_neighbors(dists, u) g2 = node_labels[v] - if g1 == g2 && u < v - continue + if u < v + θ[g1, g2] = add_to(θ[g1, g2], d) end - θ[g1, g2] = add_to(θ[g1, g2], d) end end for k in 1:number_groups for l in k:number_groups - log_likelihood[k, l] = loglikelihood(θ[k,l], get_edges_in_groups(edge_list, k, l)) + log_likelihood[k, + l] = loglikelihood( + θ[k, l], get_edges_in_groups(node_labels, edge_list, k, l)) end end return Assignment(node_labels, edge_list, dists, θ, log_likelihood) diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 7402941..1f45485 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -20,7 +20,12 @@ function remove_from(avgdist::Dist{D}, dist::D) where {D} end end -for f in [:logpdf, :sample, :dist, :eltype] + +# probably need to update to account for counts in second dist? will that mess the other code? +add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) +remove_from(d::Dist, dist::Dist) = remove_from(d, dist.dist) + +for f in [:logpdf, :sample, :dist, :eltype, :params] @eval $f(d::Dist, args...) = $f(d.dist, args...) end @@ -45,3 +50,4 @@ fit(::Bernoulli, x) = Bernoulli(mean(x)) sample(d::Bernoulli, n=1) = rand(n) .<= d.p dist(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) +params(d::Bernoulli) = (d.p,) diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index ba99540..8dba343 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -1,12 +1,12 @@ -mutable struct WorkspaceSwap{D,K,F} +mutable struct WorkspaceSwap{D,F} θ::SymArray{D} log_likelihood_per_group::SymArray{F} end -mutable struct Swap{D, F} +mutable struct Swap{W} u::Int v::Int - workspace::WorkspaceSwap{D,F} + workspace::W end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 7d71e4a..09155e1 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -2,7 +2,7 @@ module FastSymArray export SymArray - mutable struct SymArray{F} + mutable struct SymArray{F} <: AbstractArray{F, 2} d::Dict{Tuple{Int, Int}, F} k::Int end @@ -13,11 +13,17 @@ module FastSymArray for j in i:k), k) end + function Base.size(a::SymArray) + return (a.k, a.k) + end + Base.@propagate_inbounds function Base.getindex(a::SymArray, i, j) + @boundscheck checkbounds(a, i, j) return a.d[minmax(i, j)] end - function Base.setindex!(a::SymArray, v, i, j) + Base.@propagate_inbounds function Base.setindex!(a::SymArray, v, i, j) + @boundscheck checkbounds(a, i, j) a.d[minmax(i, j)] = v end diff --git a/test/test_data_format.jl b/test/test_data_format.jl index bdc68e4..6bcdbf6 100644 --- a/test/test_data_format.jl +++ b/test/test_data_format.jl @@ -17,6 +17,6 @@ end end - @test eltype(edgelist) == eltype(A) + @test NetworkHistogram.edge_type(edgelist) == eltype(A) @test nodes(edgelist) == size(A,1) end From 3e88e699eb56eed74371483d139f8c2bccce6ad0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 30 Apr 2025 14:23:51 +0200 Subject: [PATCH 138/266] only optimize on missing, need to do it for bernoulli --- src/EdgeList.jl | 4 ++-- src/optimization/swap_workspace.jl | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 41c1723..9b397f9 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -19,13 +19,13 @@ function nodes(edgelist::EdgeList{E}) where {E} end -function EdgeList(A::AbstractMatrix{E}) where {E} +function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} n = size(A, 1) data = Vector{Vector{Tuple{Int,E}}}(undef, n) for j in 1:n data[j] = Vector{Tuple{Int,E}}(undef, 0) for i in 1:n - if A[i, j] != 0 + if !ismissing(A[i,j]) push!(data[j], (i, A[i, j])) end end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 8dba343..ea2c0aa 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -15,7 +15,7 @@ function make_swap(a::Assignment, id) end function make_swap!(swap::Swap, a::Assignment, id) - swap.index1, swap.index2 = id + swap.u, swap.v = id swap.workspace.θ = deepcopy(a.θ) swap.workspace.log_likelihood_per_group = deepcopy(a.log_likelihood) end @@ -31,22 +31,23 @@ function swap_node_labels!(a::Assignment, i, j) end function apply_swap!(a::Assignment, s::Swap) - g1 = get_group_of_vertex(a, s.index1) - g2 = get_group_of_vertex(a, s.index2) + g1 = group(a, s.u) + g2 = group(a, s.v) groups_concerned = Set([minmax(g1, g2)]) - for (u, g_old, g_new) in [(s.index1, g1, g2), (s.index2, g2, g1)] + for (u, g_old, g_new) in [(s.u, g1, g2), (s.v, g2, g1)] # iterate over neighbors of u and get the decoration of the edge - for (v,d) in a.dists[u] - g_v = get_group_of_vertex(a, v) + for (v,d) in iterate_neighbors(a.dists, u) + g_v = group(a, v) a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) push!(groups_concerned, minmax(g_new, g_v)) push!(groups_concerned, minmax(g_old, g_v)) end end + println("Groups concerned: ", groups_concerned) fast_ll_update!(a, groups_concerned) - swap_node_labels!(a, s.index1, s.index2) + swap_node_labels!(a, s.u, s.v) end @@ -65,7 +66,7 @@ function _fast_ll_one_group(a::Assignment, g1, g2) ll = 0.0 d = a.θ[g1, g2] for u in nodes_g1 - for (v,e) in a.edges[u] # assume implicitly that g1 != g2 + for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 if v in nodes_g2 ll += loglikelihood(d, e) end From f4e7fa2a5a4375de769393e4f5b8813019f1db94 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 30 Apr 2025 17:00:05 +0200 Subject: [PATCH 139/266] remove old code --- src/old_messy/api.jl | 155 ----------- src/old_messy/assignments/Assignments.jl | 104 -------- .../assignments/BernoulliAssignment/struct.jl | 118 --------- .../assignments/BernoulliAssignment/swap.jl | 99 ------- .../CategoricalAssignment/struct.jl | 128 --------- .../assignments/CategoricalAssignment/swap.jl | 201 -------------- .../assignments/SparseAssignment/struct.jl | 115 -------- .../assignments/SparseAssignment/swap.jl | 134 ---------- .../assignments/SumAssignment/struct.jl | 94 ------- .../assignments/SumAssignment/swap.jl | 119 --------- src/old_messy/assignments/group_numbering.jl | 86 ------ src/old_messy/assignments/include.jl | 4 - src/old_messy/bootstrap.jl | 15 -- .../distributions/categorical_with_0.jl | 141 ---------- src/old_messy/distributions/discrete_dist.jl | 142 ---------- src/old_messy/distributions/discretizer.jl | 248 ------------------ src/old_messy/distributions/include.jl | 5 - src/old_messy/distributions/markov_chain.jl | 144 ---------- src/old_messy/distributions/utils.jl | 19 -- src/old_messy/distributions/zero_inflated.jl | 93 ------- src/old_messy/observations.jl | 231 ---------------- .../optimisation/config_rules/InitRule.jl | 99 ------- .../optimisation/config_rules/accept_rule.jl | 25 -- .../config_rules/bandwidth_selection_rule.jl | 101 ------- .../optimisation/config_rules/include.jl | 5 - .../optimisation/config_rules/stop_rule.jl | 49 ---- .../optimisation/config_rules/swap_rule.jl | 27 -- src/old_messy/optimisation/fit.jl | 99 ------- src/old_messy/optimisation/include.jl | 3 - src/old_messy/optimisation/least_squares.jl | 95 ------- src/old_messy/optimisation/swap.jl | 26 -- src/old_messy/sbm.jl | 186 ------------- test/old_tests/TestNetworkHistogram.jl | 30 --- .../assignments/bernoulli_assignment.jl | 42 --- .../assignments/categorical_assignment.jl | 126 --------- .../assignments/default_assignment.jl | 17 -- .../assignments/sparse_assignment.jl | 120 --------- test/old_tests/assignments/sum_assignment.jl | 9 - .../old_tests/discretised_dist/discretizer.jl | 20 -- test/old_tests/generated_tests/all.jl | 2 - .../generated_tests/test_distribution.jl | 84 ------ .../generated_tests/test_zero_inflated.jl | 97 ------- test/old_tests/observations/discretisation.jl | 15 -- .../optimisation/config_rules/init_rule.jl | 46 ---- test/old_tests/runtests.jl | 38 --- test/old_tests/test_api.jl | 19 -- 46 files changed, 3775 deletions(-) delete mode 100644 src/old_messy/api.jl delete mode 100644 src/old_messy/assignments/Assignments.jl delete mode 100644 src/old_messy/assignments/BernoulliAssignment/struct.jl delete mode 100644 src/old_messy/assignments/BernoulliAssignment/swap.jl delete mode 100644 src/old_messy/assignments/CategoricalAssignment/struct.jl delete mode 100644 src/old_messy/assignments/CategoricalAssignment/swap.jl delete mode 100644 src/old_messy/assignments/SparseAssignment/struct.jl delete mode 100644 src/old_messy/assignments/SparseAssignment/swap.jl delete mode 100644 src/old_messy/assignments/SumAssignment/struct.jl delete mode 100644 src/old_messy/assignments/SumAssignment/swap.jl delete mode 100644 src/old_messy/assignments/group_numbering.jl delete mode 100644 src/old_messy/assignments/include.jl delete mode 100644 src/old_messy/bootstrap.jl delete mode 100644 src/old_messy/distributions/categorical_with_0.jl delete mode 100644 src/old_messy/distributions/discrete_dist.jl delete mode 100644 src/old_messy/distributions/discretizer.jl delete mode 100644 src/old_messy/distributions/include.jl delete mode 100644 src/old_messy/distributions/markov_chain.jl delete mode 100644 src/old_messy/distributions/utils.jl delete mode 100644 src/old_messy/distributions/zero_inflated.jl delete mode 100644 src/old_messy/observations.jl delete mode 100644 src/old_messy/optimisation/config_rules/InitRule.jl delete mode 100644 src/old_messy/optimisation/config_rules/accept_rule.jl delete mode 100644 src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl delete mode 100644 src/old_messy/optimisation/config_rules/include.jl delete mode 100644 src/old_messy/optimisation/config_rules/stop_rule.jl delete mode 100644 src/old_messy/optimisation/config_rules/swap_rule.jl delete mode 100644 src/old_messy/optimisation/fit.jl delete mode 100644 src/old_messy/optimisation/include.jl delete mode 100644 src/old_messy/optimisation/least_squares.jl delete mode 100644 src/old_messy/optimisation/swap.jl delete mode 100644 src/old_messy/sbm.jl delete mode 100644 test/old_tests/TestNetworkHistogram.jl delete mode 100644 test/old_tests/assignments/bernoulli_assignment.jl delete mode 100644 test/old_tests/assignments/categorical_assignment.jl delete mode 100644 test/old_tests/assignments/default_assignment.jl delete mode 100644 test/old_tests/assignments/sparse_assignment.jl delete mode 100644 test/old_tests/assignments/sum_assignment.jl delete mode 100644 test/old_tests/discretised_dist/discretizer.jl delete mode 100644 test/old_tests/generated_tests/all.jl delete mode 100644 test/old_tests/generated_tests/test_distribution.jl delete mode 100644 test/old_tests/generated_tests/test_zero_inflated.jl delete mode 100644 test/old_tests/observations/discretisation.jl delete mode 100644 test/old_tests/optimisation/config_rules/init_rule.jl delete mode 100644 test/old_tests/runtests.jl delete mode 100644 test/old_tests/test_api.jl diff --git a/src/old_messy/api.jl b/src/old_messy/api.jl deleted file mode 100644 index d09a6f3..0000000 --- a/src/old_messy/api.jl +++ /dev/null @@ -1,155 +0,0 @@ -""" - _default_init(dist::Distribution, start = MetisStart()) - -Initialize the distribution with a default rule. - -# Arguments -- `dist::Distribution`: The distribution to initialize. -- `start`: The starting method. - -# Returns -- `InitRule`: The initialization rule. -""" -function _default_init(dist::Distribution, start = MetisStart()) - if dist isa Bernoulli - return InitRule(start, Val{BernoulliData}()) - elseif dist isa Categorical - return InitRule(start, Val{CategoricalData}()) - elseif dist isa DiscretizedDistribution || dist isa ZeroInflatedCategorical - return InitRule(start, Val{SparseData}()) - else - return InitRule(start, nothing) - end -end - -function _default_init(::DiscreteMarkovChain, start = RandomStart()) - return InitRule(start, Val{SumData}()) -end - -""" - _nethist(g::Observations{G, D}, h; kwargs...) - -Estimate the graphon and fit the model to the given graph observations. - -# Arguments -- `g::Observations{G, D}`: The graph observations. -- `h`: Number of nodes per block. -- `kwargs...`: Additional keyword arguments. - -# Returns -- `fit_model`: The fitted model. -- `a`: The assignment of nodes to blocks. -""" -function _nethist(g::Observations{G, D}, h; kwargs...) where {G, D} - kwargs_dict = Dict(kwargs) - start_clustering = pop!(kwargs_dict, :start_clustering, MetisStart()) - initialise_rule = pop!( - kwargs_dict, :initialise_rule, _default_init( - g.dist_ref, start_clustering)) - a = estimate_graphon(g, h; - kwargs_dict..., initialise_rule = initialise_rule) - return fit(a, g), a -end - -""" - nethist(g::Observations{G, D}; h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) - -Fit a Stochastic Block Model (SBM) to the given graph observations. - -# Arguments -- `g::Observations{G, D}`: The graph observations. -- `h`: Number of nodes per block. -- `iterations`: Maximum number of iterations. -- `stalled_iter`: Number of stalled iterations before stopping. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `progress_bar::Bool`: Whether to show a progress bar. -- `start_clustering`: Initial clustering method. - -# Returns -- `sbm`: The fitted SBM. -- `a`: The assignment of nodes to blocks. -""" -function nethist(g::Observations{G, D}; - h = select_number_node_per_block(g, EstimatedDegrees()), - iterations = 100_000, - stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomGroupSwap(), - accept_rule::AcceptRule = Strict(), - progress_bar::Bool = false, - start_clustering = MetisStart() -) where {G, D} - return _nethist(g, h; - iterations = iterations, - swap_rule = swap_rule, - accept_rule = accept_rule, - stop_rule = PreviousBestValue(stalled_iter), - progress_bar = progress_bar, - start_clustering = start_clustering) -end - -""" - nethist_discretised(g::Observations{G, D}; number_levels, h, iterations, stalled_iter, swap_rule, accept_rule, progress_bar, start_clustering) - -Fit a discretised Stochastic Block Model (SBM) to the given graph observations. - -# Arguments -- `g::Observations{G, D}`: The graph observations. -- `number_levels`: Number of levels for discretisation. -- `h`: Number of nodes per block. -- `iterations`: Maximum number of iterations. -- `stalled_iter`: Number of stalled iterations before stopping. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `progress_bar::Bool`: Whether to show a progress bar. -- `start_clustering`: Initial clustering method. - -# Returns -- `sbm_discretise`: The fitted discretised SBM. -- `a`: The assignment of nodes to blocks. -- `discretiser`: The discretiser used. -""" -function nethist_discretised(g::Observations{G, D}; - number_levels = nothing, - h = select_number_node_per_block(g, EstimatedDegrees()), - iterations = 100_000, - stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomGroupSwap(), - accept_rule::AcceptRule = Strict(), - progress_bar::Bool = false, - start_clustering = MetisStart() -) where {G, D} - num_groups = isnothing(number_levels) ? number_nodes(g) ÷ h : nothing - obs_discrete, discretiser = discretise( - g, number_groups = num_groups, number_levels = number_levels) - sbm_discretise, a = _nethist(obs_discrete, h; - iterations = iterations, - swap_rule = swap_rule, - accept_rule = accept_rule, - stop_rule = PreviousBestValue(stalled_iter), - progress_bar = progress_bar, - start_clustering = start_clustering) - return sbm_discretise, a, discretiser -end - - - -function nethist_mc(g::Observations{G, <:DiscreteMarkovChain}; - h = number_nodes(g) ÷ 2, - iterations = 100_000, - stalled_iter = 1000, - swap_rule::NodeSwapRule = RandomGroupSwap(), - accept_rule::AcceptRule = Strict(), - progress_bar::Bool = true, - start_clustering = RandomStart() -) where {G} - initialise_rule = _default_init(g.dist_ref, start_clustering) - a = estimate_graphon(g, h; - iterations = iterations, - initialise_rule = initialise_rule, - swap_rule = swap_rule, - accept_rule = accept_rule, - stop_rule = PreviousBestValue(stalled_iter), - progress_bar = progress_bar) - return fit(a, g), a -end diff --git a/src/old_messy/assignments/Assignments.jl b/src/old_messy/assignments/Assignments.jl deleted file mode 100644 index c312324..0000000 --- a/src/old_messy/assignments/Assignments.jl +++ /dev/null @@ -1,104 +0,0 @@ -include("group_numbering.jl") - -""" - struct Assignment{T, B} <: AbstractVector{Vector{Int}} - -A structure representing an assignment of nodes to groups. - -# Fields -- `group_size::GroupSize{T}`: The size of each group. -- `node_labels::Vector{Int}`: A vector of node labels. -- `additional_data::B`: Additional data associated with the assignment. - -# Constructor - Assignment(group_size::GroupSize{T}, node_labels, additional_data::B) where {T, B} - -Creates a new `Assignment` instance. - -# Arguments -- `group_size::GroupSize{T}`: The size of each group. -- `node_labels::Vector{Int}`: A vector of node labels. The length of this vector must be equal to the sum of `group_size`. -- `additional_data::B`: Additional data associated with the assignment. - -# Throws -- `ArgumentError`: If the length of `node_labels` is not equal to the sum of `group_size`. -""" -struct Assignment{T, B} <: AbstractVector{Vector{Int}} - group_size::GroupSize{T} - node_labels::Vector{Int} - additional_data::B - - function Assignment(group_size::GroupSize{T}, node_labels, - additional_data::B) where {T, B} - if length(node_labels) != sum(group_size) - throw(ArgumentError("The length of `node_labels` must be equal to the sum of \ - `group_size`")) - end - return new{T, B}(group_size, node_labels, additional_data) - end -end - -function Assignment(group_size::GroupSize, node_labels) - if length(node_labels) != sum(group_size) - throw(ArgumentError("The length of `node_labels` $(length(node_labels)) must be \ - equal to the sum of `group_size` $(sum(group_size))")) - end - c = StatsBase.countmap(node_labels) - if length(c) != length(group_size) - throw(ArgumentError("The number of unique elements in `node_labels` $(length(c)) \ - must be equal to the length of `group_size` $(length(group_size))")) - end - for (k, v) in c - if v != group_size[k] - throw(ArgumentError("The number of elements in `node_labels` $(v) for group \ - $(k) must be equal to the size of the group $(group_size[k])")) - end - end - return Assignment(group_size, node_labels, nothing) -end - -function number_groups(assignment::Assignment) - return length(assignment.group_size) -end - -function number_nodes(assignment::Assignment) - return length(assignment.node_labels) -end - -function get_vertex_in_group(assignment::Assignment, group) - return findall(assignment.node_labels .== group) -end - -function get_group_of_vertex(assignment::Assignment, vertex) - return assignment.node_labels[vertex] -end - -function get_edge_indices(a::Assignment, i, j) - if i == j - return get_edge_indices(a, i) - else - return [(x, y) for x in get_vertex_in_group(a, i) - for y in get_vertex_in_group(a, j)] - end -end - -function get_edge_indices(a::Assignment, i) - nodes_i = get_vertex_in_group(a, i) - return [(x, y) for x in nodes_i for y in nodes_i if x < y] -end - -Base.size(a::Assignment) = (number_groups(a),) -Base.@propagate_inbounds function Base.getindex(a::Assignment, i) - @boundscheck checkbounds(a, i) - return get_vertex_in_group(a, i) -end - -function get_ordered_adjacency_matrix(a::Assignment, A, by = identity) - perm = sortperm(a.node_labels, by = by) - return A[perm, perm] -end - -function Base.deepcopy(a::Assignment) - return Assignment( - a.group_size, copy(a.node_labels), deepcopy(a.additional_data)) -end diff --git a/src/old_messy/assignments/BernoulliAssignment/struct.jl b/src/old_messy/assignments/BernoulliAssignment/struct.jl deleted file mode 100644 index 73fab50..0000000 --- a/src/old_messy/assignments/BernoulliAssignment/struct.jl +++ /dev/null @@ -1,118 +0,0 @@ -""" - mutable struct BernoulliData{F} - -A data structure to store information related to a Bernoulli assignment in a network. - -# Fields -- `counts::Matrix{Int}`: A matrix representing the maximum number of edges between groups. -- `realized::Matrix{Int}`: A matrix representing the number of edges between groups. -- `estimated_theta::Matrix{F}`: A matrix of estimated parameters (theta). -- `A::BitMatrix`: An adjacency matrix representing the network structure. -- `log_likelihood::F`: -""" -mutable struct BernoulliData{F} - counts::Matrix{Int} - realized::Matrix{Int} - estimated_theta::Matrix{F} - A::BitMatrix # possible improvement by using an adjacency list - log_likelihood::F -end - -const BernoulliAssignment{T, F} = Assignment{T, BernoulliData{F}} -const BernoulliInitRule{S, F} = InitRule{S, Val{BernoulliData}} - -function BernoulliAssignment( - g, group_size::GroupSize, node_labels::Vector{Int}) - bernoulli_data = make_bernoulli_data(g, node_labels, group_size) - return Assignment(group_size, node_labels, bernoulli_data) -end - -function make_assignment(g, h, init_rule::BernoulliInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - return BernoulliAssignment(g, group_size, node_labels) -end - -# might be worth using graph accessors instead of the adjacency matrix ? -function make_bernoulli_data(g, node_labels, group_size) - number_groups = length(group_size) - n = length(node_labels) - counts = zeros(Int, number_groups, number_groups) - realized = zeros(Int, number_groups, number_groups) - A = convert_bitmatrix(g) - - # below needs to be abstracted: not sure how diagonal is handled if nonzero - # addtioally, we should be able to deal with missing values ! - # This concerns the counts matrix above as well - @inbounds @simd for k in 1:number_groups - for l in k:number_groups - realized[k, l] = sum(A[node_labels .== k, node_labels .== l]) - realized[l, k] = realized[k, l] - counts[k, l] = group_size[k] * group_size[l] - counts[l, k] = counts[k, l] - end - end - - @inbounds @simd for k in 1:number_groups - counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 - realized[k, k] = sum(A[node_labels .== k, node_labels .== k]) ÷ 2 - end - - estimated_theta = realized ./ counts - ll = compute_log_likelihood(estimated_theta, counts) - return BernoulliData(counts, realized, estimated_theta, A, ll) -end - -function convert_bitmatrix(g::Observations{<:AbstractGraph, D}) where {D} - A = collect(adjacency_matrix(g.graph)) - return convert(BitMatrix, collect(adjacency_matrix(g.graph))) -end - -function convert_bitmatrix(g::Observations{<:AbstractMatrix, D}) where {D} - return convert(BitMatrix, g.graph) -end - -function compute_log_likelihood(estimated_theta::AbstractMatrix{F}, - counts::AbstractMatrix{T}) where {F <: Real, T <: Real} - number_groups = size(estimated_theta, 1) - loglik = zero(eltype(estimated_theta)) - @inbounds for j in 1:number_groups - @simd for i in j:number_groups - θ = estimated_theta[i, j] - loglik += (xlogx(θ) + xlogx(1 - θ)) * counts[i, j] - end - end - return loglik -end - -function loglikelihood(assignment::BernoulliAssignment) - return assignment.additional_data.log_likelihood -end - -loglikelihood(a::BernoulliAssignment, g::Observations) = loglikelihood(a) - -function force_recompute_ll(a::BernoulliAssignment, g::Observations) - a_simple = Assignment(a.group_size, a.node_labels) - return loglikelihood(a_simple, g) -end - -include("swap.jl") - -function get_ordered_adjacency_matrix(a::BernoulliAssignment, by = identity) - return get_ordered_adjacency_matrix(a, a.additional_data.A, by) -end - -# TODO: move to sparse structure to encode difference between 0 weight and absence of edge -# from docs: -# A = sparse(I,J,V) -# rows = rowvals(A) -# vals = nonzeros(A) -# m, n = size(A) -# for j = 1:n -# for i in nzrange(A, j) -# row = rows[i] -# val = vals[i] -# # perform sparse wizardry... -# end -# end diff --git a/src/old_messy/assignments/BernoulliAssignment/swap.jl b/src/old_messy/assignments/BernoulliAssignment/swap.jl deleted file mode 100644 index edfd9a3..0000000 --- a/src/old_messy/assignments/BernoulliAssignment/swap.jl +++ /dev/null @@ -1,99 +0,0 @@ -mutable struct BernoulliSwap{F} <: Swap - index1::Int - index2::Int - realized::Matrix{Int} - estimated_theta::Matrix{F} - log_likelihood::F -end - -function make_swap( - a::BernoulliAssignment{T, F}, id) where {T, F} - return BernoulliSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta), - a.additional_data.log_likelihood) -end - -function make_swap!(swap::BernoulliSwap{F}, a::BernoulliAssignment{T, F}, - id) where {T, F} - swap.index1, swap.index2 = id - copy!(swap.realized, a.additional_data.realized) - copy!(swap.estimated_theta, a.additional_data.estimated_theta) - swap.log_likelihood = a.additional_data.log_likelihood -end - -function revert_swap!( - a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - swap_node_labels!(a, swap.index1, swap.index2) - copy!(a.additional_data.realized, swap.realized) - copy!(a.additional_data.estimated_theta, swap.estimated_theta) - a.additional_data.log_likelihood = swap.log_likelihood -end - -function apply_swap!( - a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - update_observed_and_labels!(a, swap) - update_ll!(a) -end - -function update_observed_and_labels!( - a::BernoulliAssignment{T, F}, swap::BernoulliSwap{F}) where {T, F} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - - for i in axes(a.additional_data.A, 2) - if i == swap.index1 || i == swap.index2 || - a.additional_data.A[swap.index1, i] == - a.additional_data.A[swap.index2, i] - continue - end - group_inter = get_group_of_vertex(a, i) - if a.additional_data.A[swap.index1, i] - a.additional_data.realized[g1, group_inter] -= 1 - a.additional_data.realized[ - group_inter, g1] = a.additional_data.realized[ - g1, group_inter] - - a.additional_data.realized[g2, group_inter] += 1 - a.additional_data.realized[ - group_inter, g2] = a.additional_data.realized[ - g2, group_inter] - end - if a.additional_data.A[swap.index2, i] - a.additional_data.realized[g2, group_inter] -= 1 - a.additional_data.realized[ - group_inter, g2] = a.additional_data.realized[ - g2, group_inter] - - a.additional_data.realized[g1, group_inter] += 1 - a.additional_data.realized[ - group_inter, g1] = a.additional_data.realized[ - g1, group_inter] - end - end - - @. a.additional_data.estimated_theta = a.additional_data.realized / - a.additional_data.counts - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function update_ll!(a::BernoulliAssignment) - a.additional_data.log_likelihood = compute_log_likelihood( - a.additional_data.estimated_theta, a.additional_data.counts) - return nothing -end - -function fit(a::BernoulliAssignment, g::Observations) - dists = initialize_sbm(a.group_size, Bernoulli(0.5)) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - dists[group1, - group2] = Bernoulli(a.additional_data.estimated_theta[ - group1, group2]) - end - end - return dists -end diff --git a/src/old_messy/assignments/CategoricalAssignment/struct.jl b/src/old_messy/assignments/CategoricalAssignment/struct.jl deleted file mode 100644 index 5ccd175..0000000 --- a/src/old_messy/assignments/CategoricalAssignment/struct.jl +++ /dev/null @@ -1,128 +0,0 @@ -mutable struct CategoricalData{F, C} - counts::Matrix{Int} - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - A::Matrix{C} # possible use of CategoricalArrays.jl ? - log_likelihood::F # need to remove this type - scratch::Matrix{Int} -end - -const CategoricalAssignment{T, F, C} = Assignment{ - T, CategoricalData{F, C}} -const CategoricalInitRule{S, F} = InitRule{S, Val{CategoricalData}} - -function CategoricalAssignment( - g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { - G, D} - categorical_data = make_categorical_data(g, node_labels, group_size) - return Assignment(group_size, node_labels, categorical_data) -end - -function make_assignment(g, h, init_rule::CategoricalInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - a = CategoricalAssignment(g, group_size, node_labels) - return a -end - -function make_categorical_data(g, node_labels, group_size) - number_groups = length(group_size) - A, num_categories = categorical_matrix(g) - counts = zeros(Int, number_groups, number_groups) - realized = zeros(Int, num_categories, number_groups, number_groups) - estimated_theta = zeros( - Float64, num_categories, number_groups, number_groups) - - _count_cat_occurences!( - counts, realized, g, Assignment(group_size, node_labels)) - - _fast_div!(estimated_theta, realized, counts) - scratch = zeros(Int, num_categories, number_groups) - - ll = compute_log_likelihood(estimated_theta, realized) - return CategoricalData(counts, realized, estimated_theta, A, ll, scratch) -end - -function _count_cat_occurences!(counts, realized, g, a_dummy) - @inbounds for k in 1:number_groups(a_dummy) - for l in k:number_groups(a_dummy) - counts_dict = StatsBase.countmap(get_obs.( - Ref(g), get_edge_indices(a_dummy, k, l))) - total = 0 - for (m, v) in counts_dict - realized[m, k, l] = v - realized[m, l, k] = v - total += v - end - counts[k, l] = total - counts[l, k] = total - end - end -end - -function recount_occurences!(a) - _count_cat_occurences!( - a.additional_data.counts, a.additional_data.realized, a.additional_data.A, a) - return nothing -end - -function compute_log_likelihood( - estimated_theta::Array{T, 3}, realized::Array{F, 3}) where { - T, F} - loglik = zero(T) - number_groups = size(estimated_theta, 2) - number_decorations = size(estimated_theta, 1) - @inbounds for j in 1:number_groups - for i in j:number_groups - for m in 1:number_decorations - if realized[m, i, j] != 0 - loglik += realized[m, i, j] * log(estimated_theta[m, i, j]) - end - end - #loglik += sum(log.(estimated_theta[i, j]) .* realized[i, j]) - #loglik += sum(xlogy.(realized[i,j], estimated_theta[i, j]) ) - end - end - return loglik -end - -function categorical_matrix(A::CategoricalMatrix) - @info "Converting CategoricalMatrix to matrix" - categories = levels(A) - return levelcode.(recode( - A, [l => i for (i, l) in enumerate(categories)]..., missing => 0)) -end - -# to update, just for test now -function categorical_matrix(A::AbstractMatrix{Int}) - min_A = minimum(A) - if min_A > 1 - A_inter = A .- min_A .+ 1 - else - A_inter = copy(A) - end - for i in 1:size(A_inter, 1) - A_inter[i, i] = 0 - end - return A_inter -end - -function categorical_matrix(g::Observations) - return categorical_matrix(g.graph), ncategories(g.dist_ref) -end - -function loglikelihood(a::CategoricalAssignment, g::Observations) - return a.additional_data.log_likelihood -end - -function force_recompute_ll(a::CategoricalAssignment, g::Observations) - a_simple = Assignment(a.group_size, a.node_labels) - return loglikelihood(a_simple, g) -end - -include("swap.jl") - -function get_ordered_adjacency_matrix(a::CategoricalAssignment, by = identity) - return get_ordered_adjacency_matrix(a, a.additional_data.A, by) -end diff --git a/src/old_messy/assignments/CategoricalAssignment/swap.jl b/src/old_messy/assignments/CategoricalAssignment/swap.jl deleted file mode 100644 index 43b6cd9..0000000 --- a/src/old_messy/assignments/CategoricalAssignment/swap.jl +++ /dev/null @@ -1,201 +0,0 @@ -mutable struct CategoricalSwap{F} <: Swap - index1::Int - index2::Int - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - log_likelihood::F -end - -function make_swap(a::CategoricalAssignment, id) - return CategoricalSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta), - a.additional_data.log_likelihood) -end - -function copy_realized_and_theta!(a, b) - copy!(a.realized, b.realized) - copy!(a.estimated_theta, b.estimated_theta) - a.log_likelihood = b.log_likelihood - return nothing -end - -function make_swap!( - swap::CategoricalSwap{F}, a::CategoricalAssignment{T, F, C}, - id) where {T, F, C} - swap.index1, swap.index2 = id - copy_realized_and_theta!(swap, a.additional_data) -end - -function revert_swap!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - swap_node_labels!(a, swap.index1, swap.index2) - copy_realized_and_theta!(a.additional_data, swap) - #copy!.(a.additional_data.realized, swap.realized) - #copy!.(a.additional_data.estimated_theta, swap.estimated_theta) - #a.additional_data.log_likelihood = swap.log_likelihood - #return nothing -end - -function apply_swap!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - #update_observed_and_labels!(a, swap) - new_update_observed_and_labels!(a, swap) - update_ll!(a) -end - -function update_ll!(a::CategoricalAssignment) - a.additional_data.log_likelihood = compute_log_likelihood( - a.additional_data.estimated_theta, a.additional_data.realized) - return nothing -end - -function fit( - a::CategoricalAssignment{T, F, C}, g::Observations) where { - T, F, C} - dists = initialize_sbm( - a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - dists[group1, - group2] = Categorical(a.additional_data.estimated_theta[:, - group1, group2]) - end - end - return dists -end - -function fit( - a::CategoricalAssignment{T, F, C}, g::Observations{ - G, <:DiscretizedDistribution}) where { - T, F, C, G} - dists = initialize_sbm( - a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - set_params!( - dists[group1, - group2], a.additional_data.estimated_theta[:, - group1, group2]) - end - end - return dists -end - -function _move_connection!(realized, group_origin, group_dest, scratch) - @inbounds for group in axes(realized, 2) - for label in axes(realized, 1) - realized[label, group, group_origin] -= scratch[label, group] - realized[label, group, group_dest] += scratch[label, group] - realized[label, group_origin, group] = realized[ - label, group, group_origin] - realized[label, group_dest, group] = realized[ - label, group, group_dest] - end - end -end - -# need to rethink if want to use muli-threading -# check https://juliafolds.github.io/Transducers.jl/dev/tutorials/words/ -function new_update_observed_and_labels!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - if g1 == g2 - return nothing - end - - a.additional_data.scratch .= 0 - for i in axes(a.additional_data.A, 1) - if i == swap.index1 || i == swap.index2 - continue - end - @inbounds obs = a.additional_data.A[i, swap.index1] - if obs != 0 - group_inter = get_group_of_vertex(a, i) - a.additional_data.scratch[obs, group_inter] += 1 - end - end - _move_connection!( - a.additional_data.realized, g1, g2, a.additional_data.scratch) - - a.additional_data.scratch .= 0 - for i in axes(a.additional_data.A, 1) - if i == swap.index1 || i == swap.index2 - continue - end - @inbounds obs = a.additional_data.A[i, swap.index2] - if obs != 0 - group_inter = get_group_of_vertex(a, i) - a.additional_data.scratch[obs, group_inter] += 1 - end - end - _move_connection!( - a.additional_data.realized, g2, g1, a.additional_data.scratch) - - _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, - a.additional_data.counts) - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function update_observed_and_labels!( - a::CategoricalAssignment{T, F, C}, swap::CategoricalSwap{F}) where { - T, F, C} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - - adj_1 = @view a.additional_data.A[:, swap.index1] - adj_2 = @view a.additional_data.A[:, swap.index2] - - for i in axes(a.additional_data.A, 1) - if i == swap.index1 || i == swap.index2 - continue - end - obs_1 = adj_1[i] - obs_2 = adj_2[i] - group_inter = get_group_of_vertex(a, i) - if obs_1 != obs_2 - _fast_update!!( - a.additional_data.realized, g1, g2, obs_1, obs_2, group_inter) - end - end - - _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, - a.additional_data.counts) - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function _fast_update!!(realized, g1, g2, obs_1, obs_2, g_inter) - realized[obs_1, g1, g_inter] -= 1 - realized[obs_1, g_inter, g1] = realized[obs_1, g1, g_inter] - - realized[obs_1, g2, g_inter] += 1 - realized[obs_1, g_inter, g2] = realized[obs_1, g2, g_inter] - - # send from group 2 to group 1 - realized[obs_2, g2, g_inter] -= 1 - realized[obs_2, g_inter, g2] = realized[obs_2, g2, g_inter] - - realized[obs_2, g1, g_inter] += 1 - realized[obs_2, g_inter, g1] = realized[obs_2, g1, g_inter] -end - -function _fast_div!(theta, realized, counts) - for j in axes(theta, 3) - for i in axes(theta, 2) - for m in axes(theta, 1) - theta[m, i, j] = realized[m, i, j] / counts[i, j] - end - end - end -end diff --git a/src/old_messy/assignments/SparseAssignment/struct.jl b/src/old_messy/assignments/SparseAssignment/struct.jl deleted file mode 100644 index 8251c25..0000000 --- a/src/old_messy/assignments/SparseAssignment/struct.jl +++ /dev/null @@ -1,115 +0,0 @@ -mutable struct SparseData{F, C} - counts::Matrix{Int} - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - A::SparseMatrixCSC{C, Int} - scratch_count::Matrix{Int} - scratch_missing::Vector{Int} - log_likelihood::F -end - -const SparseAssignment{T, F, C} = Assignment{ - T, SparseData{F, C}} -const SparseInitRule{S, F} = InitRule{S, Val{SparseData}} - -function SparseAssignment( - g::Observations{G, D}, group_size::GroupSize, node_labels::Vector{Int}) where { - G, D} - A = issparse(g.graph) ? g.graph : sparse(g.graph) - num_levels = ncategories(g.dist_ref) - sparse_data = SparseData( - A, size(group_size, 1), num_levels, group_size, node_labels) - return Assignment(group_size, node_labels, sparse_data) -end - -function make_assignment(g, h, init_rule::SparseInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - return SparseAssignment(g, group_size, node_labels) -end - -function SparseData(A::SparseMatrixCSC{T, Int}, k::Int, - level_count::Int, group_size, node_labels) where {T} - n = size(A, 1) - data = SparseData(zeros(Int, k, k), zeros(Int, level_count, k, k), - zeros(Float64, level_count, k, k), dropzeros(A), zeros( - Int, level_count, k), zeros( - Int, k), 0.0) - _count_possible_occurences!(data, group_size) - _count_occurences!(data, node_labels) - _fast_div!(data.estimated_theta, data.realized, data.counts) - data.log_likelihood = compute_log_likelihood_without_0( - data.estimated_theta, data.realized, data.counts) - return data -end - -function _count_possible_occurences!(data, group_size) - k = size(group_size, 1) - for j in 1:k - data.counts[j, j] = group_size[j] * (group_size[j] - 1) ÷ 2 - for i in (j + 1):k - data.counts[i, j] = group_size[i] * group_size[j] - data.counts[j, i] = group_size[i] * group_size[j] - end - end -end - -function _count_occurences!(data, node_labels) - m, n = size(data.A) - for k in 1:length(unique(node_labels)) - for l in k:length(unique(node_labels)) - node_group_k = findall(x -> x == k, node_labels) - node_group_l = findall(x -> x == l, node_labels) - if k != l - counts = StatsBase.countmap(data.A[i, j] for i in node_group_k - for j in node_group_l if i != j) - else - counts = StatsBase.countmap(data.A[i, j] for i in node_group_k - for j in node_group_l if i < j) - end - for m in 1:size(data.realized, 1) - data.realized[m, k, l] = get(counts, m, 0) - data.realized[m, l, k] = get(counts, m, 0) - end - total_witouth_missing = sum(values(counts)) - - get(counts, missing, 0) - data.counts[k, l] = total_witouth_missing - data.counts[l, k] = total_witouth_missing - end - end -end - -function compute_log_likelihood_without_0( - estimated_theta::Array{T, 3}, realized::Array{F, 3}, counts) where { - T, F} - loglik = zero(T) - number_groups = size(estimated_theta, 2) - number_decorations = size(estimated_theta, 1) - for j in 1:number_groups - for i in j:number_groups - total_decorations = counts[i, j] - loglik -= xlogx(total_decorations) - for m in 1:number_decorations - loglik += xlogx(realized[m, i, j]) - total_decorations -= realized[m, i, j] - end - loglik += xlogx(total_decorations) - end - end - return loglik -end - -function _n_decorations_with_0(a::SparseAssignment) - return size(a.additional_data.estimated_theta, 1) + 1 -end - -function _n_decorations_not_0(a::SparseAssignment) - return size(a.additional_data.estimated_theta, 1) -end - -function loglikelihood(assignment::SparseAssignment, g::Observations) - return assignment.additional_data.log_likelihood -end - -include("swap.jl") diff --git a/src/old_messy/assignments/SparseAssignment/swap.jl b/src/old_messy/assignments/SparseAssignment/swap.jl deleted file mode 100644 index cdfda51..0000000 --- a/src/old_messy/assignments/SparseAssignment/swap.jl +++ /dev/null @@ -1,134 +0,0 @@ -mutable struct SparseSwap{F} <: Swap - index1::Int - index2::Int - realized::Array{Int, 3} - estimated_theta::Array{F, 3} - counts::Matrix{Int} - log_likelihood::F -end - -function make_swap(a::SparseAssignment, id) - return SparseSwap(id[1], id[2], copy(a.additional_data.realized), - copy(a.additional_data.estimated_theta), copy(a.additional_data.counts), - a.additional_data.log_likelihood) -end - -function copy_addtional!(a, b) - copy!(a.realized, b.realized) - copy!(a.estimated_theta, b.estimated_theta) - copy!(a.counts, b.counts) - a.log_likelihood = b.log_likelihood - return nothing -end - -function make_swap!( - swap::SparseSwap{F}, a::SparseAssignment{T, F}, - id) where {T, F} - swap.index1, swap.index2 = id - copy_addtional!(swap, a.additional_data) -end - -function revert_swap!( - a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} - swap_node_labels!(a, swap.index1, swap.index2) - copy_addtional!(a.additional_data, swap) - return nothing -end - -# this function fails in presence of missing values -function apply_swap!( - a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} - update_observed_and_labels!(a, swap) - update_ll!(a) -end - -function update_ll!(a::SparseAssignment) - a.additional_data.log_likelihood = compute_log_likelihood_without_0( - a.additional_data.estimated_theta, a.additional_data.realized, a.additional_data.counts) - return nothing -end - -function update_observed_and_labels!( - a::SparseAssignment{T, F}, swap::SparseSwap{F}) where {T, F} - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - - if g1 == g2 - return nothing - end - - rows = rowvals(a.additional_data.A) - vals = nonzeros(a.additional_data.A) - m, n = size(a.additional_data.A) - for j in [swap.index1, swap.index2] - a.additional_data.scratch_count .= 0 - a.additional_data.scratch_missing .= 0 - g_from = swap.index1 == j ? g1 : g2 - g_to = swap.index1 == j ? g2 : g1 - for i_index in nzrange(a.additional_data.A, j) - row = rows[i_index] - if row == swap.index1 || row == swap.index2 - continue - end - val = vals[i_index] - groupi = get_group_of_vertex(a, row) - if ismissing(val) - a.additional_data.scratch_missing[groupi] += 1 - else - a.additional_data.scratch_count[val, groupi] += 1 - end - end - _move_connection!( - a.additional_data.realized, g_from, g_to, a.additional_data.scratch_count) - - _update_counts!( - a.additional_data.counts, g_from, g_to, a.additional_data.scratch_missing) - end - - _fast_div!(a.additional_data.estimated_theta, a.additional_data.realized, - a.additional_data.counts) - - # swap of the labels should happen after the update of the realized and estimated_theta - # for the above loop to work correctly - swap_node_labels!(a, swap.index1, swap.index2) - return nothing -end - -function _update_counts!(counts, g_from, g_to, missing_update) - for i in axes(counts, 1) - counts[i, g_to] = counts[i, g_to] - missing_update[i] - counts[i, g_from] = counts[i, g_from] + missing_update[i] - counts[g_to, i] = counts[i, g_to] - counts[g_from, i] = counts[i, g_from] - end -end - -function fit(a::SparseAssignment, g::Observations) - dists = initialize_sbm( - a.group_size, ZeroInflatedCategorical(_n_decorations_not_0(a))) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - theta = a.additional_data.estimated_theta[:, group1, group2] - dists[group1, - group2] = ZeroInflatedCategorical(1 - sum(theta), theta) - end - end - return dists -end - -function fit(a::SparseAssignment, - g::Observations{G, <:DiscretizedDistribution}) where {G} - dists = initialize_sbm(a.group_size, - DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(_n_decorations_not_0(a)))) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - theta = a.additional_data.estimated_theta[:, group1, group2] - p = clamp(1 - sum(theta), 0, 1) - dists[group1, - group2] = DiscretizedDistribution( - g.dist_ref.discretizer, ZeroInflatedCategorical(p, theta)) - end - end - return dists -end diff --git a/src/old_messy/assignments/SumAssignment/struct.jl b/src/old_messy/assignments/SumAssignment/struct.jl deleted file mode 100644 index 8451eba..0000000 --- a/src/old_messy/assignments/SumAssignment/struct.jl +++ /dev/null @@ -1,94 +0,0 @@ - -# type F needs to be a vector field! - -mutable struct SumData{F, C} - λ::SparseMatrixCSC{F, Int} - θ::Dict{Tuple{Int, Int}, F} - A::SparseMatrixCSC{C, Int} - counts::Dict{Tuple{Int, Int}, Int} - log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} - log_likelihood::Float64 -end - -const SumAssignment{T, F, C} = Assignment{T, SumData{F, C}} -const SumInitRule{S} = InitRule{S, Val{SumData}} - - - -function make_assignment(g, h, init_rule::SumInitRule) - group_size, - node_labels = initialize_node_labels( - g, h, init_rule.starting_assignment_rule) - return SumAssignment(g, group_size, node_labels) -end - -function SumAssignment(g::Observations, group_size::GroupSize, node_labels) - A = issparse(g.graph) ? g.graph : sparse(g.graph) - λ = fit.(Ref(g.dist_ref), A) - return SumAssignment(A, λ, group_size, node_labels) -end - -function SumAssignment( - A::SparseMatrixCSC{C, Int}, - λ::SparseMatrixCSC{F, Int}, group_size::GroupSize, node_labels::Vector{Int}) where { - F, C} - k = size(group_size, 1) - θ = Dict{Tuple{Int, Int}, F}() - counts = Dict{Tuple{Int, Int}, Int}() - - rows = rowvals(λ) - vals = nonzeros(λ) - m, n = size(λ) - for u in 1:n - for i in nzrange(λ,u) - v = rows[i] - if u >= v - # break # check that this isn't a mistake trying to be fast - continue - end - key_groups = minmax(node_labels[u], node_labels[v]) - param = vals[i] - if haskey(θ, key_groups) - θ[key_groups] += param - else - θ[key_groups] = param - end - if haskey(counts, key_groups) - counts[key_groups] += 1 - else - counts[key_groups] = 1 - end - end - end - for i in 1:k - for j in i:k - θ[minmax(i, j)] /= counts[minmax(i, j)] - end - end - ll_sum = 0.0 - ll = Dict{Tuple{Int, Int}, Float64}() - for i in 1:k - for j in i:k - ll[(i, j)] = 0.0 - end - end - for u in 1:n - for i in nzrange(λ, u) - v = rows[i] - if u >= v - continue - end - key_groups = minmax(node_labels[u], node_labels[v]) - ll[minmax( - node_labels[u], node_labels[v])] += loglikelihood(θ[key_groups], A[u, v]) - end - end - ll_sum = sum(values(ll)) - return Assignment(group_size, node_labels, SumData(λ, θ, A, counts, ll, ll_sum)) -end - -function loglikelihood(assignment::SumAssignment, g::Observations) - return sum(values(assignment.additional_data.log_likelihood)) -end - -include("swap.jl") diff --git a/src/old_messy/assignments/SumAssignment/swap.jl b/src/old_messy/assignments/SumAssignment/swap.jl deleted file mode 100644 index 4bd46de..0000000 --- a/src/old_messy/assignments/SumAssignment/swap.jl +++ /dev/null @@ -1,119 +0,0 @@ -mutable struct SumSwap{F} <: Swap - index1::Int - index2::Int - θ::Dict{Tuple{Int, Int}, F} - counts::Dict{Tuple{Int, Int}, Int} - log_likelihood_per_group::Dict{Tuple{Int, Int}, Float64} - log_likelihood::Float64 -end - -function make_swap(a::SumAssignment, id) - return SumSwap(id[1], id[2], deepcopy(a.additional_data.θ), - deepcopy(a.additional_data.counts), deepcopy(a.additional_data.log_likelihood_per_group), - a.additional_data.log_likelihood) -end - -function make_swap!(swap::SumSwap{F}, a::SumAssignment{T, F}, id) where {T, F} - swap.index1, swap.index2 = id - swap.θ = deepcopy(a.additional_data.θ) - swap.counts = deepcopy(a.additional_data.counts) - swap.log_likelihood_per_group = deepcopy(a.additional_data.log_likelihood_per_group) - swap.log_likelihood = a.additional_data.log_likelihood -end - -function revert_swap!( - a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} - swap_node_labels!(a, swap.index1, swap.index2) - a.additional_data.θ = deepcopy(swap.θ) - a.additional_data.counts = deepcopy(swap.counts) - a.additional_data.log_likelihood_per_group = deepcopy(swap.log_likelihood_per_group) - a.additional_data.log_likelihood = swap.log_likelihood -end - -function apply_swap!( - a::SumAssignment{T, F}, swap::SumSwap{F}) where {T, F} - λ = a.additional_data.λ - rows = rowvals(λ) - vals = nonzeros(λ) - g1 = get_group_of_vertex(a, swap.index1) - g2 = get_group_of_vertex(a, swap.index2) - if g1 == g2 - return nothing - end - - for i in nzrange(λ, swap.index1) - v = rows[i] - key_old_groups = minmax(g1, a.node_labels[v]) - key_new_groups = minmax(g2, a.node_labels[v]) - c_og = a.additional_data.counts[key_old_groups] - c_ng = a.additional_data.counts[key_new_groups] - param = vals[i] - a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - - param)/(c_og - 1) - a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + - param)/(c_ng + 1) - a.additional_data.counts[key_old_groups] -= 1 - a.additional_data.counts[key_new_groups] += 1 - end - - for i in nzrange(λ, swap.index2) - v = rows[i] - key_old_groups = minmax(g2, a.node_labels[v]) - key_new_groups = minmax(g1, a.node_labels[v]) - c_og = a.additional_data.counts[key_old_groups] - c_ng = a.additional_data.counts[key_new_groups] - param = vals[i] - a.additional_data.θ[key_old_groups] = (a.additional_data.θ[key_old_groups]*c_og - - param)/(c_og - 1) - a.additional_data.θ[key_new_groups] = (a.additional_data.θ[key_new_groups]*c_ng + - param)/(c_ng + 1) - a.additional_data.counts[key_old_groups] -= 1 - a.additional_data.counts[key_new_groups] += 1 - end - - swap_node_labels!(a, swap.index1, swap.index2) - fast_update_ll!(a, swap) -end - -function fast_update_ll!(a::SumAssignment, swap::SumSwap) - k = size(a.group_size, 1) - for i in 1:k - for j in i:k - index_group = (i, j) - if swap.θ[index_group] != a.additional_data.θ[index_group] - _update_ll_one_group!(a, index_group) - end - end - end - a.additional_data.log_likelihood = sum(values(a.additional_data.log_likelihood_per_group)) -end - -function _update_ll_one_group!(a::SumAssignment, group) - nodes_1 = findall(x -> x == group[1], a.node_labels) - nodes_2 = findall(x -> x == group[2], a.node_labels) - ll = 0.0 - rows = rowvals(a.additional_data.λ) - for u in nodes_1 - for v in intersect(rows[nzrange(a.additional_data.λ, u)], nodes_2) - ll += loglikelihood( - a.additional_data.θ[group], a.additional_data.A[u, v]) - end - end - a.additional_data.log_likelihood_per_group[group] = ll - return nothing -end - -function fit( - a::SumAssignment{T, F, C}, g::Observations{ - G, <:DiscreteMarkovChain}) where { - T, F, C, G} - dists = initialize_sbm( - a.group_size, g.dist_ref) - for group1 in 1:number_groups(a) - for group2 in 1:number_groups(a) - dists[ - group1, group2] = a.additional_data.θ[minmax(group1, group2)] - end - end - return dists -end diff --git a/src/old_messy/assignments/group_numbering.jl b/src/old_messy/assignments/group_numbering.jl deleted file mode 100644 index 879e909..0000000 --- a/src/old_messy/assignments/group_numbering.jl +++ /dev/null @@ -1,86 +0,0 @@ -""" -Array-like storage for the number of nodes in each group. Try to split the number of nodes -into equal groups, but if it is not possible, the last group may have more nodes. -""" -struct GroupSize{T} <: AbstractVector{Int} - group_number::T - number_groups::Int - - function GroupSize(number_nodes, h::Real) - @assert 0 < h < 1 - standard_group = floor(Int, number_nodes * h) - GroupSize(number_nodes, standard_group) - end - - function GroupSize(number_nodes, standard_group::Integer) - @assert 1 < standard_group <= number_nodes - number_groups = number_nodes ÷ standard_group # number of standard groups! - if number_groups * standard_group == number_nodes - new{Int}(standard_group, number_groups) - else - remainder_group = standard_group + - mod(number_nodes, standard_group) - new{Tuple{Int, Int}}( - (standard_group, remainder_group), number_groups) - end - end -end - -Base.size(g::GroupSize) = (g.number_groups,) -Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) - @boundscheck checkbounds(g, i) - return g.group_number -end - -Base.@propagate_inbounds function Base.getindex( - g::GroupSize{Tuple{Int, Int}}, i::Int) - @boundscheck checkbounds(g, i) - return i < length(g) ? g.group_number[1] : g.group_number[2] -end - -function check_compatiblity!(node_labels, g::GroupSize) - counts = StatsBase.countmap(node_labels) - - if length(counts) != g.number_groups - throw(ArgumentError("The vector of node labels is not compatible with the \ - group size: number of group in labels $(length(counts)) != expected number $(g.number_groups)")) - end - if size(node_labels, 1) != sum(g) - throw(ArgumentError("The vector of node labels is not compatible with the \ - group size: number of node labels $(size(node_labels, 1)) != expected number of nodes $(sum(g))")) - end - unbalanced = any(((k, v),) -> v != g[k], counts) - if unbalanced - @debug "The group size is unbalanced, trying to fix it : $(counts)" - g, node_labels = try_fixing_group_size!(node_labels, g) - if any(((k, v),) -> v != g[k], StatsBase.countmap(node_labels)) - throw(ArgumentError("Could not fix the group size")) - else - @debug "Fixed the group size by moving nodes between groups" - end - end -end - -function try_fixing_group_size!(node_labels, g::GroupSize) - counts = StatsBase.countmap(node_labels) - groups_too_small = filter(((k, v),) -> v < g[k], counts) - groups_too_large = filter(((k, v),) -> v > g[k], counts) - amount_too_small = sum(g[k] - v for (k, v) in groups_too_small) - amount_too_large = sum(v - g[k] for (k, v) in groups_too_large) - if amount_too_small == amount_too_large - nodes_to_move = [] - for (l, v) in groups_too_large - number_nodes_to_move = v - g[l] - nodes_to_move = vcat(nodes_to_move, - findall(x -> x == l, node_labels)[1:number_nodes_to_move]) - end - for (k, v) in groups_too_small - number_nodes_to_move = g[k] - v - for i in 1:number_nodes_to_move - index = popfirst!(nodes_to_move) - node_labels[index] = k - end - end - end - return g, node_labels -end diff --git a/src/old_messy/assignments/include.jl b/src/old_messy/assignments/include.jl deleted file mode 100644 index 5829223..0000000 --- a/src/old_messy/assignments/include.jl +++ /dev/null @@ -1,4 +0,0 @@ -include("BernoulliAssignment/struct.jl") -include("CategoricalAssignment/struct.jl") -include("SparseAssignment/struct.jl") -include("SumAssignment/struct.jl") diff --git a/src/old_messy/bootstrap.jl b/src/old_messy/bootstrap.jl deleted file mode 100644 index 77086dd..0000000 --- a/src/old_messy/bootstrap.jl +++ /dev/null @@ -1,15 +0,0 @@ -function bootstrap( - statistic::Function, data::AbstractMatrix, model::BlockModel, - sampling::BootstrapSampling) - t0 = tx(statistic(data)) - m = nrun(sampling) - t1 = zeros_tuple(t0, m) - data1 = copy(data) - for i in 1:m - draw_and_fill!(data1, model) - for (j, t) in enumerate(tx(statistic(data1))) - t1[j][i] = t - end - end - return ParametricBootstrapSample(t0, t1, statistic, data, model, sampling) -end diff --git a/src/old_messy/distributions/categorical_with_0.jl b/src/old_messy/distributions/categorical_with_0.jl deleted file mode 100644 index 045d45b..0000000 --- a/src/old_messy/distributions/categorical_with_0.jl +++ /dev/null @@ -1,141 +0,0 @@ -""" - struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution - -A zero-inflated categorical distribution that combines a Bernoulli distribution with a categorical distribution. - -# Fields -- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. -- `dist::D`: The categorical distribution. - -# Constructors -- `ZeroInflatedCategorical(p::Real, dist::D)`: Creates a zero-inflated categorical distribution with probability `p` of zero and categorical distribution `dist`. - -# Mathematical Explanation -The zero-inflated categorical distribution modifies the original categorical distribution by introducing a probability `p` of zero. The `pmf` and `cdf` are adjusted accordingly: -- `pdf(x) = p * δ(x) + (1 - p) * pmf_original(x)` -- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` -where `δ(x)` is the Dirac delta function. -""" -struct ZeroInflatedCategorical{B, D} <: DiscreteUnivariateDistribution - edge_proba::B - dist::D -end - -_dirac_delta(x) = x == 0 ? one(x) : zero(x) -_dirac_delta(x, lb, ub) = lb <= x <= ub ? one(x) : zero(x) - -function ZeroInflatedCategorical(p::Real, dist::D) where {D} - if p < 0 - p = zero(p) - elseif p > 1 - p = one(p) - end - return ZeroInflatedCategorical(Bernoulli(1 - p), dist) -end - -function ZeroInflatedCategorical(p::Real, probs::AbstractVector) - if sum(probs) == 0 - probs_ = ones(length(probs)) / length(probs) - else - probs_ = probs / sum(probs) - end - if p < 0 - p = zero(p) - elseif p > 1 - p = one(p) - end - return ZeroInflatedCategorical(p, Categorical(probs_)) -end - -function ZeroInflatedCategorical(vec_probs::AbstractVector) - ZeroInflatedCategorical(vec_probs[1], vec_probs[2:end]) -end - -function ZeroInflatedCategorical(k::Int) - ZeroInflatedCategorical(ones(k + 1) ./ (k + 1)) -end - -""" - Distributions.pdf(d::ZeroInflatedCategorical, x::Real) - -Computes the probability mass function (pmf) of the zero-inflated categorical distribution `d` at `x`. - -# Mathematical Explanation -The `pmf` of the zero-inflated categorical distribution is given by: -- `pmf(x) = p * δ(x) + (1 - p) * pmf_original(x)` -where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `pmf_original(x)` is the pmf of the original categorical distribution. -""" -function Distributions.pdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + - pdf(d.edge_proba, one(x)) * pdf(d.dist, x) -end - -""" - rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) - -Generates a random sample from the zero-inflated categorical distribution `d` using the random number generator `rng`. -""" -function rand(rng::Random.AbstractRNG, d::ZeroInflatedCategorical) - return rand(rng, d.edge_proba) * rand(rng, d.dist) -end - -logpdf(d::ZeroInflatedCategorical, x::Real) = log(pdf(d, x)) - -minimum(d::ZeroInflatedCategorical) = min(minimum(d.dist), 0) - -maximum(d::ZeroInflatedCategorical) = max(maximum(d.dist), 0) - -insupport(d::ZeroInflatedCategorical, x::Real) = x == 0 || insupport(d.dist, x) - -""" - Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - -Computes the cumulative distribution function (cdf) of the zero-inflated categorical distribution `d` at `x`. - -# Mathematical Explanation -The `cdf` of the zero-inflated categorical distribution is given by: -- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` -where `p` is the probability of zero, `δ(x)` is the Dirac delta function, and `cdf_original(x)` is the cdf of the original categorical distribution. -""" -function Distributions.cdf(d::ZeroInflatedCategorical, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, 0, Inf) + - pdf(d.edge_proba, one(x)) * cdf(d.dist, x) -end - -function Distributions.params(d::ZeroInflatedCategorical) - (first(params(d.edge_proba)), params(d.dist)...) -end - -ncategories(d::ZeroInflatedCategorical) = ncategories(d.dist) - -""" - Distributions.fit(::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) - -Fits a zero-inflated categorical distribution to the given data. -""" -function Distributions.fit( - ::Type{ZeroInflatedCategorical{B, D}}, data::AbstractArray, n_cat) where { - B, D <: Categorical} - indices_0 = findall(x -> x == 0, data) - p = length(indices_0) / length(data) - if p != 1 - dist = fit_mle(Categorical, n_cat, data[setdiff(1:end, indices_0)]) - return ZeroInflatedCategorical(p, dist) - else - return ZeroInflatedCategorical(1.0, zeros(n_cat)) - end -end - -function get_params_cat_like(dist::ZeroInflatedCategorical) - p = first(params(dist.edge_proba)) - probs = vcat(params(dist.dist)...) - return vcat(1 - p, probs .* p) -end - -function Base.convert(::Type{<:ZeroInflatedCategorical}, d::D) where {D} - return ZeroInflatedCategorical(1.0, d) -end - -function Base.convert(T::Type{<:Categorical}, d::ZeroInflatedCategorical) - return T(get_params_cat_like(d)) -end diff --git a/src/old_messy/distributions/discrete_dist.jl b/src/old_messy/distributions/discrete_dist.jl deleted file mode 100644 index 55af36b..0000000 --- a/src/old_messy/distributions/discrete_dist.jl +++ /dev/null @@ -1,142 +0,0 @@ -""" - struct DiscretizedDistribution{D, L} <: ContinuousUnivariateDistribution - -A discretized distribution that combines a discretizer with a zero-inflated categorical distribution. - -# Fields -- `discretizer::D`: The discretizer used to discretize the continuous distribution. -- `probs::L`: The zero-inflated categorical distribution representing the discretized probabilities. - -# Constructors -- `DiscretizedDistribution(d::D, n_bins::Int, support_bound = extrema(d))`: Creates a discretized distribution with `n_bins` bins and support bound `support_bound`. - -# Mathematical Explanation -The discretized distribution modifies the original continuous distribution by dividing it into `n_bins` bins. The `pdf` and `cdf` are adjusted accordingly: -- `pdf(x) = pdf_discretized(bin) / bin_width` -- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` -""" -mutable struct DiscretizedDistribution{D, L} <: - ContinuousUnivariateDistribution where {D, L} - discretizer::D - probs::L -end - -function DiscretizedDistribution( - d::D, n_bins::Int, support_bound = extrema(d)) where {D} - disc = DiscretizerZeroToZero(n_bins, support_bound...) - ps = zeros(non_zero_labels_counts(disc)) - for i in 1:non_zero_labels_counts(disc) - lb, ub = decode(disc, i) - ps[i] = cdf(d, ub) - cdf(d, lb) - end - probs = ZeroInflatedCategorical(0.0, ps) - return DiscretizedDistribution(disc, probs) -end - -function DiscretizedDistribution( - d::ZeroInflated, n_bins::Int, support_bound = extrema(d)) - disc = DiscretizerZeroToZero(n_bins, support_bound...) - ps = zeros(non_zero_labels_counts(disc)) - for i in 1:non_zero_labels_counts(disc) - lb, ub = decode(disc, i) - ps[i] = cdf(d, ub) - cdf(d, lb) - end - probs = ZeroInflatedCategorical(get_proba_zero(d), ps) - return DiscretizedDistribution(disc, probs) -end - -function DiscretizedDistribution(discretizer::Discretizer) - return DiscretizedDistribution( - discretizer, ZeroInflatedCategorical(non_zero_labels_counts(discretizer))) -end - -""" - rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) - -Generates a random sample from the discretized distribution `d` using the random number generator `rng`. -""" -function rand(rng::Random.AbstractRNG, d::DiscretizedDistribution) - bin = rand(rng, d.probs) - return _decode_randomly(rng, d.discretizer, bin) -end - -minimum(d::DiscretizedDistribution) = minimum(d.discretizer) - -maximum(d::DiscretizedDistribution) = maximum(d.discretizer) - -function insupport(d::DiscretizedDistribution, x::Real) - support_encoding(d.discretizer, x) -end - -function Base.convert(::Type{DiscretizedDistribution}, d::D) where {D} - return DiscretizedDistribution(d, 10) -end - -ncategories(d::DiscretizedDistribution) = ncategories(d.probs) - -function Distributions.fit( - ::Type{<:DiscretizedDistribution{D, L}}, data) where {D, L} - return fit(L, data) -end - -function set_params!(d::DiscretizedDistribution{D, L}, params) where {D, L} - d.probs = L(params...) -end - -""" - Distributions.pdf(d::DiscretizedDistribution, x::Real) - -Computes the probability density function (pdf) of the discretized distribution `d` at `x`. - -# Mathematical Explanation -The `pdf` of the discretized distribution is computed as: -- `pdf(x) = pdf_discretized(bin) / bin_width` -""" -function pdf(d::DiscretizedDistribution, x::Real) - if x == 0 - return pdf(d.probs, zero(x)) - end - if !support_encoding(d.discretizer, x) - return zero(x) - end - bin = encode(d.discretizer, x) - return pdf(d.probs, bin) / binwidth(d.discretizer) -end - -""" - Distributions.logpdf(d::DiscretizedDistribution, x::Real) - -Computes the log of the probability density function (logpdf) of the discretized distribution `d` at `x`. -""" -function logpdf(d::DiscretizedDistribution, x::Real) - if !support_encoding(d.discretizer, x) - return -Inf - end - x == 0 && return log(pdf(d.probs, x)) - bin = encode(d.discretizer, x) - return log(pdf(d.probs, bin)) - log(binwidth(d.discretizer)) -end - -""" - Distributions.cdf(d::DiscretizedDistribution{D, P}, x::Real) where {D, P <: ZeroInflatedCategorical} - -Computes the cumulative distribution function (cdf) of the discretized distribution `d` at `x`. - -# Mathematical Explanation -The `cdf` of the discretized distribution is computed as: -- `cdf(x) = cdf_discretized(bin) + (cdf_discretized(bin + 1) - cdf_discretized(bin)) * progress_in_bin(x)` -""" -function Distributions.cdf( - d::DiscretizedDistribution{D, P}, x::Real) where { - D, P <: ZeroInflatedCategorical} - x < minimum(d) && return zero(x) - x > maximum(d) && return one(x) - bin = encode(d.discretizer, x) - result = (x == 0) * cdf(d.probs, x) - if bin != 0 - result += cdf(d.probs, bin - 1) + - (cdf(d.probs, bin) - cdf(d.probs, bin - 1)) * - progress_in_bin(d.discretizer, x, bin) - end - return result -end diff --git a/src/old_messy/distributions/discretizer.jl b/src/old_messy/distributions/discretizer.jl deleted file mode 100644 index e039930..0000000 --- a/src/old_messy/distributions/discretizer.jl +++ /dev/null @@ -1,248 +0,0 @@ -# Inspired by Discretizer.jl but with the fast decoding function and built-in -# convention for discretizing continuous distributions. -abstract type Discretizer end - -function encode(d::Discretizer, x::AbstractArray{<:Real}) - return [encode(d, u) for u in x] -end - -function decode(d::Discretizer, x::AbstractArray{<:Real}) - return [decode(d, u) for u in x] -end - -""" -Uniformly discretizes a continuous distribution into a fixed number of bins of equal width. -""" -struct RegularDiscretizer{F, T, L} <: Discretizer - n_bins::Int - lower_bound::F - upper_bound::F - bin_labels::MVector{L, T} - bin_width::F -end - -function RegularDiscretizer( - n_bins::Int, lower_bound::F, upper_bound::F) where {F} - if !isfinite(lower_bound) || !isfinite(upper_bound) - throw(ArgumentError("RegularDiscretizer requires finite lower and upper bounds.")) - end - bin_width = (upper_bound - lower_bound) / n_bins - return RegularDiscretizer( - n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), bin_width - ) -end - -function support_encoding(d::RegularDiscretizer, x::Real) - return d.lower_bound <= x <= d.upper_bound -end - -function minimum(d::RegularDiscretizer) - return d.lower_bound -end - -function maximum(d::RegularDiscretizer) - return d.upper_bound -end - -function encode(d::RegularDiscretizer, x::Real) - if x >= d.upper_bound - return d.n_bins - end - return d.bin_labels[convert(Int, div(x - d.lower_bound, d.bin_width) + 1)] -end - -function _decode_randomly( - rng::Random.AbstractRNG, d::RegularDiscretizer, bin::Int) - hi, lo = decode(d, bin) - return lo + (hi - lo) * rand(rng) -end - -function binwidth(d::RegularDiscretizer) - return d.bin_width -end - -function decode(d::RegularDiscretizer, bin::Int) - return (d.lower_bound + (bin - 1) * d.bin_width, - d.lower_bound + bin * d.bin_width) -end - -function encode(d::RegularDiscretizer, x::AbstractArray{Real}) - return [encode(d, u) for u in x] -end - -function decode(d::RegularDiscretizer, x::AbstractArray{Real}) - return [decode(d, u) for u in x] -end - -function nlabels(d::RegularDiscretizer) - return d.n_bins -end - -non_zero_labels_counts(d::RegularDiscretizer) = nlabels(d) - -""" -Maps a set of categories to a set of bins -""" -struct CategoryDiscretizer{F, T} - cat_to_bin::Dict{F, T} - bin_to_cat::Dict{T, F} - min_label::T - max_label::T -end - -function CategoryDiscretizer(cat_to_bin::Dict, bin_to_cat::Dict) - min_label = minimum(keys(bin_to_cat)) - max_label = maximum(keys(bin_to_cat)) - return CategoryDiscretizer(cat_to_bin, bin_to_cat, min_label, max_label) -end - -function support_encoding(d::CategoryDiscretizer, x) - return haskey(d.cat_to_bin, x) -end - -function encode(d::CategoryDiscretizer, x) - return d.cat_to_bin[x] -end - -function decode(d::CategoryDiscretizer, label) - return d.bin_to_cat[label] -end - -function nlabels(d::CategoryDiscretizer) - return length(d.bin_to_cat) -end - -function binwidth(d::CategoryDiscretizer{F, T}, x::T) where {F, T} - return length(d.bin_to_cat[x]) -end - -function non_zero_labels_counts(d::CategoryDiscretizer) - if 0 ∈ keys(d.bin_to_cat) - return length(d.bin_to_cat) - 1 - else - return length(d.bin_to_cat) - end -end - -function minimum(d::CategoryDiscretizer) - return d.min_label -end - -function maximum(d::CategoryDiscretizer) - return d.max_label -end - -""" -Uniformly discretizes a continuous distribution into a fixed number of bins of equal width, -with additional bins for missing or special values. -""" -struct HybridDiscretizer{F, T, L} <: Discretizer - lin::RegularDiscretizer{F, T, L} - cat::CategoryDiscretizer{F, T} -end - -# change so that atoms can be packed together if wanted -function HybridDiscretizer(n_bins, lower_bound, upper_bound, atoms) - cat_to_bin = Dict(a => n_bins + i for (i, a) in enumerate(atoms)) - bin_to_cat = Dict(n_bins + i => a for (i, a) in enumerate(atoms)) - bin_width = (upper_bound - lower_bound) / n_bins - return HybridDiscretizer( - RegularDiscretizer{typeof(bin_width), Int, n_bins}( - n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), - (upper_bound - lower_bound) / n_bins), - CategoryDiscretizer(cat_to_bin, bin_to_cat) - ) -end - -function DiscretizerZeroToZero(n_bins, lower_bound, upper_bound) - cat_to_bin = Dict([0.0 => 0]) - bin_to_cat = Dict([0 => 0.0]) - bin_width = (upper_bound - lower_bound) / n_bins - return HybridDiscretizer( - RegularDiscretizer{typeof(bin_width), Int, n_bins}( - n_bins, lower_bound, upper_bound, MVector{n_bins}(1:n_bins), - (upper_bound - lower_bound) / n_bins), - CategoryDiscretizer(cat_to_bin, bin_to_cat) - ) -end - -function support_encoding(d::HybridDiscretizer, x) - return support_encoding(d.lin, x) || support_encoding(d.cat, x) -end - -function minimum(d::HybridDiscretizer) - return min(minimum(d.lin), minimum(d.cat)) -end - -function maximum(d::HybridDiscretizer) - return max(maximum(d.lin), maximum(d.cat)) -end - -function nlabels(d::HybridDiscretizer) - return nlabels(d.lin) + nlabels(d.cat) -end - -function non_zero_labels_counts(d::HybridDiscretizer) - return non_zero_labels_counts(d.lin) + non_zero_labels_counts(d.cat) -end - -binwidth(d::HybridDiscretizer) = binwidth(d.lin) - -function binwidth(d::HybridDiscretizer, bin) - if haskey(d.cat.cat_to_bin, bin) - return binwidth(d.cat, bin) - else - return binwidth(d.lin) - end -end - -function encode(d::HybridDiscretizer, x::Real) - if haskey(d.cat.cat_to_bin, x) - return encode(d.cat, x) - else - return encode(d.lin, x) - end -end - -function decode(d::HybridDiscretizer, bin::Int) - if haskey(d.cat.bin_to_cat, bin) - return decode(d.cat, bin) - else - return decode(d.lin, bin) - end -end - -function _decode_randomly( - rng::Random.AbstractRNG, d::HybridDiscretizer, bin::Int) - if haskey(d.cat.bin_to_cat, bin) - return decode(d.cat, bin) - else - return _decode_randomly(rng, d.lin, bin) - end -end - -function auto_nbins(data) - binwidth = 2iqr(data) / cbrt(n) - lo, hi = extrema(data) - nbins_fd = ceil(Int, (hi - lo) / binwidth) - nbins_sturges = ceil(Int, log(2, n)) + 1 - nbins = max(nbins_fd, nbins_sturges) - return nbins -end - -function progress_in_bin(d::CategoryDiscretizer, x::Real, bin) - return one(x) -end - -function progress_in_bin(d::RegularDiscretizer, x::Real, bin) - lo, hi = decode(d, bin) - return (x - lo) / (hi - lo) -end - -function progress_in_bin(d::HybridDiscretizer, x::Real, bin) - if haskey(d.cat.bin_to_cat, bin) - return progress_in_bin(d.cat, x, bin) - else - return progress_in_bin(d.lin, x, bin) - end -end diff --git a/src/old_messy/distributions/include.jl b/src/old_messy/distributions/include.jl deleted file mode 100644 index cc5c557..0000000 --- a/src/old_messy/distributions/include.jl +++ /dev/null @@ -1,5 +0,0 @@ -include("categorical_with_0.jl") -include("discretizer.jl") -include("zero_inflated.jl") -include("discrete_dist.jl") -include("markov_chain.jl") diff --git a/src/old_messy/distributions/markov_chain.jl b/src/old_messy/distributions/markov_chain.jl deleted file mode 100644 index 28a21b8..0000000 --- a/src/old_messy/distributions/markov_chain.jl +++ /dev/null @@ -1,144 +0,0 @@ -# if S is Int, assume the states are ordered and sequential -# should store everything in transpose, will be faster but way more -# complicated to read -struct DiscreteMarkovChain{S, M <: AbstractMatrix} - states::Vector{S} - transitions::M -end - -struct SampleChain{S, M <: AbstractMatrix} - states::Vector{S} - indices::Vector{Int} - transitions::M -end - -Base.zero(::DiscreteMarkovChain) = DiscreteMarkovChain(Int[], zeros(Int, 0, 0)) -Base.zero(::SampleChain{S}) where {S} = SampleChain(S[], Int[], zeros(Int, 1, 1)) - -function state_index(mc::DiscreteMarkovChain{S}, state::S) where {S} - findfirst(isequal(state), mc.states) -end - -state_space(mc::DiscreteMarkovChain) = mc.states -transition_matrix(mc::DiscreteMarkovChain) = mc.transitions - -function stationary_dist(mc::DiscreteMarkovChain) - T = transition_matrix(mc) - F = eigen(T') - tol = 1e-8 - idx = findfirst(abs.(F.values .- 1) .< tol) - if idx === nothing - error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") - end - # Extract the corresponding eigenvector and normalize it to sum to 1. - pi = real(F.vectors[:, idx]) - return pi ./ sum(pi) -end - -function stationary_dist(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}) where {S} - T = transition_matrix(mc) - vals, vecs, _ = eigsolve(T') - tol = 1e-8 - idx = findfirst(abs.(vals .- 1) .< tol) - if idx === nothing - error("No eigenvalue equal (within tolerance) to 1 found. The chain may not be ergodic.") - end - # Extract the corresponding eigenvector and normalize it to sum to 1. - pi = Real.(vecs[idx]) - result = pi ./ sum(pi) - return result -end - -function sample_indices(mc::DiscreteMarkovChain, t::Int) - indices = Vector{Int}(undef, t) - indices[1] = rand(Categorical(stationary_dist(mc))) - tr_transposed = transpose(mc.transitions) - for i in 2:t - indices[i] = rand(Categorical(tr_transposed[:, indices[i - 1]])) - end - return indices -end - -function sample(mc::DiscreteMarkovChain, t::Int) - indices = sample_indices(mc, t) - states = mc.states[indices] - counts = zeros(Int, length(mc.states), length(mc.states)) - for i in 1:(length(indices) - 1) - counts[indices[i], indices[i + 1]] += 1 - end - return SampleChain(states, indices, counts) -end - -function sample(mc::DiscreteMarkovChain{S, <:SparseMatrixCSC}, t::Int) where {S} - indices = sample_indices(mc, t) - states = mc.states[indices] - counts = zeros(Int, length(mc.states), length(mc.states)) - for i in 1:(length(indices) - 1) - counts[indices[i], indices[i + 1]] += 1 - end - return SampleChain(states, indices, sparse(counts)) -end - -## yes I know this is awful and does not return a proper chain, but... -function Base.:+(a::DiscreteMarkovChain, b::DiscreteMarkovChain) - return DiscreteMarkovChain( - a.states, - a.transitions .+ b.transitions) -end - -function Base.:-(a::DiscreteMarkovChain, b::DiscreteMarkovChain) - return DiscreteMarkovChain( - a.states, - a.transitions .- b.transitions) -end - -function Base.:*(a::DiscreteMarkovChain, c::Real) - return DiscreteMarkovChain( - a.states, - a.transitions .* c) -end - -Base.:*(c::Real, a::DiscreteMarkovChain) = a * c - -function Base.:/(a::DiscreteMarkovChain, c::Real) - return DiscreteMarkovChain( - a.states, - a.transitions ./ c) -end - -function loglikelihood(mc::DiscreteMarkovChain{S, M}, chain::Vector{Int}) where {S, M} - Tr = transition_matrix(mc) - probas = Vector{Float64}(undef, length(chain)) - probas[1] = stationary_dist(mc)[chain[1]] - for i in 1:(length(chain) - 1) - probas[i + 1] = Tr[chain[i], chain[i + 1]] - end - return sum(log, probas) -end - -function loglikelihood( - mc::DiscreteMarkovChain{S, M1}, chain::Vector{S}) where {S, M1} - return loglikelihood(mc, state_index.(Ref(mc), chain)) -end - -#without the first state, huge computational speedup -function loglikelihood( - mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} - return sum(map(xlogy, chain.transitions, mc.transitions)) #+log(stationary_dist(mc)[chain.indices[1]]) -end - - - -# user responsability to have the same states... -function fit( - mc::DiscreteMarkovChain{S, M1}, chain::SampleChain{S, M2}) where {S, M1, M2} - return DiscreteMarkovChain( - mc.states, make_row_stochastic(chain.transitions)) -end - - - -function make_row_stochastic(A::M) where {M <: AbstractMatrix} - f(row) = sum(row) == 0 ? ones(length(row)) / length(row) : row ./ sum(row) - return mapslices(f, A, dims = 2) -end diff --git a/src/old_messy/distributions/utils.jl b/src/old_messy/distributions/utils.jl deleted file mode 100644 index d30daf7..0000000 --- a/src/old_messy/distributions/utils.jl +++ /dev/null @@ -1,19 +0,0 @@ -const logtwo = log(2.0) - -sumlog(x::AbstractArray{<:Real}) = sum(log,x) - -function sumlog(x::AbstractArray{<:AbstractFloat}) - sig = one(T) - ex = zero(exponent(one(T))) - bound = floatmax(T) / 2 - for xj in x - sig *= significand(xj) - ex += exponent(xj) - if sig > bound - (a, b) = (significand(sig), exponent(sig)) - sig = a - ex += b - end - end - log(sig) + logtwo * ex -end diff --git a/src/old_messy/distributions/zero_inflated.jl b/src/old_messy/distributions/zero_inflated.jl deleted file mode 100644 index 9cc4720..0000000 --- a/src/old_messy/distributions/zero_inflated.jl +++ /dev/null @@ -1,93 +0,0 @@ -""" - struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution - -A zero-inflated distribution that combines a Bernoulli distribution with a continuous distribution. - -# Fields -- `edge_proba::B`: The Bernoulli distribution representing the probability of zero. -- `dist::D`: The continuous distribution. - -# Constructors -- `ZeroInflated(p::Real, dist::D)`: Creates a zero-inflated distribution with probability `p` of zero and continuous distribution `dist`. - -# Mathematical Explanation -The zero-inflated distribution modifies the original distribution by introducing a probability `p` of zero. The `pdf` and `cdf` are adjusted accordingly: -- `pdf(x) = p * δ(x) + (1 - p) * pdf_original(x)` -- `cdf(x) = p * δ(x) + (1 - p) * cdf_original(x)` -where `δ(x)` is the Dirac delta function. -""" -struct ZeroInflated{B, D} <: ContinuousUnivariateDistribution - edge_proba::B - dist::D -end - -function ZeroInflated(p::Real, dist::D) where {D} - return ZeroInflated(Bernoulli(1 - p), dist) -end - -""" - Distributions.pdf(d::ZeroInflated, x::Real) - -Computes the probability density function (pdf) of the zero-inflated distribution `d` at `x`. -""" -function Distributions.pdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + - pdf(d.edge_proba, one(x)) * pdf(d.dist, x) -end - -""" - get_proba_zero(d::ZeroInflated) - -Returns the probability of zero for the zero-inflated distribution `d`. -""" -function get_proba_zero(d::ZeroInflated) - return pdf(d.edge_proba, 0) -end - -""" - rand(rng::Random.AbstractRNG, d::ZeroInflated) - -Generates a random sample from the zero-inflated distribution `d` using the random number generator `rng`. -""" -function rand(rng::Random.AbstractRNG, d::ZeroInflated) - return rand(rng, d.edge_proba) * rand(rng, d.dist) -end - -logpdf(d::ZeroInflated, x::Real) = log(pdf(d, x)) - -minimum(d::ZeroInflated) = min(minimum(d.dist), 0) - -maximum(d::ZeroInflated) = max(maximum(d.dist), 0) - -insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) - -""" - Distributions.cdf(d::ZeroInflated, x::Real) - -Computes the cumulative distribution function (cdf) of the zero-inflated distribution `d` at `x`. -""" -function Distributions.cdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + - cdf(d.dist, x) * pdf(d.edge_proba, one(x)) -end - -function Distributions.params(d::ZeroInflated) - (first(params(d.edge_proba)), params(d.dist)...) -end - -""" - Distributions.fit(::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) - -Fits a zero-inflated distribution to the given data. -""" -function Distributions.fit( - ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} - indices_0 = findall(x -> x == 0, data) - p = length(indices_0) / length(data) - if p != 1 - return ZeroInflated( - p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) - else - return ZeroInflated(1.0, D()) - end -end diff --git a/src/old_messy/observations.jl b/src/old_messy/observations.jl deleted file mode 100644 index 120642c..0000000 --- a/src/old_messy/observations.jl +++ /dev/null @@ -1,231 +0,0 @@ -""" - Observations{G, D} - -A struct to hold observations for a network. The type parameter `G` represents the network - structure and must support indexing and the `size` function. - -# Fields -- `graph::G`: The network structure (e.g. adjacency matrix). -- `dist_ref::D`: distribution of the observations (used for getting support, type of elements, etc.) -""" -struct Observations{G, D} - graph::G - dist_ref::D -end - -""" - number_nodes(graph::Observations) - -Get the number of nodes in the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `num_nodes`: The number of nodes. -""" -function number_nodes(graph::Observations) - return size(graph.graph, 1) -end - -""" - get_obs(graph::Observations, x::Tuple) - -Get the observation for the given tuple of nodes. - -# Arguments -- `graph::Observations`: The graph observations. -- `x::Tuple`: The tuple of nodes. - -# Returns -- `obs`: The observation. -""" -function get_obs(graph::Observations, x::Tuple) - return get_obs(graph, x[1], x[2]) -end - -""" - get_obs(graph::Observations, i::Int, j::Int) - -Get the observation for the given pair of nodes. - -# Arguments -- `graph::Observations`: The graph observations. -- `i::Int`: The first node. -- `j::Int`: The second node. - -# Returns -- `obs`: The observation. -""" -function get_obs(graph::Observations, i::Int, j::Int) - return graph.graph[i, j] -end - -""" - density(graph::Observations) - -Get the density of the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `density`: The density of the graph. -""" -function density(graph::Observations) - return sum(graph.graph) / - ((size(graph.graph, 1) * (size(graph.graph, 1) - 1))) -end - -""" - get_degree(graph::Observations) - -Get the degree of each node in the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `degrees`: The degrees of the nodes. -""" -function get_degree(graph::Observations) - return sum(graph.graph, dims = 2) -end - -""" - get_adj(graph::Observations) - -Get the adjacency matrix of the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `adj_matrix`: The adjacency matrix. -""" -function get_adj(graph::Observations) - return graph.graph -end - -function normalized_laplacian(graph::Observations) - return normalized_laplacian(graph.graph) -end - -function normalized_laplacian(g::AbstractGraph) - return normalized_laplacian(Graphs.adjacency_matrix(g)) -end - -normalized_laplacian(g::CategoricalArray) = normalized_laplacian(levelcode.(g)) - -""" - normalized_laplacian(graph::Observations) - -Get the normalized Laplacian of the graph. - -# Arguments -- `graph::Observations`: The graph observations. - -# Returns -- `L`: The normalized Laplacian matrix. -""" -function normalized_laplacian(graph::AbstractMatrix) - degrees = sum(graph, dims = 1) - degrees .-= minimum(degrees) - n = size(graph, 1) - L = similar(graph, Float64) - for j in 1:n - for i in 1:n - if i == j - L[i, j] = 1 - elseif degrees[i] == 0 || degrees[j] == 0 - L[i, j] = 0 - elseif graph[i, j] != 0 - L[i, j] = -1 / sqrt(degrees[i] * degrees[j]) - end - end - end - return L -end - -function Metis.graph(graph::Observations{ - G, <:UnivariateDistribution}) where {G} - use_weights = true - if minimum(graph.dist_ref) < 0 - @warn "Negative values are not allowed for MetisStart, using binary graph" - use_weights = false - end - return Metis.graph(sparse(graph.graph), weights = use_weights) -end - -""" - discretise(graph::Observations; number_groups, number_levels) - -Discretise the graph observations. - -# Arguments -- `graph::Observations`: The graph observations. -- `number_groups`: Number of groups for discretisation. -- `number_levels`: Number of levels for discretisation. - -# Returns -- `discretised_graph`: The discretised graph observations. -- `discretiser`: The discretiser used. - -Assume that the diagonal is zero. -0 indicates no edge, while missing indicates no information about the edge. -By default maps 0 to 0. If you want another behaviour use the function where you -pass a `Discretizer` object. - -number_levels will be the number of levels in the discretized distribution (excluding 0). -""" -function discretise( - graph::Observations; number_groups = nothing, number_levels = nothing) - if isnothing(number_groups) && isnothing(number_levels) - throw(ArgumentError("Either `number_groups` or `number_levels` must be provided")) - end - if isnothing(number_levels) - number_levels = round(Int, - get_num_levels_from_groups(number_nodes(graph), number_groups)) - else - if !isnothing(number_groups) - @warn "disregarding `number_groups` as `number_levels` is provided" - end - end - return discretise( - graph, DiscretizerZeroToZero(number_levels, extrema(graph.graph)...)) -end - -""" - discretise(graph::Observations, discretiser::Discretizer) - -Discretise the graph observations using the given discretiser. - -# Arguments -- `graph::Observations`: The graph observations. -- `discretiser::Discretizer`: The discretiser to use. - -# Returns -- `discretised_graph`: The discretised graph observations. -- `discretiser`: The discretiser used. -""" -function discretise(graph::Observations, discretiser::Discretizer) - A_encoded = encode(discretiser, graph.graph) - return Observations(A_encoded, DiscretizedDistribution(discretiser)), - discretiser -end - -""" - get_num_levels_from_groups(n, number_groups) - -Get the number of levels for the discretized distribution given n and k. - -# Arguments -- `n`: The number of nodes. -- `number_groups`: The number of groups. - -# Returns -- `num_levels`: The number of levels. -""" -function get_num_levels_from_groups(n, number_groups) - return max(1, n^(0.5 * (1 - log(number_groups) / log(n)))) -end diff --git a/src/old_messy/optimisation/config_rules/InitRule.jl b/src/old_messy/optimisation/config_rules/InitRule.jl deleted file mode 100644 index c60e144..0000000 --- a/src/old_messy/optimisation/config_rules/InitRule.jl +++ /dev/null @@ -1,99 +0,0 @@ -abstract type StartingAssignment end -struct OrderedStart <: StartingAssignment end -struct RandomStart <: StartingAssignment end -struct SpectralStart <: StartingAssignment end -struct MetisStart <: StartingAssignment end -struct BiasAdjustedSoS <: StartingAssignment end - -struct FromAssignment{A} <: StartingAssignment - assignment::A -end -struct HigherOrderSpectralStart <: StartingAssignment - k::Int -end - -struct InitRule{S <: StartingAssignment, I} - starting_assignment_rule::S - assignment_rule::I -end - -function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} - return Assignment(initialize_node_labels( - g, h, init_rule.starting_assignment_rule)...) -end - -""" - initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) - -initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` -object and a vector of node labels. - -# Implemented rules -- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. -- `RandomStart()`: Randomly assign nodes to groups. -- `SpectralStart()`: Assign nodes to groups based on spectral clustering. -- `MetisStart()`: Assign nodes to groups based on Metis partitioning. -- `FromAssignment(a)`: Assign nodes to groups based on the given assignment `a`. -""" -initialize_node_labels - -function initialize_node_labels(g, h, ::OrderedStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::RandomStart) - group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) - Random.shuffle!(node_labels) - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::SpectralStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = zeros(Int, number_nodes(g)) - - laplacian = normalized_laplacian(g) - decomp, = partialschur(laplacian, nev = 2, which = :LR) - - # get 2nd eigenvector, sort its components - indices = sortperm(real.(decomp.Q[:, 2])) - # bin them into groups of correct size - start = 1 - for (i, group) in enumerate(group_size) - stop = start + group - 1 - node_labels[indices[start:stop]] .= i - start = stop + 1 - end - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::MetisStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = convert.( - Int, Metis.partition(Metis.graph(g), length(group_size))) - check_compatiblity!(node_labels, group_size) - return group_size, node_labels -end - -function initialize_node_labels(g, h, rule::FromAssignment{A}) where {A} - group_size = GroupSize(number_nodes(g), h) - check_compatiblity!(rule.assignment.node_labels, group_size) - return group_size, rule.assignment.node_labels -end - -function initialize_node_labels(g, h, rule::HigherOrderSpectralStart) - throw(ArgumentError("Not implemented yet, need to finish with Clustering.jl")) - # this will need to have the main optim changed -> no assumption that all blocks are - # the same size - group_size = GroupSize(number_nodes(g), h) - laplacian = normalized_laplacian(g) - results = IterativeSolvers.lobpcg(laplacian, true, rule.k) - return group_size, node_labels -end - -function initialize_node_labels(g, h, ::BiasAdjustedSoS) - # implement method from Bias-adjusted spectral clustering in multilayer stochastic block - # models - -end diff --git a/src/old_messy/optimisation/config_rules/accept_rule.jl b/src/old_messy/optimisation/config_rules/accept_rule.jl deleted file mode 100644 index 46e7bd6..0000000 --- a/src/old_messy/optimisation/config_rules/accept_rule.jl +++ /dev/null @@ -1,25 +0,0 @@ -abstract type AcceptRule end -struct Strict <: AcceptRule end - -""" - accept_reject_update!(a::Assignment, swap::Swap, g, accept_rule::AcceptRule) - - -Perform the swap and accept it if it improves the likelihood of the assignment. `a` will -be updated in place if the swap is accepted. - -# Implemented rules -- `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. -""" -accept_reject_update! - -function accept_reject_update!(a::Assignment, swap::Swap, g, ::Strict) - # calculate the score of the current assignment - current_score = score(a, g) - # perform the swap - apply_swap!(a, swap) - # if the new assignment is worse, revert the swap - if score(a, g) <= current_score - revert_swap!(a, swap) - end -end diff --git a/src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl b/src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl deleted file mode 100644 index efa4da2..0000000 --- a/src/old_messy/optimisation/config_rules/bandwidth_selection_rule.jl +++ /dev/null @@ -1,101 +0,0 @@ -abstract type KSelectionRule end -struct OracleK <: KSelectionRule - K::Int -end -struct OracleM{F} <: KSelectionRule - M::F - α::F -end - -struct OracleH <: KSelectionRule - H::Int -end - -function OracleM(M) - return OracleM(M, 1.0) -end - -abstract type EstimatedM <: KSelectionRule end -struct EstimatedEigenvalues <: EstimatedM end -struct EstimatedDegrees <: EstimatedM end - -""" - select_number_node_per_block(g::Observations, rule::KSelectionRule) - -How to select the number of blocks `K` for the BlockModel model. - -# Implemented rules -- `OracleK(K::Int)`: Use the oracle number of blocks `K`. -- `OracleM(M::Int)`: Give the Holder constant `M` of the graphon, use the results from - [Olhede and Wolfe (2014)](https://www.pnas.org/doi/epdf/10.1073/pnas.1400374111) to - estimate the number of blocks `K`. -- `EstimatedEigenvalues()`: Use the estimated eigenvalues of the adjacency matrix to - estimate the Holder constant and then use `OracleM` to estimate the number of blocks `K`. -- `EstimatedDegrees()`: Use the estimated degrees of the adjacency matrix to estimate the - Holder constant and then use `OracleM` to estimate the number of blocks `K`. -- `OracleH(H::Int)`: Use the oracle number of nodes per block `H`. - -!!! info - - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in - the graph. - - The estimated Holder constant `M` comes from equation (11) in Olhede and Wolfe (2014). -""" -select_number_node_per_block - -function select_number_node_per_block(g::Observations, rule::OracleH) - if rule.H > number_nodes(g) ÷ 2 - throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ - number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) - end - if rule.H <= 1 - throw(ArgumentError("The number of nodes per block $(rule.H) is too small, it should \ - be at least 2")) - end - return rule.H -end - -function select_number_node_per_block(g::Observations, rule::OracleK) - nodes_per_block = number_nodes(g) ÷ rule.K - return select_number_node_per_block(g, OracleH(nodes_per_block)) -end - -function select_number_node_per_block(g::Observations, rule::OracleM) - rho = density(g) - n = number_nodes(g) - h = min(max(2, round(Int, (2 * rule.M * rho)^(-1 / 4) * sqrt(n))), n ÷ 2) - return select_number_node_per_block(g, OracleH(h)) -end - -function select_number_node_per_block(g::Observations, rule::EstimatedM) - n = number_nodes(g) - c = min(4, sqrt(n) / 8) - number_points_from_mid = round(Int, c * sqrt(n)) - mid_points = max(1, n ÷ 2 - number_points_from_mid):(n ÷ 2 + number_points_from_mid) - m = estimated_number_nodes_per_block(g, rule, mid_points, density(g)) - return select_number_node_per_block(g, OracleH(m)) -end - -function estimated_number_nodes_per_block( - g::Observations, ::EstimatedEigenvalues, points, rho) - @warn "Check this method again" - decomp, = partialschur(get_adj(g), nev = 1, which = :LR) - u, λ = real.(decomp.Q), decomp.eigenvalues[1] - return _approx_k_from_delta_f(u, λ, points, rho) -end - -function estimated_number_nodes_per_block( - g::Observations, ::EstimatedDegrees, points, rho) - d = get_degree(g) - mult = ((d' * get_adj(g) * d) / (sum(d .^ 2))^2)[1] - return _approx_k_from_delta_f(d, mult, points, rho) -end - -function _approx_k_from_delta_f(u, mult, midpoints, ρ, α = 1.0) - sort!(u, dims = 1) - uMid = u[midpoints] - β₀, β₁ = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid - # from Olhede and Wolfe (2014), equation (11) - h = (2^(α + 1) * α * mult^2 * (β₁ * length(uMid) / 2 + β₀)^2 * β₁^2 * - ρ^(-1))^(-1 / (2 * (α + 1))) - return round(Int, h) -end diff --git a/src/old_messy/optimisation/config_rules/include.jl b/src/old_messy/optimisation/config_rules/include.jl deleted file mode 100644 index 8c5b6bf..0000000 --- a/src/old_messy/optimisation/config_rules/include.jl +++ /dev/null @@ -1,5 +0,0 @@ -include("swap_rule.jl") -include("accept_rule.jl") -include("InitRule.jl") -include("stop_rule.jl") -include("bandwidth_selection_rule.jl") diff --git a/src/old_messy/optimisation/config_rules/stop_rule.jl b/src/old_messy/optimisation/config_rules/stop_rule.jl deleted file mode 100644 index 89575dc..0000000 --- a/src/old_messy/optimisation/config_rules/stop_rule.jl +++ /dev/null @@ -1,49 +0,0 @@ -abstract type StopRule end - -function initialise_stop_rule!(stop_rule::StopRule, a, g) -end - -# default score is the log likelihood -function score(a::Assignment, g::Observations) - return loglikelihood(a, g) / binomial(number_nodes(a), 2) -end - -mutable struct PreviousBestValue{T} <: StopRule - k::Int - previous_best_value::T - iterations_since_best::Int - function PreviousBestValue( - k::Int, x::T = -Inf) where {T <: Real} - @assert k > 0 - # queue stores the best values and at most k subsequent values - new{T}(k, x, 0) - end -end - -function initialise_stop_rule!(stop_rule::PreviousBestValue, a, g) - score_value = score(a, g) - stop_rule.previous_best_value = score_value -end - -""" - stopping_rule(assignment::Assignment,g, stop_rule::StopRule) - -Returns a Bool with true if we should stop the optimization based on the `stop_rule`. - -# Implemented rules -- `PreviousBestValue(k)`: Stop if the current iteration is `k` iterations away from the - iteration with the best value. -""" -stopping_rule - -function stopping_rule( - assignment::Assignment, g, stop_rule::PreviousBestValue) - score_value = score(assignment, g) - if score_value > stop_rule.previous_best_value - stop_rule.previous_best_value = score_value - stop_rule.iterations_since_best = 0 - else - stop_rule.iterations_since_best += 1 - end - return stop_rule.iterations_since_best >= stop_rule.k -end diff --git a/src/old_messy/optimisation/config_rules/swap_rule.jl b/src/old_messy/optimisation/config_rules/swap_rule.jl deleted file mode 100644 index e811d4f..0000000 --- a/src/old_messy/optimisation/config_rules/swap_rule.jl +++ /dev/null @@ -1,27 +0,0 @@ -abstract type NodeSwapRule end - -struct RandomNodeSwap <: NodeSwapRule end -struct RandomGroupSwap <: NodeSwapRule end -""" - select_swap(node_assignment::Assignment, ::NodeSwapRule) - -Selects two nodes to swap based on the `NodeSwapRule`, the adjacency matrix `A` and the -current assignment `node_assignment`. - -# Implemented rules -- `RandomNodeSwap()`: Select two nodes at random. -- `RandomGroupSwap()`: Select two nodes from two different groups at random. -""" -select_swap - -function select_swap(assignment::Assignment, ::RandomNodeSwap) - return StatsBase.sample(1:number_nodes(assignment), 2; replace = false) -end - -function select_swap(assignment::Assignment, ::RandomGroupSwap) - groups = StatsBase.sample( - 1:number_groups(assignment), 2; replace = false) - index1 = rand(get_vertex_in_group(assignment, groups[1])) - index2 = rand(get_vertex_in_group(assignment, groups[2])) - return (index1, index2) -end diff --git a/src/old_messy/optimisation/fit.jl b/src/old_messy/optimisation/fit.jl deleted file mode 100644 index 8e5985a..0000000 --- a/src/old_messy/optimisation/fit.jl +++ /dev/null @@ -1,99 +0,0 @@ -# Slow fallback methods for the Assignment type -# Speed up by implementing specialized methods for the BernoulliAssignment type and others - -""" - fit(a::Assignment, g::Observations) - -Compute the estimator from node clustering as specified in the assignment. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `g::Observations`: The graph observations. - -# Returns -- `dists`: The fitted distributions. -""" -function fit(a::Assignment, g::Observations) - dists = initialize_sbm(a.group_size, g.dist_ref) - fit!(dists, g, a) - return dists -end - -""" - fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}, a::Assignment) where {G,D,K,F} - -Fit the SBM to the given graph observations and assignment. - -# Arguments -- `sbm::BlockModel{D,K,F}`: The block model to fit. -- `g::Observations{G,D}`: The graph observations. -- `a::Assignment`: The assignment of nodes to blocks. -""" -function fit!(sbm::BlockModel{D, K, F}, g::Observations{G, D}, - a::Assignment) where {G, D, K, F} - for group1 in 1:number_groups(a) - for group2 in group1:number_groups(a) - edge_indices = get_edge_indices(a, group1, group2) - sbm[group1, group2] = fit_group(g.dist_ref, g, edge_indices) - end - end -end - -function fit_group(d::ZeroInflatedCategorical, g, edges) - return Distributions.fit( - typeof(d), get_obs.(Ref(g), edges), ncategories(g.dist_ref)) -end - -function fit_group(distribution, g, edges) - return Distributions.fit(typeof(distribution), get_obs.(Ref(g), edges)) -end - -function fit_group(distribution::Binomial, g, edges) - return Distributions.fit( - typeof(distribution), ntrials(distribution), get_obs.(Ref(g), edges)) -end - -""" - loglikelihood(a::Assignment, g::Observations) - -Compute the log likelihood of a BlockModel fitted according to the assignment. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `g::Observations`: The graph observations. - -# Returns -- `log_likelihood`: The log likelihood of the fitted model. -""" -function loglikelihood(a::Assignment, g::Observations) - return _log_likelihood(a, fit(a, g), g) -end - -function _log_likelihood(a::Assignment, sbm::BlockModel, g) - log_likelihood = 0.0 - for i in 1:number_nodes(a) - label_a = a.node_labels[i] - for j in (i + 1):number_nodes(a) - label_b = a.node_labels[j] - log_likelihood += logdensityof( - sbm[label_a, label_b], get_obs(g, i, j)) - end - end - return log_likelihood -end - -""" - fit!(sbm::BlockModel{D,K,F}, g::Observations{G,D}) where {G,D,K,F} - -Fit the SBM to the given graph observations. - -# Arguments -- `sbm::BlockModel{D,K,F}`: The block model to fit. -- `g::Observations{G,D}`: The graph observations. -""" -function fit!( - sbm::BlockModel{D, K, F}, g::Observations{G, D}) where {G, D, K, F} - k = number_blocks(sbm) - a = estimate_graphon(g, select_number_node_per_block(g, OracleK(k))) - fit!(sbm, g, a) -end diff --git a/src/old_messy/optimisation/include.jl b/src/old_messy/optimisation/include.jl deleted file mode 100644 index 044e887..0000000 --- a/src/old_messy/optimisation/include.jl +++ /dev/null @@ -1,3 +0,0 @@ -include("fit.jl") -include("swap.jl") -include("least_squares.jl") diff --git a/src/old_messy/optimisation/least_squares.jl b/src/old_messy/optimisation/least_squares.jl deleted file mode 100644 index 9aae0db..0000000 --- a/src/old_messy/optimisation/least_squares.jl +++ /dev/null @@ -1,95 +0,0 @@ -include("config_rules/include.jl") - -""" - estimate_graphon(graph, h; iterations, initialise_rule, swap_rule, accept_rule, stop_rule, progress_bar) - -Estimate the graphon for the given graph. - -# Arguments -- `graph`: The input graph. -- `h`: Number of nodes per block. -- `iterations`: Maximum number of iterations. -- `initialise_rule::InitRule`: Rule for initializing the assignment. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `stop_rule::StopRule`: Rule for stopping the iterations. -- `progress_bar::Bool`: Whether to show a progress bar. - -# Returns -- `a`: The assignment of nodes to blocks. -""" -function estimate_graphon( - graph, h = select_number_node_per_block(graph, EstimatedDegrees()); - iterations::Int = 10_000, - initialise_rule::InitRule = InitRule(SpectralStart(), nothing), - swap_rule::NodeSwapRule = RandomNodeSwap(), - accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(1000), - progress_bar::Bool = false -) - a = make_assignment(graph, h, initialise_rule) - @debug a - initialise_stop_rule!(stop_rule, a, graph) - greedy_improve!( - a, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) - return a -end - -""" - greedy_improve!(a::Assignment, graph; iterations, swap_rule, accept_rule, stop_rule, progress_bar) - -Perform greedy improvement on the assignment. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `graph`: The input graph. -- `iterations`: Maximum number of iterations. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -- `stop_rule::StopRule`: Rule for stopping the iterations. -- `progress_bar::Bool`: Whether to show a progress bar. -""" -function greedy_improve!(a::Assignment, graph; iterations::Int = 10_000, - swap_rule::NodeSwapRule = RandomNodeSwap(), - accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(1000), - progress_bar::Bool = false -) - # swap memory allocation - swap = make_swap(a, (1, 1)) - p = ProgressUnknown( - enabled = progress_bar, showspeed = true, desc = "Greedy search: ") - # perform local search until the stopping rule is met - for i in 1:iterations - local_search!( - a, graph, swap, swap_rule = swap_rule, accept_rule = accept_rule) - next!(p) - if stopping_rule(a, graph, stop_rule) - finish!(p) - break - end - end -end - -""" - local_search!(a::Assignment, graph, swap; swap_rule, accept_rule) - -Perform local search by trying a swap and accepting it if it improves the likelihood. - -# Arguments -- `a::Assignment`: The assignment of nodes to blocks. -- `graph`: The input graph. -- `swap`: The swap object. -- `swap_rule::NodeSwapRule`: Rule for swapping nodes. -- `accept_rule::AcceptRule`: Rule for accepting swaps. -""" -function local_search!( - a::Assignment, graph, swap::Swap = make_swap(a, (1, 1)); - swap_rule::NodeSwapRule = RandomNodeSwap(), - accept_rule::AcceptRule = Strict() -) - # select two nodes to swap and build the swap object - make_swap!(swap, a, select_swap(a, swap_rule)) - # perform the swap and accept it if it improves the likelihood - accept_reject_update!(a, swap, graph, accept_rule) -end diff --git a/src/old_messy/optimisation/swap.jl b/src/old_messy/optimisation/swap.jl deleted file mode 100644 index 69b1ff8..0000000 --- a/src/old_messy/optimisation/swap.jl +++ /dev/null @@ -1,26 +0,0 @@ -abstract type Swap end - -mutable struct DefaultSwap <: Swap - index1::Int - index2::Int -end - -function make_swap(::Assignment, id) - return DefaultSwap(id[1], id[2]) -end - -function make_swap!(swap::DefaultSwap, ::Assignment, id) - swap.index1, swap.index2 = id -end - -function apply_swap!(a::Assignment, s::DefaultSwap) - swap_node_labels!(a, s.index1, s.index2) -end - -function revert_swap!(assignment::Assignment, swap::DefaultSwap) - apply_swap!(assignment, swap) -end - -function swap_node_labels!(a::Assignment, i, j) - a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] -end diff --git a/src/old_messy/sbm.jl b/src/old_messy/sbm.jl deleted file mode 100644 index 1fad688..0000000 --- a/src/old_messy/sbm.jl +++ /dev/null @@ -1,186 +0,0 @@ -# TODO: remove BlockModel being a subtype of AbstractMatrix -# this was fun but useless and actually harmful - -struct BlockModel{T, K, F <: Real} <: AbstractMatrix{T} - sizes::Vector{F} - probs::SymmetricTensor{T, K, 2} -end - -function BlockModel( - θ::AbstractMatrix{T}, sizes::Vector{F}) where {T, F <: Real} - return BlockModel(sizes, - SymmetricTensor([θ[i, j] for i in 1:size(θ, 1) for j in i:size(θ, 2)], - Val(length(sizes)), Val(2))) -end - -function edge_type(::BlockModel{T, K, F}) where {T, K, F} - return eltype(T) -end - -function _check_sizes(sizes) - @assert sum(sizes)≈1 "Sizes must sum to 1, got $(sum(sizes))" - return sizes -end - -function _check_sizes(sizes::Vector{Int}) - return sizes ./ sum(sizes) -end - -function initialize_sbm(sizes::Vector, dist, k = length(sizes)) - sizes = _check_sizes(sizes) - n_dims = binomial(k + 1, 2) - probs = Vector{typeof(dist)}(undef, n_dims) - fill!(probs, dist) - return BlockModel(sizes, SymmetricTensor(probs, Val(k), Val(2))) -end - -function initialize_sbm(sizes::GroupSize, dist, k = length(sizes)) - size_bins = sizes ./ sum(sizes) - n_dims = binomial(k + 1, 2) - probs = Vector{typeof(dist)}(undef, n_dims) - fill!(probs, dist) - return BlockModel(size_bins, SymmetricTensor(probs, Val(k), Val(2))) -end - -function initialize_sbm(k::Int, dist) - return initialize_sbm(ones(k) / k, dist) -end - -number_blocks(::BlockModel{T, K, F}) where {T, K, F} = K - -Base.size(s::BlockModel) = size(s.probs) -Base.ndims(::BlockModel) = 2 -Base.eltype(::BlockModel{T, K, F}) where {T, K, F} = T -Base.setindex!(s::BlockModel, v, i, j) = setindex!(s.probs, v, i, j) -Base.@propagate_inbounds function Base.getindex(s::BlockModel, i, j) - return getindex(s.probs, i, j) -end - -function sample( - rng::Random.AbstractRNG, sbm::BlockModel, node_labels::Vector{Int}, sorted = false) - n_nodes = length(node_labels) - if sorted - sort!(node_labels) - end - A = zeros(edge_type(sbm), n_nodes, n_nodes) - for j in 1:n_nodes - for i in (j + 1):n_nodes - A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) - end - end - return sparse(Symmetric(A, :L)), node_labels -end - -function draw_and_fill!( - rng::Random.AbstractRNG, A, sbm::BlockModel, sorted = false) - n_blocks = number_blocks(sbm) - n_nodes = size(A, 1) - node_labels = StatsBase.sample( - rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) - if sorted - sort!(node_labels) - end - @inbounds for j in 1:n_nodes - for i in (j + 1):n_nodes - A[i, j] = Random.rand(rng, sbm[node_labels[i], node_labels[j]]) - end - end - A .= Symmetric(A, :L) -end - -function draw_and_fill!(A, sbm, sorted = false) - draw_and_fill!(Random.default_rng(), A, sbm, sorted) -end - -function sample(sbm::BlockModel, node_labels::Vector{Int}, sorted = false) - sample(Random.default_rng(), sbm, node_labels, sorted) -end -function sample( - rng::Random.AbstractRNG, sbm::BlockModel, n_nodes::Int, sorted = false) - n_blocks = number_blocks(sbm) - node_labels = StatsBase.sample( - rng, 1:n_blocks, StatsBase.weights(sbm.sizes), n_nodes, replace = true) - if sorted - sort!(node_labels) - end - return sample(rng, sbm, node_labels) -end - -function sample(sbm::BlockModel, n_nodes::Int, sorted = false) - sample(Random.default_rng(), sbm, n_nodes, sorted) -end - -function get_probability_matrix(sbm::BlockModel, node_labels::Vector{Int}) - return sbm.probs[node_labels, node_labels] -end - -function _get_params_as_vec(dist::Distribution) - return vcat(params(dist)...) -end - -function latent_to_block_index(latents_vec, sbm::BlockModel) - cum_sum_sizes = cumsum(sbm.sizes) - cum_sum_sizes[end] = 1.0 - return [findfirst(x -> x >= l, cum_sum_sizes) for l in latents_vec] -end - -""" - best_alignment(fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) - -Find the best permutation of the blocks of `fitted_sbm` to match the blocks of `true_sbm` by -comparing the mean absolute difference of the parameters of the two models. -If the difference between the two models is less than `tol`, the function stops early. - -!!! warning - This function is not efficient for large numbers of blocks, as it uses brute force to - find the best permutation. -""" -function best_alignment( - fitted_sbm::BlockModel, true_sbm::BlockModel, tol = 0.01) - k = number_blocks(fitted_sbm) - if k != number_blocks(true_sbm) - throw(ArgumentError("The number of blocks must be the same for both models")) - end - best_perm = nothing - best_loss = Inf - fitted_params = _get_params_as_vec.(fitted_sbm) - true_params = _get_params_as_vec.(true_sbm) - for perm in permutations(1:k) - loss = sum(map(x -> sum(abs.(x)), fitted_params[perm] .- true_params)) - if loss < best_loss - best_loss = loss - best_perm = perm - end - if best_loss < tol - break - end - end - return best_perm -end - -function align_sbm!(sbm::BlockModel, perm) - sbm.probs .= sbm.probs[perm, perm] - sbm.sizes .= sbm.sizes[perm] -end - -""" - order_groups(a::Assignment, latents::AbstractVector) - -Order the groups of an assignment according to the true latents. This is an heuristic -approach, which is not guaranteed to find the true ordering of the groups. -""" -function order_groups(a::Assignment, latents::AbstractVector) - n = number_nodes(a) - k = number_groups(a) - sort_perm = sortperm(latents) - sorted_group_labels = a.node_labels[sort_perm] - dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - return sort( - 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) -end - -function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) - align_sbm!(sbm, order_groups(a, latents)) -end diff --git a/test/old_tests/TestNetworkHistogram.jl b/test/old_tests/TestNetworkHistogram.jl deleted file mode 100644 index 0018f70..0000000 --- a/test/old_tests/TestNetworkHistogram.jl +++ /dev/null @@ -1,30 +0,0 @@ -module TestNetworkHistogram - -import NetworkHistogram as NH -using Test - -function to_default_assignment(a_specialised::NH.Assignment{T, B}) where {T, B} - return NH.Assignment(a_specialised.group_size, a_specialised.node_labels) -end - -to_default_assignment(a::NH.Assignment{T, Nothing}) where {T} = a - -function test_swap_revertible( - a::NH.Assignment, swap::NH.Swap, g::NH.Observations) - a_test = deepcopy(a) - NH.apply_swap!(a_test, swap) - @test NH.get_group_of_vertex(a, swap.index1) == - NH.get_group_of_vertex(a_test, swap.index2) - @test NH.get_group_of_vertex(a, swap.index2) == - NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment - a_new = to_default_assignment(a_test) - @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - - # revert the swap and check if the assignment is the same as before - NH.revert_swap!(a_test, swap) - @test a == a_test - @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) -end - -end diff --git a/test/old_tests/assignments/bernoulli_assignment.jl b/test/old_tests/assignments/bernoulli_assignment.jl deleted file mode 100644 index d683e1a..0000000 --- a/test/old_tests/assignments/bernoulli_assignment.jl +++ /dev/null @@ -1,42 +0,0 @@ -import NetworkHistogram as NH - -@testset "test construction Bernoulli assignment" begin - using Distributions: Bernoulli - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - obs = NH.Observations(A, Bernoulli(0.5)) - node_labels = [1, 1, 1, 1, 2, 2, 2, 2] - group_size = NH.GroupSize(8, 4) - a = NH.BernoulliAssignment(obs, group_size, node_labels) - for i in 1:8 - @test NH.get_group_of_vertex(a, i) == node_labels[i] - end - @test all(a.additional_data.A .== A) - @test a.additional_data.realized == [5 2; 2 5] - @test a.additional_data.counts == [6 16; 16 6] - @test a.additional_data.estimated_theta == [5/6 1/8; 1/8 5/6] -end - -@testset "test Bernoulli swap" begin - using ..TestNetworkHistogram: test_swap_revertible - using Distributions: Bernoulli - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - obs = NH.Observations(A, Bernoulli(0.5)) - a = NH.BernoulliAssignment( - obs, NH.GroupSize(8, 4), [1, 1, 1, 1, 2, 2, 2, 2]) - swap = NH.make_swap(a, (1, 2)) - test_swap_revertible(a, swap, obs) -end diff --git a/test/old_tests/assignments/categorical_assignment.jl b/test/old_tests/assignments/categorical_assignment.jl deleted file mode 100644 index bd4c4db..0000000 --- a/test/old_tests/assignments/categorical_assignment.jl +++ /dev/null @@ -1,126 +0,0 @@ -import NetworkHistogram as NH - -using Random - -@testset "test Categorical swap" begin - Random.seed!(1234123) - using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment - using Distributions: Categorical - using LinearAlgebra: Symmetric - import Random - m = 2 - p = ones(m) ./ m - n = 12 - k = 4 - dist = Categorical(p) - sbm = NH.initialize_sbm(ones(k) ./ k, dist) - node_labels = repeat(1:k, inner = n ÷ k) - A, _ = NH.sample(sbm, node_labels) - g = NH.Observations(collect(A), dist) - a = NH.CategoricalAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) - swap = NH.make_swap(a, (1, k + 1)) - @test A[:, 1] != A[:, k + 1] - a_test = deepcopy(a) - NH.apply_swap!(a_test, swap) - @test NH.get_group_of_vertex(a, swap.index1) == - NH.get_group_of_vertex(a_test, swap.index2) - @test NH.get_group_of_vertex(a, swap.index2) == - NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment - a_new = to_default_assignment(a_test) - @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - @test a_test.additional_data.realized != a.additional_data.realized - @test a_test.additional_data.estimated_theta != - a.additional_data.estimated_theta - @test a_test.additional_data.log_likelihood != - a.additional_data.log_likelihood - # revert the swap and check if the assignment is the same as before - NH.revert_swap!(a_test, swap) - @test a == a_test - @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) -end - -@testset "fast update test" begin - using Distributions - realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; - [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized = [realized[I][k] - for k in eachindex(realized[1, 1]), - I in CartesianIndices(realized)] - counts = [1 4 4 - 4 1 4 - 4 4 1] - A = [0 1 2 2 3 3 - 1 0 2 2 3 3 - 2 2 0 1 3 3 - 2 2 1 0 3 3 - 3 3 3 3 0 1 - 3 3 3 3 1 0] - groupsize = NH.GroupSize(6, 2) - node_labels = [1, 1, 2, 2, 3, 3] - g = NH.Observations(A, Categorical(3)) - a = NH.CategoricalAssignment(g, groupsize, node_labels) - for index in eachindex(realized) - @test all(realized[index] .== a.additional_data.realized[index]) - end - @test loglikelihood(a, g) ≈ 0 - @test a.additional_data.counts == counts - swap_id = (1, 3) - ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; - [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized_after_swap = [ras[I][k] - for k in eachindex(ras[1, 1]), - I in CartesianIndices(ras)] - - swap = NH.make_swap(a, swap_id) - NH.apply_swap!(a, swap) - for j in 1:3 - for i in 1:3 - @test all(realized_after_swap[:, i, j] .== - a.additional_data.realized[:, i, j]) - @test all(a.additional_data.estimated_theta[:, i, j] .≈ - realized_after_swap[:, i, j] ./ counts[i, j]) - end - end - @test loglikelihood(a, g) == 4 * log(0.5) -end - -#todo: test ll against categorical likelihood on basic assignment -@testset "test swap is not overwritten" begin - A = [0 4 4 2 1 2 2 3 4 2 3 1 4 1 1 3 4 4 3 3 - 4 0 4 2 4 2 1 1 1 3 3 1 1 1 3 3 4 2 1 4 - 4 4 0 1 2 4 2 2 1 3 2 3 1 2 3 2 3 4 1 1 - 2 2 1 0 2 1 2 2 2 3 1 1 3 3 3 3 3 1 1 2 - 1 4 2 2 0 4 1 4 3 2 4 3 4 3 1 3 1 1 1 3 - 2 2 4 1 4 0 2 3 1 3 1 4 3 3 1 3 1 3 3 3 - 2 1 2 2 1 2 0 3 2 2 1 1 1 3 3 1 1 3 1 1 - 3 1 2 2 4 3 3 0 4 3 2 3 1 1 1 1 1 3 2 1 - 4 1 1 2 3 1 2 4 0 3 1 1 1 3 2 1 3 1 4 1 - 2 3 3 3 2 3 2 3 3 0 1 3 1 1 3 1 3 1 1 4 - 3 3 2 1 4 1 1 2 1 1 0 2 3 2 2 1 2 2 1 3 - 1 1 3 1 3 4 1 3 1 3 2 0 4 4 2 2 2 3 1 1 - 4 1 1 3 4 3 1 1 1 1 3 4 0 2 2 1 2 1 1 3 - 1 1 2 3 3 3 3 1 3 1 2 4 2 0 1 2 1 2 1 1 - 1 3 3 3 1 1 3 1 2 3 2 2 2 1 0 2 1 2 1 1 - 3 3 2 3 3 3 1 1 1 1 1 2 1 2 2 0 1 1 1 3 - 4 4 3 3 1 1 1 1 3 3 2 2 2 1 1 1 0 1 1 1 - 4 2 4 1 1 3 3 3 1 1 2 3 1 2 2 1 1 0 1 1 - 3 1 1 1 1 3 1 2 4 1 1 1 1 1 1 1 1 1 0 1 - 3 4 1 2 3 3 1 1 1 4 3 1 3 1 1 3 1 1 1 0] - g = NH.Observations(A, Categorical(4)) - h = 6 - a = NH.make_assignment( - g, h, NH.InitRule(NH.OrderedStart(), Val{NH.CategoricalData}())) - a_ref = deepcopy(a) - swap_indices = [(18, 5), (15, 10), (5, 13)] - swap = NH.make_swap(a, swap_indices[1]) - for swap_index in swap_indices - NH.make_swap!(swap, a, swap_index) - NH.apply_swap!(a, swap) - @test swap.realized == a_ref.additional_data.realized - @test swap.estimated_theta == a_ref.additional_data.estimated_theta - NH.revert_swap!(a, swap) - end -end diff --git a/test/old_tests/assignments/default_assignment.jl b/test/old_tests/assignments/default_assignment.jl deleted file mode 100644 index fefbf64..0000000 --- a/test/old_tests/assignments/default_assignment.jl +++ /dev/null @@ -1,17 +0,0 @@ -import NetworkHistogram as NH - -@testset "test default swap" begin - using ..TestNetworkHistogram: test_swap_revertible - import Random, LinearAlgebra - using Distributions: Bernoulli, Normal - Random.seed!(1234123) - n = 20 - k = 5 - #data = LinearAlgebra.Symmetric(Random.rand(Bool,n,n)) - data = Random.rand(Normal(), n, n) - g = NH.Observations(data, Normal(0, 1)) - labels = repeat(1:(n ÷ k), inner = k) - a = NH.Assignment(NH.GroupSize(n, k), labels) - swap = NH.DefaultSwap(1, 2) - test_swap_revertible(a, swap, g) -end diff --git a/test/old_tests/assignments/sparse_assignment.jl b/test/old_tests/assignments/sparse_assignment.jl deleted file mode 100644 index a7b56ac..0000000 --- a/test/old_tests/assignments/sparse_assignment.jl +++ /dev/null @@ -1,120 +0,0 @@ -import NetworkHistogram as NH - -using Random - -@testset "test sparse give the same as categorical" begin - using Distributions, LinearAlgebra, SparseArrays - k = 2 - m = 5 - level_count = 4 - n = 20 - tau = [0.8, 0.1, 0.1, 0.1, 0.1] - sbm = NH.initialize_sbm(ones(k) ./ k, Categorical(tau ./ sum(tau))) - A, _ = NH.sample(sbm, n) - A_dense = collect(A) - A = sparse(A_dense .- 1) - for i in 1:n - A[i, i] = 0 - end - g = NH.Observations(A_dense, Categorical(m)) - sbm_fitted, a = nethist(g; h = n ÷ k, iterations = 10) - sparse_a = NH.SparseAssignment( - NH.Observations(A, Categorical(m)), a.group_size, a.node_labels) - @test a.additional_data.counts == sparse_a.additional_data.counts - for (l, m_index) in enumerate(2:m) - @test a.additional_data.realized[m_index, :, :] == - sparse_a.additional_data.realized[l, :, :] - @test a.additional_data.estimated_theta[m_index, :, :] == - sparse_a.additional_data.estimated_theta[l, :, :] - end - @test a.additional_data.log_likelihood ≈ - sparse_a.additional_data.log_likelihood -end - -@testset "test sparse swap" begin - Random.seed!(1234123) - using ..TestNetworkHistogram: test_swap_revertible, to_default_assignment - using Distributions: DiscreteNonParametric - using LinearAlgebra: Symmetric - import Random - m = 4 - p = ones(m) ./ m - n = 12 - k = 4 - dist = NH.ZeroInflatedCategorical(p) - sbm = NH.initialize_sbm(ones(k) ./ k, dist) - node_labels = repeat(1:k, inner = n ÷ k) - A = sparse(first(NH.sample(sbm, node_labels))) - g = NH.Observations(A, dist) - a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) - swap = NH.make_swap(a, (1, k + 1)) - @test A[:, 1] != A[:, k + 1] - a_test = deepcopy(a) - NH.apply_swap!(a_test, swap) - @test NH.get_group_of_vertex(a, swap.index1) == - NH.get_group_of_vertex(a_test, swap.index2) - @test NH.get_group_of_vertex(a, swap.index2) == - NH.get_group_of_vertex(a_test, swap.index1) - # force recomputation of the log likelihood using default assignment - a_new = to_default_assignment(a_test) - @test NH.loglikelihood(a_new, g) ≈ NH.loglikelihood(a_test, g) - @test a_test.additional_data.realized != a.additional_data.realized - @test a_test.additional_data.estimated_theta != - a.additional_data.estimated_theta - @test a_test.additional_data.log_likelihood != - a.additional_data.log_likelihood - # revert the swap and check if the assignment is the same as before - NH.revert_swap!(a_test, swap) - @test a == a_test - @test NH.loglikelihood(a, g) ≈ NH.loglikelihood(a_test, g) -end - -@testset "fast sparse update test" begin - using Distributions - realized = [[[1, 0, 0]] [[0, 4, 0]] [[0, 0, 4]]; - [[0, 4, 0]] [[1, 0, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized = [realized[I][k] - for k in eachindex(realized[1, 1]), - I in CartesianIndices(realized)] - counts = [1 4 4 - 4 1 4 - 4 4 1] - A = sparse([0 1 2 2 3 3 - 1 0 2 2 3 3 - 2 2 0 1 3 3 - 2 2 1 0 3 3 - 3 3 3 3 0 1 - 3 3 3 3 1 0]) - groupsize = NH.GroupSize(6, 2) - node_labels = [1, 1, 2, 2, 3, 3] - g = NH.Observations(A, Categorical(3)) - k = 3 - m = 3 - n = size(A, 1) - a = NH.SparseAssignment(g, NH.GroupSize(n, n ÷ k), node_labels) - for index in eachindex(realized) - @test all(realized[index] .== a.additional_data.realized[index]) - end - @test loglikelihood(a, g) ≈ 0 - @test a.additional_data.counts == counts - swap_id = (1, 3) - ras = [[[0, 1, 0]] [[2, 2, 0]] [[0, 0, 4]]; - [[2, 2, 0]] [[0, 1, 0]] [[0, 0, 4]]; - [[0, 0, 4]] [[0, 0, 4]] [[1, 0, 0]]] - realized_after_swap = [ras[I][k] - for k in eachindex(ras[1, 1]), - I in CartesianIndices(ras)] - - swap = NH.make_swap(a, swap_id) - NH.apply_swap!(a, swap) - for j in 1:3 - for i in 1:3 - @test all(realized_after_swap[:, i, j] .== - a.additional_data.realized[:, i, j]) - @test all(a.additional_data.estimated_theta[:, i, j] .≈ - realized_after_swap[:, i, j] ./ counts[i, j]) - end - end - @test loglikelihood(a, g) ≈ 4 * log(0.5) -end diff --git a/test/old_tests/assignments/sum_assignment.jl b/test/old_tests/assignments/sum_assignment.jl deleted file mode 100644 index 1a80856..0000000 --- a/test/old_tests/assignments/sum_assignment.jl +++ /dev/null @@ -1,9 +0,0 @@ -import NetworkHistogram as NH - -using Random - - -@testset "test sum assignment" begin - using Distributions, LinearAlgebra, SparseArrays - @test 1 == 2 -end diff --git a/test/old_tests/discretised_dist/discretizer.jl b/test/old_tests/discretised_dist/discretizer.jl deleted file mode 100644 index 17c90fa..0000000 --- a/test/old_tests/discretised_dist/discretizer.jl +++ /dev/null @@ -1,20 +0,0 @@ -using NetworkHistogram - -@testset "discretizer" begin - using StaticArrays - reg_disc = NetworkHistogram.RegularDiscretizer( - 10, 0.0, 1.0, MVector{10}(1:10), 1 / 10) - cat_disc = NetworkHistogram.CategoryDiscretizer( - Dict([0.0 => 11]), Dict([11 => 0.0])) - hybrid_disc = NetworkHistogram.HybridDiscretizer( - reg_disc, cat_disc) - - @test NetworkHistogram.encode(reg_disc, 0.0) == 1 - @test NetworkHistogram.encode(cat_disc, 0.0) == 11 - @test NetworkHistogram.encode(hybrid_disc, 0.0) == 11 - @test NetworkHistogram.decode(hybrid_disc, 11) == 0.0 - @test all(NetworkHistogram.encode(reg_disc, 0.001:0.001:1.0) .== - NetworkHistogram.encode(hybrid_disc, 0.001:0.001:1.0)) - @test all(NetworkHistogram.decode(hybrid_disc, 1:10) .== - NetworkHistogram.decode(reg_disc, 1:10)) -end diff --git a/test/old_tests/generated_tests/all.jl b/test/old_tests/generated_tests/all.jl deleted file mode 100644 index 3a6cd57..0000000 --- a/test/old_tests/generated_tests/all.jl +++ /dev/null @@ -1,2 +0,0 @@ -include("test_zero_inflated.jl") -include("test_distribution.jl") diff --git a/test/old_tests/generated_tests/test_distribution.jl b/test/old_tests/generated_tests/test_distribution.jl deleted file mode 100644 index 9389428..0000000 --- a/test/old_tests/generated_tests/test_distribution.jl +++ /dev/null @@ -1,84 +0,0 @@ -using NetworkHistogram: ZeroInflated, DiscretizedDistribution, - ZeroInflatedCategorical, - ncategories, Discretizer, encode, decode, binwidth, - RegularDiscretizer, - CategoryDiscretizer, HybridDiscretizer, - DiscretizerZeroToZero, nlabels -using Distributions -using Test - -@testset "ZeroInflated" begin - dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) - @test pdf(dist, 0) ≈ 0.3 + 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 0) - @test pdf(dist, 1) ≈ 0.7 * pdf(truncated(Normal(0, 1), -3, 3), 1) - @test cdf(dist, 0) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 0) - @test cdf(dist, 1) ≈ 0.3 + 0.7 * cdf(truncated(Normal(0, 1), -3, 3), 1) -end - -@testset "DiscretizedDistribution" begin - dist = DiscretizedDistribution(truncated(Normal(0, 1), -3, 3), 10) - @test ncategories(dist) == 10 - @test pdf(dist, 0) >= 0 - @test cdf(dist, 0) >= 0 -end - -@testset "ZeroInflatedCategorical" begin - dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) - @test pdf(dist, 0) ≈ 0.3 - @test pdf(dist, 1) ≈ 0.7 * 0.2 - @test cdf(dist, 0) ≈ 0.3 - @test cdf(dist, 1) ≈ 0.3 + 0.7 * 0.2 -end - -@testset "ZeroInflatedDiscretizedDistribution" begin - dist = ZeroInflated(0.3, truncated(Normal(0, 1), -3, 3)) - disc_dist = DiscretizedDistribution(dist, 10) - @test ncategories(disc_dist) == 10 - @test pdf(disc_dist, 0) >= 0 - @test cdf(disc_dist, 0) >= 0 -end - -@testset "DiscretizedZeroInflatedCategorical" begin - dist = ZeroInflatedCategorical(0.3, Categorical([0.2, 0.3, 0.5])) - disc_dist = DiscretizedDistribution(dist, 10) - @test ncategories(disc_dist) == 10 - @test pdf(disc_dist, 0) >= 0 - @test cdf(disc_dist, 0) >= 0 -end - -@testset "Discretizer" begin - using Distributions - disc = RegularDiscretizer(10, 0.0, 1.0) - @test encode(disc, 0.05) == 1 - @test decode(disc, 1) == (0.0, 0.1) - @test binwidth(disc) == 0.1 - @test nlabels(disc) == 10 -end - -@testset "CategoryDiscretizer" begin - cat_to_bin = Dict("a" => 1, "b" => 2, "c" => 3) - bin_to_cat = Dict(1 => "a", 2 => "b", 3 => "c") - disc = CategoryDiscretizer(cat_to_bin, bin_to_cat) - @test encode(disc, "a") == 1 - @test decode(disc, 1) == "a" - @test nlabels(disc) == 3 -end - -@testset "HybridDiscretizer" begin - atoms = [0.0, 1.0] - disc = HybridDiscretizer(10, -1.0, 1.0, atoms) - @test encode(disc, 0.0) == 11 - @test encode(disc, 0.5) == 8 - @test decode(disc, 11) == 0.0 - @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) - @test nlabels(disc) == 12 -end - -@testset "DiscretizerZeroToZero" begin - disc = DiscretizerZeroToZero(10, -1.0, 1.0) - @test encode(disc, 0.0) == 0 - @test encode(disc, 0.5) == 8 - @test decode(disc, 0) == 0.0 - @test all(isapprox.(decode(disc, 8), (0.4, 0.6); atol = 1e-2)) - @test nlabels(disc) == 11 -end diff --git a/test/old_tests/generated_tests/test_zero_inflated.jl b/test/old_tests/generated_tests/test_zero_inflated.jl deleted file mode 100644 index 380e80c..0000000 --- a/test/old_tests/generated_tests/test_zero_inflated.jl +++ /dev/null @@ -1,97 +0,0 @@ -using Test -using Distributions -using Random -using NetworkHistogram: ZeroInflated, get_proba_zero - -@testset "ZeroInflated Distribution Tests" begin - @testset "continuous distribution" begin - # Test construction - dist = Normal(0, 1) - zero_inflated_dist = ZeroInflated(0.5, dist) - @test zero_inflated_dist.edge_proba == Bernoulli(0.5) - @test zero_inflated_dist.dist == dist - - # Test pdf - @test pdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * pdf(dist, 0) - @test pdf(zero_inflated_dist, 1) ≈ 0.5 * pdf(dist, 1) - - # Test get_proba_zero - @test get_proba_zero(zero_inflated_dist) == 0.5 - - # Test rand - rng = MersenneTwister(1234) - sample = rand(rng, zero_inflated_dist) - @test sample == 0 || insupport(dist, sample) - - # Test logpdf - @test logpdf(zero_inflated_dist, 0) ≈ log(0.5 * (1 + pdf(dist, 0))) - @test logpdf(zero_inflated_dist, 1) ≈ log(0.5 * pdf(dist, 1)) - - # Test minimum and maximum - @test minimum(zero_inflated_dist) == minimum(dist) - @test maximum(zero_inflated_dist) == maximum(dist) - - # Test insupport - @test insupport(zero_inflated_dist, 0) - @test insupport(zero_inflated_dist, 1) == insupport(dist, 1) - - # Test cdf - @test cdf(zero_inflated_dist, 0) ≈ 0.5 + 0.5 * cdf(dist, 0) - @test cdf(zero_inflated_dist, 1) ≈ 0.5 + 0.5 * cdf(dist, 1) - - # Test params - @test params(zero_inflated_dist) == (0.5, params(dist)...) - - # Test fit - data = [0, 0, 1, 2, 3] - fitted_dist = fit(ZeroInflated{Bernoulli, Normal}, data, 2) - @test fitted_dist.edge_proba == Bernoulli(0.6) - @test fitted_dist.dist isa Normal - end - - @testset "discrete distribution" begin - # Test construction with discrete distribution - dist_disc = Poisson(3) - zero_inflated_dist_disc = ZeroInflated(0.5, dist_disc) - @test zero_inflated_dist_disc.edge_proba == Bernoulli(0.5) - @test zero_inflated_dist_disc.dist == dist_disc - - # Test pdf with discrete distribution - @test pdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * pdf(dist_disc, 0) - @test pdf(zero_inflated_dist_disc, 1) ≈ 0.5 * pdf(dist_disc, 1) - - # Test get_proba_zero with discrete distribution - @test get_proba_zero(zero_inflated_dist_disc) == 0.5 - - # Test rand with discrete distribution - rng = MersenneTwister(1234) - sample_disc = rand(rng, zero_inflated_dist_disc) - @test sample_disc == 0 || insupport(dist_disc, sample_disc) - - # Test logpdf with discrete distribution - @test logpdf(zero_inflated_dist_disc, 0) ≈ - log(0.5 * (1 + pdf(dist_disc, 0))) - @test logpdf(zero_inflated_dist_disc, 1) ≈ log(0.5 * pdf(dist_disc, 1)) - - # Test minimum and maximum with discrete distribution - @test minimum(zero_inflated_dist_disc) == minimum(dist_disc) - @test maximum(zero_inflated_dist_disc) == maximum(dist_disc) - - # Test insupport with discrete distribution - @test insupport(zero_inflated_dist_disc, 0) - @test insupport(zero_inflated_dist_disc, 1) == insupport(dist_disc, 1) - - # Test cdf with discrete distribution - @test cdf(zero_inflated_dist_disc, 0) ≈ 0.5 + 0.5 * cdf(dist_disc, 0) - @test cdf(zero_inflated_dist_disc, 1) ≈ 0.5 + 0.5 * cdf(dist_disc, 1) - - # Test params with discrete distribution - @test params(zero_inflated_dist_disc) == (0.5, params(dist_disc)...) - - # Test fit with discrete distribution - data_disc = [0, 0, 1, 2, 3] - fitted_dist_disc = fit(ZeroInflated{Bernoulli, Poisson}, data_disc, 2) - @test fitted_dist_disc.edge_proba == Bernoulli(0.6) - @test fitted_dist_disc.dist isa Poisson - end -end diff --git a/test/old_tests/observations/discretisation.jl b/test/old_tests/observations/discretisation.jl deleted file mode 100644 index 49eb959..0000000 --- a/test/old_tests/observations/discretisation.jl +++ /dev/null @@ -1,15 +0,0 @@ -using NetworkHistogram - -@testset "discretisation" begin - using Distributions - A = rand(-1:1, 20, 20) - for i in 1:20 - A[i, i] = 0 - end - g = Observations(A, Uniform(-1, 1)) - discretised_g, discretizer = discretise(g; number_levels = 6) - @test size(discretised_g.graph) == size(g.graph) - @test discretised_g.dist_ref isa NetworkHistogram.DiscretizedDistribution - @test ncategories(discretised_g.dist_ref) == 6 - @test all(discretised_g.graph .∈ Ref(0:6)) -end diff --git a/test/old_tests/optimisation/config_rules/init_rule.jl b/test/old_tests/optimisation/config_rules/init_rule.jl deleted file mode 100644 index f304378..0000000 --- a/test/old_tests/optimisation/config_rules/init_rule.jl +++ /dev/null @@ -1,46 +0,0 @@ -import NetworkHistogram as NH - -@testset "regression test" begin - Random.seed!(1234123) - using Distributions: Bernoulli - A = BitMatrix([0 0 1 0 1 0 1 1 0 1 - 0 0 1 1 1 1 1 1 0 0 - 1 1 0 1 0 0 0 0 1 0 - 0 1 1 0 1 0 1 0 0 0 - 1 1 0 1 0 0 1 0 0 1 - 0 1 0 0 0 0 0 1 0 0 - 1 1 0 1 1 0 0 1 0 1 - 1 1 0 0 0 1 1 0 0 1 - 0 0 1 0 0 0 0 0 0 1 - 1 0 0 0 1 0 1 1 1 0]) - h_true_nethist = 2.643731 # version 0.2.3 from nethist package - k_true = 3 - obs = NH.Observations(A, Bernoulli(0.5)) - @testset "degrees" begin - k = NH.select_number_node_per_block(obs, NH.EstimatedDegrees()) - @test k == k_true - end - @testset "eigenvalues" begin - k = NH.select_number_node_per_block(obs, NH.EstimatedEigenvalues()) - @test k == k_true - end -end - -@testset "test oracle K" begin - Random.seed!(1234123) - using Distributions: Bernoulli - A = [0 1 1 1 0 0 1 0 - 1 0 1 1 0 0 0 0 - 1 1 0 0 0 0 0 0 - 1 1 0 0 0 0 0 1 - 0 0 0 0 0 1 1 1 - 0 0 0 0 1 0 1 1 - 1 0 0 0 1 1 0 0 - 0 0 0 1 1 1 0 0] - obs = NH.Observations(A, Bernoulli(0.5)) - oracle = NH.OracleH(4) - @test NH.select_number_node_per_block(obs, oracle) == 4 - err = ArgumentError("The number of nodes per block 5 is too large for the \ - number of nodes 8, it should be at most 4") - @test_throws err NH.select_number_node_per_block(obs, NH.OracleH(5)) -end diff --git a/test/old_tests/runtests.jl b/test/old_tests/runtests.jl deleted file mode 100644 index 8c1ee77..0000000 --- a/test/old_tests/runtests.jl +++ /dev/null @@ -1,38 +0,0 @@ -using Test -using Aqua -using SparseArrays -include("TestNetworkHistogram.jl") - -@testset "Tests" begin - @testset "Discretizer tests" begin - include("discretised_dist/discretizer.jl") - end - @testset "Assignment tests" begin - include("assignments/default_assignment.jl") - include("assignments/bernoulli_assignment.jl") - include("assignments/categorical_assignment.jl") - include("assignments/sparse_assignment.jl") - include("assignments/sum_assignment.jl") - end - - @testset "Rule optimization tests" begin - include("optimisation/config_rules/init_rule.jl") - end - - @testset "Observations tests" begin - include("observations/discretisation.jl") - end - - @testset "API tests" begin - include("test_api.jl") - end - - @testset "Generated tests" begin - include("generated_tests/all.jl") - end - - # @testset "Aqua.jl for package quality" begin - # using NetworkHistogram - # Aqua.test_all(NetworkHistogram) - # end -end diff --git a/test/old_tests/test_api.jl b/test/old_tests/test_api.jl deleted file mode 100644 index 48fdefb..0000000 --- a/test/old_tests/test_api.jl +++ /dev/null @@ -1,19 +0,0 @@ -@testset "test api" begin - using Distributions - A = rand(-1:1, 40, 40) - for i in 1:40 - A[i, i] = 0 - end - - g = Observations(Symmetric(A), Uniform(-1, 1)) - sbm_fitted, a = nethist(g; h = 10, iterations = 10) - - @test eltype(sbm_fitted) == typeof(Uniform(-1, 1)) - @test size(sbm_fitted) == (4, 4) - - sbm_discretised, a, discretizer = nethist_discretised( - g; number_levels = 5, h = 10, iterations = 10) - @test sbm_discretised[1, 1] isa DiscretizedDistribution - @test ncategories(sbm_discretised[1, 1]) == 5 - @test size(sbm_discretised) == (4, 4) -end From aa46922a9447fcea01d8e5b836e7f51f488061e1 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 30 Apr 2025 17:06:47 +0200 Subject: [PATCH 140/266] correctly compute fast group ll update --- src/optimization/swap_workspace.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index ea2c0aa..40d4362 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -68,6 +68,7 @@ function _fast_ll_one_group(a::Assignment, g1, g2) for u in nodes_g1 for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 if v in nodes_g2 + if (g1 == g2 && u < v) || g1 != g2 ll += loglikelihood(d, e) end end From 594d50a81167ec239eb6e7f86240d4e26a8d8b88 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 30 Apr 2025 17:06:57 +0200 Subject: [PATCH 141/266] correctly compute fast group ll update --- src/optimization/swap_workspace.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 40d4362..4ed9404 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -69,7 +69,8 @@ function _fast_ll_one_group(a::Assignment, g1, g2) for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 if v in nodes_g2 if (g1 == g2 && u < v) || g1 != g2 - ll += loglikelihood(d, e) + ll += loglikelihood(d, e) + end end end end From f1f18b003c853998fb618f4b2f1ec7fee6079194 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 7 May 2025 12:22:43 +0200 Subject: [PATCH 142/266] decouple neighbour list and edge values --- src/EdgeList.jl | 32 +++++++------- src/NetworkHistogram.jl | 1 - src/include_old.jl | 67 ------------------------------ src/optimization/swap_workspace.jl | 1 - 4 files changed, 16 insertions(+), 85 deletions(-) delete mode 100644 src/include_old.jl diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 9b397f9..7658c80 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -1,18 +1,15 @@ struct EdgeList{E} - data::Vector{Vector{Tuple{Int,E}}} + data::Vector{Vector{E}} + name_list::Vector{Vector{Int}} end function neighbors(A::EdgeList{E}, i::Int) where {E} - return first.(A.data[i]), last.(A.data[i]) + return A.name_list[i], A.data[i] end -function iterate_neighbors(A::EdgeList{E}, i::Int) where {E} - return zip(first.(A.data[i]), last.(A.data[i])) -end -function edge_type(edgelist::EdgeList{E}) where {E} - return E -end +iterate_neighbors(A::EdgeList, i::Int) = zip(neighbors(A, i)...) +edge_type(A::EdgeList{E}) where {E} = E function nodes(edgelist::EdgeList{E}) where {E} return length(edgelist.data) @@ -21,16 +18,19 @@ end function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} n = size(A, 1) - data = Vector{Vector{Tuple{Int,E}}}(undef, n) + data = Vector{Vector{E}}(undef, n) + name_list = Vector{Vector{Int}}(undef, n) for j in 1:n data[j] = Vector{Tuple{Int,E}}(undef, 0) + name_list[j] = Vector{Int}(undef, 0) for i in 1:n if !ismissing(A[i,j]) - push!(data[j], (i, A[i, j])) + push!(name_list[j], i) + push!(data[j], A[i, j]) end end end - return EdgeList(data) + return EdgeList(data, name_list) end @@ -40,12 +40,12 @@ end function fit(d::Dist, A::EdgeList{E}) where {E} - new_data = Vector{Vector{Tuple{Int, typeof(d)}}}(undef, length(A.data)) + new_data = Vector{Vector{typeof(d)}}(undef, length(A.data)) for j in 1:length(A.data) - new_data[j] = Vector{Tuple{Int, typeof(d)}}(undef, length(A.data[j])) - for (k,(i, e)) in enumerate(A.data[j]) - new_data[j][k] = (i, fit(d, e)) + new_data[j] = Vector{typeof(d)}(undef, length(A.data[j])) + for (k,e) in enumerate(A.data[j]) + new_data[j][k] = fit(d, e) end end - return EdgeList(new_data) + return EdgeList(new_data, A.name_list) end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index fc172f3..45c074d 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -13,6 +13,5 @@ include("optimization/greedy.jl") export EdgeList, neighbors, nodes -#include("include_old.jl") end diff --git a/src/include_old.jl b/src/include_old.jl deleted file mode 100644 index beda772..0000000 --- a/src/include_old.jl +++ /dev/null @@ -1,67 +0,0 @@ -using LinearAlgebra, SparseArrays -using Distributions, DensityInterface -using Graphs, SimpleWeightedGraphs -using PermutationSymmetricTensors -using ProgressMeter: Progress, next!, finish!, ProgressUnknown -import StatsBase, Random -using DensityInterface: logdensityof -using LogExpFunctions: xlogx, xlogy -using ArnoldiMethod: LM, SR, LR, partialschur, partialeigen -using KrylovKit: eigsolve -import Metis -import IterativeSolvers -import Clustering -import StatsAPI: loglikelihood, fit -using CategoricalArrays, CategoricalDistributions -using Combinatorics: permutations -using StaticArrays -using Bootstrap: BootstrapSampling, ParametricBootstrapSample, tx, nrun, - zeros_tuple -import Bootstrap: bootstrap -import Base.maximum, Base.minimum -import Random: rand -import Base.convert -import Distributions: pdf, logpdf, ncategories, cdf, rand - - -include("old_messy/distributions/include.jl") -include("old_messy/assignments/Assignments.jl") -include("old_messy/sbm.jl") -include("old_messy/observations.jl") -include("old_messy/optimisation/include.jl") - -# more specialised and faster assignment types and methods -include("old_messy/assignments/include.jl") - -include("old_messy/api.jl") -include("old_messy/bootstrap.jl") - -export nethist, nethist_discretised -export loglikelihood, fit, cdf, pdf - -# export options for optimisation -export estimate_graphon -# starting assignment rules -export InitRule -export OrderedStart, RandomStart, SpectralStart, MetisStart, FromAssignment -# accept rules -export AcceptRule -export Strict -# stopping rules -export PreviousBestValue -# bandwidth selection rules -export OracleK, EstimatedEigenvalues, EstimatedDegrees, - select_number_node_per_block -# random local search rules -export RandomNodeSwap, RandomGroupSwap - -# export useful function for manipulating assignments -export Assignment, number_groups, number_nodes -export get_ordered_adjacency_matrix, get_vertex_in_group, get_group_of_vertex -export BernoulliData, CategoricalData -export Observations, discretise -export DiscretizedDistribution - -export Observations, estimate_graphon, nethist, nethist_discretised - -export bootstrap diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 4ed9404..db473dd 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -44,7 +44,6 @@ function apply_swap!(a::Assignment, s::Swap) push!(groups_concerned, minmax(g_old, g_v)) end end - println("Groups concerned: ", groups_concerned) fast_ll_update!(a, groups_concerned) swap_node_labels!(a, s.u, s.v) From 5ca238ccbb107e2b28dd41e67bb95045b9075316 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 8 May 2025 11:06:31 +0200 Subject: [PATCH 143/266] try fixing ll --- src/EdgeList.jl | 13 ++++------- src/NetworkHistogram.jl | 7 ++++-- src/api.jl | 6 ++--- src/assignment.jl | 3 +++ src/block_model.jl | 9 +++++++- src/distributions_type.jl | 3 ++- src/optimization/config_rules/accept_rule.jl | 3 +++ src/optimization/config_rules/swap_rule.jl | 4 ++-- src/optimization/greedy.jl | 2 +- src/optimization/swap_workspace.jl | 23 ++++---------------- test/Project.toml | 1 + test/runtests.jl | 1 + test/test_data_format.jl | 5 ----- 13 files changed, 37 insertions(+), 43 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 7658c80..8f2b8c5 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -10,11 +10,8 @@ end iterate_neighbors(A::EdgeList, i::Int) = zip(neighbors(A, i)...) edge_type(A::EdgeList{E}) where {E} = E - -function nodes(edgelist::EdgeList{E}) where {E} - return length(edgelist.data) -end - +nodes(edgelist::EdgeList) = length(edgelist.data) +number_nodes(edgelist::EdgeList) = nodes(edgelist) function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} n = size(A, 1) @@ -24,7 +21,7 @@ function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} data[j] = Vector{Tuple{Int,E}}(undef, 0) name_list[j] = Vector{Int}(undef, 0) for i in 1:n - if !ismissing(A[i,j]) + if !ismissing(A[i,j]) # gonna be an issue with MC! have to define 0 chain and fast operations on them push!(name_list[j], i) push!(data[j], A[i, j]) end @@ -34,9 +31,7 @@ function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} end -function Base.convert(::Type{EdgeList{E}}, A::AbstractMatrix{E}) where {E} - return EdgeList(A) -end +convert(::Type{EdgeList}, A::AbstractMatrix) = EdgeList(A) function fit(d::Dist, A::EdgeList{E}) where {E} diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 45c074d..3756509 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,6 +1,9 @@ module NetworkHistogram using StatsBase using StaticArrays +using ProgressMeter +import StatsAPI: loglikelihood +import Base: convert include("utils/include.jl") using .FastSymArray @@ -10,8 +13,8 @@ include("block_model.jl") include("EdgeList.jl") include("assignment.jl") include("optimization/greedy.jl") +include("api.jl") - -export EdgeList, neighbors, nodes +export EdgeList, neighbors, nodes, loglikelihood end diff --git a/src/api.jl b/src/api.jl index 99cd71e..cedcace 100644 --- a/src/api.jl +++ b/src/api.jl @@ -11,12 +11,12 @@ end function preprocess_data(data, dist) - A = _fast_compressed_g.(dist, data) + A = EdgeList(_fast_compressed_obs.(dist, data)) return A, dist end function postprocess(out) - return true - return BlockModel(optimal_a) + return out + return BlockModel(out) end diff --git a/src/assignment.jl b/src/assignment.jl index 603197e..40c76c8 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -46,6 +46,9 @@ mutable struct Assignment{E, D, F} log_likelihood::SymArray{F} end +number_nodes(a::Assignment) = length(a.node_labels) +number_groups(a::Assignment) = size(a.θ, 1) + function loglikelihood(a::Assignment) return sum(a.log_likelihood) end diff --git a/src/block_model.jl b/src/block_model.jl index 9cb2e79..c3a875e 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -13,7 +13,14 @@ end function sample(bm::BlockModel, latents::Vector{T}) where {T} - #fuck need the element type of the distribution... + A = Array{eltype(bm[1,1]), 2}(undef, length(latents), length(latents)) .* zero(eltype(bm[1,1])) + for j in 1:length(latents) + for i in j+1:length(latents) + A[i, j] = sample(bm[latents[i], latents[j]]) + A[j, i] = A[i, j] + end + end + return A end diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 1f45485..a5f61ae 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -47,7 +47,8 @@ end agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) = Bernoulli(w1 * d1.p + w2 * d2.p) fit(::Bernoulli, x) = Bernoulli(mean(x)) -sample(d::Bernoulli, n=1) = rand(n) .<= d.p dist(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) params(d::Bernoulli) = (d.p,) +eltype(d::Bernoulli) = Bool +sample(d::Bernoulli) = Bool(rand() .<= d.p) diff --git a/src/optimization/config_rules/accept_rule.jl b/src/optimization/config_rules/accept_rule.jl index 94ce206..caa3d70 100644 --- a/src/optimization/config_rules/accept_rule.jl +++ b/src/optimization/config_rules/accept_rule.jl @@ -17,6 +17,9 @@ function accept_reject_update!(a::Assignment, swap::Swap, ::Strict) current_score = loglikelihood(a) apply_swap!(a, swap) if loglikelihood(a) <= current_score + @debug "Rejecting swap: $(swap.u) <-> $(swap.v), score: $(loglikelihood(a)), current score: $current_score" revert_swap!(a, swap) + else + @debug "Accepting swap: $(swap.u) <-> $(swap.v), score: $(loglikelihood(a))" end end diff --git a/src/optimization/config_rules/swap_rule.jl b/src/optimization/config_rules/swap_rule.jl index 63cb5b0..5f5a0f3 100644 --- a/src/optimization/config_rules/swap_rule.jl +++ b/src/optimization/config_rules/swap_rule.jl @@ -21,7 +21,7 @@ end function select_indices_swap(assignment::Assignment, ::RandomGroupSwap) groups = StatsBase.sample( 1:number_groups(assignment), 2; replace = false) - index1 = rand(get_vertex_in_group(assignment, groups[1])) - index2 = rand(get_vertex_in_group(assignment, groups[2])) + index1 = rand(findall(x -> x == groups[1], assignment.node_labels)) + index2 = rand(findall(x -> x == groups[2], assignment.node_labels)) return (index1, index2) end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 8d4ec9b..3919b23 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -9,7 +9,7 @@ mutable struct GreedyParams progress_bar::Bool end -GreedyParams() = GreedyParams(10_000, RandomNodeSwap(), Strict(), PreviousBestValue(1000), false) +GreedyParams() = GreedyParams(10_000, RandomGroupSwap(), Strict(), PreviousBestValue(1000), true) function greedy_optimize(g, initial_labels, params::GreedyParams) a = Assignment(initial_labels, g...) diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index db473dd..cace5aa 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -44,6 +44,7 @@ function apply_swap!(a::Assignment, s::Swap) push!(groups_concerned, minmax(g_old, g_v)) end end + @show a.θ fast_ll_update!(a, groups_concerned) swap_node_labels!(a, s.u, s.v) @@ -54,24 +55,8 @@ end function fast_ll_update!(a, groups_concerned) for g in groups_concerned - a.log_likelihood[g[1], g[2]] = _fast_ll_one_group(a, g[1], g[2]) + # Use get_edges_in_groups to get the correct set of edges + edges = get_edges_in_groups(a, g[1], g[2]) + a.log_likelihood[g[1], g[2]] = loglikelihood(a.θ[g[1], g[2]], edges) end end - - -function _fast_ll_one_group(a::Assignment, g1, g2) - nodes_g1 = findall(x -> x == g1, a.node_labels) - nodes_g2 = findall(x -> x == g2, a.node_labels) - ll = 0.0 - d = a.θ[g1, g2] - for u in nodes_g1 - for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 - if v in nodes_g2 - if (g1 == g2 && u < v) || g1 != g2 - ll += loglikelihood(d, e) - end - end - end - end - return ll -end diff --git a/test/Project.toml b/test/Project.toml index a7b675e..3b6cd81 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,4 +6,5 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 5dd564b..f90eb13 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,4 +5,5 @@ using NetworkHistogram include("test_data_format.jl") include("test_distributions_type.jl") + include("test_swap_workspace.jl") end diff --git a/test/test_data_format.jl b/test/test_data_format.jl index 6bcdbf6..7cf4fc5 100644 --- a/test/test_data_format.jl +++ b/test/test_data_format.jl @@ -7,13 +7,8 @@ for j in 1:20 nv_j, val_j = neighbors(edgelist, j) for i in 1:20 - if A[i,j] == 0 - @test i ∉ nv_j - end - if A[i,j] != 0 @test i in nv_j @test A[i,j] == val_j[findfirst(x -> x == i, nv_j)] - end end end From 844fb0c30e528a5a4f3cb616816347856f36eafa Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 8 May 2025 12:30:07 +0200 Subject: [PATCH 144/266] fix ll with agent --- src/NetworkHistogram.jl | 2 +- src/assignment.jl | 24 ++++++----------- src/optimization/swap_workspace.jl | 41 ++++++++++++++++++------------ src/utils/SymArray.jl | 7 ++++- test/runtests.jl | 1 + 5 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 3756509..11f9ae9 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -3,7 +3,7 @@ using StatsBase using StaticArrays using ProgressMeter import StatsAPI: loglikelihood -import Base: convert +import Base: convert, eltype include("utils/include.jl") using .FastSymArray diff --git a/src/assignment.jl b/src/assignment.jl index 40c76c8..1ed08a3 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -62,23 +62,15 @@ function get_edges_in_groups(a::Assignment, g1::Int, g2::Int) end function get_edges_in_groups(node_labels, edges_all, g1, g2) + + edges = Vector{edge_type(edges_all)}() nodes_g1 = findall(x -> x == g1, node_labels) - edges = Vector{edge_type(edges_all)}(undef, 0) - if g1 == g2 - for u in nodes_g1 - for (v, e) in iterate_neighbors(edges_all, u) - if v in nodes_g1 && u < v - push!(edges, e) - end - end - end - else - nodes_g2 = findall(x -> x == g2, node_labels) - for u in nodes_g1 - for (v, e) in iterate_neighbors(edges_all, u) - if v in nodes_g2 - push!(edges, e) - end + nodes_g2 = findall(x -> x == g2, node_labels) + + for u in nodes_g1 + for (v, e) in iterate_neighbors(edges_all, u) + if v in nodes_g2 && ((g1 == g2 && u < v) || g1 != g2) + push!(edges, e) end end end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index cace5aa..7c403af 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -21,7 +21,9 @@ function make_swap!(swap::Swap, a::Assignment, id) end function revert_swap!(assignment::Assignment, swap::Swap) - apply_swap!(assignment, swap) + # swap labels back to original + swap_node_labels!(assignment, swap.u, swap.v) + # restore saved θ and log likelihoods assignment.θ = deepcopy(swap.workspace.θ) assignment.log_likelihood = deepcopy(swap.workspace.log_likelihood_per_group) end @@ -30,24 +32,31 @@ function swap_node_labels!(a::Assignment, i, j) a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] end + function apply_swap!(a::Assignment, s::Swap) - g1 = group(a, s.u) - g2 = group(a, s.v) - groups_concerned = Set([minmax(g1, g2)]) - for (u, g_old, g_new) in [(s.u, g1, g2), (s.v, g2, g1)] - # iterate over neighbors of u and get the decoration of the edge - for (v,d) in iterate_neighbors(a.dists, u) - g_v = group(a, v) - a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) - a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) - push!(groups_concerned, minmax(g_new, g_v)) - push!(groups_concerned, minmax(g_old, g_v)) + # swap node labels + swap_node_labels!(a, s.u, s.v) + # fully rebuild θ and log_likelihood based on new labels + k = size(a.θ, 1) + # initial distribution template and zero-likelihood + base_dist = a.θ[1, 1] + a.θ = SymArray(k, base_dist) + a.log_likelihood = SymArray(k, zero(eltype(a.log_likelihood))) + # accumulate edge contributions + for u in 1:length(a.node_labels) + g_u = group(a, u) + for (v, d) in iterate_neighbors(a.dists, u) + if u < v + g_v = group(a, v) + a.θ[g_u, g_v] = add_to(a.θ[g_u, g_v], d) + end end end - @show a.θ - fast_ll_update!(a, groups_concerned) - - swap_node_labels!(a, s.u, s.v) + # recompute log likelihoods for all group pairs + for g1 in 1:k, g2 in g1:k + edges = get_edges_in_groups(a, g1, g2) + a.log_likelihood[g1, g2] = loglikelihood(a.θ[g1, g2], edges) + end end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 09155e1..a8c7079 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -1,6 +1,7 @@ module FastSymArray - export SymArray + import Base: eltype + export SymArray, eltype mutable struct SymArray{F} <: AbstractArray{F, 2} d::Dict{Tuple{Int, Int}, F} @@ -31,4 +32,8 @@ module FastSymArray function Base.sum(a::SymArray) return sum(values(a.d)) end + + function eltype(a::SymArray{F}) where {F} + return F + end end diff --git a/test/runtests.jl b/test/runtests.jl index f90eb13..d9d6d64 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,4 +6,5 @@ using NetworkHistogram include("test_data_format.jl") include("test_distributions_type.jl") include("test_swap_workspace.jl") + include("test_get_edges_in_groups.jl") end From 4ba2caa8cdee6a8852915aacc7b19e0ec0ed64e7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 12 May 2025 11:18:24 +0200 Subject: [PATCH 145/266] fix swap with slow rebuild --- Project.toml | 2 + src/NetworkHistogram.jl | 4 +- src/assignment.jl | 2 +- src/block_model.jl | 6 +++ src/distributions_type.jl | 9 ++-- src/optimization/config_rules/accept_rule.jl | 3 -- src/optimization/greedy.jl | 4 +- src/optimization/swap_workspace.jl | 45 +++++++++++--------- 8 files changed, 43 insertions(+), 32 deletions(-) diff --git a/Project.toml b/Project.toml index 2d93d76..29a587a 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.5.2" [deps] ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" @@ -28,6 +29,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] +CairoMakie = "0.13.4" KrylovKit = "0.9.5" julia = "1.11" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 11f9ae9..aae12f3 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -3,7 +3,7 @@ using StatsBase using StaticArrays using ProgressMeter import StatsAPI: loglikelihood -import Base: convert, eltype +import Base: convert, eltype, zero include("utils/include.jl") using .FastSymArray @@ -15,6 +15,6 @@ include("assignment.jl") include("optimization/greedy.jl") include("api.jl") -export EdgeList, neighbors, nodes, loglikelihood +export EdgeList, neighbors, nodes, loglikelihood, zero end diff --git a/src/assignment.jl b/src/assignment.jl index 1ed08a3..b5889d6 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -80,7 +80,7 @@ end function Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} dists = fit(dist, edge_list) number_groups = length(unique(node_labels)) - θ = SymArray(number_groups, dist) + θ = SymArray(number_groups, zero(dist)) log_likelihood = SymArray(number_groups, 0.0) for u in 1:nodes(dists) g1 = node_labels[u] diff --git a/src/block_model.jl b/src/block_model.jl index c3a875e..fedc7aa 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -15,7 +15,13 @@ end function sample(bm::BlockModel, latents::Vector{T}) where {T} A = Array{eltype(bm[1,1]), 2}(undef, length(latents), length(latents)) .* zero(eltype(bm[1,1])) for j in 1:length(latents) + for i in 1:j-1 + A[i, j] = A[j, i] + end for i in j+1:length(latents) + # println("i: ", i, " j: ", j) + # println("latents[i]: ", latents[i], " latents[j]: ", latents[j]) + # println("bm[latents[i], latents[j]]: ", bm[latents[i], latents[j]]) A[i, j] = sample(bm[latents[i], latents[j]]) A[j, i] = A[i, j] end diff --git a/src/distributions_type.jl b/src/distributions_type.jl index a5f61ae..7a3136a 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -4,6 +4,7 @@ struct Dist{D} end Dist(d) = Dist(d, 1) +zero(d::Dist) = Dist(zero(d.dist),0) Base.broadcastable(x::Dist) = Ref(x) @@ -13,8 +14,8 @@ end function remove_from(avgdist::Dist{D}, dist::D) where {D} - if avgdist.counts == 1 - error("Cannot remove from a distribution with only one sample") + if avgdist.counts <= 1 + error("Cannot remove from a distribution with strictly less than 2 counts") else return Dist(agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts - 1), - 1 / (avgdist.counts - 1)), avgdist.counts -1) end @@ -44,11 +45,13 @@ struct Bernoulli{T<:Real} p::T end +#zero(::Type{Bernoulli{T}}) where {T} = Bernoulli(zero(T)) +zero(d::Bernoulli) = Bernoulli(zero(d.p)) agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) = Bernoulli(w1 * d1.p + w2 * d2.p) fit(::Bernoulli, x) = Bernoulli(mean(x)) dist(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) params(d::Bernoulli) = (d.p,) eltype(d::Bernoulli) = Bool -sample(d::Bernoulli) = Bool(rand() .<= d.p) +sample(d::Bernoulli) = Bool(rand() <= d.p) diff --git a/src/optimization/config_rules/accept_rule.jl b/src/optimization/config_rules/accept_rule.jl index caa3d70..94ce206 100644 --- a/src/optimization/config_rules/accept_rule.jl +++ b/src/optimization/config_rules/accept_rule.jl @@ -17,9 +17,6 @@ function accept_reject_update!(a::Assignment, swap::Swap, ::Strict) current_score = loglikelihood(a) apply_swap!(a, swap) if loglikelihood(a) <= current_score - @debug "Rejecting swap: $(swap.u) <-> $(swap.v), score: $(loglikelihood(a)), current score: $current_score" revert_swap!(a, swap) - else - @debug "Accepting swap: $(swap.u) <-> $(swap.v), score: $(loglikelihood(a))" end end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 3919b23..339607f 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -9,7 +9,7 @@ mutable struct GreedyParams progress_bar::Bool end -GreedyParams() = GreedyParams(10_000, RandomGroupSwap(), Strict(), PreviousBestValue(1000), true) +GreedyParams() = GreedyParams(100_000, RandomGroupSwap(), Strict(), PreviousBestValue(10_000), true) function greedy_optimize(g, initial_labels, params::GreedyParams) a = Assignment(initial_labels, g...) @@ -27,7 +27,7 @@ function greedy_improve!(a::Assignment; params = GreedyParams()) for i in 1:params.max_iter local_search!(a, swap, params) - next!(p) + next!(p; showvalues = [("ll: ",sum(a.log_likelihood))]) if stopping_rule(a, params.stop_rule) finish!(p) break diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 7c403af..de90678 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -36,27 +36,30 @@ end function apply_swap!(a::Assignment, s::Swap) # swap node labels swap_node_labels!(a, s.u, s.v) - # fully rebuild θ and log_likelihood based on new labels - k = size(a.θ, 1) - # initial distribution template and zero-likelihood - base_dist = a.θ[1, 1] - a.θ = SymArray(k, base_dist) - a.log_likelihood = SymArray(k, zero(eltype(a.log_likelihood))) - # accumulate edge contributions - for u in 1:length(a.node_labels) - g_u = group(a, u) - for (v, d) in iterate_neighbors(a.dists, u) - if u < v - g_v = group(a, v) - a.θ[g_u, g_v] = add_to(a.θ[g_u, g_v], d) - end - end - end - # recompute log likelihoods for all group pairs - for g1 in 1:k, g2 in g1:k - edges = get_edges_in_groups(a, g1, g2) - a.log_likelihood[g1, g2] = loglikelihood(a.θ[g1, g2], edges) - end + new_assignment = Assignment(a.node_labels, a.edges, a.θ[1,1]) + a.θ = new_assignment.θ + a.log_likelihood = new_assignment.log_likelihood + # # fully rebuild θ and log_likelihood based on new labels + # k = size(a.θ, 1) + # # initial distribution template and zero-likelihood + # base_dist = a.θ[1, 1] + # a.θ = SymArray(k, base_dist) + # a.log_likelihood = SymArray(k, zero(eltype(a.log_likelihood))) + # # accumulate edge contributions + # for u in 1:length(a.node_labels) + # g_u = group(a, u) + # for (v, d) in iterate_neighbors(a.dists, u) + # if u < v + # g_v = group(a, v) + # a.θ[g_u, g_v] = add_to(a.θ[g_u, g_v], d) + # end + # end + # end + # # recompute log likelihoods for all group pairs + # for g1 in 1:k, g2 in g1:k + # edges = get_edges_in_groups(a, g1, g2) + # a.log_likelihood[g1, g2] = loglikelihood(a.θ[g1, g2], edges) + # end end From c4e3c91f78b55111380f1c725c26fe5da4c0f3cb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 12 May 2025 11:36:05 +0200 Subject: [PATCH 146/266] make shift broadcast --- src/EdgeList.jl | 44 +++++++++++++++++++++++++++--- src/api.jl | 4 +-- src/distributions_type.jl | 5 ++-- src/optimization/swap_workspace.jl | 1 - 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 8f2b8c5..ed24424 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -13,16 +13,38 @@ edge_type(A::EdgeList{E}) where {E} = E nodes(edgelist::EdgeList) = length(edgelist.data) number_nodes(edgelist::EdgeList) = nodes(edgelist) -function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} +EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} = _from_adj_to_edge_list(A) + +# function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} +# n = size(A, 1) +# data = Vector{Vector{E}}(undef, n) +# name_list = Vector{Vector{Int}}(undef, n) +# for j in 1:n +# data[j] = Vector{E}(undef, 0) +# name_list[j] = Vector{Int}(undef, 0) +# for i in 1:n +# if !ismissing(A[i,j]) # gonna be an issue with MC! have to define 0 chain and fast operations on them +# push!(name_list[j], i) +# push!(data[j], A[i, j]) +# end +# end +# end +# return EdgeList(data, name_list) +# end + + + +function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) n = size(A, 1) - data = Vector{Vector{E}}(undef, n) + test = function_to_apply(A[1,1]) + data = Vector{Vector{typeof(test)}}(undef, n) name_list = Vector{Vector{Int}}(undef, n) for j in 1:n - data[j] = Vector{Tuple{Int,E}}(undef, 0) + data[j] = Vector{typeof(test)}(undef, 0) name_list[j] = Vector{Int}(undef, 0) for i in 1:n if !ismissing(A[i,j]) # gonna be an issue with MC! have to define 0 chain and fast operations on them - push!(name_list[j], i) + push!(name_list[j], function_to_apply(i)) push!(data[j], A[i, j]) end end @@ -30,6 +52,20 @@ function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} return EdgeList(data, name_list) end +_fast_compressed_obs(d::Dist, A::AbstractMatrix) = _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x)) +_fast_compressed_obs(d::Dist, A::EdgeList{E}) where {E} = _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x)) + +function _make_shift_broadcast(A::EdgeList, f) + # may work ? -> data = f.(A.data) + n = length(A.data) + test = f(A.data[1][1]) + data = Vector{Vector{typeof(test)}}(undef, n) + for j in 1:n + data[j] = f.(A.data[j]) + end + return EdgeList(data, A.name_list) +end + convert(::Type{EdgeList}, A::AbstractMatrix) = EdgeList(A) diff --git a/src/api.jl b/src/api.jl index cedcace..f1cb2d7 100644 --- a/src/api.jl +++ b/src/api.jl @@ -10,8 +10,8 @@ function nethist(data_input, dist_user, initial_node_labels, params::GreedyParam end -function preprocess_data(data, dist) - A = EdgeList(_fast_compressed_obs.(dist, data)) +function preprocess_data(data, dist::Dist) + A = EdgeList(_fast_compressed_obs(dist, data)) return A, dist end diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 7a3136a..c6ad7d9 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -32,12 +32,13 @@ end fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) +unwrap(d::Dist) = d.dist + # expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined # by default do nothing _fast_compressed_obs(d, x) = x -unwrap(d::Dist) = d.dist # Bernoulli distribution @@ -45,7 +46,6 @@ struct Bernoulli{T<:Real} p::T end -#zero(::Type{Bernoulli{T}}) where {T} = Bernoulli(zero(T)) zero(d::Bernoulli) = Bernoulli(zero(d.p)) agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) = Bernoulli(w1 * d1.p + w2 * d2.p) @@ -55,3 +55,4 @@ logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) params(d::Bernoulli) = (d.p,) eltype(d::Bernoulli) = Bool sample(d::Bernoulli) = Bool(rand() <= d.p) +_fast_compressed_obs(d::Bernoulli, x) = x diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index de90678..9300514 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -39,7 +39,6 @@ function apply_swap!(a::Assignment, s::Swap) new_assignment = Assignment(a.node_labels, a.edges, a.θ[1,1]) a.θ = new_assignment.θ a.log_likelihood = new_assignment.log_likelihood - # # fully rebuild θ and log_likelihood based on new labels # k = size(a.θ, 1) # # initial distribution template and zero-likelihood # base_dist = a.θ[1, 1] From 1bc74ffb38572da10076a3d78a04ee5529d5e732 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 12 May 2025 11:36:14 +0200 Subject: [PATCH 147/266] add tests --- test/test_get_edges_in_groups.jl | 33 ++++++++++++++++ test/test_swap_workspace.jl | 68 ++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 test/test_get_edges_in_groups.jl create mode 100644 test/test_swap_workspace.jl diff --git a/test/test_get_edges_in_groups.jl b/test/test_get_edges_in_groups.jl new file mode 100644 index 0000000..4df3a22 --- /dev/null +++ b/test/test_get_edges_in_groups.jl @@ -0,0 +1,33 @@ +using Test +using NetworkHistogram + +@testset "get_edges_in_groups behavior" begin + # Simple 4-node undirected graph + # 1-2, 1-3, 2-4, 3-4 + A = [0 1 1 0; + 1 0 0 1; + 1 0 0 1; + 0 1 1 0] + edgelist = NetworkHistogram.EdgeList(A) + node_labels = [1, 1, 2, 2] # nodes 1,2 in group 1; 3,4 in group 2 + + # Test within-group edges (group 1) + edges_1_1 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 1, 1) + @test length(edges_1_1) == 1 # Only edge (1,2) + @test edges_1_1[1] == 1 # A[1,2] == 1 + + # Test within-group edges (group 2) + edges_2_2 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 2, 2) + @test length(edges_2_2) == 1 # Only edge (3,4) + @test edges_2_2[1] == 1 # A[3,4] == 1 + + # Test between-group edges (1,2) + edges_1_2 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 1, 2) + # Edges: (1,3), (2,4) + @test length(edges_1_2) == 4 + @test sort(edges_1_2) == [0, 0, 1, 1] # Both edges exist + + # Test symmetry: get_edges_in_groups(2,1) == get_edges_in_groups(1,2) + edges_2_1 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 2, 1) + @test sort(edges_2_1) == sort(edges_1_2) +end diff --git a/test/test_swap_workspace.jl b/test/test_swap_workspace.jl new file mode 100644 index 0000000..d438874 --- /dev/null +++ b/test/test_swap_workspace.jl @@ -0,0 +1,68 @@ +using Test +using NetworkHistogram +using StatsBase +using Random + + +function manual_loglikelihood(A, node_labels, θ) + n = size(A, 1) + k = size(θ, 1) + ll = 0.0 + for j in 1:n + for i in 1:n + if i!=j + g1 = node_labels[i] + g2 = node_labels[j] + ll += NetworkHistogram.logpdf(θ[g1,g2], A[i,j]) + end + end + end + return ll/2 +end + +function slow_swap(a::NetworkHistogram.Assignment, s::NetworkHistogram.Swap) + labels = deepcopy(a.node_labels) + labels[s.u], labels[s.v] = labels[s.v], labels[s.u] + return NetworkHistogram.Assignment(labels, a.edges, a.θ[1,1]) +end + + +@testset "Swap workspace likelihood update (Bernoulli)" begin + Random.seed!(42) + n = 6 + k = 2 + p1, p2 = 0.8, 0.3 + d = NetworkHistogram.Bernoulli(0.5) + # Create a block model with two groups + sbm = NetworkHistogram.BlockModel(k, d) + sbm[1,1] = NetworkHistogram.Bernoulli(p1) + sbm[2,2] = NetworkHistogram.Bernoulli(p2) + sbm[1,2] = NetworkHistogram.Bernoulli(0.1) + + labels = StatsBase.inverse_rle(1:k, fill(n÷k, k)) + A = NetworkHistogram.sample(sbm, labels) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) + + ll_original = NetworkHistogram.loglikelihood(assignment) + ll_manual = manual_loglikelihood(A, assignment.node_labels, assignment.θ) + @test isapprox(ll_original, ll_manual; atol=1e-10) + + # Swap two nodes from different groups + indices = (1, n) + swap = NetworkHistogram.make_swap(assignment, indices) + slow_swapped = slow_swap(assignment, swap) + NetworkHistogram.apply_swap!(assignment, swap) + ll_after_swap = NetworkHistogram.loglikelihood(assignment) + ll_slow_swap = NetworkHistogram.loglikelihood(slow_swapped) + ll_manual_after_swap = manual_loglikelihood(A, assignment.node_labels, assignment.θ) + @test isapprox(ll_after_swap, ll_manual_after_swap; atol=1e-10) + @test isapprox(ll_after_swap, ll_slow_swap; atol=1e-10) + + # Revert the swap + NetworkHistogram.revert_swap!(assignment, swap) + ll_after_revert = NetworkHistogram.loglikelihood(assignment) + ll_manual_after_revert = manual_loglikelihood(A, assignment.node_labels, assignment.θ) + @test isapprox(ll_after_revert, ll_manual_after_revert; atol=1e-10) + @test isapprox(ll_after_revert, ll_original; atol=1e-10) +end From 2052df257ef33ce254d667a96d3b43affa836d87 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 12 May 2025 11:46:15 +0200 Subject: [PATCH 148/266] add comments --- src/distributions_type.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/distributions_type.jl b/src/distributions_type.jl index c6ad7d9..f1cf644 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -40,7 +40,7 @@ unwrap(d::Dist) = d.dist _fast_compressed_obs(d, x) = x -# Bernoulli distribution +# Bernoulli distribution (example) struct Bernoulli{T<:Real} p::T From 98ff8ce7fdc39a719c555831cff9deee8c860c0d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 12 May 2025 12:46:42 +0200 Subject: [PATCH 149/266] update to slower version --- src/EdgeList.jl | 3 +- src/distributions_type.jl | 17 ++++-- src/optimization/config_rules/stop_rule.jl | 8 +++ src/optimization/greedy.jl | 2 +- src/optimization/swap_workspace.jl | 63 ++++++++++++---------- test/test_data_format.jl | 4 ++ 6 files changed, 62 insertions(+), 35 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index ed24424..a92d074 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -14,6 +14,7 @@ nodes(edgelist::EdgeList) = length(edgelist.data) number_nodes(edgelist::EdgeList) = nodes(edgelist) EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} = _from_adj_to_edge_list(A) +EdgeList(adj_list::EdgeList) = adj_list # function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} # n = size(A, 1) @@ -43,7 +44,7 @@ function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) data[j] = Vector{typeof(test)}(undef, 0) name_list[j] = Vector{Int}(undef, 0) for i in 1:n - if !ismissing(A[i,j]) # gonna be an issue with MC! have to define 0 chain and fast operations on them + if !ismissing(A[i,j]) && i != j # gonna be an issue with MC! have to define 0 chain and fast operations on them push!(name_list[j], function_to_apply(i)) push!(data[j], A[i, j]) end diff --git a/src/distributions_type.jl b/src/distributions_type.jl index f1cf644..129f641 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -1,8 +1,11 @@ struct Dist{D} dist::D counts::Int + Dist(d,counts::Int) = counts < 0 ? error("Counts cannot be negative") : new{typeof(d)}(d, counts) end + + Dist(d) = Dist(d, 1) zero(d::Dist) = Dist(zero(d.dist),0) @@ -14,15 +17,19 @@ end function remove_from(avgdist::Dist{D}, dist::D) where {D} - if avgdist.counts <= 1 - error("Cannot remove from a distribution with strictly less than 2 counts") - else - return Dist(agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts - 1), - 1 / (avgdist.counts - 1)), avgdist.counts -1) + if avgdist.counts <= 0 + error("Cannot remove from a distribution with 0 counts") end + # if avgdist.counts == 1 && params(avgdist) == params(dist) + # return Dist(zero(avgdist.dist), 0) + # else + # error("Cannot remove from a distribution with 1 count unless the parameters are the same, got $(params(avgdist)) and $(params(dist))") + # end + return Dist(agg_params(avgdist.dist, dist, avgdist.counts / max(1,(avgdist.counts - 1)), - 1 / max(1,(avgdist.counts - 1))), avgdist.counts -1) end -# probably need to update to account for counts in second dist? will that mess the other code? +## probably this is fucked ... add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) remove_from(d::Dist, dist::Dist) = remove_from(d, dist.dist) diff --git a/src/optimization/config_rules/stop_rule.jl b/src/optimization/config_rules/stop_rule.jl index a4e12c5..a3d639e 100644 --- a/src/optimization/config_rules/stop_rule.jl +++ b/src/optimization/config_rules/stop_rule.jl @@ -1,5 +1,10 @@ abstract type StopRule end + +function info_to_print(::StopRule) + return nothing +end + function initialise_stop_rule!(stop_rule::StopRule, a, g) end @@ -45,3 +50,6 @@ function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) end return stop_rule.iterations_since_best >= stop_rule.k end + + +info_to_print(stop_rule::PreviousBestValue) = ("stalled iter: ", stop_rule.iterations_since_best) diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 339607f..b1b5fef 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -27,7 +27,7 @@ function greedy_improve!(a::Assignment; params = GreedyParams()) for i in 1:params.max_iter local_search!(a, swap, params) - next!(p; showvalues = [("ll: ",sum(a.log_likelihood))]) + next!(p; showvalues = [("ll: ",sum(a.log_likelihood)), info_to_print(params.stop_rule)]) if stopping_rule(a, params.stop_rule) finish!(p) break diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 9300514..489b427 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -34,40 +34,47 @@ end function apply_swap!(a::Assignment, s::Swap) - # swap node labels + g1 = group(a, s.u) + g2 = group(a, s.v) + groups_concerned = Set([minmax(g1, g2)]) + for (u, g_old, g_new) in [(s.u, g1, g2), (s.v, g2, g1)] + for (v,d) in iterate_neighbors(a.dists, u) + if v == s.u || v == s.v + continue + end + g_v = group(a, v) + a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) + a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) + push!(groups_concerned, minmax(g_new, g_v), minmax(g_old, g_v)) + end + end swap_node_labels!(a, s.u, s.v) - new_assignment = Assignment(a.node_labels, a.edges, a.θ[1,1]) - a.θ = new_assignment.θ - a.log_likelihood = new_assignment.log_likelihood - # k = size(a.θ, 1) - # # initial distribution template and zero-likelihood - # base_dist = a.θ[1, 1] - # a.θ = SymArray(k, base_dist) - # a.log_likelihood = SymArray(k, zero(eltype(a.log_likelihood))) - # # accumulate edge contributions - # for u in 1:length(a.node_labels) - # g_u = group(a, u) - # for (v, d) in iterate_neighbors(a.dists, u) - # if u < v - # g_v = group(a, v) - # a.θ[g_u, g_v] = add_to(a.θ[g_u, g_v], d) - # end - # end - # end - # # recompute log likelihoods for all group pairs - # for g1 in 1:k, g2 in g1:k - # edges = get_edges_in_groups(a, g1, g2) - # a.log_likelihood[g1, g2] = loglikelihood(a.θ[g1, g2], edges) - # end + fast_ll_update!(a, groups_concerned) end -## below can be specialised for Bernoulli probably +## below can be specialised for Bernoulli probably (probably above needs to be actually) function fast_ll_update!(a, groups_concerned) for g in groups_concerned - # Use get_edges_in_groups to get the correct set of edges - edges = get_edges_in_groups(a, g[1], g[2]) - a.log_likelihood[g[1], g[2]] = loglikelihood(a.θ[g[1], g[2]], edges) + a.log_likelihood[g[1], g[2]] = _fast_ll_one_group(a, g[1], g[2]) + end +end + + +function _fast_ll_one_group(a::Assignment, g1, g2) + nodes_g1 = findall(x -> x == g1, a.node_labels) + nodes_g2 = findall(x -> x == g2, a.node_labels) + ll = 0.0 + d = a.θ[g1, g2] + for u in nodes_g1 + for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 + if v in nodes_g2 + if (g1 == g2 && u < v) || g1 != g2 + ll += loglikelihood(d, e) + end + end + end end + return ll end diff --git a/test/test_data_format.jl b/test/test_data_format.jl index 7cf4fc5..dbc19e2 100644 --- a/test/test_data_format.jl +++ b/test/test_data_format.jl @@ -7,8 +7,12 @@ for j in 1:20 nv_j, val_j = neighbors(edgelist, j) for i in 1:20 + if i != j @test i in nv_j @test A[i,j] == val_j[findfirst(x -> x == i, nv_j)] + else + @test i ∉ nv_j + end end end From 83ca2ff27a73d8df969de2edd3a4e266620c1653 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 12 May 2025 17:18:51 +0200 Subject: [PATCH 150/266] export kinda fixed --- src/EdgeList.jl | 4 ++-- src/NetworkHistogram.jl | 5 +++-- src/optimization/swap_workspace.jl | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index a92d074..a384e77 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -45,8 +45,8 @@ function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) name_list[j] = Vector{Int}(undef, 0) for i in 1:n if !ismissing(A[i,j]) && i != j # gonna be an issue with MC! have to define 0 chain and fast operations on them - push!(name_list[j], function_to_apply(i)) - push!(data[j], A[i, j]) + push!(name_list[j], i) + push!(data[j], function_to_apply(A[i, j])) end end end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index aae12f3..935c1e1 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -2,9 +2,10 @@ module NetworkHistogram using StatsBase using StaticArrays using ProgressMeter -import StatsAPI: loglikelihood +import StatsAPI: loglikelihood, fit import Base: convert, eltype, zero + include("utils/include.jl") using .FastSymArray @@ -15,6 +16,6 @@ include("assignment.jl") include("optimization/greedy.jl") include("api.jl") -export EdgeList, neighbors, nodes, loglikelihood, zero +export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 489b427..17f7573 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -71,7 +71,7 @@ function _fast_ll_one_group(a::Assignment, g1, g2) for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 if v in nodes_g2 if (g1 == g2 && u < v) || g1 != g2 - ll += loglikelihood(d, e) + ll += logpdf(d, e) end end end From 01884b77e797a80f73df986df2a51db36fd69ca4 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 13 May 2025 13:00:55 +0200 Subject: [PATCH 151/266] slow recompute faster than trying to isolate behaviour for now, see comments in swap_workspace.jl --- src/api.jl | 2 +- src/assignment.jl | 8 ++- src/distributions_type.jl | 11 ++-- src/optimization/greedy.jl | 2 + src/optimization/swap_workspace.jl | 89 +++++++++++++++++------------- 5 files changed, 67 insertions(+), 45 deletions(-) diff --git a/src/api.jl b/src/api.jl index f1cb2d7..2d0bb7a 100644 --- a/src/api.jl +++ b/src/api.jl @@ -3,7 +3,7 @@ function nethist(data_input, dist_user, initial_node_labels, params::GreedyParam dist = Dist(dist_user) g = preprocess_data(data_input, dist) - + @debug "started optimizatiion" out = greedy_optimize(g, initial_node_labels, params) return postprocess(out) diff --git a/src/assignment.jl b/src/assignment.jl index b5889d6..c30d003 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -79,6 +79,12 @@ end function Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} dists = fit(dist, edge_list) + θ, ll = _compute_theta_and_ll(node_labels, dists, edge_list, dist) + return Assignment(node_labels, edge_list, dists, θ, ll) +end + + +function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} number_groups = length(unique(node_labels)) θ = SymArray(number_groups, zero(dist)) log_likelihood = SymArray(number_groups, 0.0) @@ -98,5 +104,5 @@ function Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E θ[k, l], get_edges_in_groups(node_labels, edge_list, k, l)) end end - return Assignment(node_labels, edge_list, dists, θ, log_likelihood) + return θ, log_likelihood end diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 129f641..70339f4 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -33,7 +33,13 @@ end add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) remove_from(d::Dist, dist::Dist) = remove_from(d, dist.dist) -for f in [:logpdf, :sample, :dist, :eltype, :params] +# expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined +# by default do nothing +_fast_compressed_obs(d, x) = x + + +# what to delegate to the underlying distribution +for f in [:logpdf, :sample, :dist, :eltype, :params, :_fast_compressed_obs] @eval $f(d::Dist, args...) = $f(d.dist, args...) end @@ -42,9 +48,6 @@ loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) unwrap(d::Dist) = d.dist -# expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined -# by default do nothing -_fast_compressed_obs(d, x) = x # Bernoulli distribution (example) diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index b1b5fef..39bc5b7 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -12,7 +12,9 @@ end GreedyParams() = GreedyParams(100_000, RandomGroupSwap(), Strict(), PreviousBestValue(10_000), true) function greedy_optimize(g, initial_labels, params::GreedyParams) + @debug "making assignment" a = Assignment(initial_labels, g...) + @debug "assignment made, starting greedy search" greedy_improve!(a; params = params) return a end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 17f7573..0c28877 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -32,49 +32,60 @@ function swap_node_labels!(a::Assignment, i, j) a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] end - -function apply_swap!(a::Assignment, s::Swap) - g1 = group(a, s.u) - g2 = group(a, s.v) - groups_concerned = Set([minmax(g1, g2)]) - for (u, g_old, g_new) in [(s.u, g1, g2), (s.v, g2, g1)] - for (v,d) in iterate_neighbors(a.dists, u) - if v == s.u || v == s.v - continue - end - g_v = group(a, v) - a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) - a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) - push!(groups_concerned, minmax(g_new, g_v), minmax(g_old, g_v)) - end - end +# for reference and testing +function _slow_swap!(a::Assignment, s::Swap) swap_node_labels!(a, s.u, s.v) - fast_ll_update!(a, groups_concerned) + a.θ, a.log_likelihood = _compute_theta_and_ll(a.node_labels, a.dists, a.edges, a.θ[1,1]) end +apply_swap!(a::Assignment, s::Swap) = _slow_swap!(a, s) -## below can be specialised for Bernoulli probably (probably above needs to be actually) -function fast_ll_update!(a, groups_concerned) - for g in groups_concerned - a.log_likelihood[g[1], g[2]] = _fast_ll_one_group(a, g[1], g[2]) - end -end +## below is not faster than the above, need to find a way to take advantage of the sparsity +# somewhere in the datastructure for it to make a difference +# function apply_swap!(a::Assignment, s::Swap) +# g1 = group(a, s.u) +# g2 = group(a, s.v) +# groups_concerned = Set([minmax(g1, g2)]) +# for (u, g_old, g_new) in [(s.u, g1, g2), (s.v, g2, g1)] +# for (v,d) in iterate_neighbors(a.dists, u) +# if v == s.u || v == s.v +# continue +# end +# g_v = group(a, v) +# a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) +# a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) +# push!(groups_concerned, minmax(g_new, g_v), minmax(g_old, g_v)) +# end +# end +# swap_node_labels!(a, s.u, s.v) +# fast_ll_update!(a, groups_concerned) +# end -function _fast_ll_one_group(a::Assignment, g1, g2) - nodes_g1 = findall(x -> x == g1, a.node_labels) - nodes_g2 = findall(x -> x == g2, a.node_labels) - ll = 0.0 - d = a.θ[g1, g2] - for u in nodes_g1 - for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 - if v in nodes_g2 - if (g1 == g2 && u < v) || g1 != g2 - ll += logpdf(d, e) - end - end - end - end - return ll -end + +# ## below can be specialised for Bernoulli probably (probably above needs to be actually) + +# function fast_ll_update!(a, groups_concerned) +# for g in groups_concerned +# a.log_likelihood[g[1], g[2]] = _fast_ll_one_group(a, g[1], g[2]) +# end +# end + + +# function _fast_ll_one_group(a::Assignment, g1, g2) +# nodes_g1 = findall(x -> x == g1, a.node_labels) +# nodes_g2 = findall(x -> x == g2, a.node_labels) +# ll = 0.0 +# d = a.θ[g1, g2] +# for u in nodes_g1 +# for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 +# if v in nodes_g2 +# if (g1 == g2 && u < v) || g1 != g2 +# ll += logpdf(d, e) +# end +# end +# end +# end +# return ll +# end From 55681d2d94163ba21b8e70f4beb7372aa6f3e215 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 13 May 2025 13:03:28 +0200 Subject: [PATCH 152/266] add more debug info --- src/api.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/api.jl b/src/api.jl index 2d0bb7a..6460aad 100644 --- a/src/api.jl +++ b/src/api.jl @@ -6,6 +6,7 @@ function nethist(data_input, dist_user, initial_node_labels, params::GreedyParam @debug "started optimizatiion" out = greedy_optimize(g, initial_node_labels, params) + @debug "finished optimizatiion with loglikelihood $(loglikelihood(out))" return postprocess(out) end From f7b9955a0d34fc00498f4bd6df92dc143c2524cf Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 13 May 2025 13:45:11 +0200 Subject: [PATCH 153/266] add more debug info --- src/api.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/api.jl b/src/api.jl index 6460aad..9e62ead 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,5 +1,6 @@ function nethist(data_input, dist_user, initial_node_labels, params::GreedyParams) + @debug "preprocessing data" dist = Dist(dist_user) g = preprocess_data(data_input, dist) From 4478a6e7f4c2832c6ac90d8e8376080bf702af3c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 13 May 2025 17:09:00 +0200 Subject: [PATCH 154/266] monkey patch for missing, needs to be fixed --- src/EdgeList.jl | 7 +++++-- src/distributions_type.jl | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index a384e77..37b0e9b 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -37,13 +37,16 @@ EdgeList(adj_list::EdgeList) = adj_list function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) n = size(A, 1) - test = function_to_apply(A[1,1]) + input = findfirst(x -> !ismissing(x), A) + test = function_to_apply(A[input]) data = Vector{Vector{typeof(test)}}(undef, n) name_list = Vector{Vector{Int}}(undef, n) for j in 1:n data[j] = Vector{typeof(test)}(undef, 0) name_list[j] = Vector{Int}(undef, 0) for i in 1:n + if !ismissing(A[i,j]) + end if !ismissing(A[i,j]) && i != j # gonna be an issue with MC! have to define 0 chain and fast operations on them push!(name_list[j], i) push!(data[j], function_to_apply(A[i, j])) @@ -68,7 +71,7 @@ function _make_shift_broadcast(A::EdgeList, f) end -convert(::Type{EdgeList}, A::AbstractMatrix) = EdgeList(A) +#convert(::Type{EdgeList}, A::AbstractMatrix) = EdgeList(A) function fit(d::Dist, A::EdgeList{E}) where {E} diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 70339f4..4e19be5 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -12,7 +12,8 @@ zero(d::Dist) = Dist(zero(d.dist),0) Base.broadcastable(x::Dist) = Ref(x) function add_to(avgdist::Dist{D}, dist::D) where {D} - return Dist(agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), 1 / (avgdist.counts + 1)), avgdist.counts + 1) + inner_dist = agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), 1 / (avgdist.counts + 1)) + return Dist(inner_dist, avgdist.counts + 1) end From 7edf2b642b6ffbe88d221f8c7b59a24c6e4520d0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 20 May 2025 09:03:12 +0200 Subject: [PATCH 155/266] hack for empty collection, introduces type instability... --- src/distributions_type.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 4e19be5..3686987 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -45,7 +45,9 @@ for f in [:logpdf, :sample, :dist, :eltype, :params, :_fast_compressed_obs] end fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) -loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) + +## TODO: remove type instability ? +loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) unwrap(d::Dist) = d.dist From d582f68d6461a72838694875bf34ffa94921b155 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 20 May 2025 10:01:22 +0200 Subject: [PATCH 156/266] update block models and add conversion for symarray --- src/NetworkHistogram.jl | 2 +- src/api.jl | 2 +- src/assignment.jl | 4 ++++ src/block_model.jl | 13 +++++++++++-- src/distributions_type.jl | 3 ++- src/utils/SymArray.jl | 23 ++++++++++++++++++++++- 6 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 935c1e1..8845f4d 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -10,9 +10,9 @@ include("utils/include.jl") using .FastSymArray include("distributions_type.jl") -include("block_model.jl") include("EdgeList.jl") include("assignment.jl") +include("block_model.jl") include("optimization/greedy.jl") include("api.jl") diff --git a/src/api.jl b/src/api.jl index 9e62ead..53bec4f 100644 --- a/src/api.jl +++ b/src/api.jl @@ -20,5 +20,5 @@ end function postprocess(out) return out - return BlockModel(out) + return out.node_labels, BlockModel(out) end diff --git a/src/assignment.jl b/src/assignment.jl index c30d003..5f6ca49 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -49,6 +49,10 @@ end number_nodes(a::Assignment) = length(a.node_labels) number_groups(a::Assignment) = size(a.θ, 1) +function proportions(a::Assignment) + return counts(a.node_labels) / number_nodes(a) +end + function loglikelihood(a::Assignment) return sum(a.log_likelihood) end diff --git a/src/block_model.jl b/src/block_model.jl index fedc7aa..e188a3f 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -1,17 +1,26 @@ struct BlockModel{D, K, T} _dists::SymArray{D} sizes::SVector{K, T} - cum_sizes::Vector{T} + cum_sizes::SVector{K,T} end function BlockModel(k::Int, d::D) where {D} sizes = @SVector fill(1/k, k) - cumulative_sizes = cumsum(sizes) + cumulative_sizes = SVector{k}(cumsum(sizes)) _dists = SymArray(k, d) return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) end +function BlockModel(a::Assignment) + k = length(unique(a.node_labels)) + sizes = SVector{k}(proportions(a)) + cumulative_sizes = SVector{k}(cumsum(sizes)) + _dists = deepcopy(a.θ) + return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(unwrap.(_dists), sizes, cumulative_sizes) +end + + function sample(bm::BlockModel, latents::Vector{T}) where {T} A = Array{eltype(bm[1,1]), 2}(undef, length(latents), length(latents)) .* zero(eltype(bm[1,1])) for j in 1:length(latents) diff --git a/src/distributions_type.jl b/src/distributions_type.jl index 3686987..e938725 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -47,7 +47,8 @@ end fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) ## TODO: remove type instability ? -loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) +# loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) +loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) unwrap(d::Dist) = d.dist diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index a8c7079..6206c65 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -1,6 +1,6 @@ module FastSymArray - import Base: eltype + import Base: eltype, convert export SymArray, eltype mutable struct SymArray{F} <: AbstractArray{F, 2} @@ -36,4 +36,25 @@ module FastSymArray function eltype(a::SymArray{F}) where {F} return F end + + function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} + @assert size(a, 1) == size(a, 2) + k = size(a, 1) + d = Dict{Tuple{Int, Int}, F}(minmax(i, j) => a[i, j] for i in 1:k + for j in i:k) + return SymArray(k, d) + end + + + function convert(::Type{AbstractMatrix{F}}, a::SymArray{F}) where {F} + k = a.k + m = zeros(F, k, k) + for i in 1:k + for j in i:k + m[i, j] = a[i, j] + end + end + return m + end + end From c295bc0f2df1e2536779e711640352dbdfe0926c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 20 May 2025 10:11:20 +0200 Subject: [PATCH 157/266] Fix blockmodel from assignment For now inefficient: does two conversions between matrix and custom types. Since this is not an operation that happens a lot, for now not on top of my todo list, but to be looked at. --- src/block_model.jl | 4 ++-- src/utils/SymArray.jl | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/block_model.jl b/src/block_model.jl index e188a3f..56462fb 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -16,8 +16,8 @@ function BlockModel(a::Assignment) k = length(unique(a.node_labels)) sizes = SVector{k}(proportions(a)) cumulative_sizes = SVector{k}(cumsum(sizes)) - _dists = deepcopy(a.θ) - return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(unwrap.(_dists), sizes, cumulative_sizes) + _dists = unwrap.(a.θ) + return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 6206c65..2912c14 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -8,7 +8,7 @@ module FastSymArray k::Int end - function SymArray(k, d::F) where {F} + function SymArray(k::T, d::F) where {F, T<:Real} @assert k > 0 return SymArray{F}(Dict{Tuple{Int, Int}, F}(minmax(i, j) => d for i in 1:k for j in i:k), k) @@ -40,9 +40,15 @@ module FastSymArray function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} @assert size(a, 1) == size(a, 2) k = size(a, 1) - d = Dict{Tuple{Int, Int}, F}(minmax(i, j) => a[i, j] for i in 1:k - for j in i:k) - return SymArray(k, d) + res = SymArray(k, a[1,1]) + for j in axes(a,2) + for i in axes(a,1) + if i <= j + res[i,j] = a[i,j] + end + end + end + return res end From cbf95d0ba9e4740a5e4ffaff4448842941a9de78 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 20 May 2025 14:39:21 +0200 Subject: [PATCH 158/266] update naming convention --- src/distributions_type.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/distributions_type.jl b/src/distributions_type.jl index e938725..cd9a4cb 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -40,7 +40,7 @@ _fast_compressed_obs(d, x) = x # what to delegate to the underlying distribution -for f in [:logpdf, :sample, :dist, :eltype, :params, :_fast_compressed_obs] +for f in [:logpdf, :sample, :distance, :eltype, :params, :_fast_compressed_obs] @eval $f(d::Dist, args...) = $f(d.dist, args...) end @@ -64,7 +64,7 @@ end zero(d::Bernoulli) = Bernoulli(zero(d.p)) agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) = Bernoulli(w1 * d1.p + w2 * d2.p) fit(::Bernoulli, x) = Bernoulli(mean(x)) -dist(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) +distance(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) params(d::Bernoulli) = (d.p,) eltype(d::Bernoulli) = Bool From 1501d744d7e55a59b213c310f61077d219f31f72 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 21 May 2025 15:35:18 +0200 Subject: [PATCH 159/266] before speed benchmark --- Project.toml | 2 ++ src/distributions_type.jl | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 29a587a..063a47c 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.5.2" [deps] ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -29,6 +30,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] +BenchmarkTools = "1.6.0" CairoMakie = "0.13.4" KrylovKit = "0.9.5" julia = "1.11" diff --git a/src/distributions_type.jl b/src/distributions_type.jl index cd9a4cb..1ebd4c6 100644 --- a/src/distributions_type.jl +++ b/src/distributions_type.jl @@ -47,8 +47,8 @@ end fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) ## TODO: remove type instability ? -# loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) -loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) +loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) +# loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) unwrap(d::Dist) = d.dist From de5337b4e772aeab50209da14ff093d3c275b261 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 24 May 2025 14:36:03 +0200 Subject: [PATCH 160/266] same speed, but maybe parallelization options --- src/assignment.jl | 22 ++++++++++--- src/optimization/swap_workspace.jl | 50 ------------------------------ 2 files changed, 17 insertions(+), 55 deletions(-) diff --git a/src/assignment.jl b/src/assignment.jl index 5f6ca49..759f421 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -101,12 +101,24 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list: end end end - for k in 1:number_groups - for l in k:number_groups - log_likelihood[k, - l] = loglikelihood( - θ[k, l], get_edges_in_groups(node_labels, edge_list, k, l)) + + for u in 1:nodes(dists) + g1 = node_labels[u] + for (v, e) in iterate_neighbors(edge_list, u) + g2 = node_labels[v] + if u > v + log_likelihood[g1,g2] += logpdf(θ[g1,g2], e) + else + break + end end end + # for k in 1:number_groups + # for l in k:number_groups + # log_likelihood[k, + # l] = loglikelihood( + # θ[k, l], get_edges_in_groups(node_labels, edge_list, k, l)) + # end + # end return θ, log_likelihood end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 0c28877..c61d0ee 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -39,53 +39,3 @@ function _slow_swap!(a::Assignment, s::Swap) end apply_swap!(a::Assignment, s::Swap) = _slow_swap!(a, s) - - -## below is not faster than the above, need to find a way to take advantage of the sparsity -# somewhere in the datastructure for it to make a difference - -# function apply_swap!(a::Assignment, s::Swap) -# g1 = group(a, s.u) -# g2 = group(a, s.v) -# groups_concerned = Set([minmax(g1, g2)]) -# for (u, g_old, g_new) in [(s.u, g1, g2), (s.v, g2, g1)] -# for (v,d) in iterate_neighbors(a.dists, u) -# if v == s.u || v == s.v -# continue -# end -# g_v = group(a, v) -# a.θ[g_new, g_v] = add_to(a.θ[g_new, g_v], d) -# a.θ[g_old, g_v] = remove_from(a.θ[g_old, g_v], d) -# push!(groups_concerned, minmax(g_new, g_v), minmax(g_old, g_v)) -# end -# end -# swap_node_labels!(a, s.u, s.v) -# fast_ll_update!(a, groups_concerned) -# end - - -# ## below can be specialised for Bernoulli probably (probably above needs to be actually) - -# function fast_ll_update!(a, groups_concerned) -# for g in groups_concerned -# a.log_likelihood[g[1], g[2]] = _fast_ll_one_group(a, g[1], g[2]) -# end -# end - - -# function _fast_ll_one_group(a::Assignment, g1, g2) -# nodes_g1 = findall(x -> x == g1, a.node_labels) -# nodes_g2 = findall(x -> x == g2, a.node_labels) -# ll = 0.0 -# d = a.θ[g1, g2] -# for u in nodes_g1 -# for (v,e) in iterate_neighbors(a.edges,u) # assume implicitly that g1 != g2 -# if v in nodes_g2 -# if (g1 == g2 && u < v) || g1 != g2 -# ll += logpdf(d, e) -# end -# end -# end -# end -# return ll -# end From a2d3539bae35746a3c9da6acf6bb2ac156956b08 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 24 May 2025 18:13:31 +0200 Subject: [PATCH 161/266] sum was doing all sum we just need triangular with diag --- src/assignment.jl | 2 +- src/optimization/config_rules/stop_rule.jl | 2 +- src/utils/SymArray.jl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/assignment.jl b/src/assignment.jl index 759f421..b7f29fb 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -54,7 +54,7 @@ function proportions(a::Assignment) end function loglikelihood(a::Assignment) - return sum(a.log_likelihood) + return FastSymArray.sum_tri_with_diag(a.log_likelihood) end function group(a::Assignment, node::Int) diff --git a/src/optimization/config_rules/stop_rule.jl b/src/optimization/config_rules/stop_rule.jl index a3d639e..1ead4f8 100644 --- a/src/optimization/config_rules/stop_rule.jl +++ b/src/optimization/config_rules/stop_rule.jl @@ -10,7 +10,7 @@ end # default score is the log likelihood function score(a::Assignment) - return loglikelihood(a) / binomial(number_nodes(a), 2) + return loglikelihood(a) #/ binomial(number_nodes(a), 2) end mutable struct PreviousBestValue{T} <: StopRule diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 2912c14..e594840 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -29,7 +29,7 @@ module FastSymArray end - function Base.sum(a::SymArray) + function sum_tri_with_diag(a::SymArray) return sum(values(a.d)) end From c98afe81fd59372dcca2a24c14d77bf18bd49214 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 24 May 2025 19:09:37 +0200 Subject: [PATCH 162/266] add early warning for extreme behaviour in local search --- src/optimization/greedy.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 39bc5b7..ddf65f7 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -31,6 +31,9 @@ function greedy_improve!(a::Assignment; params = GreedyParams()) local_search!(a, swap, params) next!(p; showvalues = [("ll: ",sum(a.log_likelihood)), info_to_print(params.stop_rule)]) if stopping_rule(a, params.stop_rule) + if i < 10 + @warn "Greedy search stopped early after $(i) iterations" + end finish!(p) break end From 8180c7d207115b26fcd7fa2c9fe7bc915bf5114a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 26 May 2025 09:45:52 +0200 Subject: [PATCH 163/266] more general sampling procedure for sbm --- src/block_model.jl | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/block_model.jl b/src/block_model.jl index 56462fb..889d4c5 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -20,21 +20,33 @@ function BlockModel(a::Assignment) return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) end +function map_ξ_to_block(bm::BlockModel, ξ::T) where {T<:Real} + return findfirst(x -> x >= ξ, bm.cum_sizes) +end + +function sample(bm::BlockModel, latents::Int, args...) + latents = map(x -> map_ξ_to_block(bm, x), rand(latents)) + return latents, sample(bm, latents, args...) +end + -function sample(bm::BlockModel, latents::Vector{T}) where {T} - A = Array{eltype(bm[1,1]), 2}(undef, length(latents), length(latents)) .* zero(eltype(bm[1,1])) +function sample(bm::BlockModel, latents::Vector{T}, args...) where {T} + A = Array{eltype(bm[1,1]), 2}(undef, length(latents), length(latents)) for j in 1:length(latents) for i in 1:j-1 A[i, j] = A[j, i] end for i in j+1:length(latents) - # println("i: ", i, " j: ", j) # println("latents[i]: ", latents[i], " latents[j]: ", latents[j]) # println("bm[latents[i], latents[j]]: ", bm[latents[i], latents[j]]) - A[i, j] = sample(bm[latents[i], latents[j]]) + A[i, j] = sample(bm[latents[i], latents[j]], args...) A[j, i] = A[i, j] end end + # fill the diagonal with zeros, avoid undefined references + for i in 1:length(latents) + A[i, i] = zero(A[1,2]) + end return A end @@ -65,3 +77,9 @@ function Base.setindex!(s::BlockModel, v, i::Real, j::Real) l = findfirst(x -> x ≥ j, s.cum_sizes) s._dists[k, l] = v end + + +# helpers for generating ordered latents +function ordered_latents(bm::BlockModel, n::Int) + return sort(map(x -> map_ξ_to_block(bm, x), rand(n))) +end From dfee0e30a1f067a6bbb07d2e21987e498aefa43f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 26 May 2025 12:28:40 +0200 Subject: [PATCH 164/266] restructure --- src/NetworkHistogram.jl | 2 +- src/{ => distributions}/distributions_type.jl | 0 src/distributions/include.jl | 1 + src/distributions/zero_mixture.jl | 50 +++++++++++++++++++ src/optimization/config_rules/InitRule.jl | 16 ++++++ 5 files changed, 68 insertions(+), 1 deletion(-) rename src/{ => distributions}/distributions_type.jl (100%) create mode 100644 src/distributions/include.jl create mode 100644 src/distributions/zero_mixture.jl diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 8845f4d..57169bb 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -9,7 +9,7 @@ import Base: convert, eltype, zero include("utils/include.jl") using .FastSymArray -include("distributions_type.jl") +include("distributions/include.jl") include("EdgeList.jl") include("assignment.jl") include("block_model.jl") diff --git a/src/distributions_type.jl b/src/distributions/distributions_type.jl similarity index 100% rename from src/distributions_type.jl rename to src/distributions/distributions_type.jl diff --git a/src/distributions/include.jl b/src/distributions/include.jl new file mode 100644 index 0000000..4512d42 --- /dev/null +++ b/src/distributions/include.jl @@ -0,0 +1 @@ +include("distributions_type.jl") diff --git a/src/distributions/zero_mixture.jl b/src/distributions/zero_mixture.jl new file mode 100644 index 0000000..0cfae5a --- /dev/null +++ b/src/distributions/zero_mixture.jl @@ -0,0 +1,50 @@ +struct ZeroInflated{B, D} + edge_proba::B + dist::D +end + +function ZeroInflated(p::Real, dist::D) where {D} + return ZeroInflated(Bernoulli(1 - p), dist) +end + +function Distributions.pdf(d::ZeroInflated, x::Real) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + + pdf(d.edge_proba, one(x)) * pdf(d.dist, x) +end + +function get_proba_zero(d::ZeroInflated) + return pdf(d.edge_proba, 0) +end + +function rand(rng::Random.AbstractRNG, d::ZeroInflated) + return rand(rng, d.edge_proba) * rand(rng, d.dist) +end + +logpdf(d::ZeroInflated, x::Real) = log(pdf(d, x)) + +minimum(d::ZeroInflated) = min(minimum(d.dist), 0) + +maximum(d::ZeroInflated) = max(maximum(d.dist), 0) + +insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) + +function Distributions.cdf(d::ZeroInflated, x::Real) + return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + + cdf(d.dist, x) * pdf(d.edge_proba, one(x)) +end + +function Distributions.params(d::ZeroInflated) + (first(params(d.edge_proba)), params(d.dist)...) +end + +function Distributions.fit( + ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} + indices_0 = findall(x -> iszero(x), data) + p = length(indices_0) / length(data) + if p != 1 + return ZeroInflated( + p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) + else + return ZeroInflated(1.0, D()) + end +end diff --git a/src/optimization/config_rules/InitRule.jl b/src/optimization/config_rules/InitRule.jl index 23ac2fa..db607ce 100644 --- a/src/optimization/config_rules/InitRule.jl +++ b/src/optimization/config_rules/InitRule.jl @@ -7,6 +7,11 @@ struct FromAssignment{A} <: StartingAssignment assignment::A end + +struct FromNodeLabels{L} <: StartingAssignment + node_labels::L +end + struct InitRule{S <: StartingAssignment, I} starting_assignment_rule::S assignment_rule::I @@ -42,3 +47,14 @@ function initialize_node_labels(g, h, ::RandomStart) Random.shuffle!(node_labels) return group_size, node_labels end + + +function initialise_node_labels(g,h, init_rule::FromAssignment{A}) where {A <: Assignment} + return initialise_node_labels(g, h, FromNodeLabels(init_rule.assignment.node_labels)) +end + + +function initialise_node_labels(g, h, init_rule::FromNodeLabels{L}) where {L} + @assert number_nodes(g) == length(init_rule.node_labels) + return group_size, deepcopy(init_rule.node_labels) +end From 6c3689c871aa9a427136fac7705f4966a2eb7e6f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 26 May 2025 16:46:40 +0200 Subject: [PATCH 165/266] restart for zero inflated --- src/distributions/zero_inflated.jl | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/distributions/zero_inflated.jl diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl new file mode 100644 index 0000000..61b5dd5 --- /dev/null +++ b/src/distributions/zero_inflated.jl @@ -0,0 +1,9 @@ + +struct ZeroInflated{D, F} + dist::D + proba_zero::F +end + +function ZeroInflated(dist) + return ZeroInflated(dist, 0.0) +end From 0fb683612bd65b1afeee3ad586023eab4841359d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 26 May 2025 17:49:14 +0200 Subject: [PATCH 166/266] add zero inflated dist --- src/api.jl | 15 +++++++-- src/assignment.jl | 2 +- src/distributions/include.jl | 1 + src/distributions/zero_inflated.jl | 45 +++++++++++++++++++++++++++ src/distributions/zero_mixture.jl | 50 ------------------------------ 5 files changed, 60 insertions(+), 53 deletions(-) delete mode 100644 src/distributions/zero_mixture.jl diff --git a/src/api.jl b/src/api.jl index 53bec4f..76ef96e 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,7 +1,11 @@ -function nethist(data_input, dist_user, initial_node_labels, params::GreedyParams) +function nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated::Bool = false) + return _nethist(data_input, dist_user, initial_node_labels, params, Val(zero_inflated)) +end + +function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) @debug "preprocessing data" - dist = Dist(dist_user) + dist = get_ref_dist(dist_user, zero_inflated) g = preprocess_data(data_input, dist) @debug "started optimizatiion" @@ -12,6 +16,13 @@ function nethist(data_input, dist_user, initial_node_labels, params::GreedyParam end +function get_ref_dist(dist::D, ::Val{true}) where {D} + return Dist(ZeroInflated(dist)) +end +function get_ref_dist(dist::D, ::Val{false}) where {D} + return Dist(dist) +end + function preprocess_data(data, dist::Dist) A = EdgeList(_fast_compressed_obs(dist, data)) return A, dist diff --git a/src/assignment.jl b/src/assignment.jl index b7f29fb..9acdf67 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -107,7 +107,7 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list: for (v, e) in iterate_neighbors(edge_list, u) g2 = node_labels[v] if u > v - log_likelihood[g1,g2] += logpdf(θ[g1,g2], e) + log_likelihood[g1, g2] += logpdf(θ[g1, g2], e) else break end diff --git a/src/distributions/include.jl b/src/distributions/include.jl index 4512d42..6b0a06c 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -1 +1,2 @@ include("distributions_type.jl") +include("zero_inflated.jl") diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 61b5dd5..6702899 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -4,6 +4,51 @@ struct ZeroInflated{D, F} proba_zero::F end +abstract type SampleZI end + +struct SampleNZ{F} <: SampleZI + value::F +end + +struct SampleZ <: SampleZI +end + function ZeroInflated(dist) return ZeroInflated(dist, 0.0) end + +function logpdf(zi::ZeroInflated{D, F}, x::SampleZ) where {D, F} + return log(zi.proba_zero) +end + +function logpdf(zi::ZeroInflated{D, F}, x::SampleNZ{T}) where {D, F, T} + return log(1 - zi.proba_zero) + logpdf(zi.dist, x.value) +end + +function agg_params( + zi1::ZeroInflated{D, F}, zi2::ZeroInflated{D, F}, w1, w2) where {D, F} + new_dist = agg_params(zi1.dist, zi2.dist, w1, w2) + new_proba_zero = w1 * zi1.proba_zero + w2 * zi2.proba_zero + return ZeroInflated(new_dist, new_proba_zero) +end + +zero(zi::ZeroInflated) = ZeroInflated(zero(zi.dist), 0.0) + +eltype(zi::ZeroInflated{D, F}) where {D, F} = Union{SampleZ, SampleNZ{eltype(D)}} + + +function fit(zi::ZeroInflated{D, F}, x::SampleZ) where {D, F} + return ZeroInflated(zero(zi.dist), 1.0) +end + +function fit(zi::ZeroInflated{D, F}, x::SampleNZ{T}) where {D, F, T} + return ZeroInflated(fit(zi.dist, x.value), 0.0) +end + +function _fast_compressed_obs(zi::ZeroInflated, x) + if iszero(x) + return SampleZ() + else + return SampleNZ(_fast_compressed_obs(zi.dist, x)) + end +end diff --git a/src/distributions/zero_mixture.jl b/src/distributions/zero_mixture.jl deleted file mode 100644 index 0cfae5a..0000000 --- a/src/distributions/zero_mixture.jl +++ /dev/null @@ -1,50 +0,0 @@ -struct ZeroInflated{B, D} - edge_proba::B - dist::D -end - -function ZeroInflated(p::Real, dist::D) where {D} - return ZeroInflated(Bernoulli(1 - p), dist) -end - -function Distributions.pdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x) + - pdf(d.edge_proba, one(x)) * pdf(d.dist, x) -end - -function get_proba_zero(d::ZeroInflated) - return pdf(d.edge_proba, 0) -end - -function rand(rng::Random.AbstractRNG, d::ZeroInflated) - return rand(rng, d.edge_proba) * rand(rng, d.dist) -end - -logpdf(d::ZeroInflated, x::Real) = log(pdf(d, x)) - -minimum(d::ZeroInflated) = min(minimum(d.dist), 0) - -maximum(d::ZeroInflated) = max(maximum(d.dist), 0) - -insupport(d::ZeroInflated, x::Real) = x == 0 || insupport(d.dist, x) - -function Distributions.cdf(d::ZeroInflated, x::Real) - return pdf(d.edge_proba, zero(x)) * _dirac_delta(x, zero(x), Inf) + - cdf(d.dist, x) * pdf(d.edge_proba, one(x)) -end - -function Distributions.params(d::ZeroInflated) - (first(params(d.edge_proba)), params(d.dist)...) -end - -function Distributions.fit( - ::Type{ZeroInflated{B, D}}, data::AbstractArray, n_cat) where {B, D} - indices_0 = findall(x -> iszero(x), data) - p = length(indices_0) / length(data) - if p != 1 - return ZeroInflated( - p, fit(D, data[setdiff(collect(eachindex(data)), indices_0)])) - else - return ZeroInflated(1.0, D()) - end -end From 6ac0c1cbfd3eafd7e33d46cb033589190ff950b0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 26 May 2025 19:46:16 +0200 Subject: [PATCH 167/266] remove over type specialisation --- src/NetworkHistogram.jl | 2 +- src/distributions/zero_inflated.jl | 52 ++++++++++++++++++------------ src/optimization/greedy.jl | 2 +- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 57169bb..70afaa0 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -2,7 +2,7 @@ module NetworkHistogram using StatsBase using StaticArrays using ProgressMeter -import StatsAPI: loglikelihood, fit +import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 6702899..b112e79 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -4,25 +4,23 @@ struct ZeroInflated{D, F} proba_zero::F end -abstract type SampleZI end - -struct SampleNZ{F} <: SampleZI +struct SampleZI{F} value::F + iszero::Bool end -struct SampleZ <: SampleZI -end function ZeroInflated(dist) return ZeroInflated(dist, 0.0) end -function logpdf(zi::ZeroInflated{D, F}, x::SampleZ) where {D, F} - return log(zi.proba_zero) -end -function logpdf(zi::ZeroInflated{D, F}, x::SampleNZ{T}) where {D, F, T} - return log(1 - zi.proba_zero) + logpdf(zi.dist, x.value) +function logpdf(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} + if x.iszero + return log(zi.proba_zero) + else + return log(1 - zi.proba_zero) + logpdf(zi.dist, x.value) + end end function agg_params( @@ -34,21 +32,33 @@ end zero(zi::ZeroInflated) = ZeroInflated(zero(zi.dist), 0.0) -eltype(zi::ZeroInflated{D, F}) where {D, F} = Union{SampleZ, SampleNZ{eltype(D)}} +eltype(zi::ZeroInflated{D, F}) where {D, F} = SampleZI{eltype(D)} +params(zi::ZeroInflated{D, F}) where {D, F} = (params(zi.dist)..., zi.proba_zero) -function fit(zi::ZeroInflated{D, F}, x::SampleZ) where {D, F} - return ZeroInflated(zero(zi.dist), 1.0) +function fit(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} + if x.iszero + return ZeroInflated(zero(zi.dist), 1.0) + else + return ZeroInflated(fit(zi.dist, x.value), 0.0) + end end -function fit(zi::ZeroInflated{D, F}, x::SampleNZ{T}) where {D, F, T} - return ZeroInflated(fit(zi.dist, x.value), 0.0) +function _fast_compressed_obs(zi::ZeroInflated, x, filter = iszero) + return SampleZI(_fast_compressed_obs(zi.dist,x), filter(x)) end -function _fast_compressed_obs(zi::ZeroInflated, x) - if iszero(x) - return SampleZ() - else - return SampleNZ(_fast_compressed_obs(zi.dist, x)) - end + +function unwrap(d::Dist{ZeroInflated{B,D}}) where {B,D} + #yeah I know again... + return d.dist.dist +end + + +function get_proportion_observed(d::Dist{ZeroInflated{B, D}}) where {B, D} + return (1-d.dist.proba_zero) * d.counts +end + +function get_proportion_observed(d::Dist) + return d.counts end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index ddf65f7..edbe89b 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -29,7 +29,7 @@ function greedy_improve!(a::Assignment; params = GreedyParams()) for i in 1:params.max_iter local_search!(a, swap, params) - next!(p; showvalues = [("ll: ",sum(a.log_likelihood)), info_to_print(params.stop_rule)]) + next!(p; showvalues = [("ll: ", loglikelihood(a)), info_to_print(params.stop_rule)]) if stopping_rule(a, params.stop_rule) if i < 10 @warn "Greedy search stopped early after $(i) iterations" From 9378236bb2cb62ff1bfa77f0ddb8c3ec2dfceda8 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 27 May 2025 15:33:12 +0200 Subject: [PATCH 168/266] add fast update for heavy ll --- Project.toml | 2 + src/NetworkHistogram.jl | 2 +- src/assignment.jl | 65 ++++++++++++++++++++++--- src/distributions/distributions_type.jl | 18 ++++++- src/optimization/swap_workspace.jl | 40 ++++++++++++++- 5 files changed, 115 insertions(+), 12 deletions(-) diff --git a/Project.toml b/Project.toml index 063a47c..a2c0a87 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" +OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -33,6 +34,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" BenchmarkTools = "1.6.0" CairoMakie = "0.13.4" KrylovKit = "0.9.5" +OhMyThreads = "0.8.3" julia = "1.11" [extras] diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 70afaa0..00027f1 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -4,7 +4,7 @@ using StaticArrays using ProgressMeter import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero - +using OhMyThreads: TaskLocalValue, @tasks, @local include("utils/include.jl") using .FastSymArray diff --git a/src/assignment.jl b/src/assignment.jl index 9acdf67..3577e76 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -101,7 +101,6 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list: end end end - for u in 1:nodes(dists) g1 = node_labels[u] for (v, e) in iterate_neighbors(edge_list, u) @@ -113,12 +112,62 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list: end end end - # for k in 1:number_groups - # for l in k:number_groups - # log_likelihood[k, - # l] = loglikelihood( - # θ[k, l], get_edges_in_groups(node_labels, edge_list, k, l)) - # end - # end return θ, log_likelihood end + + + + +function _compute_theta_and_ll_not_working( + node_labels, + dists::EdgeList{Dist{D}}, + edge_list::EdgeList{E}, + dist::Dist{D} +) where {E, D} + n = nodes(dists) + K = length(unique(node_labels)) + + # --- PASS 1: accumulate θ in parallel --- + partial_thetas = @tasks for u in 1:n + @set collect = true + @local theta_local = SymArray(K, zero(dist)) + gu = node_labels[u] + for (v, d) in iterate_neighbors(dists, u) + if u < v + gv = node_labels[v] + theta_local[gu, gv] = add_to(theta_local[gu, gv], d) + end + end + theta_local + end + # reduce into the final θ + θ = SymArray(K, zero(dist)) + for t in partial_thetas + for j in 1:K, i in j:K + θ[i, j] = add_to(θ[i, j], t[i, j]) + end + end + + # --- PASS 2: compute log‐likelihoods in parallel --- + partial_lls = @tasks for u in 1:n + @set collect = true + @local ll_local = SymArray(K, 0.0) + gu = node_labels[u] + for (v, e) in iterate_neighbors(edge_list, u) + if u < v + gv = node_labels[v] + ll_local[gu, gv] += logpdf(θ[gu, gv], e) + end + end + ll_local + end + # reduce into the final ll + ll = SymArray(K, 0.0) + for p in partial_lls + for j in 1:K, i in j:K + ll[i, j] += p[i, j] + end + end + + return θ, ll +end diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index 1ebd4c6..4d75b38 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -31,8 +31,22 @@ end ## probably this is fucked ... -add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) -remove_from(d::Dist, dist::Dist) = remove_from(d, dist.dist) +# add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) + +function add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} + Dist( + agg_params( + avgdist.dist, dist.dist, avgdist.counts / (avgdist.counts + dist.counts), + dist.counts / (avgdist.counts + dist.counts)), + avgdist.counts + dist.counts) +end +function remove_from(avgdist::Dist, dist::Dist) + Dist( + agg_params( + avgdist.dist, dist.dist, avgdist.counts / max(1, (avgdist.counts - dist.counts)), + - dist.counts / max(1, (avgdist.counts - dist.counts))), + avgdist.counts - dist.counts) +end # expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined # by default do nothing diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index c61d0ee..75b8a03 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -38,4 +38,42 @@ function _slow_swap!(a::Assignment, s::Swap) a.θ, a.log_likelihood = _compute_theta_and_ll(a.node_labels, a.dists, a.edges, a.θ[1,1]) end -apply_swap!(a::Assignment, s::Swap) = _slow_swap!(a, s) +# apply_swap!(a::Assignment, s::Swap) = _slow_swap!(a, s) + + +function apply_swap!(a::Assignment, s::Swap) + u,v = s.u, s.v + gu = a.node_labels[u] + gv = a.node_labels[v] + groups_concerned = Set{Tuple{Int,Int}}([minmax(gu, gv)]) + + for (node, d) in iterate_neighbors(a.dists, u) + if node == v + continue + end + g1 = a.node_labels[node] + a.θ[gv, g1] = add_to(a.θ[gv, g1], d) + a.θ[gu, g1] = remove_from(a.θ[gu, g1], d) + push!(groups_concerned, minmax(gu,g1)) + push!(groups_concerned, minmax(gv,g1)) + end + + for (index, (node, d)) in enumerate(iterate_neighbors(a.dists, v)) + if node == u + continue + end + g2 = a.node_labels[node] + a.θ[gu, g2] = add_to(a.θ[gu, g2], d) + a.θ[gv, g2] = remove_from(a.θ[gv, g2], d) + push!(groups_concerned, minmax(gv,g2)) + push!(groups_concerned, minmax(gu,g2)) + end + + swap_node_labels!(a, u, v) + for (g1, g2) in groups_concerned + a.log_likelihood[g1, g2] = 0.0 + for e in get_edges_in_groups(a.node_labels, a.edges, g1, g2) + a.log_likelihood[g1, g2] += logpdf(a.θ[g1, g2], e) + end + end +end From 85546e7307fab969bf93549f9058c6f21eeb2d7d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 6 Jun 2025 12:21:09 +0200 Subject: [PATCH 169/266] add skeleton for clustered start --- Project.toml | 1 + src/EdgeList.jl | 14 ++++++++++++++ src/NetworkHistogram.jl | 1 + src/optimization/config_rules/InitRule.jl | 18 ++++++++++++------ 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index a2c0a87..8c719d2 100644 --- a/Project.toml +++ b/Project.toml @@ -33,6 +33,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] BenchmarkTools = "1.6.0" CairoMakie = "0.13.4" +Clustering = "0.15.7" KrylovKit = "0.9.5" OhMyThreads = "0.8.3" julia = "1.11" diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 37b0e9b..fe2d5dc 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -16,6 +16,20 @@ number_nodes(edgelist::EdgeList) = nodes(edgelist) EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} = _from_adj_to_edge_list(A) EdgeList(adj_list::EdgeList) = adj_list +function get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} + if i == j + return zero(E) + end + if j ∉ A.name_list[i] && i ∉ A.name_list[j] + return zero(E) + end + for (k, e) in iterate_neighbors(A, i) + if k == j + return e + end + end +end + # function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} # n = size(A, 1) # data = Vector{Vector{E}}(undef, n) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 00027f1..018519f 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -5,6 +5,7 @@ using ProgressMeter import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero using OhMyThreads: TaskLocalValue, @tasks, @local +using Clustering include("utils/include.jl") using .FastSymArray diff --git a/src/optimization/config_rules/InitRule.jl b/src/optimization/config_rules/InitRule.jl index db607ce..785b1d3 100644 --- a/src/optimization/config_rules/InitRule.jl +++ b/src/optimization/config_rules/InitRule.jl @@ -1,7 +1,7 @@ abstract type StartingAssignment end struct OrderedStart <: StartingAssignment end struct RandomStart <: StartingAssignment end - +struct ClusteredStart <: StartingAssignment end struct FromAssignment{A} <: StartingAssignment assignment::A @@ -27,25 +27,26 @@ end """ initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) -initialize node labels based on the `starting_assignment_rule`, and return a `GroupSize` -object and a vector of node labels. +initialize node labels based on the `starting_assignment_rule`, and return a vector of +node labels. # Implemented rules - `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. - `RandomStart()`: Randomly assign nodes to groups. +- `ClusteredStart()`: Assign nodes to groups based on a kmedoids algorithm. """ initialize_node_labels function initialize_node_labels(g, h, ::OrderedStart) group_size = GroupSize(number_nodes(g), h) node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) - return group_size, node_labels + return node_labels end function initialize_node_labels(g, h, ::RandomStart) group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) Random.shuffle!(node_labels) - return group_size, node_labels + return node_labels end @@ -56,5 +57,10 @@ end function initialise_node_labels(g, h, init_rule::FromNodeLabels{L}) where {L} @assert number_nodes(g) == length(init_rule.node_labels) - return group_size, deepcopy(init_rule.node_labels) + return deepcopy(init_rule.node_labels) +end + + +function number_nodes(g::AbstractMatrix) + return size(g, 1) end From d630df5897c93bb1c8ddbead999d63d5c2bde43b Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 8 Jul 2025 12:56:44 +0200 Subject: [PATCH 170/266] typo in zero inflated --- src/distributions/zero_inflated.jl | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index b112e79..1bfbd7b 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -23,11 +23,20 @@ function logpdf(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} end end +# function logpdf(zi::ZeroInflated{D, F}, x) where {D, F} +# if iszero(x) +# return log(zi.proba_zero) +# else +# return log(1 - zi.proba_zero) + logpdf(zi.dist, x.value) +# end +# end + function agg_params( zi1::ZeroInflated{D, F}, zi2::ZeroInflated{D, F}, w1, w2) where {D, F} - new_dist = agg_params(zi1.dist, zi2.dist, w1, w2) new_proba_zero = w1 * zi1.proba_zero + w2 * zi2.proba_zero - return ZeroInflated(new_dist, new_proba_zero) + return ZeroInflated( + agg_params(zi1.dist, zi2.dist, w1, w2), + new_proba_zero) end zero(zi::ZeroInflated) = ZeroInflated(zero(zi.dist), 0.0) @@ -62,3 +71,20 @@ end function get_proportion_observed(d::Dist) return d.counts end + + +# function fit(zd::ZeroInflated, x::SampleZI) +# if x.iszero +# return ZeroInflated(zero(zd.dist), 1.0) +# else +# return ZeroInflated(fit(zd.dist, x.value), 0.0) +# end +# end + +# function fit(zd::ZeroInflated{D, F}, x) where {D, F} +# if iszero(x) +# return ZeroInflated(zero(zd.dist), 1.0) +# else +# return ZeroInflated(fit(zd.dist, x), 0.0) +# end +# end From 99564064b41db3187cf470dcc259e7e884710358 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 10 Jul 2025 20:22:02 +0200 Subject: [PATCH 171/266] add utilities for sbm --- src/block_model.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/block_model.jl b/src/block_model.jl index 889d4c5..48dce3d 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -20,6 +20,16 @@ function BlockModel(a::Assignment) return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) end + +function BlockModel(nodes_labels, θ) + k = length(unique(nodes_labels)) + sizes = SVector{k}(counts(nodes_labels) / length(nodes_labels)) + cumulative_sizes = SVector{k}(cumsum(sizes)) + _dists = unwrap.(θ) + return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) +end + + function map_ξ_to_block(bm::BlockModel, ξ::T) where {T<:Real} return findfirst(x -> x >= ξ, bm.cum_sizes) end From b663709b69f82a9539f2db5f4df00bb996ebf8b1 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 24 Sep 2025 09:34:19 +0200 Subject: [PATCH 172/266] add more detailed readme --- README.md | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8d7bf2e..4b8d822 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,31 @@ [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://sds-epfl.github.io/NetworkHistogram.jl/stable/) [![DOI](https://zenodo.org/badge/572018079.svg)](https://zenodo.org/doi/10.5281/zenodo.10212851) - -Implementation of the network histogram for graphon estimation from the paper [Network histograms and universality of blockmodel approximation](https://doi.org/10.1073/pnas.1400374111) by Sofia C. Olhede and Patrick J. Wolfe. - +Implementation of the network histogram for graphon estimation from the paper +[Network histograms and universality of blockmodel approximation (2014)](https://doi.org/10.1073/pnas.1400374111) +by Sofia C. Olhede and Patrick J. Wolfe and its extension to decorated graphs +by Charles Dufour and Sofia C. Olhede +[Inference for decorated graphs and application to multiplex networks (2024)](https://arxiv.org/abs/2408.12339). + +The network histogram is a nonparametric estimator for the generating mechanism +of an exchangeable random graph (see graphons, decorated graphons and +probability graphons). We assume our observed graph is +$A \in \mathcal{K}^{n \times n}$, where $\mathcal{K}$ is a set of edge +decorations (e.g. $\{0,1\}$ for unweighted graphs, $\mathbb{N}$ for count +edges, $\mathbb{R}$ for real-valued edges, etc.). Using the Aldous-Hoover +theorem, we know that $A$ is generated from a graphon +$W: [0,1]^2 \to \mathcal{P}\left(\mathcal{K}\right)$, where +$\mathcal{P}\left(\mathcal{K}\right)$ is the set of probability measures on +$\mathcal{K}$ in the following way: + +1. Sample $U_1, \ldots, U_n \sim \text{iid } \text{Uniform}[0,1]$. +2. For each pair of nodes $i,j$, sample the edge $A_{ij} \sim W(U_i, U_j)$ + independently. + +The network histogram approximates the generating graphon +$W: [0,1]^2 \to \mathcal{P}\left(\mathcal{K}\right)$ by a piecewise constant +function, i.e. a stochastic block model with $k$ blocks. For details, see the +papers mentioned above. ## Installation @@ -24,7 +46,8 @@ Pkg.add("NetworkHistogram") ## Usage -We fit the estimator and then extract the estimated graphon matrix and node labels. +We fit the estimator and then extract the estimated graphon matrix and node +labels. ```julia using NetworkHistogram, LinearAlgebra @@ -45,4 +68,5 @@ sbm_matrix = estimate.θ node_labels = estimate.node_labels ``` -You can control the optimization process by modifying the rules used in the optimization. Check out the docs for more information. +You can control the optimization process by modifying the rules used in the +optimization. Check out the docs for more information. From ee1e56c9b4b7c18905d7c49c5c26cae94ef19d14 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 24 Sep 2025 11:40:16 +0200 Subject: [PATCH 173/266] trying to unify once more --- src/optimization/swap_categorical.jl | 91 ++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 src/optimization/swap_categorical.jl diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl new file mode 100644 index 0000000..2783900 --- /dev/null +++ b/src/optimization/swap_categorical.jl @@ -0,0 +1,91 @@ +mutable struct WorkspaceDiscreteSwap{ + D, C <: AbstractMatrix, R <: AbstractMatrix, + R2 <: AbstractMatrix, S <: AbstractMatrix{D}, + L <: AbstractMatrix} + θ::S + log_likelihood_per_group::L + counts::C + realized::R + estimated::R2 +end + +struct Cat{M, V <: AbstractVector{<:Real}} + p::V + function Cat(p::AbstractVector{<:Real}) + new{Val{length(p)}, typeof(p)}(p / sum(p)) + end +end + +num_categories(::Type{Cat{Val{M}, V}}) where {M, V} = M +num_categories(::Cat{Val{M}, V}) where {M, V} = M +zero(c::Cat{Val{M}, V}) where {M, V} = Cat(ones(eltype(V), M)) +distance(c1::Cat{M, V}, c2::Cat{M, V}) where {M, V} = sum(abs.(c1.p .- c2.p)) +eltype(::Cat{M, V}) where {M, V} = Int +params(c::Cat) = (c.p,) +logpdf(c::Cat, x::Int) = log(c.p[x]) + +function set_params!(c::Cat{M, V}, p::V) where {M, V} + c.p .= p +end + +function Assignment( + node_labels, edge_list::EdgeList{E}, + dist::Dist{D}) where {E, D <: Cat} + dists = fit(dist, edge_list) + realized = Matrix{Vector{Int}}(undef, n_groups, n_groups) + counts = Matrix{Int}(undef, n_groups, n_groups) + estimated = Matrix{Vector{Float64}}(undef, n_groups, n_groups) + fill!(realized, zeros(Int, num_categories(unwrap(dist)))) + fill!(counts, 0) + for u in 1:n_nodes + g1 = node_labels[u] + for (v, e) in iterate_neighbors(edge_list, u) + g2 = node_labels[v] + if u < v + counts[minmax(g1, g2)...] += 1 + realized[minmax(g1, g2)...][e] += 1 + else + break + end + end + end + for g2 in 1:n_groups + for g1 in g2:n_groups + estimated[g1, g2] = (counts[g1, g2] == 0) ? + zeros(Float64, num_categories(unwrap(dist))) : + (realized[g1, g2]) ./ counts[g1, g2] + end + end + + θ = SymArray(n_groups, zero(dist)) + log_likelihood_per_group = SymArray(n_groups, 0.0) + for g2 in 1:n_groups + for g1 in g2:n_groups + set_params!(θ[g1, g2], estimated[g1, g2]) + for m in 1:num_categories(unwrap(dist)) + if realized[g1, g2][m] > 0 + log_likelihood_per_group[g1, g2] += realized[g1, g2][m] * + logpdf(θ[g1, g2], m) + end + end + end + end + w = WorkspaceDiscreteSwap{Dist{D}, + Matrix{Int}, Matrix{Float64}, Matrix{Float64}, + SymArray{D}, SymArray{Float64}}( + deepcopy(θ), deepcopy(log_likelihood_per_group), + counts, deepcopy(realized), deepcopy(estimated)) + return Assignment( + node_labels, edge_list, dists, θ, log_likelihood_per_group, w) +end + +function make_workspace(a::Assignment{E, Dist{D}, + F, W}) where {E, F, D <: Cat, W} + return deepcopy(a.additional_workspace) +end + +function make_swap!(ws::WorkspaceDiscreteSwap, + a) where {E, F, D <: Categorical} + ws.θ = deepcopy(a.θ) + ws.log_likelihood_per_group = deepcopy(a.log_likelihood) +end From 1f98baa57a07b1dc7037bdae5ced66e321525beb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 24 Sep 2025 11:40:29 +0200 Subject: [PATCH 174/266] formatting mostly --- src/EdgeList.jl | 29 +++++----- src/api.jl | 14 ++--- src/assignment.jl | 72 +++---------------------- src/distributions/distributions_type.jl | 40 +++++++------- src/optimization/greedy.jl | 15 ++++-- src/optimization/swap_workspace.jl | 26 +++++---- 6 files changed, 78 insertions(+), 118 deletions(-) diff --git a/src/EdgeList.jl b/src/EdgeList.jl index fe2d5dc..95e79a9 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -7,19 +7,21 @@ function neighbors(A::EdgeList{E}, i::Int) where {E} return A.name_list[i], A.data[i] end - iterate_neighbors(A::EdgeList, i::Int) = zip(neighbors(A, i)...) edge_type(A::EdgeList{E}) where {E} = E nodes(edgelist::EdgeList) = length(edgelist.data) number_nodes(edgelist::EdgeList) = nodes(edgelist) -EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} = _from_adj_to_edge_list(A) +function EdgeList(A::AbstractMatrix{<:Union{Missing, E}}) where {E} + _from_adj_to_edge_list(A) +end EdgeList(adj_list::EdgeList) = adj_list function get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} if i == j return zero(E) end + # TODO: probably can remove this if j ∉ A.name_list[i] && i ∉ A.name_list[j] return zero(E) end @@ -47,9 +49,8 @@ end # return EdgeList(data, name_list) # end - - -function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) +function _from_adj_to_edge_list( + A::AbstractMatrix, function_to_apply = identity) n = size(A, 1) input = findfirst(x -> !ismissing(x), A) test = function_to_apply(A[input]) @@ -59,9 +60,9 @@ function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) data[j] = Vector{typeof(test)}(undef, 0) name_list[j] = Vector{Int}(undef, 0) for i in 1:n - if !ismissing(A[i,j]) + if !ismissing(A[i, j]) end - if !ismissing(A[i,j]) && i != j # gonna be an issue with MC! have to define 0 chain and fast operations on them + if !ismissing(A[i, j]) && i != j # gonna be an issue with MC! have to define 0 chain and fast operations on them push!(name_list[j], i) push!(data[j], function_to_apply(A[i, j])) end @@ -70,12 +71,16 @@ function _from_adj_to_edge_list(A::AbstractMatrix, function_to_apply = identity) return EdgeList(data, name_list) end -_fast_compressed_obs(d::Dist, A::AbstractMatrix) = _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x)) -_fast_compressed_obs(d::Dist, A::EdgeList{E}) where {E} = _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x)) +function _fast_compressed_obs(d::Dist, A::AbstractMatrix) + _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x)) +end +function _fast_compressed_obs(d::Dist, A::EdgeList{E}) where {E} + _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x)) +end function _make_shift_broadcast(A::EdgeList, f) # may work ? -> data = f.(A.data) - n = length(A.data) + n = length(A.data) test = f(A.data[1][1]) data = Vector{Vector{typeof(test)}}(undef, n) for j in 1:n @@ -84,15 +89,13 @@ function _make_shift_broadcast(A::EdgeList, f) return EdgeList(data, A.name_list) end - #convert(::Type{EdgeList}, A::AbstractMatrix) = EdgeList(A) - function fit(d::Dist, A::EdgeList{E}) where {E} new_data = Vector{Vector{typeof(d)}}(undef, length(A.data)) for j in 1:length(A.data) new_data[j] = Vector{typeof(d)}(undef, length(A.data[j])) - for (k,e) in enumerate(A.data[j]) + for (k, e) in enumerate(A.data[j]) new_data[j][k] = fit(d, e) end end diff --git a/src/api.jl b/src/api.jl index 76ef96e..ccc51a9 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,9 +1,11 @@ -function nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated::Bool = false) - return _nethist(data_input, dist_user, initial_node_labels, params, Val(zero_inflated)) +function nethist(data_input, dist_user, initial_node_labels, + params::GreedyParams, zero_inflated::Bool = false) + return _nethist( + data_input, dist_user, initial_node_labels, params, Val(zero_inflated)) end - -function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) +function _nethist(data_input, dist_user, initial_node_labels, + params::GreedyParams, zero_inflated) @debug "preprocessing data" dist = get_ref_dist(dist_user, zero_inflated) g = preprocess_data(data_input, dist) @@ -15,7 +17,6 @@ function _nethist(data_input, dist_user, initial_node_labels, params::GreedyPara return postprocess(out) end - function get_ref_dist(dist::D, ::Val{true}) where {D} return Dist(ZeroInflated(dist)) end @@ -25,10 +26,9 @@ end function preprocess_data(data, dist::Dist) A = EdgeList(_fast_compressed_obs(dist, data)) - return A, dist + return A, dist end - function postprocess(out) return out return out.node_labels, BlockModel(out) diff --git a/src/assignment.jl b/src/assignment.jl index 3577e76..fe6ee65 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -38,12 +38,13 @@ Base.@propagate_inbounds function Base.getindex( return i < length(g) ? g.group_number[1] : g.group_number[2] end -mutable struct Assignment{E, D, F} +mutable struct Assignment{E, D, F, W} node_labels::AbstractVector{Int} const edges::EdgeList{E} const dists::EdgeList{D} θ::SymArray{D} log_likelihood::SymArray{F} + additional_workspace::W end number_nodes(a::Assignment) = length(a.node_labels) @@ -66,14 +67,13 @@ function get_edges_in_groups(a::Assignment, g1::Int, g2::Int) end function get_edges_in_groups(node_labels, edges_all, g1, g2) - edges = Vector{edge_type(edges_all)}() nodes_g1 = findall(x -> x == g1, node_labels) nodes_g2 = findall(x -> x == g2, node_labels) for u in nodes_g1 for (v, e) in iterate_neighbors(edges_all, u) - if v in nodes_g2 && ((g1 == g2 && u < v) || g1 != g2) + if v in nodes_g2 && ((g1 == g2 && u < v) || g1 != g2) push!(edges, e) end end @@ -81,14 +81,15 @@ function get_edges_in_groups(node_labels, edges_all, g1, g2) return edges end -function Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} +function Assignment( + node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} dists = fit(dist, edge_list) θ, ll = _compute_theta_and_ll(node_labels, dists, edge_list, dist) - return Assignment(node_labels, edge_list, dists, θ, ll) + return Assignment(node_labels, edge_list, dists, θ, ll, nothing) end - -function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} +function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, + edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} number_groups = length(unique(node_labels)) θ = SymArray(number_groups, zero(dist)) log_likelihood = SymArray(number_groups, 0.0) @@ -114,60 +115,3 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list: end return θ, log_likelihood end - - - - -function _compute_theta_and_ll_not_working( - node_labels, - dists::EdgeList{Dist{D}}, - edge_list::EdgeList{E}, - dist::Dist{D} -) where {E, D} - n = nodes(dists) - K = length(unique(node_labels)) - - # --- PASS 1: accumulate θ in parallel --- - partial_thetas = @tasks for u in 1:n - @set collect = true - @local theta_local = SymArray(K, zero(dist)) - gu = node_labels[u] - for (v, d) in iterate_neighbors(dists, u) - if u < v - gv = node_labels[v] - theta_local[gu, gv] = add_to(theta_local[gu, gv], d) - end - end - theta_local - end - # reduce into the final θ - θ = SymArray(K, zero(dist)) - for t in partial_thetas - for j in 1:K, i in j:K - θ[i, j] = add_to(θ[i, j], t[i, j]) - end - end - - # --- PASS 2: compute log‐likelihoods in parallel --- - partial_lls = @tasks for u in 1:n - @set collect = true - @local ll_local = SymArray(K, 0.0) - gu = node_labels[u] - for (v, e) in iterate_neighbors(edge_list, u) - if u < v - gv = node_labels[v] - ll_local[gu, gv] += logpdf(θ[gu, gv], e) - end - end - ll_local - end - # reduce into the final ll - ll = SymArray(K, 0.0) - for p in partial_lls - for j in 1:K, i in j:K - ll[i, j] += p[i, j] - end - end - - return θ, ll -end diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index 4d75b38..d710653 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -1,22 +1,24 @@ struct Dist{D} dist::D counts::Int - Dist(d,counts::Int) = counts < 0 ? error("Counts cannot be negative") : new{typeof(d)}(d, counts) + function Dist(d, counts::Int) + counts < 0 ? error("Counts cannot be negative") : + new{typeof(d)}(d, counts) + end end - - Dist(d) = Dist(d, 1) -zero(d::Dist) = Dist(zero(d.dist),0) +zero(d::Dist) = Dist(zero(d.dist), 0) Base.broadcastable(x::Dist) = Ref(x) function add_to(avgdist::Dist{D}, dist::D) where {D} - inner_dist = agg_params(avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), 1 / (avgdist.counts + 1)) + inner_dist = agg_params( + avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), + 1 / (avgdist.counts + 1)) return Dist(inner_dist, avgdist.counts + 1) end - function remove_from(avgdist::Dist{D}, dist::D) where {D} if avgdist.counts <= 0 error("Cannot remove from a distribution with 0 counts") @@ -26,25 +28,30 @@ function remove_from(avgdist::Dist{D}, dist::D) where {D} # else # error("Cannot remove from a distribution with 1 count unless the parameters are the same, got $(params(avgdist)) and $(params(dist))") # end - return Dist(agg_params(avgdist.dist, dist, avgdist.counts / max(1,(avgdist.counts - 1)), - 1 / max(1,(avgdist.counts - 1))), avgdist.counts -1) + return Dist( + agg_params( + avgdist.dist, dist, avgdist.counts / max(1, (avgdist.counts - 1)), + -1 / max(1, (avgdist.counts - 1))), + avgdist.counts - 1) end - ## probably this is fucked ... # add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) function add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} Dist( agg_params( - avgdist.dist, dist.dist, avgdist.counts / (avgdist.counts + dist.counts), + avgdist.dist, dist.dist, avgdist.counts / + (avgdist.counts + dist.counts), dist.counts / (avgdist.counts + dist.counts)), avgdist.counts + dist.counts) end function remove_from(avgdist::Dist, dist::Dist) Dist( agg_params( - avgdist.dist, dist.dist, avgdist.counts / max(1, (avgdist.counts - dist.counts)), - - dist.counts / max(1, (avgdist.counts - dist.counts))), + avgdist.dist, dist.dist, + avgdist.counts / max(1, (avgdist.counts - dist.counts)), + -dist.counts / max(1, (avgdist.counts - dist.counts))), avgdist.counts - dist.counts) end @@ -52,7 +59,6 @@ end # by default do nothing _fast_compressed_obs(d, x) = x - # what to delegate to the underlying distribution for f in [:logpdf, :sample, :distance, :eltype, :params, :_fast_compressed_obs] @eval $f(d::Dist, args...) = $f(d.dist, args...) @@ -65,18 +71,16 @@ loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) # loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) unwrap(d::Dist) = d.dist - - - # Bernoulli distribution (example) -struct Bernoulli{T<:Real} +struct Bernoulli{T <: Real} p::T end - zero(d::Bernoulli) = Bernoulli(zero(d.p)) -agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) = Bernoulli(w1 * d1.p + w2 * d2.p) +function agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) + Bernoulli(w1 * d1.p + w2 * d2.p) +end fit(::Bernoulli, x) = Bernoulli(mean(x)) distance(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index edbe89b..9ae5453 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -9,7 +9,10 @@ mutable struct GreedyParams progress_bar::Bool end -GreedyParams() = GreedyParams(100_000, RandomGroupSwap(), Strict(), PreviousBestValue(10_000), true) +function GreedyParams() + GreedyParams( + 100_000, RandomGroupSwap(), Strict(), PreviousBestValue(10_000), true) +end function greedy_optimize(g, initial_labels, params::GreedyParams) @debug "making assignment" @@ -19,17 +22,19 @@ function greedy_optimize(g, initial_labels, params::GreedyParams) return a end - function greedy_improve!(a::Assignment; params = GreedyParams()) # allocate memory for swap swap = make_swap(a, (1, 1)) # display progress bar - p = ProgressUnknown(enabled = params.progress_bar, showspeed = true, desc = "Greedy search: ") + p = ProgressUnknown(enabled = params.progress_bar, + showspeed = true, desc = "Greedy search: ") - for i in 1:params.max_iter + for i in 1:(params.max_iter) local_search!(a, swap, params) - next!(p; showvalues = [("ll: ", loglikelihood(a)), info_to_print(params.stop_rule)]) + next!(p; + showvalues = [ + ("ll: ", loglikelihood(a)), info_to_print(params.stop_rule)]) if stopping_rule(a, params.stop_rule) if i < 10 @warn "Greedy search stopped early after $(i) iterations" diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 75b8a03..4a00a27 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -1,21 +1,25 @@ -mutable struct WorkspaceSwap{D,F} +mutable struct WorkspaceSwap{D, F} θ::SymArray{D} log_likelihood_per_group::SymArray{F} end +function make_workspace(a::Assignment) + return WorkspaceSwap(deepcopy(a.θ), deepcopy(a.log_likelihood)) +end + mutable struct Swap{W} u::Int v::Int workspace::W end - function make_swap(a::Assignment, id) - return Swap(id[1], id[2], WorkspaceSwap(deepcopy(a.θ), deepcopy(a.log_likelihood))) + return Swap(id[1], id[2], make_workspace(a)) end function make_swap!(swap::Swap, a::Assignment, id) swap.u, swap.v = id + swap.workspace.θ = deepcopy(a.θ) swap.workspace.log_likelihood_per_group = deepcopy(a.log_likelihood) end @@ -35,17 +39,17 @@ end # for reference and testing function _slow_swap!(a::Assignment, s::Swap) swap_node_labels!(a, s.u, s.v) - a.θ, a.log_likelihood = _compute_theta_and_ll(a.node_labels, a.dists, a.edges, a.θ[1,1]) + a.θ, a.log_likelihood = _compute_theta_and_ll( + a.node_labels, a.dists, a.edges, a.θ[1, 1]) end # apply_swap!(a::Assignment, s::Swap) = _slow_swap!(a, s) - function apply_swap!(a::Assignment, s::Swap) - u,v = s.u, s.v + u, v = s.u, s.v gu = a.node_labels[u] gv = a.node_labels[v] - groups_concerned = Set{Tuple{Int,Int}}([minmax(gu, gv)]) + groups_concerned = Set{Tuple{Int, Int}}([minmax(gu, gv)]) for (node, d) in iterate_neighbors(a.dists, u) if node == v @@ -54,8 +58,8 @@ function apply_swap!(a::Assignment, s::Swap) g1 = a.node_labels[node] a.θ[gv, g1] = add_to(a.θ[gv, g1], d) a.θ[gu, g1] = remove_from(a.θ[gu, g1], d) - push!(groups_concerned, minmax(gu,g1)) - push!(groups_concerned, minmax(gv,g1)) + push!(groups_concerned, minmax(gu, g1)) + push!(groups_concerned, minmax(gv, g1)) end for (index, (node, d)) in enumerate(iterate_neighbors(a.dists, v)) @@ -65,8 +69,8 @@ function apply_swap!(a::Assignment, s::Swap) g2 = a.node_labels[node] a.θ[gu, g2] = add_to(a.θ[gu, g2], d) a.θ[gv, g2] = remove_from(a.θ[gv, g2], d) - push!(groups_concerned, minmax(gv,g2)) - push!(groups_concerned, minmax(gu,g2)) + push!(groups_concerned, minmax(gv, g2)) + push!(groups_concerned, minmax(gu, g2)) end swap_node_labels!(a, u, v) From 6282e30b66491ca90b6537c87ec34bfc2cffa4dc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 24 Sep 2025 13:15:50 +0200 Subject: [PATCH 175/266] add fast categorical --- src/distributions/distributions_type.jl | 4 + src/optimization/greedy.jl | 1 + src/optimization/swap_categorical.jl | 152 ++++++++++++++++++++---- src/optimization/swap_workspace.jl | 17 ++- src/utils/SymArray.jl | 103 ++++++++-------- test/runtests.jl | 2 +- test/test_cat_case.jl | 60 ++++++++++ 7 files changed, 264 insertions(+), 75 deletions(-) create mode 100644 test/test_cat_case.jl diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index d710653..93968ac 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -7,6 +7,10 @@ struct Dist{D} end end +function Base.show(io::IO, d::Dist) + print(io, "$(d.dist)") +end + Dist(d) = Dist(d, 1) zero(d::Dist) = Dist(zero(d.dist), 0) diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 9ae5453..171b7a2 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -1,4 +1,5 @@ include("swap_workspace.jl") +include("swap_categorical.jl") include("config_rules/include.jl") mutable struct GreedyParams diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index 2783900..b53acca 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -1,7 +1,7 @@ mutable struct WorkspaceDiscreteSwap{ - D, C <: AbstractMatrix, R <: AbstractMatrix, - R2 <: AbstractMatrix, S <: AbstractMatrix{D}, - L <: AbstractMatrix} + D, C <: SymArray, R <: SymArray, + R2 <: SymArray, S <: SymArray{D}, + L <: SymArray} θ::S log_likelihood_per_group::L counts::C @@ -16,14 +16,38 @@ struct Cat{M, V <: AbstractVector{<:Real}} end end +function Base.show(io::IO, c::Cat) + print(io, "Cat($(c.p))") +end + num_categories(::Type{Cat{Val{M}, V}}) where {M, V} = M num_categories(::Cat{Val{M}, V}) where {M, V} = M zero(c::Cat{Val{M}, V}) where {M, V} = Cat(ones(eltype(V), M)) distance(c1::Cat{M, V}, c2::Cat{M, V}) where {M, V} = sum(abs.(c1.p .- c2.p)) eltype(::Cat{M, V}) where {M, V} = Int -params(c::Cat) = (c.p,) +params(c::Cat{M, V}) where {M, V} = (c.p,) logpdf(c::Cat, x::Int) = log(c.p[x]) +function fit(::Cat{Val{M}, V}, x::AbstractVector{Int}) where {M, V} + p_est = zeros(eltype(V), M) + for xi in x + p_est[xi] += 1 + end + return Cat{Val{M}, V}(p_est ./ length(x)) +end +function sample(c::Cat{Val{M}, V}) where {M, V} + return findfirst(x -> x >= rand(), cumsum(c.p)) +end + +function fit(::Cat{Val{M}, V}, x::Int) where {M, V} + p_est = zeros(eltype(V), M) + p_est[x] = 1.0 + return Cat(p_est) +end + +function set_params!(c::Dist{Cat{M, V}}, p::V) where {M, V} + set_params!(c.dist, p) +end function set_params!(c::Cat{M, V}, p::V) where {M, V} c.p .= p end @@ -31,17 +55,26 @@ end function Assignment( node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D <: Cat} + n_groups = length(unique(node_labels)) + n_nodes = length(node_labels) dists = fit(dist, edge_list) - realized = Matrix{Vector{Int}}(undef, n_groups, n_groups) - counts = Matrix{Int}(undef, n_groups, n_groups) - estimated = Matrix{Vector{Float64}}(undef, n_groups, n_groups) - fill!(realized, zeros(Int, num_categories(unwrap(dist)))) - fill!(counts, 0) + realized = SymArray(n_groups, zeros(Float64, num_categories(unwrap(dist)))) + estimated = SymArray( + n_groups, zeros(Float64, num_categories(unwrap(dist)))) + counts = SymArray(n_groups, 0) + # realized = Matrix{Vector{Int}}(undef, n_groups, n_groups) + # counts = Matrix{Int}(undef, n_groups, n_groups) + # estimated = Matrix{Vector{Float64}}(undef, n_groups, n_groups) + # for index in eachindex(realized) + # realized[index] = copy(zeros(Int, num_categories(unwrap(dist)))) + # estimated[index] = copy(zeros(Float64, num_categories(unwrap(dist)))) + # end + # fill!(counts, 0) for u in 1:n_nodes g1 = node_labels[u] for (v, e) in iterate_neighbors(edge_list, u) g2 = node_labels[v] - if u < v + if v < u counts[minmax(g1, g2)...] += 1 realized[minmax(g1, g2)...][e] += 1 else @@ -49,12 +82,12 @@ function Assignment( end end end - for g2 in 1:n_groups - for g1 in g2:n_groups - estimated[g1, g2] = (counts[g1, g2] == 0) ? - zeros(Float64, num_categories(unwrap(dist))) : - (realized[g1, g2]) ./ counts[g1, g2] - end + + for g2 in 1:n_groups, g1 in g2:n_groups + counts[g1, g2] = counts[minmax(g1, g2)...] + realized[g1, g2] = realized[minmax(g1, g2)...] + _fast_normalization!( + estimated[g1, g2], realized[g1, g2], counts[g1, g2]) end θ = SymArray(n_groups, zero(dist)) @@ -70,10 +103,7 @@ function Assignment( end end end - w = WorkspaceDiscreteSwap{Dist{D}, - Matrix{Int}, Matrix{Float64}, Matrix{Float64}, - SymArray{D}, SymArray{Float64}}( - deepcopy(θ), deepcopy(log_likelihood_per_group), + w = WorkspaceDiscreteSwap(deepcopy(θ), deepcopy(log_likelihood_per_group), counts, deepcopy(realized), deepcopy(estimated)) return Assignment( node_labels, edge_list, dists, θ, log_likelihood_per_group, w) @@ -84,8 +114,86 @@ function make_workspace(a::Assignment{E, Dist{D}, return deepcopy(a.additional_workspace) end -function make_swap!(ws::WorkspaceDiscreteSwap, - a) where {E, F, D <: Categorical} +function make_swap_workspace!(ws::WorkspaceDiscreteSwap, a::Assignment) ws.θ = deepcopy(a.θ) ws.log_likelihood_per_group = deepcopy(a.log_likelihood) + ws.realized = deepcopy(a.additional_workspace.realized) + ws.estimated = deepcopy(a.additional_workspace.estimated) +end + +function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) + a.θ = deepcopy(ws.θ) + a.log_likelihood = deepcopy(ws.log_likelihood_per_group) + as = a.additional_workspace + as.θ = deepcopy(ws.θ) + as.log_likelihood_per_group = deepcopy(ws.log_likelihood_per_group) + as.realized = deepcopy(ws.realized) + as.estimated = deepcopy(ws.estimated) +end + +function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) + u, v = s.u, s.v + n_groups = number_groups(as) + gu = as.node_labels[u] + gv = as.node_labels[v] + for (node, e) in iterate_neighbors(as.edges, u) + if node == v + continue + end + g_inter = as.node_labels[node] + as.additional_workspace.counts[minmax(gu, g_inter)...] -= 1 + as.additional_workspace.realized[minmax(gu, g_inter)...][e] -= 1 + as.additional_workspace.counts[minmax(gv, g_inter)...] += 1 + as.additional_workspace.realized[minmax(gv, g_inter)...][e] += 1 + end + for (node, e) in iterate_neighbors(as.edges, v) + if node == u + continue + end + g_inter = as.node_labels[node] + as.additional_workspace.counts[minmax(gv, g_inter)...] -= 1 + as.additional_workspace.realized[minmax(gv, g_inter)...][e] -= 1 + as.additional_workspace.counts[minmax(gu, g_inter)...] += 1 + as.additional_workspace.realized[minmax(gu, g_inter)...][e] += 1 + end + _fast_normalization!.(as.additional_workspace.estimated, + as.additional_workspace.realized, as.additional_workspace.counts) + swap_node_labels!(as, u, v) + + for g2 in 1:n_groups + for g1 in g2:n_groups + set_params!(as.additional_workspace.θ[g1, g2], + as.additional_workspace.estimated[g1, g2]) + as.additional_workspace.log_likelihood_per_group[g1, g2] = _fast_ll( + as.additional_workspace.estimated[g1, g2], as.additional_workspace.realized[ + g1, g2], + as.additional_workspace.counts[g1, g2]) + end + end + + as.θ = deepcopy(as.additional_workspace.θ) + as.log_likelihood = deepcopy(as.additional_workspace.log_likelihood_per_group) +end + +function _fast_normalization!(p::AbstractVector, r::AbstractVector, c::Real) + if c > 0 + @inbounds for m in eachindex(p) + p[m] = r[m] / c + end + else + fill!(p, 0.0) + end +end + +function _fast_ll( + p::AbstractVector, r::AbstractVector, c::Real) + ll = zero(eltype(p)) + if c > 0 + @inbounds for m in eachindex(p) + if r[m] > 0 + ll += r[m] * log(p[m]) + end + end + end + return ll end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 4a00a27..149a362 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -13,23 +13,30 @@ mutable struct Swap{W} workspace::W end +function make_swap_workspace!(ws, a::Assignment) + ws.θ = deepcopy(a.θ) + ws.log_likelihood_per_group = deepcopy(a.log_likelihood) +end + +function revert_swap_workspace!(a::Assignment, ws) + a.θ = deepcopy(ws.θ) + a.log_likelihood = deepcopy(ws.log_likelihood_per_group) +end + function make_swap(a::Assignment, id) return Swap(id[1], id[2], make_workspace(a)) end function make_swap!(swap::Swap, a::Assignment, id) swap.u, swap.v = id - - swap.workspace.θ = deepcopy(a.θ) - swap.workspace.log_likelihood_per_group = deepcopy(a.log_likelihood) + make_swap_workspace!(swap.workspace, a) end function revert_swap!(assignment::Assignment, swap::Swap) # swap labels back to original swap_node_labels!(assignment, swap.u, swap.v) # restore saved θ and log likelihoods - assignment.θ = deepcopy(swap.workspace.θ) - assignment.log_likelihood = deepcopy(swap.workspace.log_likelihood_per_group) + revert_swap_workspace!(assignment, swap.workspace) end function swap_node_labels!(a::Assignment, i, j) diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index e594840..64b2f8d 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -1,66 +1,75 @@ module FastSymArray - import Base: eltype, convert - export SymArray, eltype +import Base: eltype, convert +export SymArray, eltype - mutable struct SymArray{F} <: AbstractArray{F, 2} - d::Dict{Tuple{Int, Int}, F} - k::Int - end +mutable struct SymArray{F} <: AbstractArray{F, 2} + d::Dict{Tuple{Int, Int}, F} + k::Int +end - function SymArray(k::T, d::F) where {F, T<:Real} - @assert k > 0 - return SymArray{F}(Dict{Tuple{Int, Int}, F}(minmax(i, j) => d for i in 1:k - for j in i:k), k) - end +function SymArray(k::T, d::F) where {F, T <: Real} + @assert k > 0 + return SymArray{F}( + Dict{Tuple{Int, Int}, F}(minmax(i, j) => deepcopy(d) for i in 1:k + for j in i:k), + k) +end - function Base.size(a::SymArray) - return (a.k, a.k) - end +function SymArray(k::T, d::AbstractArray) where {T <: Real} + @assert k > 0 + return SymArray{typeof(d)}( + Dict{Tuple{Int, Int}, typeof(d)}(minmax(i, j) => deepcopy(d) + for i in 1:k + for j in i:k), + k) +end - Base.@propagate_inbounds function Base.getindex(a::SymArray, i, j) - @boundscheck checkbounds(a, i, j) - return a.d[minmax(i, j)] - end +function Base.size(a::SymArray) + return (a.k, a.k) +end - Base.@propagate_inbounds function Base.setindex!(a::SymArray, v, i, j) - @boundscheck checkbounds(a, i, j) - a.d[minmax(i, j)] = v - end +Base.@propagate_inbounds function Base.getindex(a::SymArray, i, j) + @boundscheck checkbounds(a, i, j) + return a.d[minmax(i, j)] +end +Base.@propagate_inbounds function Base.setindex!(a::SymArray, v, i, j) + @boundscheck checkbounds(a, i, j) + a.d[minmax(i, j)] = v +end - function sum_tri_with_diag(a::SymArray) - return sum(values(a.d)) - end +function sum_tri_with_diag(a::SymArray) + return sum(values(a.d)) +end - function eltype(a::SymArray{F}) where {F} - return F - end +function eltype(a::SymArray{F}) where {F} + return F +end - function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} - @assert size(a, 1) == size(a, 2) - k = size(a, 1) - res = SymArray(k, a[1,1]) - for j in axes(a,2) - for i in axes(a,1) - if i <= j - res[i,j] = a[i,j] - end +function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} + @assert size(a, 1) == size(a, 2) + k = size(a, 1) + res = SymArray(k, a[1, 1]) + for j in axes(a, 2) + for i in axes(a, 1) + if i <= j + res[i, j] = a[i, j] end end - return res end + return res +end - - function convert(::Type{AbstractMatrix{F}}, a::SymArray{F}) where {F} - k = a.k - m = zeros(F, k, k) - for i in 1:k - for j in i:k - m[i, j] = a[i, j] - end +function convert(::Type{AbstractMatrix{F}}, a::SymArray{F}) where {F} + k = a.k + m = zeros(F, k, k) + for i in 1:k + for j in i:k + m[i, j] = a[i, j] end - return m end + return m +end end diff --git a/test/runtests.jl b/test/runtests.jl index d9d6d64..2e9702f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,9 +2,9 @@ using Test using LinearAlgebra, SparseArrays using NetworkHistogram @testset "Tests" begin - include("test_data_format.jl") include("test_distributions_type.jl") include("test_swap_workspace.jl") + include("test_cat_case.jl") include("test_get_edges_in_groups.jl") end diff --git a/test/test_cat_case.jl b/test/test_cat_case.jl new file mode 100644 index 0000000..c8607c5 --- /dev/null +++ b/test/test_cat_case.jl @@ -0,0 +1,60 @@ +using Test +using NetworkHistogram +using StatsBase +using Random +using Distributions + +@testset "Swap workspace likelihood update (Categorical)" begin + Random.seed!(42) + n = 10 + k = 2 + m = 3 + d_mine = NetworkHistogram.Cat(fill(1 / m, m)) + # Create a block model with two groups + sbm = NetworkHistogram.BlockModel(k, d_mine) + sbm[1, 1] = NetworkHistogram.Cat([0.7, 0.2, 0.1]) + sbm[2, 2] = NetworkHistogram.Cat([0.1, 0.3, 0.6]) + sbm[1, 2] = NetworkHistogram.Cat([0.3, 0.4, 0.3]) + + labels = StatsBase.inverse_rle(1:k, fill(n ÷ k, k)) + A = NetworkHistogram.sample(sbm, labels) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d_mine)) + + for ind in eachindex(assignment.additional_workspace.counts) + @test assignment.additional_workspace.counts[ind] == + sum(assignment.additional_workspace.realized[ind]) + end + + ll_original = NetworkHistogram.loglikelihood(assignment) + + # Swap two nodes from different groups + indices = (1, n) + swap = NetworkHistogram.make_swap(assignment, indices) + true_swapped = deepcopy(labels) + true_swapped[1] = labels[n] + true_swapped[n] = labels[1] + NetworkHistogram.apply_swap!(assignment, swap) + + nodes_label_swapped = deepcopy(assignment.node_labels) + new_a = NetworkHistogram.Assignment( + nodes_label_swapped, edgelist, NetworkHistogram.Dist(d_mine)) + ll_new_a = NetworkHistogram.loglikelihood(new_a) + ll_after_swap = NetworkHistogram.loglikelihood(assignment) + ws_new = new_a.additional_workspace + ws_old = assignment.additional_workspace + @test ws_new.counts == ws_old.counts + @test ws_new.realized == ws_old.realized + @test ws_new.estimated == ws_old.estimated + + @test new_a.node_labels == assignment.node_labels + @test new_a.node_labels == true_swapped + @test new_a.log_likelihood == assignment.log_likelihood + @test isapprox(ll_after_swap, ll_new_a; atol = 1e-10) + + # Revert the swap + NetworkHistogram.revert_swap!(assignment, swap) + ll_after_revert = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_after_revert, ll_original; atol = 1e-10) +end From 348bc230ac1330e1a9600a42e6b3b642cf817fb3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 24 Sep 2025 13:49:26 +0200 Subject: [PATCH 176/266] remove unused dependencies --- Project.toml | 32 +++++--------------------------- src/NetworkHistogram.jl | 2 -- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/Project.toml b/Project.toml index 8c719d2..7b12f4f 100644 --- a/Project.toml +++ b/Project.toml @@ -4,39 +4,17 @@ authors = ["Charles Dufour", "Jake Grainger"] version = "0.5.2" [deps] -ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" -CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" -CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" -CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" -Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" -Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" -DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" -Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" -Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" -IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" -KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" -OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" -PermutationSymmetricTensors = "22e17884-8c1a-4ea8-8b39-5974e24a9d31" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SimpleWeightedGraphs = "47aef6b3-ad0c-573a-a1e2-d07658019622" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -[compat] -BenchmarkTools = "1.6.0" -CairoMakie = "0.13.4" -Clustering = "0.15.7" -KrylovKit = "0.9.5" -OhMyThreads = "0.8.3" -julia = "1.11" +[weakdeps] +DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" + +[extensions] +Discretize = "DiscretizeDistributions" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 018519f..4fa3afb 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -4,8 +4,6 @@ using StaticArrays using ProgressMeter import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero -using OhMyThreads: TaskLocalValue, @tasks, @local -using Clustering include("utils/include.jl") using .FastSymArray From 0339b88bb158b8c5d8c35b625f4824d49aae09da Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 24 Sep 2025 13:52:11 +0200 Subject: [PATCH 177/266] format and make ready for extension on discretizeDist --- .JuliaFormatter.toml | 4 +++- Project.toml | 2 +- ext/DiscretizeExt.jl | 7 ++++++ ext/DistributionsExt.jl | 8 +++---- src/block_model.jl | 23 +++++++----------- src/distributions/zero_inflated.jl | 10 ++------ src/optimization/config_rules/InitRule.jl | 11 +++------ .../config_rules/bandwidth_selection_rule.jl | 2 -- src/optimization/config_rules/stop_rule.jl | 6 ++--- src/optimization/swap_categorical.jl | 3 ++- src/optimization/swap_workspace.jl | 3 ++- test/test_data_format.jl | 6 ++--- test/test_swap_workspace.jl | 24 +++++++++---------- 13 files changed, 49 insertions(+), 60 deletions(-) create mode 100644 ext/DiscretizeExt.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 90ebcb4..d42d738 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,2 +1,4 @@ +indent = 4 +margin = 92 +normalize_line_endings = "unix" style = "sciml" -margin = 79 diff --git a/Project.toml b/Project.toml index 7b12f4f..a9d66df 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" [extensions] -Discretize = "DiscretizeDistributions" +DiscretizeExt = "DiscretizeDistributions" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/ext/DiscretizeExt.jl b/ext/DiscretizeExt.jl new file mode 100644 index 0000000..46d3d8d --- /dev/null +++ b/ext/DiscretizeExt.jl @@ -0,0 +1,7 @@ +module DiscretizeExt + +using NetworkHistogram +using StatsBase +using DiscreteDistributions + +end diff --git a/ext/DistributionsExt.jl b/ext/DistributionsExt.jl index 13fba05..32cde0c 100644 --- a/ext/DistributionsExt.jl +++ b/ext/DistributionsExt.jl @@ -1,8 +1,8 @@ module DistributionsExt - using NetworkHistogram - using StatsBase - import NetworkHistogram: fast_ll_update! - import Distributions: logpdf +using NetworkHistogram +using StatsBase +import NetworkHistogram: fast_ll_update! +import Distributions: logpdf end diff --git a/src/block_model.jl b/src/block_model.jl index 48dce3d..65db31b 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -1,7 +1,7 @@ struct BlockModel{D, K, T} _dists::SymArray{D} sizes::SVector{K, T} - cum_sizes::SVector{K,T} + cum_sizes::SVector{K, T} end function BlockModel(k::Int, d::D) where {D} @@ -11,7 +11,6 @@ function BlockModel(k::Int, d::D) where {D} return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) end - function BlockModel(a::Assignment) k = length(unique(a.node_labels)) sizes = SVector{k}(proportions(a)) @@ -20,7 +19,6 @@ function BlockModel(a::Assignment) return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) end - function BlockModel(nodes_labels, θ) k = length(unique(nodes_labels)) sizes = SVector{k}(counts(nodes_labels) / length(nodes_labels)) @@ -29,8 +27,7 @@ function BlockModel(nodes_labels, θ) return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) end - -function map_ξ_to_block(bm::BlockModel, ξ::T) where {T<:Real} +function map_ξ_to_block(bm::BlockModel, ξ::T) where {T <: Real} return findfirst(x -> x >= ξ, bm.cum_sizes) end @@ -39,28 +36,26 @@ function sample(bm::BlockModel, latents::Int, args...) return latents, sample(bm, latents, args...) end - function sample(bm::BlockModel, latents::Vector{T}, args...) where {T} - A = Array{eltype(bm[1,1]), 2}(undef, length(latents), length(latents)) + A = Array{eltype(bm[1, 1]), 2}(undef, length(latents), length(latents)) for j in 1:length(latents) - for i in 1:j-1 + for i in 1:(j - 1) A[i, j] = A[j, i] end - for i in j+1:length(latents) + for i in (j + 1):length(latents) # println("latents[i]: ", latents[i], " latents[j]: ", latents[j]) # println("bm[latents[i], latents[j]]: ", bm[latents[i], latents[j]]) - A[i, j] = sample(bm[latents[i], latents[j]], args...) - A[j, i] = A[i, j] + A[i, j] = sample(bm[latents[i], latents[j]], args...) + A[j, i] = A[i, j] end end # fill the diagonal with zeros, avoid undefined references for i in 1:length(latents) - A[i, i] = zero(A[1,2]) + A[i, i] = zero(A[1, 2]) end return A end - # this is probably awfull function Base.getindex(s::BlockModel, i::Int, j::Int) @@ -75,7 +70,6 @@ function Base.size(s::BlockModel) return (s._dists.k, s._dists.k) end - function Base.getindex(s::BlockModel, i::Real, j::Real) k = findfirst(x -> x ≥ i, s.cum_sizes) l = findfirst(x -> x ≥ j, s.cum_sizes) @@ -88,7 +82,6 @@ function Base.setindex!(s::BlockModel, v, i::Real, j::Real) s._dists[k, l] = v end - # helpers for generating ordered latents function ordered_latents(bm::BlockModel, n::Int) return sort(map(x -> map_ξ_to_block(bm, x), rand(n))) diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 1bfbd7b..56801c5 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -9,12 +9,10 @@ struct SampleZI{F} iszero::Bool end - function ZeroInflated(dist) return ZeroInflated(dist, 0.0) end - function logpdf(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} if x.iszero return log(zi.proba_zero) @@ -44,7 +42,6 @@ zero(zi::ZeroInflated) = ZeroInflated(zero(zi.dist), 0.0) eltype(zi::ZeroInflated{D, F}) where {D, F} = SampleZI{eltype(D)} params(zi::ZeroInflated{D, F}) where {D, F} = (params(zi.dist)..., zi.proba_zero) - function fit(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} if x.iszero return ZeroInflated(zero(zi.dist), 1.0) @@ -54,16 +51,14 @@ function fit(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} end function _fast_compressed_obs(zi::ZeroInflated, x, filter = iszero) - return SampleZI(_fast_compressed_obs(zi.dist,x), filter(x)) + return SampleZI(_fast_compressed_obs(zi.dist, x), filter(x)) end - -function unwrap(d::Dist{ZeroInflated{B,D}}) where {B,D} +function unwrap(d::Dist{ZeroInflated{B, D}}) where {B, D} #yeah I know again... return d.dist.dist end - function get_proportion_observed(d::Dist{ZeroInflated{B, D}}) where {B, D} return (1-d.dist.proba_zero) * d.counts end @@ -72,7 +67,6 @@ function get_proportion_observed(d::Dist) return d.counts end - # function fit(zd::ZeroInflated, x::SampleZI) # if x.iszero # return ZeroInflated(zero(zd.dist), 1.0) diff --git a/src/optimization/config_rules/InitRule.jl b/src/optimization/config_rules/InitRule.jl index 785b1d3..99c1615 100644 --- a/src/optimization/config_rules/InitRule.jl +++ b/src/optimization/config_rules/InitRule.jl @@ -7,7 +7,6 @@ struct FromAssignment{A} <: StartingAssignment assignment::A end - struct FromNodeLabels{L} <: StartingAssignment node_labels::L end @@ -17,7 +16,6 @@ struct InitRule{S <: StartingAssignment, I} assignment_rule::I end - # check that this is necessary! function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} return Assignment(initialize_node_labels( @@ -40,27 +38,24 @@ initialize_node_labels function initialize_node_labels(g, h, ::OrderedStart) group_size = GroupSize(number_nodes(g), h) node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) - return node_labels + return node_labels end function initialize_node_labels(g, h, ::RandomStart) group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) Random.shuffle!(node_labels) - return node_labels + return node_labels end - -function initialise_node_labels(g,h, init_rule::FromAssignment{A}) where {A <: Assignment} +function initialise_node_labels(g, h, init_rule::FromAssignment{A}) where {A <: Assignment} return initialise_node_labels(g, h, FromNodeLabels(init_rule.assignment.node_labels)) end - function initialise_node_labels(g, h, init_rule::FromNodeLabels{L}) where {L} @assert number_nodes(g) == length(init_rule.node_labels) return deepcopy(init_rule.node_labels) end - function number_nodes(g::AbstractMatrix) return size(g, 1) end diff --git a/src/optimization/config_rules/bandwidth_selection_rule.jl b/src/optimization/config_rules/bandwidth_selection_rule.jl index 0606c11..d922a96 100644 --- a/src/optimization/config_rules/bandwidth_selection_rule.jl +++ b/src/optimization/config_rules/bandwidth_selection_rule.jl @@ -7,8 +7,6 @@ struct OracleH <: KSelectionRule H::Int end - - """ select_number_node_per_block(g::Observations, rule::KSelectionRule) diff --git a/src/optimization/config_rules/stop_rule.jl b/src/optimization/config_rules/stop_rule.jl index 1ead4f8..8f8ec0b 100644 --- a/src/optimization/config_rules/stop_rule.jl +++ b/src/optimization/config_rules/stop_rule.jl @@ -1,6 +1,5 @@ abstract type StopRule end - function info_to_print(::StopRule) return nothing end @@ -51,5 +50,6 @@ function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) return stop_rule.iterations_since_best >= stop_rule.k end - -info_to_print(stop_rule::PreviousBestValue) = ("stalled iter: ", stop_rule.iterations_since_best) +function info_to_print(stop_rule::PreviousBestValue) + ("stalled iter: ", stop_rule.iterations_since_best) +end diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index b53acca..051e997 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -84,6 +84,7 @@ function Assignment( end for g2 in 1:n_groups, g1 in g2:n_groups + counts[g1, g2] = counts[minmax(g1, g2)...] realized[g1, g2] = realized[minmax(g1, g2)...] _fast_normalization!( @@ -166,7 +167,7 @@ function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) as.additional_workspace.estimated[g1, g2]) as.additional_workspace.log_likelihood_per_group[g1, g2] = _fast_ll( as.additional_workspace.estimated[g1, g2], as.additional_workspace.realized[ - g1, g2], + g1, g2], as.additional_workspace.counts[g1, g2]) end end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 149a362..653fd00 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -46,7 +46,8 @@ end # for reference and testing function _slow_swap!(a::Assignment, s::Swap) swap_node_labels!(a, s.u, s.v) - a.θ, a.log_likelihood = _compute_theta_and_ll( + a.θ, + a.log_likelihood = _compute_theta_and_ll( a.node_labels, a.dists, a.edges, a.θ[1, 1]) end diff --git a/test/test_data_format.jl b/test/test_data_format.jl index dbc19e2..fd18a4c 100644 --- a/test/test_data_format.jl +++ b/test/test_data_format.jl @@ -1,7 +1,7 @@ @testset "Edge list tests" begin using Random Random.seed!(1234) - A = Symmetric(sprand(20,20,0.5)) + A = Symmetric(sprand(20, 20, 0.5)) edgelist = EdgeList(A) for j in 1:20 @@ -9,7 +9,7 @@ for i in 1:20 if i != j @test i in nv_j - @test A[i,j] == val_j[findfirst(x -> x == i, nv_j)] + @test A[i, j] == val_j[findfirst(x -> x == i, nv_j)] else @test i ∉ nv_j end @@ -17,5 +17,5 @@ end @test NetworkHistogram.edge_type(edgelist) == eltype(A) - @test nodes(edgelist) == size(A,1) + @test nodes(edgelist) == size(A, 1) end diff --git a/test/test_swap_workspace.jl b/test/test_swap_workspace.jl index d438874..6401e49 100644 --- a/test/test_swap_workspace.jl +++ b/test/test_swap_workspace.jl @@ -3,7 +3,6 @@ using NetworkHistogram using StatsBase using Random - function manual_loglikelihood(A, node_labels, θ) n = size(A, 1) k = size(θ, 1) @@ -13,7 +12,7 @@ function manual_loglikelihood(A, node_labels, θ) if i!=j g1 = node_labels[i] g2 = node_labels[j] - ll += NetworkHistogram.logpdf(θ[g1,g2], A[i,j]) + ll += NetworkHistogram.logpdf(θ[g1, g2], A[i, j]) end end end @@ -23,10 +22,9 @@ end function slow_swap(a::NetworkHistogram.Assignment, s::NetworkHistogram.Swap) labels = deepcopy(a.node_labels) labels[s.u], labels[s.v] = labels[s.v], labels[s.u] - return NetworkHistogram.Assignment(labels, a.edges, a.θ[1,1]) + return NetworkHistogram.Assignment(labels, a.edges, a.θ[1, 1]) end - @testset "Swap workspace likelihood update (Bernoulli)" begin Random.seed!(42) n = 6 @@ -35,9 +33,9 @@ end d = NetworkHistogram.Bernoulli(0.5) # Create a block model with two groups sbm = NetworkHistogram.BlockModel(k, d) - sbm[1,1] = NetworkHistogram.Bernoulli(p1) - sbm[2,2] = NetworkHistogram.Bernoulli(p2) - sbm[1,2] = NetworkHistogram.Bernoulli(0.1) + sbm[1, 1] = NetworkHistogram.Bernoulli(p1) + sbm[2, 2] = NetworkHistogram.Bernoulli(p2) + sbm[1, 2] = NetworkHistogram.Bernoulli(0.1) labels = StatsBase.inverse_rle(1:k, fill(n÷k, k)) A = NetworkHistogram.sample(sbm, labels) @@ -46,23 +44,23 @@ end ll_original = NetworkHistogram.loglikelihood(assignment) ll_manual = manual_loglikelihood(A, assignment.node_labels, assignment.θ) - @test isapprox(ll_original, ll_manual; atol=1e-10) + @test isapprox(ll_original, ll_manual; atol = 1e-10) # Swap two nodes from different groups indices = (1, n) swap = NetworkHistogram.make_swap(assignment, indices) slow_swapped = slow_swap(assignment, swap) NetworkHistogram.apply_swap!(assignment, swap) - ll_after_swap = NetworkHistogram.loglikelihood(assignment) + ll_after_swap = NetworkHistogram.loglikelihood(assignment) ll_slow_swap = NetworkHistogram.loglikelihood(slow_swapped) ll_manual_after_swap = manual_loglikelihood(A, assignment.node_labels, assignment.θ) - @test isapprox(ll_after_swap, ll_manual_after_swap; atol=1e-10) - @test isapprox(ll_after_swap, ll_slow_swap; atol=1e-10) + @test isapprox(ll_after_swap, ll_manual_after_swap; atol = 1e-10) + @test isapprox(ll_after_swap, ll_slow_swap; atol = 1e-10) # Revert the swap NetworkHistogram.revert_swap!(assignment, swap) ll_after_revert = NetworkHistogram.loglikelihood(assignment) ll_manual_after_revert = manual_loglikelihood(A, assignment.node_labels, assignment.θ) - @test isapprox(ll_after_revert, ll_manual_after_revert; atol=1e-10) - @test isapprox(ll_after_revert, ll_original; atol=1e-10) + @test isapprox(ll_after_revert, ll_manual_after_revert; atol = 1e-10) + @test isapprox(ll_after_revert, ll_original; atol = 1e-10) end From 2d51b5155e0de69591f4ab32061c2d067e6c2261 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 26 Sep 2025 16:44:46 +0200 Subject: [PATCH 178/266] extensions now work, need to code them --- Project.toml | 4 ++++ docs/Project.toml | 3 +++ ext/BootstrapExt/BootstrapExt.jl | 12 ++++++++++++ ext/DiscretizeExt.jl | 9 +++++++-- ext/DistExt.jl | 12 ++++++++++++ ext/DistributionsExt.jl | 8 -------- src/NetworkHistogram.jl | 6 ++++++ src/optimization/swap_categorical.jl | 15 +++------------ src/utils/SymArray.jl | 6 +++++- test/Project.toml | 2 ++ 10 files changed, 54 insertions(+), 23 deletions(-) create mode 100644 ext/BootstrapExt/BootstrapExt.jl create mode 100644 ext/DistExt.jl delete mode 100644 ext/DistributionsExt.jl diff --git a/Project.toml b/Project.toml index a9d66df..25be955 100644 --- a/Project.toml +++ b/Project.toml @@ -12,8 +12,12 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [weakdeps] DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" +Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" [extensions] +DistExt = "Distributions" +BootstrapExt = "Bootstrap" DiscretizeExt = "DiscretizeDistributions" [extras] diff --git a/docs/Project.toml b/docs/Project.toml index e24d6b4..c6d1322 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,4 +1,7 @@ [deps] +DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" diff --git a/ext/BootstrapExt/BootstrapExt.jl b/ext/BootstrapExt/BootstrapExt.jl new file mode 100644 index 0000000..6d3846e --- /dev/null +++ b/ext/BootstrapExt/BootstrapExt.jl @@ -0,0 +1,12 @@ +module BootstrapExt + +using NetworkHistogram +import NetworkHistogram: test_extension_boot + +using Bootstrap + +function test_extension_boot() + return "Bootstrap extension works!" +end + +end diff --git a/ext/DiscretizeExt.jl b/ext/DiscretizeExt.jl index 46d3d8d..68fe1a1 100644 --- a/ext/DiscretizeExt.jl +++ b/ext/DiscretizeExt.jl @@ -1,7 +1,12 @@ module DiscretizeExt using NetworkHistogram -using StatsBase -using DiscreteDistributions +import NetworkHistogram: test_extension_disc + +using DiscretizeDistributions + +function test_extension_disc() + return "Discretize extension works!" +end end diff --git a/ext/DistExt.jl b/ext/DistExt.jl new file mode 100644 index 0000000..4eb0d51 --- /dev/null +++ b/ext/DistExt.jl @@ -0,0 +1,12 @@ +module DistExt + +using NetworkHistogram +import NetworkHistogram: test_extension_dist + +using Distributions + +function test_extension_dist() + return "Distribution extension works!" +end + +end diff --git a/ext/DistributionsExt.jl b/ext/DistributionsExt.jl deleted file mode 100644 index 32cde0c..0000000 --- a/ext/DistributionsExt.jl +++ /dev/null @@ -1,8 +0,0 @@ -module DistributionsExt - -using NetworkHistogram -using StatsBase -import NetworkHistogram: fast_ll_update! -import Distributions: logpdf - -end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 4fa3afb..9d3fdde 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -17,4 +17,10 @@ include("api.jl") export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf +function test_extension_dist end +function test_extension_boot end +function test_extension_disc end + +export test_extension_dist, test_extension_boot, test_extension_disc + end diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index 051e997..c0c24f2 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -59,17 +59,9 @@ function Assignment( n_nodes = length(node_labels) dists = fit(dist, edge_list) realized = SymArray(n_groups, zeros(Float64, num_categories(unwrap(dist)))) - estimated = SymArray( - n_groups, zeros(Float64, num_categories(unwrap(dist)))) + estimated = SymArray(n_groups, zeros(Float64, num_categories(unwrap(dist)))) counts = SymArray(n_groups, 0) - # realized = Matrix{Vector{Int}}(undef, n_groups, n_groups) - # counts = Matrix{Int}(undef, n_groups, n_groups) - # estimated = Matrix{Vector{Float64}}(undef, n_groups, n_groups) - # for index in eachindex(realized) - # realized[index] = copy(zeros(Int, num_categories(unwrap(dist)))) - # estimated[index] = copy(zeros(Float64, num_categories(unwrap(dist)))) - # end - # fill!(counts, 0) + for u in 1:n_nodes g1 = node_labels[u] for (v, e) in iterate_neighbors(edge_list, u) @@ -84,7 +76,6 @@ function Assignment( end for g2 in 1:n_groups, g1 in g2:n_groups - counts[g1, g2] = counts[minmax(g1, g2)...] realized[g1, g2] = realized[minmax(g1, g2)...] _fast_normalization!( @@ -167,7 +158,7 @@ function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) as.additional_workspace.estimated[g1, g2]) as.additional_workspace.log_likelihood_per_group[g1, g2] = _fast_ll( as.additional_workspace.estimated[g1, g2], as.additional_workspace.realized[ - g1, g2], + g1, g2], as.additional_workspace.counts[g1, g2]) end end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 64b2f8d..0cdc56f 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -43,10 +43,14 @@ function sum_tri_with_diag(a::SymArray) return sum(values(a.d)) end -function eltype(a::SymArray{F}) where {F} +function eltype(::SymArray{F}) where {F} return F end +# function zeros(::Type{SymArray{F}}, k::Int) where {F} +# return SymArray(k, zero(F)) +# end + function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} @assert size(a, 1) == size(a, 2) k = size(a, 1) diff --git a/test/Project.toml b/test/Project.toml index 3b6cd81..6be9544 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,7 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" +DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" From 985e2896d818925501da0ba94ab0e5bb9b929113 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 2 Oct 2025 12:32:59 +0200 Subject: [PATCH 179/266] Generalize to Distributions.jl Switch Categorical to be StaticVector params, may need to check SizedVector instead for inplace modification. Need to benchmark. TODO: - abstract interface for any distributions first - specialise to internal fast cat if univariate - make internal cat have cemetary state in first pos always or optimize ZeroInflated - extension with discretize ? --- Project.toml | 12 +++- ext/{BootstrapExt => }/BootstrapExt.jl | 0 ext/DiscretizeExt.jl | 12 +++- ext/DistExt.jl | 12 ---- src/NetworkHistogram.jl | 1 + src/distributions/cat.jl | 46 +++++++++++++ src/distributions/include.jl | 1 + src/optimization/swap_categorical.jl | 93 ++++---------------------- test/test_cat_case.jl | 10 +-- 9 files changed, 86 insertions(+), 101 deletions(-) rename ext/{BootstrapExt => }/BootstrapExt.jl (100%) delete mode 100644 ext/DistExt.jl create mode 100644 src/distributions/cat.jl diff --git a/Project.toml b/Project.toml index 25be955..b8dc5e6 100644 --- a/Project.toml +++ b/Project.toml @@ -4,22 +4,28 @@ authors = ["Charles Dufour", "Jake Grainger"] version = "0.5.2" [deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" [weakdeps] -DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" -Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" [extensions] -DistExt = "Distributions" BootstrapExt = "Bootstrap" DiscretizeExt = "DiscretizeDistributions" +[compat] +BenchmarkTools = "1.6.0" +Distributions = "0.25.120" +StructArrays = "0.7.1" + [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/ext/BootstrapExt/BootstrapExt.jl b/ext/BootstrapExt.jl similarity index 100% rename from ext/BootstrapExt/BootstrapExt.jl rename to ext/BootstrapExt.jl diff --git a/ext/DiscretizeExt.jl b/ext/DiscretizeExt.jl index 68fe1a1..af85253 100644 --- a/ext/DiscretizeExt.jl +++ b/ext/DiscretizeExt.jl @@ -1,12 +1,20 @@ module DiscretizeExt using NetworkHistogram -import NetworkHistogram: test_extension_disc - +import NetworkHistogram: test_extension_disc, get_ref_dist, Dist, ZeroInflated +import Distributions: ContinuousUnivariateDistribution using DiscretizeDistributions +# in_interval.(x, support(discretized_dist)) function test_extension_disc() return "Discretize extension works!" end +function get_ref_dist(dist::D, ::Val{true}) where {D <: ContinuousUnivariateDistribution} + return Dist(ZeroInflated(dist)) +end +function get_ref_dist(dist::D, ::Val{false}) where {D <: ContinuousUnivariateDistribution} + return Dist(dist) +end + end diff --git a/ext/DistExt.jl b/ext/DistExt.jl deleted file mode 100644 index 4eb0d51..0000000 --- a/ext/DistExt.jl +++ /dev/null @@ -1,12 +0,0 @@ -module DistExt - -using NetworkHistogram -import NetworkHistogram: test_extension_dist - -using Distributions - -function test_extension_dist() - return "Distribution extension works!" -end - -end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 9d3fdde..19bcbc0 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -4,6 +4,7 @@ using StaticArrays using ProgressMeter import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero +using Distributions include("utils/include.jl") using .FastSymArray diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl new file mode 100644 index 0000000..72917f3 --- /dev/null +++ b/src/distributions/cat.jl @@ -0,0 +1,46 @@ + +const Cat{M, T} = Categorical{T, SVector{M, T}}#, Vector{T}}} + +function Cat(p::SVector{M, T}) where {M, T} + return Categorical(p) +end + +# function Base.show(io, c::Cat) +# print(io, "Cat($(c.p))") +# end + +num_categories(::Type{Cat{M, T}}) where {M, T} = M +num_categories(::Cat{M, T}) where {M, T} = M +zero(c::Cat{M, T}) where {M, T} = Cat(ones(typeof(c.p)) ./ M) +sample(c::Cat) = rand(c) +function fit(c::Cat{M, T}, xs::AbstractVector{Int}) where {M, T} + total = length(xs) + if total == 0 + return zero(c) + end + return Cat(SVector{M}(counts(xs, M) ./ total)) +end + +function fit(c::Cat{M, T}, x::Int) where {M, T} + ps = zeros(T, M) + ps[x] = one(T) + return Cat(SVector{M}(ps)) +end + +function _xlogy(x, y) + if x == 0 + return zero(y) + end + return x * log(y) +end + +function logpdf_cat(p::AbstractVector, obs::Int) + return log(p[obs]) +end + +function logpdf_cat(p::AbstractVector, count_observed::AbstractVector) + #TODO make non allocating + return sum(_xlogy.(count_observed, p)) +end + +distance(c1::Cat{M, V}, c2::Cat{M, V}) where {M, V} = sum(abs.(c1.p .- c2.p)) diff --git a/src/distributions/include.jl b/src/distributions/include.jl index 6b0a06c..2ba3979 100644 --- a/src/distributions/include.jl +++ b/src/distributions/include.jl @@ -1,2 +1,3 @@ include("distributions_type.jl") include("zero_inflated.jl") +include("cat.jl") diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index c0c24f2..4454086 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -1,60 +1,14 @@ -mutable struct WorkspaceDiscreteSwap{ - D, C <: SymArray, R <: SymArray, - R2 <: SymArray, S <: SymArray{D}, - L <: SymArray} - θ::S +mutable struct WorkspaceDiscreteSwap{C <: SymArray, R <: SymArray, + R2 <: SymArray, L <: SymArray} log_likelihood_per_group::L counts::C realized::R estimated::R2 end -struct Cat{M, V <: AbstractVector{<:Real}} - p::V - function Cat(p::AbstractVector{<:Real}) - new{Val{length(p)}, typeof(p)}(p / sum(p)) - end -end - -function Base.show(io::IO, c::Cat) - print(io, "Cat($(c.p))") -end - -num_categories(::Type{Cat{Val{M}, V}}) where {M, V} = M -num_categories(::Cat{Val{M}, V}) where {M, V} = M -zero(c::Cat{Val{M}, V}) where {M, V} = Cat(ones(eltype(V), M)) -distance(c1::Cat{M, V}, c2::Cat{M, V}) where {M, V} = sum(abs.(c1.p .- c2.p)) -eltype(::Cat{M, V}) where {M, V} = Int -params(c::Cat{M, V}) where {M, V} = (c.p,) -logpdf(c::Cat, x::Int) = log(c.p[x]) -function fit(::Cat{Val{M}, V}, x::AbstractVector{Int}) where {M, V} - p_est = zeros(eltype(V), M) - for xi in x - p_est[xi] += 1 - end - return Cat{Val{M}, V}(p_est ./ length(x)) -end - -function sample(c::Cat{Val{M}, V}) where {M, V} - return findfirst(x -> x >= rand(), cumsum(c.p)) -end - -function fit(::Cat{Val{M}, V}, x::Int) where {M, V} - p_est = zeros(eltype(V), M) - p_est[x] = 1.0 - return Cat(p_est) -end - -function set_params!(c::Dist{Cat{M, V}}, p::V) where {M, V} - set_params!(c.dist, p) -end -function set_params!(c::Cat{M, V}, p::V) where {M, V} - c.p .= p -end - function Assignment( node_labels, edge_list::EdgeList{E}, - dist::Dist{D}) where {E, D <: Cat} + dist::Dist{Cat{M, T}}) where {E, M, T} n_groups = length(unique(node_labels)) n_nodes = length(node_labels) dists = fit(dist, edge_list) @@ -86,16 +40,12 @@ function Assignment( log_likelihood_per_group = SymArray(n_groups, 0.0) for g2 in 1:n_groups for g1 in g2:n_groups - set_params!(θ[g1, g2], estimated[g1, g2]) - for m in 1:num_categories(unwrap(dist)) - if realized[g1, g2][m] > 0 - log_likelihood_per_group[g1, g2] += realized[g1, g2][m] * - logpdf(θ[g1, g2], m) - end - end + θ[g1, g2] = Dist(Cat(SVector{M}(estimated[g1, g2]))) + log_likelihood_per_group[g1, g2] = logpdf_cat( + estimated[g1, g2], realized[g1, g2]) end end - w = WorkspaceDiscreteSwap(deepcopy(θ), deepcopy(log_likelihood_per_group), + w = WorkspaceDiscreteSwap(deepcopy(log_likelihood_per_group), counts, deepcopy(realized), deepcopy(estimated)) return Assignment( node_labels, edge_list, dists, θ, log_likelihood_per_group, w) @@ -107,17 +57,14 @@ function make_workspace(a::Assignment{E, Dist{D}, end function make_swap_workspace!(ws::WorkspaceDiscreteSwap, a::Assignment) - ws.θ = deepcopy(a.θ) ws.log_likelihood_per_group = deepcopy(a.log_likelihood) ws.realized = deepcopy(a.additional_workspace.realized) ws.estimated = deepcopy(a.additional_workspace.estimated) end function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) - a.θ = deepcopy(ws.θ) a.log_likelihood = deepcopy(ws.log_likelihood_per_group) as = a.additional_workspace - as.θ = deepcopy(ws.θ) as.log_likelihood_per_group = deepcopy(ws.log_likelihood_per_group) as.realized = deepcopy(ws.realized) as.estimated = deepcopy(ws.estimated) @@ -151,19 +98,18 @@ function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) _fast_normalization!.(as.additional_workspace.estimated, as.additional_workspace.realized, as.additional_workspace.counts) swap_node_labels!(as, u, v) - + m = size(as.additional_workspace.estimated[1, 1], 1) for g2 in 1:n_groups for g1 in g2:n_groups - set_params!(as.additional_workspace.θ[g1, g2], - as.additional_workspace.estimated[g1, g2]) - as.additional_workspace.log_likelihood_per_group[g1, g2] = _fast_ll( + as.θ[g1, g2] = Dist(Cat(SVector{m}(as.additional_workspace.estimated[g1, g2]))) + # set_params!(as.additional_workspace.θ[g1, g2], + # as.additional_workspace.estimated[g1, g2]) + as.additional_workspace.log_likelihood_per_group[g1, g2] = logpdf_cat( as.additional_workspace.estimated[g1, g2], as.additional_workspace.realized[ - g1, g2], - as.additional_workspace.counts[g1, g2]) + g1, g2]) end end - as.θ = deepcopy(as.additional_workspace.θ) as.log_likelihood = deepcopy(as.additional_workspace.log_likelihood_per_group) end @@ -176,16 +122,3 @@ function _fast_normalization!(p::AbstractVector, r::AbstractVector, c::Real) fill!(p, 0.0) end end - -function _fast_ll( - p::AbstractVector, r::AbstractVector, c::Real) - ll = zero(eltype(p)) - if c > 0 - @inbounds for m in eachindex(p) - if r[m] > 0 - ll += r[m] * log(p[m]) - end - end - end - return ll -end diff --git a/test/test_cat_case.jl b/test/test_cat_case.jl index c8607c5..51e6bfd 100644 --- a/test/test_cat_case.jl +++ b/test/test_cat_case.jl @@ -3,18 +3,20 @@ using NetworkHistogram using StatsBase using Random using Distributions +using StaticArrays @testset "Swap workspace likelihood update (Categorical)" begin Random.seed!(42) n = 10 k = 2 m = 3 - d_mine = NetworkHistogram.Cat(fill(1 / m, m)) + ps = SVector{m}(fill(1 / m, m)) + d_mine = NetworkHistogram.Cat(ps) # Create a block model with two groups sbm = NetworkHistogram.BlockModel(k, d_mine) - sbm[1, 1] = NetworkHistogram.Cat([0.7, 0.2, 0.1]) - sbm[2, 2] = NetworkHistogram.Cat([0.1, 0.3, 0.6]) - sbm[1, 2] = NetworkHistogram.Cat([0.3, 0.4, 0.3]) + sbm[1, 1] = NetworkHistogram.Cat(SVector{3}([0.7, 0.2, 0.1])) + sbm[2, 2] = NetworkHistogram.Cat(SVector{3}([0.1, 0.3, 0.6])) + sbm[1, 2] = NetworkHistogram.Cat(SVector{3}([0.3, 0.4, 0.3])) labels = StatsBase.inverse_rle(1:k, fill(n ÷ k, k)) A = NetworkHistogram.sample(sbm, labels) From 0523a6027378860e6bda960f37793075111655bc Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 8 Oct 2025 16:01:15 +0200 Subject: [PATCH 180/266] add zero inflated for categorical --- src/api.jl | 12 ++++++------ src/distributions/cat.jl | 21 ++++++++++++++++----- src/distributions/distributions_type.jl | 3 +-- src/distributions/zero_inflated.jl | 9 ++++++++- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/api.jl b/src/api.jl index ccc51a9..4c93eb1 100644 --- a/src/api.jl +++ b/src/api.jl @@ -6,14 +6,14 @@ end function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) - @debug "preprocessing data" + @info "preprocessing data" dist = get_ref_dist(dist_user, zero_inflated) - g = preprocess_data(data_input, dist) + g = preprocess_data(data_input, dist, zero_inflated) - @debug "started optimizatiion" + @info "started optimization" out = greedy_optimize(g, initial_node_labels, params) - @debug "finished optimizatiion with loglikelihood $(loglikelihood(out))" + @info "finished optimization with loglikelihood $(loglikelihood(out))" return postprocess(out) end @@ -24,8 +24,8 @@ function get_ref_dist(dist::D, ::Val{false}) where {D} return Dist(dist) end -function preprocess_data(data, dist::Dist) - A = EdgeList(_fast_compressed_obs(dist, data)) +function preprocess_data(data, dist::Dist, zero_inflated) + A = EdgeList(_fast_compressed_obs(dist, data, zero_inflated)) return A, dist end diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl index 72917f3..2596f1c 100644 --- a/src/distributions/cat.jl +++ b/src/distributions/cat.jl @@ -1,13 +1,13 @@ -const Cat{M, T} = Categorical{T, SVector{M, T}}#, Vector{T}}} +const Cat{M, T} = Categorical{T, SVector{M, T}} function Cat(p::SVector{M, T}) where {M, T} return Categorical(p) end -# function Base.show(io, c::Cat) -# print(io, "Cat($(c.p))") -# end +function Base.show(io::IO, c::Cat) + Base.print(io, c.p) +end num_categories(::Type{Cat{M, T}}) where {M, T} = M num_categories(::Cat{M, T}) where {M, T} = M @@ -39,8 +39,19 @@ function logpdf_cat(p::AbstractVector, obs::Int) end function logpdf_cat(p::AbstractVector, count_observed::AbstractVector) - #TODO make non allocating + #TODO make non allocating with mapreduce ? return sum(_xlogy.(count_observed, p)) end distance(c1::Cat{M, V}, c2::Cat{M, V}) where {M, V} = sum(abs.(c1.p .- c2.p)) + +function get_ref_dist(dist::Categorical, ::Val{true}) + return Dist(Cat(SVector{ncategories(dist) + 1}(0.0, dist.p...))) +end + +function get_ref_dist(dist::Categorical, ::Val{false}) + return Dist(Cat(SVector{ncategories(dist)}(dist.p))) +end + +_fast_compressed_obs(d::Categorical, x, ::Val{true}) = x .+ one(eltype(x)) +_fast_compressed_obs(d::Categorical, x, ::Val{false}) = x diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index 93968ac..ba43ede 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -61,7 +61,7 @@ end # expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined # by default do nothing -_fast_compressed_obs(d, x) = x +_fast_compressed_obs(d, x, zero_inflated) = x # what to delegate to the underlying distribution for f in [:logpdf, :sample, :distance, :eltype, :params, :_fast_compressed_obs] @@ -91,4 +91,3 @@ logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) params(d::Bernoulli) = (d.p,) eltype(d::Bernoulli) = Bool sample(d::Bernoulli) = Bool(rand() <= d.p) -_fast_compressed_obs(d::Bernoulli, x) = x diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 56801c5..59ac99e 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -1,4 +1,11 @@ +## TODO: define proper distribution that is zero inflated? + +# struct ZIDist{S, F} <: UniVariateDistribution{S} +# dist::UnivariateDistribution{S} +# proba_zero::F +# end + struct ZeroInflated{D, F} dist::D proba_zero::F @@ -60,7 +67,7 @@ function unwrap(d::Dist{ZeroInflated{B, D}}) where {B, D} end function get_proportion_observed(d::Dist{ZeroInflated{B, D}}) where {B, D} - return (1-d.dist.proba_zero) * d.counts + return (1 - d.dist.proba_zero) * d.counts end function get_proportion_observed(d::Dist) From e76664c3091930041a3291c890f7d6abfebc000c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 8 Oct 2025 18:35:08 +0200 Subject: [PATCH 181/266] make assignment concrete and speed up optim --- src/assignment.jl | 4 ++-- src/distributions/cat.jl | 2 +- src/optimization/config_rules/accept_rule.jl | 1 + src/optimization/config_rules/swap_rule.jl | 4 ++-- src/optimization/greedy.jl | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/assignment.jl b/src/assignment.jl index fe6ee65..2c3fbe3 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -38,8 +38,8 @@ Base.@propagate_inbounds function Base.getindex( return i < length(g) ? g.group_number[1] : g.group_number[2] end -mutable struct Assignment{E, D, F, W} - node_labels::AbstractVector{Int} +mutable struct Assignment{E, D, F, W, V <: AbstractVector{Int}} + node_labels::V const edges::EdgeList{E} const dists::EdgeList{D} θ::SymArray{D} diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl index 2596f1c..4900ab6 100644 --- a/src/distributions/cat.jl +++ b/src/distributions/cat.jl @@ -6,7 +6,7 @@ function Cat(p::SVector{M, T}) where {M, T} end function Base.show(io::IO, c::Cat) - Base.print(io, c.p) + Base.print(io, "Cat($(c.p))") end num_categories(::Type{Cat{M, T}}) where {M, T} = M diff --git a/src/optimization/config_rules/accept_rule.jl b/src/optimization/config_rules/accept_rule.jl index 94ce206..f531dc5 100644 --- a/src/optimization/config_rules/accept_rule.jl +++ b/src/optimization/config_rules/accept_rule.jl @@ -19,4 +19,5 @@ function accept_reject_update!(a::Assignment, swap::Swap, ::Strict) if loglikelihood(a) <= current_score revert_swap!(a, swap) end + return nothing end diff --git a/src/optimization/config_rules/swap_rule.jl b/src/optimization/config_rules/swap_rule.jl index 5f5a0f3..19c3dca 100644 --- a/src/optimization/config_rules/swap_rule.jl +++ b/src/optimization/config_rules/swap_rule.jl @@ -15,7 +15,7 @@ current assignment `node_assignment`. select_swap function select_indices_swap(assignment::Assignment, ::RandomNodeSwap) - return StatsBase.sample(1:number_nodes(assignment), 2; replace = false) + return Tuple(StatsBase.sample(1:number_nodes(assignment), 2; replace = false)) end function select_indices_swap(assignment::Assignment, ::RandomGroupSwap) @@ -23,5 +23,5 @@ function select_indices_swap(assignment::Assignment, ::RandomGroupSwap) 1:number_groups(assignment), 2; replace = false) index1 = rand(findall(x -> x == groups[1], assignment.node_labels)) index2 = rand(findall(x -> x == groups[2], assignment.node_labels)) - return (index1, index2) + return index1, index2 end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 171b7a2..c0b5f1a 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -12,7 +12,7 @@ end function GreedyParams() GreedyParams( - 100_000, RandomGroupSwap(), Strict(), PreviousBestValue(10_000), true) + 100_000, RandomNodeSwap(), Strict(), PreviousBestValue(10_000), true) end function greedy_optimize(g, initial_labels, params::GreedyParams) @@ -25,7 +25,7 @@ end function greedy_improve!(a::Assignment; params = GreedyParams()) # allocate memory for swap - swap = make_swap(a, (1, 1)) + swap = make_swap(a, (1, 2)) # display progress bar p = ProgressUnknown(enabled = params.progress_bar, From b61cce81245a07b6a844bb549213af6aaa8c098a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 8 Oct 2025 21:05:38 +0200 Subject: [PATCH 182/266] check backward compat --- Project.toml | 3 ++ ext/LightMCExt.jl | 68 ++++++++++++++++++++++++++++++ src/EdgeList.jl | 8 ++-- src/NetworkHistogram.jl | 2 + src/api.jl | 1 + src/block_model.jl | 16 +++++-- src/distributions/cat.jl | 4 +- src/distributions/zero_inflated.jl | 12 +++++- 8 files changed, 103 insertions(+), 11 deletions(-) create mode 100644 ext/LightMCExt.jl diff --git a/Project.toml b/Project.toml index b8dc5e6..63decab 100644 --- a/Project.toml +++ b/Project.toml @@ -16,14 +16,17 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" [weakdeps] Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" +LightMC = "b58f5c6e-c887-41d6-b553-02118416cd5d" [extensions] BootstrapExt = "Bootstrap" DiscretizeExt = "DiscretizeDistributions" +LightMCExt = "LightMC" [compat] BenchmarkTools = "1.6.0" Distributions = "0.25.120" +LightMC = "1.0.0" StructArrays = "0.7.1" [extras] diff --git a/ext/LightMCExt.jl b/ext/LightMCExt.jl new file mode 100644 index 0000000..36b6a5f --- /dev/null +++ b/ext/LightMCExt.jl @@ -0,0 +1,68 @@ +module LightMCExt + +using NetworkHistogram +using LightMC + +import NetworkHistogram: agg_params, logpdf, sample, params, distance, _fast_compressed_obs, + from_adjs_to_decorated +using LightMC: DiscreteMarkovChain, SampleChain, transition_matrix, ConvertBinaryMC + +logpdf(d::DiscreteMarkovChain, x) = LightMC.logpdf(d, x) +sample(x::DiscreteMarkovChain, args...) = LightMC.sample(x, args...) +params(d) = LightMC.params(d) + +function agg_params(d1::DiscreteMarkovChain, d2::DiscreteMarkovChain, w1, w2) + s1 = Int(sign(w1)) + s2 = Int(sign(w2)) + return DiscreteMarkovChain(s1 .* d1.transitions .+ s2 .* d2.transitions, + s1 .* d1.normalization .+ s2 .* d2.normalization) +end + +function distance(d1::DiscreteMarkovChain, d2::DiscreteMarkovChain) + mean(x -> x^2, transition_matrix(d1) - transition_matrix(d2)) +end +function distance(d1::SampleChain, d2::SampleChain) + mean(x -> x^2, transition_matrix(d1) - transition_matrix(d2)) +end +params(d::DiscreteMarkovChain) = (d.transitions, d.normalization) + +function _fast_compressed_obs(d::DiscreteMarkovChain, x::SampleChain, zeroinflated) + return x +end + +function from_adjs_to_decorated(adjs::AbstractArray{T, 3}, converter::ConvertBinaryMC, + threshold = 0.0) where {T <: Union{Missing, Real}} + sample_chain = MC.periodic_chain(adjs[1, 4, :], converter) + graph = Matrix{Union{typeof(sample_chain), Missing}}( + undef, size(adjs, 1), size(adjs, 2)) + counts_t = sum(adjs, dims = 3) + for j in axes(adjs, 2) + for i in axes(adjs, 1) + if i == j || counts_t[i, j] <= threshold * size(adjs, 3) + graph[i, j] = missing + else + graph[i, j] = LightMC.periodic_chain(adjs[i, j, :], converter) + end + end + end + return graph +end + +function from_adjs_to_decorated(adjs::AbstractArray{T, 2}, converter::ConvertBinaryMC, + threshold = 0.0) where {T <: Union{Missing, AbstractArray}} + sample_chain = LightMC.periodic_chain(adjs[1, 4], converter) + graph = Matrix{Union{typeof(sample_chain), Missing}}( + undef, size(adjs, 1), size(adjs, 2)) + for j in axes(adjs, 2) + for i in axes(adjs, 1) + if i == j || sum(adjs[i, j]) <= threshold * length(adjs[i, j]) + graph[i, j] = missing + else + graph[i, j] = LightMC.periodic_chain(adjs[i, j], converter) + end + end + end + return graph +end + +end diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 95e79a9..f1b6836 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -71,11 +71,11 @@ function _from_adj_to_edge_list( return EdgeList(data, name_list) end -function _fast_compressed_obs(d::Dist, A::AbstractMatrix) - _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x)) +function _fast_compressed_obs(d::Dist, A::AbstractMatrix, zeroinflated) + _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x, zeroinflated)) end -function _fast_compressed_obs(d::Dist, A::EdgeList{E}) where {E} - _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x)) +function _fast_compressed_obs(d::Dist, A::EdgeList{E}, zeroinflated) where {E} + _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x, zeroinflated)) end function _make_shift_broadcast(A::EdgeList, f) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 19bcbc0..bb92477 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -21,7 +21,9 @@ export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf function test_extension_dist end function test_extension_boot end function test_extension_disc end +function from_adjs_to_decorated end export test_extension_dist, test_extension_boot, test_extension_disc +export from_adjs_to_decorated end diff --git a/src/api.jl b/src/api.jl index 4c93eb1..5a7e733 100644 --- a/src/api.jl +++ b/src/api.jl @@ -8,6 +8,7 @@ function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) @info "preprocessing data" dist = get_ref_dist(dist_user, zero_inflated) + @show dist g = preprocess_data(data_input, dist, zero_inflated) @info "started optimization" diff --git a/src/block_model.jl b/src/block_model.jl index 65db31b..fe4397a 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -5,7 +5,7 @@ struct BlockModel{D, K, T} end function BlockModel(k::Int, d::D) where {D} - sizes = @SVector fill(1/k, k) + sizes = @SVector fill(1 / k, k) cumulative_sizes = SVector{k}(cumsum(sizes)) _dists = SymArray(k, d) return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) @@ -16,7 +16,8 @@ function BlockModel(a::Assignment) sizes = SVector{k}(proportions(a)) cumulative_sizes = SVector{k}(cumsum(sizes)) _dists = unwrap.(a.θ) - return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) + return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}( + _dists, sizes, cumulative_sizes) end function BlockModel(nodes_labels, θ) @@ -24,7 +25,16 @@ function BlockModel(nodes_labels, θ) sizes = SVector{k}(counts(nodes_labels) / length(nodes_labels)) cumulative_sizes = SVector{k}(cumsum(sizes)) _dists = unwrap.(θ) - return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}(_dists, sizes, cumulative_sizes) + return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}( + _dists, sizes, cumulative_sizes) +end + +function BlockModel(θ::AbstractMatrix{D}) where {D} + k = size(θ, 1) + sizes = @SVector fill(1 / k, k) + cumulative_sizes = SVector{k}(cumsum(sizes)) + _dists = convert(SymArray{D}, θ) + return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) end function map_ξ_to_block(bm::BlockModel, ξ::T) where {T <: Real} diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl index 4900ab6..acb3a98 100644 --- a/src/distributions/cat.jl +++ b/src/distributions/cat.jl @@ -53,5 +53,5 @@ function get_ref_dist(dist::Categorical, ::Val{false}) return Dist(Cat(SVector{ncategories(dist)}(dist.p))) end -_fast_compressed_obs(d::Categorical, x, ::Val{true}) = x .+ one(eltype(x)) -_fast_compressed_obs(d::Categorical, x, ::Val{false}) = x +_fast_compressed_obs(d::Categorical, x::Int, ::Val{true}) = x + one(x) +_fast_compressed_obs(d::Categorical, x::Int, ::Val{false}) = x diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl index 59ac99e..bbdaf90 100644 --- a/src/distributions/zero_inflated.jl +++ b/src/distributions/zero_inflated.jl @@ -57,8 +57,8 @@ function fit(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} end end -function _fast_compressed_obs(zi::ZeroInflated, x, filter = iszero) - return SampleZI(_fast_compressed_obs(zi.dist, x), filter(x)) +function _fast_compressed_obs(zi::ZeroInflated, x, zero_inflated; filter = iszero) + return SampleZI(_fast_compressed_obs(zi.dist, x, zero_inflated), filter(x)) end function unwrap(d::Dist{ZeroInflated{B, D}}) where {B, D} @@ -74,6 +74,14 @@ function get_proportion_observed(d::Dist) return d.counts end +function sample(zi::ZeroInflated{D, F}, args...) where {D, F} + if rand() < zi.proba_zero + return SampleZI(zero(eltype(zi.dist)), true) + else + return SampleZI(sample(zi.dist, args...), false) + end +end + # function fit(zd::ZeroInflated, x::SampleZI) # if x.iszero # return ZeroInflated(zero(zd.dist), 1.0) From 6f18035c62bb2628783659064e2882aa85f46109 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 8 Oct 2025 21:56:07 +0200 Subject: [PATCH 183/266] remove static arrays for sbm --- src/api.jl | 1 - src/block_model.jl | 32 +++++++++++++++----------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/api.jl b/src/api.jl index 5a7e733..4c93eb1 100644 --- a/src/api.jl +++ b/src/api.jl @@ -8,7 +8,6 @@ function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) @info "preprocessing data" dist = get_ref_dist(dist_user, zero_inflated) - @show dist g = preprocess_data(data_input, dist, zero_inflated) @info "started optimization" diff --git a/src/block_model.jl b/src/block_model.jl index fe4397a..e7fd28c 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -1,40 +1,38 @@ -struct BlockModel{D, K, T} +struct BlockModel{D, V} _dists::SymArray{D} - sizes::SVector{K, T} - cum_sizes::SVector{K, T} + sizes::V + cum_sizes::V end function BlockModel(k::Int, d::D) where {D} - sizes = @SVector fill(1 / k, k) - cumulative_sizes = SVector{k}(cumsum(sizes)) + sizes = fill(1 / k, k) + cumulative_sizes = cumsum(sizes) _dists = SymArray(k, d) - return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) + return BlockModel(_dists, sizes, cumulative_sizes) end function BlockModel(a::Assignment) k = length(unique(a.node_labels)) - sizes = SVector{k}(proportions(a)) - cumulative_sizes = SVector{k}(cumsum(sizes)) + sizes = proportions(a) + cumulative_sizes = cumsum(sizes) _dists = unwrap.(a.θ) - return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}( - _dists, sizes, cumulative_sizes) + return BlockModel(_dists, sizes, cumulative_sizes) end function BlockModel(nodes_labels, θ) k = length(unique(nodes_labels)) - sizes = SVector{k}(counts(nodes_labels) / length(nodes_labels)) - cumulative_sizes = SVector{k}(cumsum(sizes)) + sizes = counts(nodes_labels) / length(nodes_labels) + cumulative_sizes = cumsum(sizes) _dists = unwrap.(θ) - return BlockModel{eltype(_dists), k, eltype(cumulative_sizes)}( - _dists, sizes, cumulative_sizes) + return BlockModel(_dists, sizes, cumulative_sizes) end function BlockModel(θ::AbstractMatrix{D}) where {D} k = size(θ, 1) - sizes = @SVector fill(1 / k, k) - cumulative_sizes = SVector{k}(cumsum(sizes)) + sizes = fill(1 / k, k) + cumulative_sizes = cumsum(sizes) _dists = convert(SymArray{D}, θ) - return BlockModel{D, k, Float64}(_dists, sizes, cumulative_sizes) + return BlockModel(_dists, sizes, cumulative_sizes) end function map_ξ_to_block(bm::BlockModel, ξ::T) where {T <: Real} From 1f86bd7d98306ddc3298e5292098c018b7a78f9e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 9 Oct 2025 21:44:27 +0200 Subject: [PATCH 184/266] add Makie extension and sanity check on multiplex fit --- Project.toml | 7 +-- ext/BootstrapExt.jl | 5 -- ext/DiscretizeExt.jl | 7 +-- ext/MakieExt.jl | 46 ++++++++++++++++ src/NetworkHistogram.jl | 7 +-- src/api.jl | 6 ++- src/assignment.jl | 70 ++++++++++++------------- src/block_model.jl | 31 +++++++++++ src/distributions/cat.jl | 11 +++- src/distributions/distributions_type.jl | 3 ++ src/utils/SymArray.jl | 4 ++ 11 files changed, 137 insertions(+), 60 deletions(-) create mode 100644 ext/MakieExt.jl diff --git a/Project.toml b/Project.toml index 63decab..5b2b87b 100644 --- a/Project.toml +++ b/Project.toml @@ -17,17 +17,14 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" LightMC = "b58f5c6e-c887-41d6-b553-02118416cd5d" +Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" [extensions] BootstrapExt = "Bootstrap" DiscretizeExt = "DiscretizeDistributions" LightMCExt = "LightMC" +MakieExt = "Makie" -[compat] -BenchmarkTools = "1.6.0" -Distributions = "0.25.120" -LightMC = "1.0.0" -StructArrays = "0.7.1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/ext/BootstrapExt.jl b/ext/BootstrapExt.jl index 6d3846e..5584274 100644 --- a/ext/BootstrapExt.jl +++ b/ext/BootstrapExt.jl @@ -1,12 +1,7 @@ module BootstrapExt using NetworkHistogram -import NetworkHistogram: test_extension_boot using Bootstrap -function test_extension_boot() - return "Bootstrap extension works!" -end - end diff --git a/ext/DiscretizeExt.jl b/ext/DiscretizeExt.jl index af85253..4cd5ddf 100644 --- a/ext/DiscretizeExt.jl +++ b/ext/DiscretizeExt.jl @@ -1,15 +1,10 @@ module DiscretizeExt using NetworkHistogram -import NetworkHistogram: test_extension_disc, get_ref_dist, Dist, ZeroInflated +import NetworkHistogram: get_ref_dist, Dist, ZeroInflated import Distributions: ContinuousUnivariateDistribution using DiscretizeDistributions -# in_interval.(x, support(discretized_dist)) -function test_extension_disc() - return "Discretize extension works!" -end - function get_ref_dist(dist::D, ::Val{true}) where {D <: ContinuousUnivariateDistribution} return Dist(ZeroInflated(dist)) end diff --git a/ext/MakieExt.jl b/ext/MakieExt.jl new file mode 100644 index 0000000..e89f3d8 --- /dev/null +++ b/ext/MakieExt.jl @@ -0,0 +1,46 @@ +module MakieExt + +using NetworkHistogram +using Makie + +import NetworkHistogram: get_probability_matrix, Assignment, heatmap_params, params, + number_nodes +import Distributions: params + +_splatter_args(ps) = vcat(vec.(ps)...) +_extract_params(d) = _splatter_args(params(d)) + +function Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) + params_matrix = map(_extract_params, get_probability_matrix(a)) + ps = (getindex.(params_matrix, i) for i in 1:length(params_matrix[1, 2])) + return ps +end + +function heatmap_params(a; colormap = :balance, ordering = false) + params_matrix = map(_extract_params, get_probability_matrix(a)) + if ordering + perm = sortperm(a.node_labels) + else + perm = 1:number_nodes(a) + end + params_matrix = params_matrix[perm, perm] + num_params = length(params_matrix[1, 2]) + # Compute rows and columns such that rows * columns >= num_params and as square as possible + rows = floor(Int, sqrt(num_params)) + cols = ceil(Int, num_params / rows) + if rows * cols < num_params + rows += 1 + end + fig = Figure(size = (800, 800)) + # create a grid of subplots with rows x cols cells + axes = [Axis(fig[i, j]) for i in 1:rows, j in 1:cols] + + for i in 1:num_params + heatmap!(axes[i], getindex.(params_matrix, i)[perm, perm], colormap = colormap) + axes[i].title = "Parameter $i" + end + return fig +end + +export heatmap_params +end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index bb92477..6c145da 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -18,12 +18,9 @@ include("api.jl") export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf -function test_extension_dist end -function test_extension_boot end -function test_extension_disc end function from_adjs_to_decorated end -export test_extension_dist, test_extension_boot, test_extension_disc +function heatmap_params end -export from_adjs_to_decorated +export from_adjs_to_decorated, heatmap_params end diff --git a/src/api.jl b/src/api.jl index 4c93eb1..1e91a9b 100644 --- a/src/api.jl +++ b/src/api.jl @@ -6,11 +6,11 @@ end function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) - @info "preprocessing data" + @debug "preprocessing data" dist = get_ref_dist(dist_user, zero_inflated) g = preprocess_data(data_input, dist, zero_inflated) - @info "started optimization" + @debug "started optimization" out = greedy_optimize(g, initial_node_labels, params) @info "finished optimization with loglikelihood $(loglikelihood(out))" @@ -31,5 +31,7 @@ end function postprocess(out) return out + return Assignment(out.node_labels, out.edges, out.dists, SymArray(unwrap.(out.θ)), + out.log_likelihood, out.additional_workspace) return out.node_labels, BlockModel(out) end diff --git a/src/assignment.jl b/src/assignment.jl index 2c3fbe3..c523713 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -1,42 +1,42 @@ -""" -Array-like storage for the number of nodes in each group. Try to split the number of nodes -into equal groups, but if it is not possible, the last group may have more nodes. -""" -struct GroupSize{T} <: AbstractVector{Int} - group_number::T - number_groups::Int +# """ +# Array-like storage for the number of nodes in each group. Try to split the number of nodes +# into equal groups, but if it is not possible, the last group may have more nodes. +# """ +# struct GroupSize{T} <: AbstractVector{Int} +# group_number::T +# number_groups::Int - function GroupSize(number_nodes, h::Real) - @assert 0 < h < 1 - standard_group = floor(Int, number_nodes * h) - GroupSize(number_nodes, standard_group) - end +# function GroupSize(number_nodes, h::Real) +# @assert 0 < h < 1 +# standard_group = floor(Int, number_nodes * h) +# GroupSize(number_nodes, standard_group) +# end - function GroupSize(number_nodes, standard_group::Integer) - @assert 1 < standard_group <= number_nodes - number_groups = number_nodes ÷ standard_group # number of standard groups! - if number_groups * standard_group == number_nodes - new{Int}(standard_group, number_groups) - else - remainder_group = standard_group + - mod(number_nodes, standard_group) - new{Tuple{Int, Int}}( - (standard_group, remainder_group), number_groups) - end - end -end +# function GroupSize(number_nodes, standard_group::Integer) +# @assert 1 < standard_group <= number_nodes +# number_groups = number_nodes ÷ standard_group # number of standard groups! +# if number_groups * standard_group == number_nodes +# new{Int}(standard_group, number_groups) +# else +# remainder_group = standard_group + +# mod(number_nodes, standard_group) +# new{Tuple{Int, Int}}( +# (standard_group, remainder_group), number_groups) +# end +# end +# end -Base.size(g::GroupSize) = (g.number_groups,) -Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) - @boundscheck checkbounds(g, i) - return g.group_number -end +# Base.size(g::GroupSize) = (g.number_groups,) +# Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) +# @boundscheck checkbounds(g, i) +# return g.group_number +# end -Base.@propagate_inbounds function Base.getindex( - g::GroupSize{Tuple{Int, Int}}, i::Int) - @boundscheck checkbounds(g, i) - return i < length(g) ? g.group_number[1] : g.group_number[2] -end +# Base.@propagate_inbounds function Base.getindex( +# g::GroupSize{Tuple{Int, Int}}, i::Int) +# @boundscheck checkbounds(g, i) +# return i < length(g) ? g.group_number[1] : g.group_number[2] +# end mutable struct Assignment{E, D, F, W, V <: AbstractVector{Int}} node_labels::V diff --git a/src/block_model.jl b/src/block_model.jl index e7fd28c..9fdfb55 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -94,3 +94,34 @@ end function ordered_latents(bm::BlockModel, n::Int) return sort(map(x -> map_ξ_to_block(bm, x), rand(n))) end + +function get_probability_matrix( + bm::BlockModel{D}, latents::AbstractVector, default_dist = nothing) where {D} + # hack for dirac at 0 dist (no self-loop) + if isnothing(default_dist) + try + default_dist = zero(bm[1, 1]) + catch e + if !is(e, MethodError) + rethrow(e) + end + error("Please provide a default distribution for the diagonal as it could not be inferred") + end + end + n = length(latents) + A = Array{D, 2}(undef, n, n) + for j in 1:n + for i in 1:n + if i == j + A[i, i] = default_dist + else + A[i, j] = bm[latents[i], latents[j]] + end + end + end + return A +end + +function get_probability_matrix(a::Assignment, default_dist = nothing) + return get_probability_matrix(BlockModel(a.θ), a.node_labels, default_dist) +end diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl index acb3a98..8cdb9ad 100644 --- a/src/distributions/cat.jl +++ b/src/distributions/cat.jl @@ -1,4 +1,3 @@ - const Cat{M, T} = Categorical{T, SVector{M, T}} function Cat(p::SVector{M, T}) where {M, T} @@ -21,7 +20,7 @@ function fit(c::Cat{M, T}, xs::AbstractVector{Int}) where {M, T} return Cat(SVector{M}(counts(xs, M) ./ total)) end -function fit(c::Cat{M, T}, x::Int) where {M, T} +function fit(::Cat{M, T}, x::Int) where {M, T} ps = zeros(T, M) ps[x] = one(T) return Cat(SVector{M}(ps)) @@ -55,3 +54,11 @@ end _fast_compressed_obs(d::Categorical, x::Int, ::Val{true}) = x + one(x) _fast_compressed_obs(d::Categorical, x::Int, ::Val{false}) = x + +function tv_distance(c1::Cat, c2::Cat) + return sum(abs.(c1.p .- c2.p)) / 2 +end + +function l2_distance(c1::Cat, c2::Cat) + return sqrt(sum((c1.p .- c2.p) .^ 2)) +end diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index ba43ede..581b61a 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -75,6 +75,9 @@ loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) # loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) unwrap(d::Dist) = d.dist +Base.promote_rule(::Type{Dist{D}}, ::Type{D}) where {D} = D +Base.convert(::Type{D}, d::Dist{D}) where {D} = d.dist + # Bernoulli distribution (example) struct Bernoulli{T <: Real} diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 0cdc56f..f91b341 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -25,6 +25,10 @@ function SymArray(k::T, d::AbstractArray) where {T <: Real} k) end +function SymArray(d::AbstractMatrix{F}) where {F} + return convert(SymArray{F}, d) +end + function Base.size(a::SymArray) return (a.k, a.k) end From fe044abad879a80b32d1f9affa34894448e22ad3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 10 Oct 2025 12:12:31 +0200 Subject: [PATCH 185/266] start with tutorials --- docs/Project.toml | 4 + docs/literate/tutorials/multiplex_network.jl | 6 + docs/literate/tutorials/simple_graph.jl | 151 ++++++++++++++ docs/literate/tutorials/temporal_networks.jl | 6 + docs/literate/tutorials/weighted_network.jl | 6 + docs/make.jl | 35 +++- docs/src/api.md | 4 +- docs/src/custom_types.md | 8 - docs/src/examples.md | 0 docs/src/index.md | 12 +- docs/src/internal.md | 8 - docs/src/internals/assignments.md | 20 -- docs/src/internals/distributions.md | 14 -- docs/src/rules.md | 34 --- docs/src/tutorials/multiplex_network.md | 12 ++ docs/src/tutorials/simple_graph.md | 208 +++++++++++++++++++ docs/src/tutorials/temporal_networks.md | 12 ++ docs/src/tutorials/weighted_network.md | 12 ++ ext/MakieExt.jl | 52 ++++- src/block_model.jl | 37 +++- 20 files changed, 534 insertions(+), 107 deletions(-) create mode 100644 docs/literate/tutorials/multiplex_network.jl create mode 100644 docs/literate/tutorials/simple_graph.jl create mode 100644 docs/literate/tutorials/temporal_networks.jl create mode 100644 docs/literate/tutorials/weighted_network.jl delete mode 100644 docs/src/custom_types.md delete mode 100644 docs/src/examples.md delete mode 100644 docs/src/internal.md delete mode 100644 docs/src/internals/assignments.md delete mode 100644 docs/src/internals/distributions.md delete mode 100644 docs/src/rules.md create mode 100644 docs/src/tutorials/multiplex_network.md create mode 100644 docs/src/tutorials/simple_graph.md create mode 100644 docs/src/tutorials/temporal_networks.md create mode 100644 docs/src/tutorials/weighted_network.md diff --git a/docs/Project.toml b/docs/Project.toml index c6d1322..d1d0c6f 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,7 +1,11 @@ [deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" +Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl new file mode 100644 index 0000000..cd85fb2 --- /dev/null +++ b/docs/literate/tutorials/multiplex_network.jl @@ -0,0 +1,6 @@ +#= +# Decorated Graphon Tutorial for Multiplex Networks +=# + + +# # How to use NetworkHistogram.jl for Multiplex Networks diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl new file mode 100644 index 0000000..5a850dc --- /dev/null +++ b/docs/literate/tutorials/simple_graph.jl @@ -0,0 +1,151 @@ +#= +# A Simple Graphon Tutorial with NetworkHistogram.jl +=# + +# This tutorial introduces the concept of a graphon, demonstrates how to sample a graph from one, and then shows how to estimate the graphon from the sampled graph using the Network Histogram method provided by `NetworkHistogram.jl`. + +# ## What is a Graphon? + +# A graphon (or graph function) is a symmetric, measurable function $$W: [0, 1]^2 \to [0, 1]$$. + +# It serves as a generative model for random graphs. Think of it as a continuous and more general version of a stochastic block model. + +# In simple terms, each node `i` in a graph is assigned a latent (unobserved) position $u_i \in [0, 1]$. The probability of an edge existing between two nodes `i` and `j` is then given by the graphon function evaluated at their latent positions: + +# Let's define a simple graphon in Julia. For this example, we'll use a step-function-like graphon that resembles a stochastic block model. + +import CairoMakie as Mke +using LinearAlgebra +using Random +import StatsBase: inverse_rle +using Statistics +using NetworkHistogram +using Distributions + +h = 300; # hide + +# Define a simple step-function graphon +W(u, v) = u * v + +# We can visualize this graphon as a heatmap. +let + grid = 0:0.01:1 + fig = Mke.Figure(size = (h + 20, h)) + ax = Mke.Axis(fig[1, 1], title = "True Graphon W(u,v)", + xlabel = "u", ylabel = "v", aspect = Mke.DataAspect()) + hm = Mke.heatmap!(ax, grid, grid, W, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 2], hm) + fig +end + +#md +# ## Sampling a Graph from a Graphon + +# To generate a random graph from a graphon, we follow these steps: +# 1. **Assign latent positions:** For a graph with `n` nodes, we sample `n` independent and identically distributed random variables $u_1, u_2, \dots, u_n$ from a Uniform(0, 1) distribution. These are the latent positions of our nodes. +# 2. **Generate edges:** For each pair of nodes `(i, j)` with `i < j`, we generate a random number from a Bernoulli distribution with probability $W(u_i, u_j)$. This determines whether an edge exists between them. The resulting adjacency matrix `A` will be symmetric. + +# Let's write a function to do this. + +function sample_graph(W_func, n::Int; seed = 123) + Random.seed!(seed) + u = rand(n) # Latent positions + A = zeros(Int, n, n) + for i in 1:n + for j in (i + 1):n + if rand() < W_func(u[i], u[j]) + A[i, j] = A[j, i] = 1 + end + end + end + return A, u +end + +# Now, let's sample a graph with 400 nodes from our graphon `W`. +n = 400 +A, u_true = sample_graph(W, n) + +# We can visualize the adjacency matrix of the sampled graph. +# To make the block structure visible, we sort the nodes by their latent positions. +perm = sortperm(u_true) +A = A[perm, perm] +let + fig = Mke.Figure(size = (h, h)) + ax = Mke.Axis( + fig[1, 1], title = "Sampled Adjacency Matrix (Sorted)", aspect = Mke.DataAspect()) + Mke.heatmap!(ax, A, colormap = :binary) + fig +end + +#md +# ## The Network Histogram Method + +# The Network Histogram method is a non-parametric approach to estimate a graphon from a single observed network. The core idea is to approximate the (unknown) graphon `W` with a piecewise constant function. + +# This is achieved by: +# 1. **Partitioning the nodes:** The nodes of the graph are partitioned into `k` groups. +# 2. **Estimating block probabilities:** The probability of an edge between any two groups is estimated by the density of edges between them. +# 3. **Constructing the histogram:** These estimated probabilities form a `k x k` matrix, which is a step-function approximation of the true graphon. + +# The main challenge is to find the optimal partition of nodes. `NetworkHistogram.jl` provides tools to find a good partition by optimizing an objective function, such as the log-likelihood of the observed graph under the model. + +# ## Fitting a Network Histogram with NetworkHistogram.jl + +# Now, let's use `NetworkHistogram.jl` to fit a network histogram to the graph `A` we sampled earlier. We will try to recover the underlying 2-block structure. + +# First, we need to represent our graph in a format that the package understands. +# We can use an `EdgeList` to store the edges of the graph. +edge_list = EdgeList(A) + +# We also need to define the model for the edges. Since our graph is unweighted, +# we can use a `Bernoulli` distribution. The `Dist` wrapper is used to +# handle aggregation of distributions. +import NetworkHistogram: Dist, Assignment, nethist +dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter much. + +# We start with a random initial assignment of nodes to `k=5` groups. +k = floor(Int, sqrt(n)) +oracle_labels = inverse_rle(1:k, fill(n ÷ k, k)) + +initial_assignment = shuffle(oracle_labels) + +# Now, we create an `Assignment` object, which holds all the information +# about the model and the current state of the node groupings. +oracle_estimator = Assignment(oracle_labels, edge_list, Dist(dist)); +heatmap_params(oracle_estimator, ordering = false, colorrange = (0, 1)) + +println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) +# `NetworkHistogram.jl` provides optimization algorithms to improve the initial assignment. +# Let's use the `nethist` function with `GreedyParams`, which iteratively moves nodes between +# groups to maximize the log-likelihood. + +params_opti = NetworkHistogram.GreedyParams( + 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(2_000), false) + +a = nethist(A, dist, initial_assignment, params_opti, false); + +# The `Assignment` object `a` now contains the optimized node groupings and +# the fitted network histogram parameters. + +# We can visualize the fitted histogram. +heatmap_params(a, ordering = false, colorrange = (0, 1)) + +# And we can look at the estimated block model. +sbm_fitted = NetworkHistogram.BlockModel(a); +# We first align the groups to the true latent positions. +NetworkHistogram.align_sbm_true_latents!(sbm_fitted, a, oracle_estimator.node_labels); + +# and display the true function, the oracle estimator, and the fitted model +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], NetworkHistogram.BlockModel(oracle_estimator), + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], sbm_fitted, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end diff --git a/docs/literate/tutorials/temporal_networks.jl b/docs/literate/tutorials/temporal_networks.jl new file mode 100644 index 0000000..16e0166 --- /dev/null +++ b/docs/literate/tutorials/temporal_networks.jl @@ -0,0 +1,6 @@ +#= +# Decorated Graphon Tutorial for Temporal Networks +=# + + +# # How to use NetworkHistogram.jl for Temporal Networks diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl new file mode 100644 index 0000000..d2061f7 --- /dev/null +++ b/docs/literate/tutorials/weighted_network.jl @@ -0,0 +1,6 @@ +#= +# Decorated Graphon Tutorial for Weighted Networks +=# + + +# # How to use NetworkHistogram.jl for Weighted Networks diff --git a/docs/make.jl b/docs/make.jl index 3c9567d..6bef294 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,6 +1,32 @@ using NetworkHistogram using Documenter +## Literate preprocessing, maybe move to a separate script later for faster builds + +# to run with LiveServer and avoid infinite loops, use +# servedocs(literate_dir=joinpath("docs","literate","tutorials"),skip_dir = joinpath("docs","src","tutorials")) +# adapting `tutorials` to whatever subdir you are working on + +using Literate + +LITERATE_INPUT = joinpath(@__DIR__, "literate") +LITERATE_OUTPUT = joinpath(@__DIR__, "src") + +for dir_path in filter(isdir, readdir(joinpath(@__DIR__, "literate"), join = true)) + dirname = basename(dir_path) + + for (root, _, files) in walkdir(dir_path), file in files + # ignore non julia files + splitext(file)[2] == ".jl" || continue + # full path to a literate script + ipath = joinpath(root, file) + # generated output path + opath = splitdir(replace(ipath, LITERATE_INPUT => LITERATE_OUTPUT))[1] + # generate the markdown file calling Literate + Literate.markdown(ipath, opath) + end +end + DocMeta.setdocmeta!( NetworkHistogram, :DocTestSetup, :(using NetworkHistogram); recursive = true) @@ -18,11 +44,10 @@ makedocs(; pages = [ "Home" => "index.md", "API Reference" => "api.md", - "Optimization hyperparameters" => "rules.md", - "Examples" => "examples.md", - "Internal" => [ - "internals/assignments.md", "internals/distributions.md"] - ], + "Tutorials" => ["First steps" => "tutorials/simple_graph.md", + "Multiplex networks" => "tutorials/multiplex_network.md", + "Weighted networks" => "tutorials/weighted_network.md", + "Temporal networks" => "tutorials/temporal_networks.md"]], checkdocs = :none) deploydocs(; diff --git a/docs/src/api.md b/docs/src/api.md index e0cc4b3..8ecacaf 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -3,8 +3,8 @@ Pages = ["api.md"] Depth = 2 ``` -```@autodocs + diff --git a/docs/src/custom_types.md b/docs/src/custom_types.md deleted file mode 100644 index 68ed03d..0000000 --- a/docs/src/custom_types.md +++ /dev/null @@ -1,8 +0,0 @@ -```@contents -Pages = ["custom_types.md"] -Depth = 2 -``` - - -# How to specialize the `Assignment` type for faster performance - diff --git a/docs/src/examples.md b/docs/src/examples.md deleted file mode 100644 index e69de29..0000000 diff --git a/docs/src/index.md b/docs/src/index.md index 9629542..0bd81de 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,8 @@ # NetworkHistogram.jl -Implementation of the network histogram for graphon estimation from the paper [Network histograms and universality of blockmodel approximation](https://doi.org/10.1073/pnas.1400374111) by Sofia C. Olhede and Patrick J. Wolfe. - +Implementation of the network histogram for graphon estimation from the paper +[Network histograms and universality of blockmodel approximation](https://doi.org/10.1073/pnas.1400374111) +by Sofia C. Olhede and Patrick J. Wolfe. ## Installation @@ -11,7 +12,8 @@ Pkg.add("NetworkHistogram") ## Usage -We fit the estimator and then extract the estimated graphon matrix and node labels. +We fit the estimator and then extract the estimated graphon matrix and node +labels. ```julia using NetworkHistogram, LinearAlgebra @@ -29,5 +31,5 @@ sbm_matrix = estimate.θ node_labels = estimate.node_labels ``` -You can control the optimization process by modifying the rules used in the optimization. -Check out [Optimization hyper-parameters](@ref) for more information. \ No newline at end of file +You can control the optimization process by modifying the rules used in the +optimization. diff --git a/docs/src/internal.md b/docs/src/internal.md deleted file mode 100644 index df748f0..0000000 --- a/docs/src/internal.md +++ /dev/null @@ -1,8 +0,0 @@ -```@contents -Pages = ["internal.md"] -Depth = 2 -``` - - -# Assignments and group sizes - diff --git a/docs/src/internals/assignments.md b/docs/src/internals/assignments.md deleted file mode 100644 index 29c717a..0000000 --- a/docs/src/internals/assignments.md +++ /dev/null @@ -1,20 +0,0 @@ -```@contents -Pages = ["assignments.md"] -Depth = 1 -``` - - -# Assignments and group sizes - - -```@autodocs -Modules = [NetworkHistogram] -Pages = ["Assignments.jl", "group_numbering.jl"] -Private = true -``` - -## How to specialize the `Assignment` type for faster performance - -```@docs -NetworkHistogram.BernoulliData -``` \ No newline at end of file diff --git a/docs/src/internals/distributions.md b/docs/src/internals/distributions.md deleted file mode 100644 index 73c2cd4..0000000 --- a/docs/src/internals/distributions.md +++ /dev/null @@ -1,14 +0,0 @@ -```@contents -Pages = ["distributions.md"] -Depth = 0 -``` - - -# Distributions - - -```@autodocs -Modules = [NetworkHistogram] -Pages = ["categorical_with_0.jl", "discrete_dist.jl","discretizer.jl", "zero_inflated.jl"] -Private = true -``` \ No newline at end of file diff --git a/docs/src/rules.md b/docs/src/rules.md deleted file mode 100644 index 788374e..0000000 --- a/docs/src/rules.md +++ /dev/null @@ -1,34 +0,0 @@ -# Optimization hyper-parameters - -Here we discuss the different parameters that can be used to control the optimization process. The optimization greedily tries to find a good partition of the network. You can control the optimization process by setting the following parameters: - -## Starting node labels - -```@docs; canonical=false -NetworkHistogram.initialize_node_labels -``` - -!!! note - The groups will be of size `floor(h * n)` where `n` is the number of nodes if `h` is a - float. If `h` is an integer, the groups will be of size `h`. The last group may be - bigger if `n` is not exactly divisible by the group size. - - -## Swapping rule - -```@docs; canonical=false -NetworkHistogram.select_swap -``` - - -## Acceptance rule - -```@docs; canonical=false -NetworkHistogram.accept_reject_update! -``` - -## Stopping rule - -```@docs; canonical=false -NetworkHistogram.stopping_rule -``` \ No newline at end of file diff --git a/docs/src/tutorials/multiplex_network.md b/docs/src/tutorials/multiplex_network.md new file mode 100644 index 0000000..92da870 --- /dev/null +++ b/docs/src/tutorials/multiplex_network.md @@ -0,0 +1,12 @@ +```@meta +EditURL = "../../literate/tutorials/multiplex_network.jl" +``` + +# Decorated Graphon Tutorial for Multiplex Networks + +# How to use NetworkHistogram.jl for Multiplex Networks + +--- + +*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* + diff --git a/docs/src/tutorials/simple_graph.md b/docs/src/tutorials/simple_graph.md new file mode 100644 index 0000000..bfe8bb9 --- /dev/null +++ b/docs/src/tutorials/simple_graph.md @@ -0,0 +1,208 @@ +```@meta +EditURL = "../../literate/tutorials/simple_graph.jl" +``` + +# A Simple Graphon Tutorial with NetworkHistogram.jl + +This tutorial introduces the concept of a graphon, demonstrates how to sample a graph from one, and then shows how to estimate the graphon from the sampled graph using the Network Histogram method provided by `NetworkHistogram.jl`. + +## What is a Graphon? + +A graphon (or graph function) is a symmetric, measurable function $$W: [0, 1]^2 \to [0, 1]$$. + +It serves as a generative model for random graphs. Think of it as a continuous and more general version of a stochastic block model. + +In simple terms, each node `i` in a graph is assigned a latent (unobserved) position $u_i \in [0, 1]$. The probability of an edge existing between two nodes `i` and `j` is then given by the graphon function evaluated at their latent positions: + +Let's define a simple graphon in Julia. For this example, we'll use a step-function-like graphon that resembles a stochastic block model. + +````@example simple_graph +import CairoMakie as Mke +using LinearAlgebra +using Random +import StatsBase: inverse_rle +using Statistics +using NetworkHistogram +using Distributions + +h = 300; # hide +nothing #hide +```` + +Define a simple step-function graphon + +````@example simple_graph +W(u, v) = u * v +```` + +We can visualize this graphon as a heatmap. + +````@example simple_graph +let + grid = 0:0.01:1 + fig = Mke.Figure(size = (h + 20, h)) + ax = Mke.Axis(fig[1, 1], title = "True Graphon W(u,v)", + xlabel = "u", ylabel = "v", aspect = Mke.DataAspect()) + hm = Mke.heatmap!(ax, grid, grid, W, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 2], hm) + fig +end + +#md +```` + +## Sampling a Graph from a Graphon + +To generate a random graph from a graphon, we follow these steps: +1. **Assign latent positions:** For a graph with `n` nodes, we sample `n` independent and identically distributed random variables $u_1, u_2, \dots, u_n$ from a Uniform(0, 1) distribution. These are the latent positions of our nodes. +2. **Generate edges:** For each pair of nodes `(i, j)` with `i < j`, we generate a random number from a Bernoulli distribution with probability $W(u_i, u_j)$. This determines whether an edge exists between them. The resulting adjacency matrix `A` will be symmetric. + +Let's write a function to do this. + +````@example simple_graph +function sample_graph(W_func, n::Int; seed = 123) + Random.seed!(seed) + u = rand(n) # Latent positions + A = zeros(Int, n, n) + for i in 1:n + for j in (i + 1):n + if rand() < W_func(u[i], u[j]) + A[i, j] = A[j, i] = 1 + end + end + end + return A, u +end +```` + +Now, let's sample a graph with 400 nodes from our graphon `W`. + +````@example simple_graph +n = 400 +A, u_true = sample_graph(W, n) +```` + +We can visualize the adjacency matrix of the sampled graph. +To make the block structure visible, we sort the nodes by their latent positions. + +````@example simple_graph +perm = sortperm(u_true) +A = A[perm, perm] +let + fig = Mke.Figure(size = (h, h)) + ax = Mke.Axis( + fig[1, 1], title = "Sampled Adjacency Matrix (Sorted)", aspect = Mke.DataAspect()) + Mke.heatmap!(ax, A, colormap = :binary) + fig +end + +#md +```` + +## The Network Histogram Method + +The Network Histogram method is a non-parametric approach to estimate a graphon from a single observed network. The core idea is to approximate the (unknown) graphon `W` with a piecewise constant function. + +This is achieved by: +1. **Partitioning the nodes:** The nodes of the graph are partitioned into `k` groups. +2. **Estimating block probabilities:** The probability of an edge between any two groups is estimated by the density of edges between them. +3. **Constructing the histogram:** These estimated probabilities form a `k x k` matrix, which is a step-function approximation of the true graphon. + +The main challenge is to find the optimal partition of nodes. `NetworkHistogram.jl` provides tools to find a good partition by optimizing an objective function, such as the log-likelihood of the observed graph under the model. + +## Fitting a Network Histogram with NetworkHistogram.jl + +Now, let's use `NetworkHistogram.jl` to fit a network histogram to the graph `A` we sampled earlier. We will try to recover the underlying 2-block structure. + +First, we need to represent our graph in a format that the package understands. +We can use an `EdgeList` to store the edges of the graph. + +````@example simple_graph +edge_list = EdgeList(A) +```` + +We also need to define the model for the edges. Since our graph is unweighted, +we can use a `Bernoulli` distribution. The `Dist` wrapper is used to +handle aggregation of distributions. + +````@example simple_graph +import NetworkHistogram: Dist, Assignment, nethist +dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter much. +```` + +We start with a random initial assignment of nodes to `k=5` groups. + +````@example simple_graph +k = floor(Int, sqrt(n)) +oracle_labels = inverse_rle(1:k, fill(n ÷ k, k)) + +initial_assignment = shuffle(oracle_labels) +```` + +Now, we create an `Assignment` object, which holds all the information +about the model and the current state of the node groupings. + +````@example simple_graph +oracle_estimator = Assignment(oracle_labels, edge_list, Dist(dist)); +heatmap_params(oracle_estimator, ordering = false, colorrange = (0, 1)) + +println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) +```` + +`NetworkHistogram.jl` provides optimization algorithms to improve the initial assignment. +Let's use the `nethist` function with `GreedyParams`, which iteratively moves nodes between +groups to maximize the log-likelihood. + +````@example simple_graph +params_opti = NetworkHistogram.GreedyParams( + 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(2_000), false) + +a = nethist(A, dist, initial_assignment, params_opti, false); +nothing #hide +```` + +The `Assignment` object `a` now contains the optimized node groupings and +the fitted network histogram parameters. + +We can visualize the fitted histogram. + +````@example simple_graph +heatmap_params(a, ordering = false, colorrange = (0, 1)) +```` + +And we can look at the estimated block model. + +````@example simple_graph +sbm_fitted = NetworkHistogram.BlockModel(a); +nothing #hide +```` + +We first align the groups to the true latent positions. + +````@example simple_graph +NetworkHistogram.align_sbm_true_latents!(sbm_fitted, a, oracle_estimator.node_labels); +nothing #hide +```` + +and display the true function, the oracle estimator, and the fitted model + +````@example simple_graph +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], NetworkHistogram.BlockModel(oracle_estimator), + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], sbm_fitted, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end +```` + +--- + +*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* + diff --git a/docs/src/tutorials/temporal_networks.md b/docs/src/tutorials/temporal_networks.md new file mode 100644 index 0000000..7174672 --- /dev/null +++ b/docs/src/tutorials/temporal_networks.md @@ -0,0 +1,12 @@ +```@meta +EditURL = "../../literate/tutorials/temporal_networks.jl" +``` + +# Decorated Graphon Tutorial for Temporal Networks + +# How to use NetworkHistogram.jl for Temporal Networks + +--- + +*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* + diff --git a/docs/src/tutorials/weighted_network.md b/docs/src/tutorials/weighted_network.md new file mode 100644 index 0000000..3b22b71 --- /dev/null +++ b/docs/src/tutorials/weighted_network.md @@ -0,0 +1,12 @@ +```@meta +EditURL = "../../literate/tutorials/weighted_network.jl" +``` + +# Decorated Graphon Tutorial for Weighted Networks + +# How to use NetworkHistogram.jl for Weighted Networks + +--- + +*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* + diff --git a/ext/MakieExt.jl b/ext/MakieExt.jl index e89f3d8..c5a1b58 100644 --- a/ext/MakieExt.jl +++ b/ext/MakieExt.jl @@ -4,10 +4,13 @@ using NetworkHistogram using Makie import NetworkHistogram: get_probability_matrix, Assignment, heatmap_params, params, - number_nodes + number_nodes, number_groups, Dist, BlockModel import Distributions: params -_splatter_args(ps) = vcat(vec.(ps)...) +vec_mine(x) = vec(x) +vec_mine(x::Real) = x + +_splatter_args(ps) = vcat(vec_mine.(ps)...) _extract_params(d) = _splatter_args(params(d)) function Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) @@ -16,14 +19,28 @@ function Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) return ps end -function heatmap_params(a; colormap = :balance, ordering = false) - params_matrix = map(_extract_params, get_probability_matrix(a)) +function Makie.convert_arguments(::Type{<:Heatmap}, + sbm::BlockModel{D}) where {D <: Union{ + Dist{T}, T} where {T <: NetworkHistogram.Bernoulli}} + return (0:0.01:1, 0:0.01:1, (x, y) -> first(params(sbm[x, y]))) +end + +function heatmap_params(a; colormap = :binary, ordering = false, + colorrange = nothing, group_match = 1:number_groups(a)) + node_labels_new = map(x -> group_match[x], a.node_labels) + + params_matrix = map( + _extract_params, get_probability_matrix(a, nothing, node_labels_new)) + if ordering perm = sortperm(a.node_labels) else perm = 1:number_nodes(a) end params_matrix = params_matrix[perm, perm] + if isnothing(colorrange) + colorrange = extrema(_splatter_args(params_matrix)) + end num_params = length(params_matrix[1, 2]) # Compute rows and columns such that rows * columns >= num_params and as square as possible rows = floor(Int, sqrt(num_params)) @@ -31,16 +48,37 @@ function heatmap_params(a; colormap = :balance, ordering = false) if rows * cols < num_params rows += 1 end - fig = Figure(size = (800, 800)) + default_size = 300 + fig = Figure() # create a grid of subplots with rows x cols cells - axes = [Axis(fig[i, j]) for i in 1:rows, j in 1:cols] + axes = [Axis(fig[i, j], width = default_size, height = default_size) + for i in 1:rows, j in 1:cols] for i in 1:num_params - heatmap!(axes[i], getindex.(params_matrix, i)[perm, perm], colormap = colormap) + heatmap!(axes[i], getindex.(params_matrix, i)[perm, perm], + colormap = colormap, colorrange = colorrange) axes[i].title = "Parameter $i" end + Colorbar(fig[1:rows, cols + 1], limits = colorrange, colormap = colormap, + label = "Parameter value", width = ceil(Int, sqrt(default_size))) + resize_to_layout!(fig) return fig end +function order_groups(a::Assignment, latents::AbstractVector) + n = number_nodes(a) + k = number_groups(a) + sort_perm = sortperm(latents) + sorted_group_labels = a.node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + return sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) +end + +function align_sbm_true_latents!(sbm::NetworkHistogram.BlockModel, a::Assignment, latents) + NetworkHistogram.align_sbm!(sbm, order_groups(a, latents)) +end + export heatmap_params end diff --git a/src/block_model.jl b/src/block_model.jl index 9fdfb55..cf6e988 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -1,5 +1,5 @@ -struct BlockModel{D, V} - _dists::SymArray{D} +struct BlockModel{D, V, M <: AbstractMatrix{D}} + _dists::M sizes::V cum_sizes::V end @@ -122,6 +122,35 @@ function get_probability_matrix( return A end -function get_probability_matrix(a::Assignment, default_dist = nothing) - return get_probability_matrix(BlockModel(a.θ), a.node_labels, default_dist) +function get_probability_matrix( + a::Assignment, default_dist = nothing, node_labels = a.node_labels) + return get_probability_matrix(BlockModel(a.θ), node_labels, default_dist) +end + +function align_sbm!(sbm::BlockModel, perm) + sbm._dists .= sbm._dists[perm, perm] + sbm.sizes .= sbm.sizes[perm] + sbm.cum_sizes .= cumsum(sbm.sizes) +end + +""" + order_groups(a::Assignment, latents::AbstractVector) + +Order the groups of an assignment according to the true latents. This is an heuristic +approach, which is not guaranteed to find the true ordering of the groups. +""" +function order_groups(a::Assignment, latents::AbstractVector) + n = number_nodes(a) + k = number_groups(a) + sort_perm = sortperm(latents) + sorted_group_labels = a.node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + return sort( + 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) +end + +function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) + align_sbm!(sbm, order_groups(a, latents)) end From ace7386dbbde8dee9ce817edf1f79bc1dce7c160 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 10 Oct 2025 13:41:32 +0200 Subject: [PATCH 186/266] silence some output in tutorial --- docs/literate/tutorials/simple_graph.jl | 6 +++--- docs/src/tutorials/simple_graph.md | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 5a850dc..0543fcd 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -63,7 +63,7 @@ end # Now, let's sample a graph with 400 nodes from our graphon `W`. n = 400 -A, u_true = sample_graph(W, n) +A, u_true = sample_graph(W, n); # We can visualize the adjacency matrix of the sampled graph. # To make the block structure visible, we sort the nodes by their latent positions. @@ -95,7 +95,7 @@ end # First, we need to represent our graph in a format that the package understands. # We can use an `EdgeList` to store the edges of the graph. -edge_list = EdgeList(A) +edge_list = EdgeList(A); # We also need to define the model for the edges. Since our graph is unweighted, # we can use a `Bernoulli` distribution. The `Dist` wrapper is used to @@ -107,7 +107,7 @@ dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter k = floor(Int, sqrt(n)) oracle_labels = inverse_rle(1:k, fill(n ÷ k, k)) -initial_assignment = shuffle(oracle_labels) +initial_assignment = shuffle(oracle_labels); # Now, we create an `Assignment` object, which holds all the information # about the model and the current state of the node groupings. diff --git a/docs/src/tutorials/simple_graph.md b/docs/src/tutorials/simple_graph.md index bfe8bb9..1551ad7 100644 --- a/docs/src/tutorials/simple_graph.md +++ b/docs/src/tutorials/simple_graph.md @@ -79,7 +79,8 @@ Now, let's sample a graph with 400 nodes from our graphon `W`. ````@example simple_graph n = 400 -A, u_true = sample_graph(W, n) +A, u_true = sample_graph(W, n); +nothing #hide ```` We can visualize the adjacency matrix of the sampled graph. @@ -118,7 +119,8 @@ First, we need to represent our graph in a format that the package understands. We can use an `EdgeList` to store the edges of the graph. ````@example simple_graph -edge_list = EdgeList(A) +edge_list = EdgeList(A); +nothing #hide ```` We also need to define the model for the edges. Since our graph is unweighted, @@ -136,7 +138,8 @@ We start with a random initial assignment of nodes to `k=5` groups. k = floor(Int, sqrt(n)) oracle_labels = inverse_rle(1:k, fill(n ÷ k, k)) -initial_assignment = shuffle(oracle_labels) +initial_assignment = shuffle(oracle_labels); +nothing #hide ```` Now, we create an `Assignment` object, which holds all the information From a5e3a2735d8de11bfa0dc358e1591aadee468ec0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 15 Oct 2025 14:23:01 +0200 Subject: [PATCH 187/266] add docs and clean codebase --- Project.toml | 5 +- README.md | 106 +++++++++++++- ext/MakieExt.jl | 81 ++++++++++- src/EdgeList.jl | 131 +++++++++++++++-- src/api.jl | 83 ++++++++++- src/assignment.jl | 171 ++++++++++++++++------ src/block_model.jl | 186 +++++++++++++++++++++++- src/distributions/cat.jl | 3 +- src/distributions/distributions_type.jl | 177 ++++++++++++++++++++-- src/optimization/greedy.jl | 77 ++++++++++ src/utils/SymArray.jl | 67 ++++++++- 11 files changed, 995 insertions(+), 92 deletions(-) diff --git a/Project.toml b/Project.toml index 5b2b87b..2d1219b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,11 +1,12 @@ name = "NetworkHistogram" uuid = "7806f430-7229-459c-b2e6-df35e8e4eb5d" -authors = ["Charles Dufour", "Jake Grainger"] version = "0.5.2" +authors = ["Charles Dufour", "Jake Grainger"] [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" @@ -25,6 +26,8 @@ DiscretizeExt = "DiscretizeDistributions" LightMCExt = "LightMC" MakieExt = "Makie" +[compat] +Graphons = "0.1.0" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index 4b8d822..90614ae 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ Pkg.add("NetworkHistogram") ## Usage +### Basic Usage + We fit the estimator and then extract the estimated graphon matrix and node labels. @@ -58,7 +60,7 @@ A[diagind(A)] .= 0 # approximate the graphon with a network histogram hist = graphhist(A) -# get the graphist structure +# get the graphhist structure estimate = hist.graphhist # get the estimated graphon matrix @@ -68,5 +70,105 @@ sbm_matrix = estimate.θ node_labels = estimate.node_labels ``` +### Advanced Usage with Custom Parameters + You can control the optimization process by modifying the rules used in the -optimization. Check out the docs for more information. +optimization: + +```julia +using NetworkHistogram + +# Binary network +A = Symmetric(rand(0:1, 100, 100)) +A[diagind(A)] .= 0 + +# Initial partition into k groups +k = 3 +initial_labels = rand(1:k, 100) + +# Configure optimization parameters +params = GreedyParams( + 50_000, # Maximum iterations + RandomNodeSwap(), # How to select nodes to swap + Strict(), # Only accept improvements + PreviousBestValue(5000), # Stop after 5000 iterations without improvement + true # Show progress bar +) + +# Fit the network histogram +result = nethist(A, Bernoulli(0.5), initial_labels, params) + +# Extract results +ll = loglikelihood(result) +block_params = result.θ +node_groups = result.node_labels +``` + +### Working with Different Edge Types + +The package supports various edge types through custom distributions: + +```julia +using NetworkHistogram +using Distributions # For standard distributions + +# Example 1: Weighted networks with continuous edges +W = Symmetric(rand(100, 100)) +W[diagind(W)] .= 0 +# You can use any distribution that implements the required interface + +# Example 2: Count data (e.g., number of interactions) +C = Symmetric(rand(Poisson(2), 100, 100)) +C[diagind(C)] .= 0 +# Use appropriate count distribution + +# Example 3: Sparse networks with missing edges +A_sparse = Symmetric(rand([0, 1, missing], 100, 100)) +A_sparse[diagind(A_sparse)] .= 0 +# Missing values are treated as absent edges +``` + +### Visualizing Results (with Makie.jl) + +```julia +using NetworkHistogram +using CairoMakie # or GLMakie + +# Fit model +A = Symmetric(rand(0:1, 100, 100)) +A[diagind(A)] .= 0 +result = nethist(A, Bernoulli(0.5), rand(1:3, 100), GreedyParams()) + +# Create heatmap of estimated parameters +fig = heatmap_params(result, ordering=true, colormap=:viridis) +save("network_histogram.png", fig) +``` + +### Sampling from a Block Model + +```julia +using NetworkHistogram + +# Define a 3-block model +k = 3 +bm = BlockModel(k, Bernoulli(0.5)) + +# Set custom edge probabilities between blocks +bm[1, 1] = Bernoulli(0.8) # High within-group connectivity +bm[2, 2] = Bernoulli(0.7) +bm[3, 3] = Bernoulli(0.6) +bm[1, 2] = Bernoulli(0.1) # Low between-group connectivity +bm[1, 3] = Bernoulli(0.05) +bm[2, 3] = Bernoulli(0.05) + +# Sample a network +n_nodes = 150 +latents, A = sample(bm, n_nodes) + +# latents contains the true block assignments +# A is the sampled adjacency matrix +``` + +Check out the +[documentation](https://sds-epfl.github.io/NetworkHistogram.jl/dev/) for more +examples and detailed API information. diff --git a/ext/MakieExt.jl b/ext/MakieExt.jl index c5a1b58..ce0a3fe 100644 --- a/ext/MakieExt.jl +++ b/ext/MakieExt.jl @@ -1,30 +1,66 @@ +""" +MakieExt - Visualization extension for NetworkHistogram + +Provides plotting capabilities for Assignment and BlockModel objects using Makie.jl. +""" module MakieExt using NetworkHistogram using Makie +using StatsBase: countmap -import NetworkHistogram: get_probability_matrix, Assignment, heatmap_params, params, +import NetworkHistogram: get_probability_matrix, Assignment, heatmap_params, number_nodes, number_groups, Dist, BlockModel -import Distributions: params +import Distributions +import StatsAPI +# Helper functions to extract distribution parameters vec_mine(x) = vec(x) vec_mine(x::Real) = x _splatter_args(ps) = vcat(vec_mine.(ps)...) -_extract_params(d) = _splatter_args(params(d)) +_extract_params(d) = _splatter_args(StatsAPI.params(d)) + +""" + Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) +Convert an Assignment to plottable data by extracting distribution parameters. +""" function Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) params_matrix = map(_extract_params, get_probability_matrix(a)) ps = (getindex.(params_matrix, i) for i in 1:length(params_matrix[1, 2])) return ps end +""" + Makie.convert_arguments(::Type{<:Heatmap}, sbm::BlockModel) + +Convert a BlockModel to heatmap arguments for Bernoulli distributions. +""" function Makie.convert_arguments(::Type{<:Heatmap}, sbm::BlockModel{D}) where {D <: Union{ Dist{T}, T} where {T <: NetworkHistogram.Bernoulli}} - return (0:0.01:1, 0:0.01:1, (x, y) -> first(params(sbm[x, y]))) + return (0:0.01:1, 0:0.01:1, (x, y) -> first(StatsAPI.params(sbm[x, y]))) end +""" + heatmap_params(a; colormap=:binary, ordering=false, colorrange=nothing, group_match=1:number_groups(a)) + +Create a heatmap visualization of distribution parameters in an Assignment. + +# Arguments +- `a::Assignment`: The assignment to visualize +- `colormap`: Color scheme for the heatmap (default: :binary) +- `ordering::Bool`: Whether to sort nodes by their group labels (default: false) +- `colorrange`: Range for color mapping (default: auto-computed from data) +- `group_match`: Mapping for group indices (default: identity) + +# Returns +- `Figure`: Makie figure with heatmap(s) showing distribution parameters + +# Note +For multi-parameter distributions, creates a grid of heatmaps, one per parameter. +""" function heatmap_params(a; colormap = :binary, ordering = false, colorrange = nothing, group_match = 1:number_groups(a)) node_labels_new = map(x -> group_match[x], a.node_labels) @@ -38,19 +74,23 @@ function heatmap_params(a; colormap = :binary, ordering = false, perm = 1:number_nodes(a) end params_matrix = params_matrix[perm, perm] + if isnothing(colorrange) colorrange = extrema(_splatter_args(params_matrix)) end + num_params = length(params_matrix[1, 2]) - # Compute rows and columns such that rows * columns >= num_params and as square as possible + # Compute grid dimensions: make as square as possible rows = floor(Int, sqrt(num_params)) cols = ceil(Int, num_params / rows) if rows * cols < num_params rows += 1 end + default_size = 300 fig = Figure() - # create a grid of subplots with rows x cols cells + + # Create grid of subplots axes = [Axis(fig[i, j], width = default_size, height = default_size) for i in 1:rows, j in 1:cols] @@ -59,12 +99,28 @@ function heatmap_params(a; colormap = :binary, ordering = false, colormap = colormap, colorrange = colorrange) axes[i].title = "Parameter $i" end + Colorbar(fig[1:rows, cols + 1], limits = colorrange, colormap = colormap, label = "Parameter value", width = ceil(Int, sqrt(default_size))) resize_to_layout!(fig) return fig end +""" + order_groups(a::Assignment, latents::AbstractVector) + +Order groups based on true latent variables (heuristic alignment). + +This is a heuristic approach to match estimated groups to ground truth orderings +by analyzing the overlap between sorted latents and group assignments. + +# Arguments +- `a::Assignment`: The assignment with estimated groups +- `latents::AbstractVector`: True latent variables (e.g., block memberships) + +# Returns +- Permutation vector for reordering groups +""" function order_groups(a::Assignment, latents::AbstractVector) n = number_nodes(a) k = number_groups(a) @@ -76,6 +132,19 @@ function order_groups(a::Assignment, latents::AbstractVector) return sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) end +""" + align_sbm_true_latents!(sbm::NetworkHistogram.BlockModel, a::Assignment, latents) + +Align a BlockModel's groups to match true latent variables. + +# Arguments +- `sbm::BlockModel`: The block model to align (modified in-place) +- `a::Assignment`: The assignment +- `latents`: True latent variables + +# Note +This modifies `sbm` in-place to reorder its blocks. +""" function align_sbm_true_latents!(sbm::NetworkHistogram.BlockModel, a::Assignment, latents) NetworkHistogram.align_sbm!(sbm, order_groups(a, latents)) end diff --git a/src/EdgeList.jl b/src/EdgeList.jl index f1b6836..7c20abe 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -1,27 +1,123 @@ +""" + EdgeList{E} + +A memory-efficient adjacency list representation for sparse networks. + +# Fields +- `data::Vector{Vector{E}}`: For each node, stores the edge values to its neighbors +- `name_list::Vector{Vector{Int}}`: For each node, stores the node indices of its neighbors + +# Type Parameters +- `E`: The type of edge values (e.g., Int, Float64, or custom distribution types) + +# Examples +```julia +# From an adjacency matrix +A = [0 1 0; 1 0 1; 0 1 0] +edges = EdgeList(A) + +# Access neighbors of node 1 +neighbor_indices, edge_values = neighbors(edges, 1) + +# Iterate through neighbors +for (neighbor, edge) in iterate_neighbors(edges, 1) + println("Edge to node ", neighbor, " with value ", edge) +end +``` + +See also: [`neighbors`](@ref), [`iterate_neighbors`](@ref), [`get_edge`](@ref) +""" struct EdgeList{E} data::Vector{Vector{E}} name_list::Vector{Vector{Int}} end -function neighbors(A::EdgeList{E}, i::Int) where {E} +""" + neighbors(A::EdgeList, i::Int) + +Get the neighbor indices and edge values for node `i`. + +Returns a tuple `(neighbor_indices, edge_values)` where each vector has the same length. + +# Example +```julia +edges = EdgeList(A) +neighbor_nodes, edge_vals = neighbors(edges, 1) +``` +""" +@inline function neighbors(A::EdgeList{E}, i::Int) where {E} + @boundscheck checkbounds(A.data, i) + @boundscheck checkbounds(A.name_list, i) return A.name_list[i], A.data[i] end -iterate_neighbors(A::EdgeList, i::Int) = zip(neighbors(A, i)...) -edge_type(A::EdgeList{E}) where {E} = E -nodes(edgelist::EdgeList) = length(edgelist.data) -number_nodes(edgelist::EdgeList) = nodes(edgelist) +""" + iterate_neighbors(A::EdgeList, i::Int) + +Returns an iterator over (neighbor_index, edge_value) pairs for node `i`. + +# Example +```julia +for (j, edge) in iterate_neighbors(edges, i) + # Process edge from i to j +end +``` +""" +@inline iterate_neighbors(A::EdgeList, i::Int) = zip(neighbors(A, i)...) +""" + edge_type(A::EdgeList{E}) + +Get the element type `E` of edges stored in the EdgeList. +""" +@inline edge_type(A::EdgeList{E}) where {E} = E + +""" + nodes(edgelist::EdgeList) + number_nodes(edgelist::EdgeList) + +Return the number of nodes in the network. +""" +@inline nodes(edgelist::EdgeList) = length(edgelist.data) +@inline number_nodes(edgelist::EdgeList) = nodes(edgelist) + +""" + EdgeList(A::AbstractMatrix{<:Union{Missing, E}}) where {E} + +Construct an EdgeList from an adjacency matrix. Missing values are treated as absent edges, +and diagonal entries are excluded (no self-loops). + +# Arguments +- `A::AbstractMatrix`: Adjacency matrix where `missing` indicates absent edges + +# Example +```julia +A = [0 1 missing; 1 0 2; missing 2 0] +edges = EdgeList(A) +``` +""" function EdgeList(A::AbstractMatrix{<:Union{Missing, E}}) where {E} _from_adj_to_edge_list(A) end EdgeList(adj_list::EdgeList) = adj_list +""" + get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} + +Get the edge value between nodes `i` and `j`. Returns `zero(E)` if no edge exists or if `i == j`. + +# Arguments +- `A::EdgeList{E}`: The edge list +- `i::Int`: Source node index +- `j::Int`: Target node index + +# Returns +- Edge value of type `E`, or `zero(E)` if no edge exists +""" function get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} if i == j return zero(E) end - # TODO: probably can remove this if j ∉ A.name_list[i] && i ∉ A.name_list[j] return zero(E) end @@ -30,6 +126,7 @@ function get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} return e end end + return zero(E) # If edge not found in the iteration end # function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} @@ -49,6 +146,7 @@ end # return EdgeList(data, name_list) # end +# Internal function to convert adjacency matrix to EdgeList format function _from_adj_to_edge_list( A::AbstractMatrix, function_to_apply = identity) n = size(A, 1) @@ -60,9 +158,8 @@ function _from_adj_to_edge_list( data[j] = Vector{typeof(test)}(undef, 0) name_list[j] = Vector{Int}(undef, 0) for i in 1:n - if !ismissing(A[i, j]) - end - if !ismissing(A[i, j]) && i != j # gonna be an issue with MC! have to define 0 chain and fast operations on them + # Exclude diagonal and missing edges + if !ismissing(A[i, j]) && i != j push!(name_list[j], i) push!(data[j], function_to_apply(A[i, j])) end @@ -71,6 +168,7 @@ function _from_adj_to_edge_list( return EdgeList(data, name_list) end +# Internal functions for preprocessing edge data function _fast_compressed_obs(d::Dist, A::AbstractMatrix, zeroinflated) _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x, zeroinflated)) end @@ -78,8 +176,8 @@ function _fast_compressed_obs(d::Dist, A::EdgeList{E}, zeroinflated) where {E} _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x, zeroinflated)) end +# Internal function to apply a transformation to EdgeList data function _make_shift_broadcast(A::EdgeList, f) - # may work ? -> data = f.(A.data) n = length(A.data) test = f(A.data[1][1]) data = Vector{Vector{typeof(test)}}(undef, n) @@ -89,8 +187,19 @@ function _make_shift_broadcast(A::EdgeList, f) return EdgeList(data, A.name_list) end -#convert(::Type{EdgeList}, A::AbstractMatrix) = EdgeList(A) +""" + fit(d::Dist, A::EdgeList{E}) where {E} + +Fit the distribution `d` to each edge in the EdgeList `A`, returning a new EdgeList +where each edge is replaced by its fitted distribution. + +# Arguments +- `d::Dist`: The distribution type to fit +- `A::EdgeList{E}`: EdgeList containing edge observations +# Returns +- `EdgeList{typeof(d)}`: New EdgeList with fitted distributions +""" function fit(d::Dist, A::EdgeList{E}) where {E} new_data = Vector{Vector{typeof(d)}}(undef, length(A.data)) for j in 1:length(A.data) diff --git a/src/api.jl b/src/api.jl index 1e91a9b..7806e30 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,9 +1,87 @@ +""" + nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated::Bool = false) + +Estimate a network histogram (stochastic block model) from network data. + +This is the main entry point for fitting a network histogram to your data. It performs +preprocessing, optimization, and returns an Assignment representing the estimated model. + +# Arguments +- `data_input`: Network data (adjacency matrix or EdgeList) +- `dist_user`: Reference distribution for edge values (e.g., Bernoulli, Categorical) +- `initial_node_labels`: Initial group assignment for nodes (vector of integers 1:k) +- `params::GreedyParams`: Optimization parameters +- `zero_inflated::Bool`: Whether to use zero-inflated version of distribution (default: false) + +# Returns +- `Assignment`: The fitted network histogram with optimized node groups and parameters + +# Throws +- `ArgumentError`: If input validation fails + +# Examples +```julia +using NetworkHistogram, LinearAlgebra + +# Binary network +A = Symmetric(rand(0:1, 100, 100)) +A[diagind(A)] .= 0 + +# Initial partition into 3 groups +initial_labels = rand(1:3, 100) + +# Fit network histogram +params = GreedyParams() +result = nethist(A, Bernoulli(0.5), initial_labels, params) + +# Extract results +block_matrix = result.θ +node_groups = result.node_labels +ll = loglikelihood(result) +``` + +See also: [`GreedyParams`](@ref), [`Assignment`](@ref), [`BlockModel`](@ref) +""" function nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated::Bool = false) + # Input validation + if data_input isa AbstractMatrix + n_rows, n_cols = size(data_input) + if n_rows != n_cols + throw(ArgumentError("Adjacency matrix must be square, got size ($n_rows, $n_cols)")) + end + n = n_rows + elseif data_input isa EdgeList + n = number_nodes(data_input) + else + throw(ArgumentError("data_input must be an AbstractMatrix or EdgeList")) + end + + if length(initial_node_labels) != n + throw(ArgumentError("initial_node_labels length ($(length(initial_node_labels))) must match number of nodes ($n)")) + end + + k = length(unique(initial_node_labels)) + if k < 1 + throw(ArgumentError("Must have at least one group, got $k groups")) + end + if k > n + throw(ArgumentError("Number of groups ($k) cannot exceed number of nodes ($n)")) + end + + if !all(x -> x isa Integer && 1 <= x <= k, initial_node_labels) + throw(ArgumentError("initial_node_labels must contain integers in range 1:$k")) + end + + if params.max_iter < 1 + throw(ArgumentError("max_iter must be positive, got $(params.max_iter)")) + end + return _nethist( data_input, dist_user, initial_node_labels, params, Val(zero_inflated)) end +# Internal implementation with compile-time zero-inflation flag function _nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated) @debug "preprocessing data" @@ -17,6 +95,8 @@ function _nethist(data_input, dist_user, initial_node_labels, return postprocess(out) end +# Helper functions for preprocessing + function get_ref_dist(dist::D, ::Val{true}) where {D} return Dist(ZeroInflated(dist)) end @@ -31,7 +111,4 @@ end function postprocess(out) return out - return Assignment(out.node_labels, out.edges, out.dists, SymArray(unwrap.(out.θ)), - out.log_likelihood, out.additional_workspace) - return out.node_labels, BlockModel(out) end diff --git a/src/assignment.jl b/src/assignment.jl index c523713..1fe2601 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -1,43 +1,41 @@ -# """ -# Array-like storage for the number of nodes in each group. Try to split the number of nodes -# into equal groups, but if it is not possible, the last group may have more nodes. -# """ -# struct GroupSize{T} <: AbstractVector{Int} -# group_number::T -# number_groups::Int - -# function GroupSize(number_nodes, h::Real) -# @assert 0 < h < 1 -# standard_group = floor(Int, number_nodes * h) -# GroupSize(number_nodes, standard_group) -# end - -# function GroupSize(number_nodes, standard_group::Integer) -# @assert 1 < standard_group <= number_nodes -# number_groups = number_nodes ÷ standard_group # number of standard groups! -# if number_groups * standard_group == number_nodes -# new{Int}(standard_group, number_groups) -# else -# remainder_group = standard_group + -# mod(number_nodes, standard_group) -# new{Tuple{Int, Int}}( -# (standard_group, remainder_group), number_groups) -# end -# end -# end - -# Base.size(g::GroupSize) = (g.number_groups,) -# Base.@propagate_inbounds function Base.getindex(g::GroupSize{Int}, i::Int) -# @boundscheck checkbounds(g, i) -# return g.group_number -# end - -# Base.@propagate_inbounds function Base.getindex( -# g::GroupSize{Tuple{Int, Int}}, i::Int) -# @boundscheck checkbounds(g, i) -# return i < length(g) ? g.group_number[1] : g.group_number[2] -# end +""" + Assignment{E, D, F, W, V <: AbstractVector{Int}} +Represents a network histogram: a partition of nodes into groups along with +edge distributions between groups. + +# Fields +- `node_labels::V`: Vector assigning each node to a group (1-indexed) +- `edges::EdgeList{E}`: The observed edge data +- `dists::EdgeList{D}`: Fitted distributions for each edge +- `θ::SymArray{D}`: Symmetric matrix of aggregated distributions between groups +- `log_likelihood::SymArray{F}`: Symmetric matrix of log-likelihoods for each group pair +- `additional_workspace::W`: Optional workspace for optimization algorithms + +# Type Parameters +- `E`: Type of edge observations +- `D`: Type of fitted distributions +- `F`: Type for log-likelihood values (typically Float64) +- `W`: Type for additional workspace data +- `V`: Vector type for node labels + +# Examples +```julia +# Create assignment from node labels and edge data +node_labels = [1, 1, 2, 2, 3] +edges = EdgeList(adjacency_matrix) +dist = Dist(Bernoulli(0.5)) +assignment = Assignment(node_labels, edges, dist) + +# Query assignment properties +k = number_groups(assignment) +n = number_nodes(assignment) +ll = loglikelihood(assignment) +group_i = group(assignment, node_i) +``` + +See also: [`BlockModel`](@ref), [`EdgeList`](@ref), [`Dist`](@ref) +""" mutable struct Assignment{E, D, F, W, V <: AbstractVector{Int}} node_labels::V const edges::EdgeList{E} @@ -47,21 +45,80 @@ mutable struct Assignment{E, D, F, W, V <: AbstractVector{Int}} additional_workspace::W end -number_nodes(a::Assignment) = length(a.node_labels) -number_groups(a::Assignment) = size(a.θ, 1) +""" + number_nodes(a::Assignment) + +Return the number of nodes in the network. +""" +@inline number_nodes(a::Assignment) = length(a.node_labels) + +""" + number_groups(a::Assignment) +Return the number of groups (blocks) in the partition. +""" +@inline number_groups(a::Assignment) = size(a.θ, 1) + +""" + proportions(a::Assignment) + +Calculate the proportion of nodes in each group. + +# Returns +- Vector of proportions summing to 1.0 +""" function proportions(a::Assignment) return counts(a.node_labels) / number_nodes(a) end -function loglikelihood(a::Assignment) +""" + loglikelihood(a::Assignment) + +Calculate the total log-likelihood of the assignment. + +The log-likelihood measures how well the stochastic block model (with the current +node partition) fits the observed network data. + +# Returns +- `Float64`: Total log-likelihood value +""" +@inline function loglikelihood(a::Assignment) return FastSymArray.sum_tri_with_diag(a.log_likelihood) end -function group(a::Assignment, node::Int) - return a.node_labels[node] +""" + group(a::Assignment, node::Int) + +Get the group label for a specific node. + +# Arguments +- `a::Assignment`: The assignment +- `node::Int`: Node index (1-indexed) + +# Returns +- `Int`: Group index that the node belongs to +""" +@inline function group(a::Assignment, node::Int) + @boundscheck checkbounds(a.node_labels, node) + @inbounds return a.node_labels[node] end +""" + get_edges_in_groups(a::Assignment, g1::Int, g2::Int) + +Extract all edges between two groups. + +# Arguments +- `a::Assignment`: The assignment +- `g1::Int`: First group index +- `g2::Int`: Second group index + +# Returns +- `Vector{E}`: Vector of edge values between the two groups + +# Note +For within-group edges (g1 == g2), only returns edges where i < j to avoid duplicates. +""" function get_edges_in_groups(a::Assignment, g1::Int, g2::Int) return get_edges_in_groups(a.node_labels, a.edges, g1, g2) end @@ -81,6 +138,27 @@ function get_edges_in_groups(node_labels, edges_all, g1, g2) return edges end +""" + Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} + +Construct an Assignment from node labels, edge data, and a reference distribution. + +This constructor fits the distribution to the data, computes the block-level parameters +θ, and calculates the log-likelihood. + +# Arguments +- `node_labels`: Vector of group assignments for each node +- `edge_list::EdgeList{E}`: Edge observations +- `dist::Dist{D}`: Reference distribution to fit to the data + +# Example +```julia +node_labels = [1, 1, 2, 2] +edges = EdgeList(A) +dist = Dist(Bernoulli(0.5)) +assignment = Assignment(node_labels, edges, dist) +``` +""" function Assignment( node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} dists = fit(dist, edge_list) @@ -88,11 +166,14 @@ function Assignment( return Assignment(node_labels, edge_list, dists, θ, ll, nothing) end +# Internal function to compute θ parameters and log-likelihood for each group pair function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} number_groups = length(unique(node_labels)) θ = SymArray(number_groups, zero(dist)) log_likelihood = SymArray(number_groups, 0.0) + + # Aggregate distributions for each group pair for u in 1:nodes(dists) g1 = node_labels[u] for (v, d) in iterate_neighbors(dists, u) @@ -102,6 +183,8 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, end end end + + # Compute log-likelihood for each group pair for u in 1:nodes(dists) g1 = node_labels[u] for (v, e) in iterate_neighbors(edge_list, u) diff --git a/src/block_model.jl b/src/block_model.jl index cf6e988..f4caca2 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -1,9 +1,65 @@ +""" + BlockModel{D, V, M <: AbstractMatrix{D}} + +A stochastic block model representation for network generation and analysis. + +A block model is a piecewise constant graphon approximation where nodes are divided +into K blocks, and edges between blocks follow specific distributions. + +# Fields +- `_dists::M`: Symmetric K×K matrix of edge distributions between blocks +- `sizes::V`: Proportions of nodes in each block (sums to 1.0) +- `cum_sizes::V`: Cumulative proportions for mapping latent variables to blocks + +# Type Parameters +- `D`: Distribution type for edges (e.g., Bernoulli, Categorical, etc.) +- `V`: Vector type for storing proportions +- `M`: Matrix type for storing distributions + +# Constructors + +```julia +# Uniform block sizes with k blocks +BlockModel(k::Int, d::D) + +# From an Assignment +BlockModel(a::Assignment) + +# From node labels and parameter matrix +BlockModel(node_labels, θ) + +# From a distribution matrix (infers uniform block sizes) +BlockModel(θ::AbstractMatrix) +``` + +# Examples +```julia +# Create a 3-block model with Bernoulli edges +bm = BlockModel(3, Bernoulli(0.5)) + +# Sample a network from the block model +latents, A = sample(bm, 100) # 100 nodes + +# Access block-to-block distribution +dist_12 = bm[1, 2] + +# Map a latent variable to a block +block = map_ξ_to_block(bm, 0.3) +``` + +See also: [`Assignment`](@ref), [`sample`](@ref), [`get_probability_matrix`](@ref) +""" struct BlockModel{D, V, M <: AbstractMatrix{D}} _dists::M sizes::V cum_sizes::V end +""" + BlockModel(k::Int, d::D) where {D} + +Create a block model with `k` uniform-sized blocks, each initialized with distribution `d`. +""" function BlockModel(k::Int, d::D) where {D} sizes = fill(1 / k, k) cumulative_sizes = cumsum(sizes) @@ -11,6 +67,12 @@ function BlockModel(k::Int, d::D) where {D} return BlockModel(_dists, sizes, cumulative_sizes) end +""" + BlockModel(a::Assignment) + +Create a BlockModel from an Assignment, extracting the block proportions and +fitted distributions. +""" function BlockModel(a::Assignment) k = length(unique(a.node_labels)) sizes = proportions(a) @@ -19,6 +81,11 @@ function BlockModel(a::Assignment) return BlockModel(_dists, sizes, cumulative_sizes) end +""" + BlockModel(nodes_labels, θ) + +Create a BlockModel from node labels and a distribution matrix θ. +""" function BlockModel(nodes_labels, θ) k = length(unique(nodes_labels)) sizes = counts(nodes_labels) / length(nodes_labels) @@ -27,6 +94,11 @@ function BlockModel(nodes_labels, θ) return BlockModel(_dists, sizes, cumulative_sizes) end +""" + BlockModel(θ::AbstractMatrix{D}) where {D} + +Create a BlockModel from a distribution matrix, assuming uniform block sizes. +""" function BlockModel(θ::AbstractMatrix{D}) where {D} k = size(θ, 1) sizes = fill(1 / k, k) @@ -35,15 +107,54 @@ function BlockModel(θ::AbstractMatrix{D}) where {D} return BlockModel(_dists, sizes, cumulative_sizes) end +""" + map_ξ_to_block(bm::BlockModel, ξ::Real) + +Map a latent variable ξ ∈ [0,1] to its corresponding block index. + +# Arguments +- `bm::BlockModel`: The block model +- `ξ::Real`: Latent variable in [0, 1] + +# Returns +- `Int`: Block index (1 to k) +""" function map_ξ_to_block(bm::BlockModel, ξ::T) where {T <: Real} return findfirst(x -> x >= ξ, bm.cum_sizes) end +""" + sample(bm::BlockModel, latents::Int, args...) + +Sample a network from the block model by first generating `latents` random latent +variables, then sampling edges according to the block distributions. + +# Arguments +- `bm::BlockModel`: The block model to sample from +- `latents::Int`: Number of nodes to generate +- `args...`: Additional arguments passed to edge sampling + +# Returns +- Tuple of (latent_assignments, adjacency_matrix) +""" function sample(bm::BlockModel, latents::Int, args...) latents = map(x -> map_ξ_to_block(bm, x), rand(latents)) return latents, sample(bm, latents, args...) end +""" + sample(bm::BlockModel, latents::Vector, args...) + +Sample a network from the block model given specific latent block assignments. + +# Arguments +- `bm::BlockModel`: The block model to sample from +- `latents::Vector`: Block assignments for each node +- `args...`: Additional arguments passed to edge sampling + +# Returns +- Adjacency matrix with sampled edges +""" function sample(bm::BlockModel, latents::Vector{T}, args...) where {T} A = Array{eltype(bm[1, 1]), 2}(undef, length(latents), length(latents)) for j in 1:length(latents) @@ -51,20 +162,18 @@ function sample(bm::BlockModel, latents::Vector{T}, args...) where {T} A[i, j] = A[j, i] end for i in (j + 1):length(latents) - # println("latents[i]: ", latents[i], " latents[j]: ", latents[j]) - # println("bm[latents[i], latents[j]]: ", bm[latents[i], latents[j]]) A[i, j] = sample(bm[latents[i], latents[j]], args...) A[j, i] = A[i, j] end end - # fill the diagonal with zeros, avoid undefined references + # Fill diagonal with zeros (no self-loops) for i in 1:length(latents) A[i, i] = zero(A[1, 2]) end return A end -# this is probably awfull +# Base interface implementations for BlockModel function Base.getindex(s::BlockModel, i::Int, j::Int) return s._dists[i, j] @@ -78,6 +187,14 @@ function Base.size(s::BlockModel) return (s._dists.k, s._dists.k) end +""" + getindex(bm::BlockModel, i::Real, j::Real) + +Index into the block model using latent variables ξᵢ, ξⱼ ∈ [0,1]. + +Maps latent variables to their corresponding blocks and returns the +distribution between those blocks. +""" function Base.getindex(s::BlockModel, i::Real, j::Real) k = findfirst(x -> x ≥ i, s.cum_sizes) l = findfirst(x -> x ≥ j, s.cum_sizes) @@ -90,19 +207,49 @@ function Base.setindex!(s::BlockModel, v, i::Real, j::Real) s._dists[k, l] = v end -# helpers for generating ordered latents +""" + ordered_latents(bm::BlockModel, n::Int) + +Generate `n` ordered (sorted) latent block assignments from the block model. + +# Returns +- Sorted vector of block assignments +""" function ordered_latents(bm::BlockModel, n::Int) return sort(map(x -> map_ξ_to_block(bm, x), rand(n))) end +""" + get_probability_matrix(bm::BlockModel, latents::AbstractVector, default_dist=nothing) + +Generate a node-level probability matrix from a block model and latent assignments. + +Creates an n×n matrix where entry (i,j) contains the distribution for the edge +between nodes i and j, based on their block assignments. + +# Arguments +- `bm::BlockModel`: The block model +- `latents::AbstractVector`: Block assignment for each node +- `default_dist`: Distribution for diagonal entries (defaults to zero(bm[1,1]) if not provided) + +# Returns +- `Matrix`: n×n matrix of distributions + +# Example +```julia +bm = BlockModel(3, Bernoulli(0.5)) +latents = [1, 1, 2, 2, 3] +prob_matrix = get_probability_matrix(bm, latents) +``` +""" function get_probability_matrix( bm::BlockModel{D}, latents::AbstractVector, default_dist = nothing) where {D} - # hack for dirac at 0 dist (no self-loop) + # Set default distribution for diagonal (no self-loops) if isnothing(default_dist) try default_dist = zero(bm[1, 1]) catch e - if !is(e, MethodError) + if !isa(e, MethodError) rethrow(e) end error("Please provide a default distribution for the diagonal as it could not be inferred") @@ -122,11 +269,36 @@ function get_probability_matrix( return A end +""" + get_probability_matrix(a::Assignment, default_dist=nothing, node_labels=a.node_labels) + +Generate a node-level probability matrix from an Assignment. + +# Arguments +- `a::Assignment`: The assignment +- `default_dist`: Distribution for diagonal entries (default: nothing) +- `node_labels`: Custom node labels to use (default: a.node_labels) + +# Returns +- `Matrix`: Probability matrix based on the assignment's block structure +""" function get_probability_matrix( a::Assignment, default_dist = nothing, node_labels = a.node_labels) return get_probability_matrix(BlockModel(a.θ), node_labels, default_dist) end +""" + align_sbm!(sbm::BlockModel, perm) + +Permute the blocks of a stochastic block model according to permutation `perm`. + +This modifies the block model in-place, reordering blocks and updating the +cumulative sizes accordingly. + +# Arguments +- `sbm::BlockModel`: The block model to modify (modified in-place) +- `perm`: Permutation vector for reordering blocks +""" function align_sbm!(sbm::BlockModel, perm) sbm._dists .= sbm._dists[perm, perm] sbm.sizes .= sbm.sizes[perm] diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl index 8cdb9ad..c902f7e 100644 --- a/src/distributions/cat.jl +++ b/src/distributions/cat.jl @@ -37,8 +37,9 @@ function logpdf_cat(p::AbstractVector, obs::Int) return log(p[obs]) end +# Efficient log-likelihood computation for categorical observations +# Uses xlogy(x,y) = x*log(y) which handles edge cases properly function logpdf_cat(p::AbstractVector, count_observed::AbstractVector) - #TODO make non allocating with mapreduce ? return sum(_xlogy.(count_observed, p)) end diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index 581b61a..349faa4 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -1,3 +1,45 @@ +""" + Dist{D} + +A wrapper for distributions that tracks aggregation statistics. + +This type wraps a distribution `D` and maintains a count of how many observations +have been aggregated into it. This is essential for the network histogram algorithm +which needs to efficiently update distributions as nodes move between groups. + +# Fields +- `dist::D`: The underlying distribution +- `counts::Int`: Number of observations aggregated into this distribution (must be ≥ 0) + +# Type Parameters +- `D`: The type of the underlying distribution (e.g., Bernoulli, Categorical, etc.) + +# Constructors +```julia +# With explicit count +Dist(distribution, counts::Int) + +# Single observation (count = 1) +Dist(distribution) +``` + +# Examples +```julia +# Wrap a Bernoulli distribution +d = Dist(Bernoulli(0.5)) + +# Create a zero distribution +d_zero = zero(d) + +# Add observations +d_updated = add_to(d, Bernoulli(0.7)) + +# Remove observations +d_reduced = remove_from(d_updated, Bernoulli(0.7)) +``` + +See also: [`add_to`](@ref), [`remove_from`](@ref), [`zero`](@ref) +""" struct Dist{D} dist::D counts::Int @@ -11,11 +53,45 @@ function Base.show(io::IO, d::Dist) print(io, "$(d.dist)") end +""" + Dist(d) + +Create a Dist with a single observation (count = 1). +""" Dist(d) = Dist(d, 1) + +""" + zero(d::Dist) + +Create a zero-initialized distribution with 0 counts. +""" zero(d::Dist) = Dist(zero(d.dist), 0) Base.broadcastable(x::Dist) = Ref(x) +""" + add_to(avgdist::Dist{D}, dist::D) where {D} + +Add a new observation to an aggregated distribution. + +Updates the distribution parameters using weighted averaging based on the count. +The new observation has weight 1/(counts+1) and the existing distribution has +weight counts/(counts+1). + +# Arguments +- `avgdist::Dist{D}`: The current aggregated distribution +- `dist::D`: The new distribution to add + +# Returns +- `Dist{D}`: Updated distribution with incremented count + +# Example +```julia +d = Dist(Bernoulli(0.5), 2) # 2 observations with mean 0.5 +d_new = add_to(d, Bernoulli(0.8)) # Add observation with value 0.8 +# Result: Dist with 3 observations and mean (2*0.5 + 1*0.8)/3 ≈ 0.6 +``` +""" function add_to(avgdist::Dist{D}, dist::D) where {D} inner_dist = agg_params( avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), @@ -23,15 +99,28 @@ function add_to(avgdist::Dist{D}, dist::D) where {D} return Dist(inner_dist, avgdist.counts + 1) end +""" + remove_from(avgdist::Dist{D}, dist::D) where {D} + +Remove an observation from an aggregated distribution. + +Updates the distribution parameters by removing the contribution of `dist` from +the aggregate, using appropriate weight adjustments. + +# Arguments +- `avgdist::Dist{D}`: The current aggregated distribution +- `dist::D`: The distribution to remove + +# Returns +- `Dist{D}`: Updated distribution with decremented count + +# Note +Throws an error if attempting to remove from a distribution with 0 counts. +""" function remove_from(avgdist::Dist{D}, dist::D) where {D} if avgdist.counts <= 0 error("Cannot remove from a distribution with 0 counts") end - # if avgdist.counts == 1 && params(avgdist) == params(dist) - # return Dist(zero(avgdist.dist), 0) - # else - # error("Cannot remove from a distribution with 1 count unless the parameters are the same, got $(params(avgdist)) and $(params(dist))") - # end return Dist( agg_params( avgdist.dist, dist, avgdist.counts / max(1, (avgdist.counts - 1)), @@ -39,9 +128,18 @@ function remove_from(avgdist::Dist{D}, dist::D) where {D} avgdist.counts - 1) end -## probably this is fucked ... -# add_to(d::Dist, dist::Dist) = add_to(d, dist.dist) +""" + add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} +Add two Dist objects together, properly accounting for their counts. + +# Arguments +- `avgdist::Dist{D}`: First distribution +- `dist::Dist{D}`: Second distribution to add + +# Returns +- `Dist{D}`: Combined distribution with summed counts +""" function add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} Dist( agg_params( @@ -50,6 +148,12 @@ function add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} dist.counts / (avgdist.counts + dist.counts)), avgdist.counts + dist.counts) end + +""" + remove_from(avgdist::Dist, dist::Dist) + +Remove one Dist from another, properly accounting for their counts. +""" function remove_from(avgdist::Dist, dist::Dist) Dist( agg_params( @@ -59,27 +163,74 @@ function remove_from(avgdist::Dist, dist::Dist) avgdist.counts - dist.counts) end -# expose compression step that assumes there is a pdf(d, typeof(compressed(x))) properly defined -# by default do nothing +""" + _fast_compressed_obs(d, x, zero_inflated) + +Compress observations for efficient storage and computation. + +By default, returns `x` unchanged. Distributions can override this to implement +custom compression strategies. +""" _fast_compressed_obs(d, x, zero_inflated) = x -# what to delegate to the underlying distribution +# Delegate common operations to the underlying distribution for f in [:logpdf, :sample, :distance, :eltype, :params, :_fast_compressed_obs] @eval $f(d::Dist, args...) = $f(d.dist, args...) end +""" + fit(d::Dist, x) + +Fit the underlying distribution to observation(s) `x`, preserving the count. +""" fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) -## TODO: remove type instability ? +""" + loglikelihood(d::Dist, x) + +Compute the log-likelihood of observation(s) `x` under distribution `d`. + +# Returns +- `Float64`: Sum of log-probabilities, or 0.0 if x is empty +""" loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) -# loglikelihood(d::Dist, x) = sum(logpdf(d, y) for y in x) + +""" + unwrap(d::Dist) + +Extract the underlying distribution from a Dist wrapper. +""" unwrap(d::Dist) = d.dist Base.promote_rule(::Type{Dist{D}}, ::Type{D}) where {D} = D Base.convert(::Type{D}, d::Dist{D}) where {D} = d.dist -# Bernoulli distribution (example) +""" + Bernoulli{T <: Real} + +A simple Bernoulli distribution for binary (0/1) edges. + +# Fields +- `p::T`: Success probability (probability of edge = 1) + +# Example +```julia +b = Bernoulli(0.3) # 30% chance of edge +edge = sample(b) # Returns true or false +ll = logpdf(b, true) # Log probability of observing an edge +``` +# Interface Requirements +For a distribution to work with NetworkHistogram, it must implement: +- `zero(d)`: Return a zero-initialized distribution +- `agg_params(d1, d2, w1, w2)`: Aggregate two distributions with weights +- `fit(d, x)`: Fit distribution to observation(s) +- `distance(d1, d2)`: Distance metric between distributions +- `logpdf(d, x)`: Log probability density/mass function +- `params(d)`: Return tuple of parameters +- `eltype(d)`: Return element type +- `sample(d)`: Generate a random sample +""" struct Bernoulli{T <: Real} p::T end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index c0b5f1a..7dcec74 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -2,6 +2,35 @@ include("swap_workspace.jl") include("swap_categorical.jl") include("config_rules/include.jl") +""" + GreedyParams + +Configuration parameters for the greedy optimization algorithm. + +# Fields +- `max_iter::Int`: Maximum number of iterations (default: 100,000) +- `swap_rule::NodeSwapRule`: Rule for selecting which nodes to swap +- `accept_rule::AcceptRule`: Rule for accepting/rejecting proposed swaps +- `stop_rule::StopRule`: Rule for determining when to stop optimization +- `progress_bar::Bool`: Whether to display a progress bar (default: true) + +# Examples +```julia +# Use default parameters +params = GreedyParams() + +# Custom parameters with stricter stopping +params = GreedyParams( + 50_000, # max iterations + RandomNodeSwap(), # random node selection + Strict(), # only accept improvements + PreviousBestValue(5000), # stop after 5000 iterations without improvement + true # show progress bar +) +``` + +See also: [`NodeSwapRule`](@ref), [`AcceptRule`](@ref), [`StopRule`](@ref) +""" mutable struct GreedyParams max_iter::Int swap_rule::NodeSwapRule @@ -10,11 +39,43 @@ mutable struct GreedyParams progress_bar::Bool end +""" + GreedyParams() + +Create default greedy optimization parameters. + +Defaults: +- max_iter: 100,000 +- swap_rule: RandomNodeSwap() +- accept_rule: Strict() +- stop_rule: PreviousBestValue(10,000) +- progress_bar: true +""" function GreedyParams() GreedyParams( 100_000, RandomNodeSwap(), Strict(), PreviousBestValue(10_000), true) end +""" + greedy_optimize(g, initial_labels, params::GreedyParams) + +Run greedy optimization to find a good network histogram (block model partition). + +# Arguments +- `g`: Tuple of (EdgeList, Dist) containing the network data and distribution type +- `initial_labels`: Initial group assignment for nodes +- `params::GreedyParams`: Optimization parameters + +# Returns +- `Assignment`: Optimized assignment of nodes to groups + +# Algorithm +The algorithm iteratively: +1. Proposes moving a node to a different group (based on swap_rule) +2. Evaluates the change in log-likelihood +3. Accepts or rejects the move (based on accept_rule) +4. Continues until stopping criterion met (based on stop_rule) +""" function greedy_optimize(g, initial_labels, params::GreedyParams) @debug "making assignment" a = Assignment(initial_labels, g...) @@ -23,6 +84,21 @@ function greedy_optimize(g, initial_labels, params::GreedyParams) return a end +""" + greedy_improve!(a::Assignment; params = GreedyParams()) + +Improve an existing assignment through greedy local search. + +Modifies the assignment in-place by iteratively proposing and accepting beneficial +node reassignments. + +# Arguments +- `a::Assignment`: The assignment to improve (modified in-place) +- `params::GreedyParams`: Optimization parameters + +# Note +This function modifies `a` in-place and updates its log-likelihood. +""" function greedy_improve!(a::Assignment; params = GreedyParams()) # allocate memory for swap swap = make_swap(a, (1, 2)) @@ -46,6 +122,7 @@ function greedy_improve!(a::Assignment; params = GreedyParams()) end end +# Internal function for a single local search step function local_search!(a::Assignment, swap, params::GreedyParams) # select two nodes to swap and update data in the swap object make_swap!(swap, a, select_indices_swap(a, params.swap_rule)) diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index f91b341..df39ad2 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -1,13 +1,60 @@ +""" +FastSymArray - Efficient symmetric matrix storage + +This module provides `SymArray`, a memory-efficient storage for symmetric matrices +that only stores the upper triangle (including diagonal) of the matrix. +""" module FastSymArray import Base: eltype, convert export SymArray, eltype +""" + SymArray{F} <: AbstractArray{F, 2} + +A symmetric matrix that stores only the upper triangle to save memory. + +For a k×k symmetric matrix, only k(k+1)/2 elements are stored instead of k². + +# Fields +- `d::Dict{Tuple{Int, Int}, F}`: Dictionary storing (i,j) → value for i ≤ j +- `k::Int`: Dimension of the square matrix + +# Examples +```julia +# Create a 3×3 symmetric matrix initialized with zeros +sym = SymArray(3, 0.0) + +# Access elements (symmetric) +sym[1, 2] = 5.0 +sym[2, 1] # Returns 5.0 + +# Convert from regular matrix +A = [1 2 3; 2 4 5; 3 5 6] +sym = SymArray(A) +``` + +See also: [`sum_tri_with_diag`](@ref) +""" mutable struct SymArray{F} <: AbstractArray{F, 2} d::Dict{Tuple{Int, Int}, F} k::Int end +""" + SymArray(k::Int, d::F) + +Create a k×k symmetric matrix initialized with copies of value `d`. + +# Arguments +- `k::Int`: Dimension of the matrix (must be positive) +- `d::F`: Initial value for all entries + +# Example +```julia +sym = SymArray(5, 0.0) # 5×5 matrix of zeros +``` +""" function SymArray(k::T, d::F) where {F, T <: Real} @assert k > 0 return SymArray{F}( @@ -25,6 +72,11 @@ function SymArray(k::T, d::AbstractArray) where {T <: Real} k) end +""" + SymArray(d::AbstractMatrix{F}) + +Create a SymArray from an existing matrix. The matrix should be symmetric. +""" function SymArray(d::AbstractMatrix{F}) where {F} return convert(SymArray{F}, d) end @@ -43,6 +95,17 @@ Base.@propagate_inbounds function Base.setindex!(a::SymArray, v, i, j) a.d[minmax(i, j)] = v end +""" + sum_tri_with_diag(a::SymArray) + +Efficiently sum all elements in the symmetric matrix (counting each off-diagonal once). + +# Returns +- Sum of all unique elements in the symmetric matrix + +# Note +This is more efficient than `sum(a)` because it only sums stored elements. +""" function sum_tri_with_diag(a::SymArray) return sum(values(a.d)) end @@ -51,10 +114,6 @@ function eltype(::SymArray{F}) where {F} return F end -# function zeros(::Type{SymArray{F}}, k::Int) where {F} -# return SymArray(k, zero(F)) -# end - function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} @assert size(a, 1) == size(a, 2) k = size(a, 1) From f5f09381a7e5b2c43a716088e68de68648ac1985 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 15 Oct 2025 15:41:10 +0200 Subject: [PATCH 188/266] create benchmark regression test --- PERFORMANCE.md | 226 ++++++++++++++++++++++ QUICKREF.md | 128 +++++++++++++ test/Project.toml | 2 + test/runtests.jl | 17 ++ test/test_performance_regression.jl | 279 ++++++++++++++++++++++++++++ 5 files changed, 652 insertions(+) create mode 100644 PERFORMANCE.md create mode 100644 QUICKREF.md create mode 100644 test/test_performance_regression.jl diff --git a/PERFORMANCE.md b/PERFORMANCE.md new file mode 100644 index 0000000..850945a --- /dev/null +++ b/PERFORMANCE.md @@ -0,0 +1,226 @@ +# Performance Optimization Guide for NetworkHistogram + +This repository now includes a comprehensive performance regression test suite to help improve the optimization speed of NetworkHistogram algorithms. + +## 🎯 Quick Start + +### 1. Establish a Baseline + +Before making any changes: + +```bash +julia dev/run_benchmarks.jl baseline +``` + +### 2. Make Your Changes + +Edit the optimization code (e.g., in `src/optimization/`) + +### 3. Test Performance + +```bash +julia dev/run_benchmarks.jl current +``` + +This will automatically compare against your baseline and show: +- Which operations got faster/slower +- By how much (speedup factor and percentage) +- Detailed timing statistics + +### 4. Verify Correctness + +```bash +julia --project=. -e 'using Pkg; Pkg.test()' +``` + +## 📊 What Gets Benchmarked + +### Core Operations +- **Single swap operations** (Bernoulli & Categorical networks) + - Small networks: n=50 nodes + - Medium networks: n=200 nodes + - Large networks: n=500 nodes + +### Full Workflows +- Complete optimization runs (1,000 iterations) +- End-to-end performance measurement + +### Components +- Assignment creation +- EdgeList creation +- Log-likelihood computation +- Edge extraction + +## 🔍 Key Hotspots for Optimization + +Based on the workflow in `test_decorated_paper.jl`, these are the critical bottlenecks: + +### 1. `apply_swap!` Function +**Location**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` + +**Why it matters**: Called millions of times during optimization (once per iteration) + +**Current bottlenecks**: +- Uses `deepcopy` for state management +- Iterates over all neighbors repeatedly +- Allocates temporary arrays + +**Optimization ideas**: +- Pre-allocate workspace buffers +- Use in-place operations +- Cache neighbor lists +- Reduce `deepcopy` usage + +### 2. `get_edges_in_groups` Function +**Location**: `src/assignment.jl` + +**Why it matters**: Called during log-likelihood recomputation + +**Current bottlenecks**: +- Uses `findall` (allocates) +- Creates new vector each time +- Linear search through nodes + +**Optimization ideas**: +- Pre-compute group membership indices +- Use pre-allocated output buffers +- Cache results for frequently accessed group pairs + +### 3. Log-likelihood Updates +**Location**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` + +**Why it matters**: Must be computed after each swap + +**Current approach**: Recomputes only affected group pairs (good!) + +**Optimization ideas**: +- Batch `logpdf` computations +- Use vectorized operations +- Cache intermediate calculations + +## 📁 File Structure + +``` +dev/ + ├── run_benchmarks.jl # Easy-to-use benchmark runner + ├── benchmark_optimization.jl # Standalone benchmarking script + ├── BENCHMARKING.md # Detailed documentation + └── benchmark_results/ # Stored benchmark results + └── baseline.json # Reference baseline + +test/ + └── test_performance_regression.jl # Performance tests for CI +``` + +## 📈 Example Output + +``` +--- Single Swap Operations (Bernoulli) --- +Benchmarking Bernoulli swap (n=50, k=2)... + Median: 0.234 ms +Benchmarking Bernoulli swap (n=200, k=3)... + Median: 1.567 ms + +======================================== +Performance Comparison vs Baseline +Baseline: 2024-10-15 14:30:00 +======================================== +✓ FASTER bernoulli_swap_n50_k2: 1.23x (23.0%) + Current: 0.190 ms | Baseline: 0.234 ms + +≈ SIMILAR bernoulli_swap_n200_k3: 1.02x (2.0%) + Current: 1.537 ms | Baseline: 1.567 ms +``` + +## 🔧 Advanced Usage + +### Run Only Specific Benchmarks + +Edit `dev/benchmark_optimization.jl` to comment out benchmarks you don't need. + +### Compare Two Specific Benchmark Files + +```bash +julia dev/run_benchmarks.jl compare results/v1.json results/v2.json +``` + +### Profile Your Code + +```julia +using Profile + +include("dev/test_decorated_paper.jl") + +# Profile a specific function +@profile main(500:500:1000, 2) + +Profile.print() +# Or for a flamegraph: +using ProfileView +ProfileView.view() +``` + +### Check Allocations + +```julia +using BenchmarkTools + +# See allocations for a single operation +@btime apply_swap!($assignment, $swap) samples=1 evals=1 +``` + +## 🎓 Best Practices + +1. **Always establish a baseline first** - You need a reference point +2. **Make incremental changes** - Change one thing at a time +3. **Profile before optimizing** - Don't guess where the bottleneck is +4. **Test correctness** - Fast but wrong is useless +5. **Document your changes** - Explain why you made each optimization +6. **Consider maintainability** - Don't sacrifice readability for tiny gains + +## 📚 Resources + +- **Detailed benchmarking guide**: See `dev/BENCHMARKING.md` +- **Julia Performance Tips**: https://docs.julialang.org/en/v1/manual/performance-tips/ +- **BenchmarkTools.jl**: https://juliaci.github.io/BenchmarkTools.jl/stable/ +- **Profile module**: https://docs.julialang.org/en/v1/stdlib/Profile/ + +## 🤝 Contributing Performance Improvements + +When submitting a PR with performance improvements: + +1. Include before/after benchmark results +2. Explain what you optimized and why +3. Ensure all tests still pass +4. Document any trade-offs made +5. Consider adding new benchmarks for your changes + +## ❓ Troubleshooting + +### "BenchmarkTools not available" + +```bash +julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' +``` + +### High variance in results + +- Close other applications +- Run benchmarks multiple times +- Use `--threads=1` flag for consistency + +### Benchmark takes too long + +- Reduce the `samples` parameter +- Use smaller test networks +- Run individual benchmark categories instead of all at once + +## 📞 Getting Help + +- Open an issue with benchmark results +- Include your system specs (OS, Julia version, CPU) +- Describe what you're trying to optimize + +--- + +Happy optimizing! 🚀 diff --git a/QUICKREF.md b/QUICKREF.md new file mode 100644 index 0000000..2353ea1 --- /dev/null +++ b/QUICKREF.md @@ -0,0 +1,128 @@ +# Performance Optimization Quick Reference + +## 🚀 Quick Commands + +```bash +# Establish baseline +julia dev/run_benchmarks.jl baseline + +# Benchmark current code +julia dev/run_benchmarks.jl current + +# Profile to find bottlenecks +julia dev/profile_optimization.jl swap + +# Run tests +julia --project=. -e 'using Pkg; Pkg.test()' + +# Visualize results +julia dev/visualize_benchmarks.jl --all +``` + +## 📊 Understanding Output + +``` +✓ FASTER = >5% improvement +✗ SLOWER = >5% regression +≈ SIMILAR = Within ±5% +``` + +## 🎯 Priority Hotspots + +### 1. `apply_swap!` 🔴 CRITICAL +- **File**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` +- **Why**: Called ~1M times per run +- **Fix**: Reduce allocations, avoid `deepcopy` + +### 2. `get_edges_in_groups` 🟡 MODERATE +- **File**: `src/assignment.jl` +- **Why**: Called during LL updates +- **Fix**: Pre-allocate, cache group membership + +### 3. Edge iteration 🟢 LOW +- **File**: `src/EdgeList.jl` +- **Why**: Used everywhere +- **Fix**: Ensure type stability + +## 🛠️ Common Optimizations + +### Check Allocations +```julia +using BenchmarkTools +@btime my_function($args) samples=1 evals=1 +# Look for allocations in output +``` + +### Profile Code +```julia +using Profile +@profile my_function(args) +Profile.print(maxdepth=15) +``` + +### Type Stability +```julia +using Cthulhu +@descend my_function(args) +# Red = type unstable (BAD) +``` + +## 📁 Key Files + +``` +├── PERFORMANCE.md # Main guide +├── dev/ +│ ├── run_benchmarks.jl # 👈 USE THIS +│ ├── profile_optimization.jl # For profiling +│ ├── visualize_benchmarks.jl # View results +│ └── BENCHMARKING.md # Details +└── src/optimization/ # 🎯 Optimize here + ├── swap_workspace.jl + └── swap_categorical.jl +``` + +## 📈 Expected Gains + +- Reduce allocations: **20-50%** speedup +- Better data structures: **2-10x** speedup +- SIMD/vectorization: **2-4x** speedup +- Fix type instability: **2-5x** speedup + +## 🔄 Workflow + +1. **Baseline** → 2. **Profile** → 3. **Optimize** → 4. **Benchmark** → 5. **Test** → Repeat + +## 💡 Tips + +- Focus on **hot paths** (profile first!) +- Measure **before and after** every change +- Keep changes **small and focused** +- Always **test correctness** +- Document **what and why** + +## 🆘 Troubleshooting + +### "BenchmarkTools not found" +```bash +julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' +``` + +### Results vary +- Close other apps +- Use `--threads=1` +- Increase samples + +### Too slow +- Reduce samples +- Use smaller networks +- Run specific benchmarks + +## 📚 Learn More + +- `PERFORMANCE.md` - Full guide +- `dev/BENCHMARKING.md` - Detailed docs +- `julia dev/run_benchmarks.jl help` - CLI help + +--- + +**Remember**: Profile → Optimize → Benchmark → Test → Repeat 🔁 diff --git a/test/Project.toml b/test/Project.toml index 6be9544..1c5fcb0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,8 +1,10 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" diff --git a/test/runtests.jl b/test/runtests.jl index 2e9702f..efd4769 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,10 +1,27 @@ using Test using LinearAlgebra, SparseArrays using NetworkHistogram + +# Check if BenchmarkTools is available (it's not required for basic tests) +const RUN_BENCHMARKS = try + using BenchmarkTools + true +catch + @warn "BenchmarkTools not available, skipping performance regression tests" + false +end + @testset "Tests" begin include("test_data_format.jl") include("test_distributions_type.jl") include("test_swap_workspace.jl") include("test_cat_case.jl") include("test_get_edges_in_groups.jl") + + # Only run performance tests if BenchmarkTools is available + if RUN_BENCHMARKS + @testset "Performance Regression" begin + include("test_performance_regression.jl") + end + end end diff --git a/test/test_performance_regression.jl b/test/test_performance_regression.jl new file mode 100644 index 0000000..4b057a6 --- /dev/null +++ b/test/test_performance_regression.jl @@ -0,0 +1,279 @@ +using Test +using NetworkHistogram +using StatsBase +using Random +using Distributions +using StaticArrays +using BenchmarkTools + +""" +Performance regression test suite for NetworkHistogram optimization. + +This file contains benchmarks for the key optimization operations, designed to: +1. Track performance improvements/regressions over time +2. Identify bottlenecks in the optimization workflow +3. Ensure optimization changes maintain correctness + +Based on the workflow in test_decorated_paper.jl +""" + +# Helper function to create test networks +function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed=42) + Random.seed!(seed) + d = NetworkHistogram.Bernoulli(0.5) + sbm = NetworkHistogram.BlockModel(n_groups, d) + + # Create varied probabilities between groups + for g1 in 1:n_groups + for g2 in g1:n_groups + p = 0.1 + 0.7 * rand() + sbm[g1, g2] = NetworkHistogram.Bernoulli(p) + end + end + + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + A = NetworkHistogram.sample(sbm, labels) + return A, labels, d +end + +function create_test_sbm_categorical(n_groups::Int, n_nodes::Int, n_categories::Int; seed=42) + Random.seed!(seed) + ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) + d = NetworkHistogram.Cat(ps) + sbm = NetworkHistogram.BlockModel(n_groups, d) + + # Create varied probability distributions between groups + for g1 in 1:n_groups + for g2 in g1:n_groups + probs = rand(n_categories) + probs ./= sum(probs) + sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) + end + end + + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + A = NetworkHistogram.sample(sbm, labels) + return A, labels, d +end + +@testset "Performance Regression Tests" begin + + @testset "Bernoulli Networks" begin + @testset "Small network (n=50, k=2)" begin + A, labels, d = create_test_sbm_bernoulli(2, 50) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + # Benchmark single swap operation + swap = NetworkHistogram.make_swap(assignment, (1, 50)) + ll_before = NetworkHistogram.loglikelihood(assignment) + + b_swap = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end samples=100 evals=1 + + # Verify correctness + ll_after = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_before, ll_after, atol=1e-10) + + @info "Bernoulli (n=50, k=2) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + end + + @testset "Medium network (n=200, k=3)" begin + A, labels, d = create_test_sbm_bernoulli(3, 200) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + swap = NetworkHistogram.make_swap(assignment, (1, 200)) + ll_before = NetworkHistogram.loglikelihood(assignment) + + b_swap = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end samples=100 evals=1 + + ll_after = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_before, ll_after, atol=1e-10) + + @info "Bernoulli (n=200, k=3) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + end + + @testset "Large network (n=500, k=5)" begin + A, labels, d = create_test_sbm_bernoulli(5, 500) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + swap = NetworkHistogram.make_swap(assignment, (1, 500)) + ll_before = NetworkHistogram.loglikelihood(assignment) + + b_swap = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end samples=50 evals=1 + + ll_after = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_before, ll_after, atol=1e-10) + + @info "Bernoulli (n=500, k=5) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + end + end + + @testset "Categorical Networks" begin + @testset "Small network (n=50, k=2, m=3)" begin + A, labels, d = create_test_sbm_categorical(2, 50, 3) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + swap = NetworkHistogram.make_swap(assignment, (1, 50)) + ll_before = NetworkHistogram.loglikelihood(assignment) + + b_swap = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end samples=100 evals=1 + + ll_after = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_before, ll_after, atol=1e-10) + + @info "Categorical (n=50, k=2, m=3) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + end + + @testset "Medium network (n=200, k=3, m=4)" begin + A, labels, d = create_test_sbm_categorical(3, 200, 4) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + swap = NetworkHistogram.make_swap(assignment, (1, 200)) + ll_before = NetworkHistogram.loglikelihood(assignment) + + b_swap = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end samples=100 evals=1 + + ll_after = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_before, ll_after, atol=1e-10) + + @info "Categorical (n=200, k=3, m=4) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + end + + @testset "Large network (n=500, k=5, m=5)" begin + A, labels, d = create_test_sbm_categorical(5, 500, 5) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + swap = NetworkHistogram.make_swap(assignment, (1, 500)) + ll_before = NetworkHistogram.loglikelihood(assignment) + + b_swap = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end samples=50 evals=1 + + ll_after = NetworkHistogram.loglikelihood(assignment) + @test isapprox(ll_before, ll_after, atol=1e-10) + + @info "Categorical (n=500, k=5, m=5) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + end + end + + @testset "Full Optimization Workflow" begin + @testset "Bernoulli - Short optimization (n=100, k=3)" begin + A, labels, d = create_test_sbm_bernoulli(3, 100) + + # Randomize initial labels + initial_labels = rand(1:3, 100) + + params = NetworkHistogram.GreedyParams( + 1_000, # Small number for testing + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(500), + false # No progress bar for benchmarking + ) + + b_optimize = @benchmark begin + NetworkHistogram.nethist($A, $d, $initial_labels, $params) + end samples=10 evals=1 + + @info "Bernoulli full optimization (n=100, 1k iters)" median=median(b_optimize.times)/1e6 mean=mean(b_optimize.times)/1e6 + end + + @testset "Categorical - Short optimization (n=100, k=3, m=3)" begin + A, labels, d = create_test_sbm_categorical(3, 100, 3) + + # Randomize initial labels + initial_labels = rand(1:3, 100) + + params = NetworkHistogram.GreedyParams( + 1_000, + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(500), + false + ) + + b_optimize = @benchmark begin + NetworkHistogram.nethist($A, $d, $initial_labels, $params) + end samples=10 evals=1 + + @info "Categorical full optimization (n=100, 1k iters)" median=median(b_optimize.times)/1e6 mean=mean(b_optimize.times)/1e6 + end + end + + @testset "Component Benchmarks" begin + @testset "Assignment creation (n=200, k=3)" begin + A, labels, d = create_test_sbm_bernoulli(3, 200) + edgelist = NetworkHistogram.EdgeList(A) + + b_assignment = @benchmark begin + NetworkHistogram.Assignment($labels, $edgelist, NetworkHistogram.Dist($d)) + end samples=100 + + @info "Assignment creation (n=200)" median=median(b_assignment.times)/1e6 mean=mean(b_assignment.times)/1e6 + end + + @testset "EdgeList creation (n=200)" begin + A, _, _ = create_test_sbm_bernoulli(3, 200) + + b_edgelist = @benchmark begin + NetworkHistogram.EdgeList($A) + end samples=100 + + @info "EdgeList creation (n=200)" median=median(b_edgelist.times)/1e6 mean=mean(b_edgelist.times)/1e6 + end + + @testset "Loglikelihood computation (n=200, k=3)" begin + A, labels, d = create_test_sbm_bernoulli(3, 200) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + b_ll = @benchmark begin + NetworkHistogram.loglikelihood($assignment) + end samples=1000 + + @info "Loglikelihood computation (n=200)" median=median(b_ll.times)/1e3 mean=mean(b_ll.times)/1e3 + end + + @testset "Get edges in groups (n=200, k=3)" begin + A, labels, d = create_test_sbm_bernoulli(3, 200) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + + b_get_edges = @benchmark begin + NetworkHistogram.get_edges_in_groups($assignment, 1, 2) + end samples=1000 + + @info "Get edges in groups (n=200)" median=median(b_get_edges.times)/1e3 mean=mean(b_get_edges.times)/1e3 + end + end +end From c24e8423bc2366cbbf9533941dee83b24e0ba966 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 15 Oct 2025 15:41:30 +0200 Subject: [PATCH 189/266] create benchmark regression test 2 --- PERFORMANCE.md | 24 +++- QUICKREF.md | 12 +- test/runtests.jl | 2 +- test/test_performance_regression.jl | 169 +++++++++++++++------------- 4 files changed, 125 insertions(+), 82 deletions(-) diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 850945a..d68accc 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -1,6 +1,7 @@ # Performance Optimization Guide for NetworkHistogram -This repository now includes a comprehensive performance regression test suite to help improve the optimization speed of NetworkHistogram algorithms. +This repository now includes a comprehensive performance regression test suite +to help improve the optimization speed of NetworkHistogram algorithms. ## 🎯 Quick Start @@ -23,6 +24,7 @@ julia dev/run_benchmarks.jl current ``` This will automatically compare against your baseline and show: + - Which operations got faster/slower - By how much (speedup factor and percentage) - Detailed timing statistics @@ -36,16 +38,19 @@ julia --project=. -e 'using Pkg; Pkg.test()' ## 📊 What Gets Benchmarked ### Core Operations + - **Single swap operations** (Bernoulli & Categorical networks) - Small networks: n=50 nodes - Medium networks: n=200 nodes - Large networks: n=500 nodes ### Full Workflows + - Complete optimization runs (1,000 iterations) - End-to-end performance measurement ### Components + - Assignment creation - EdgeList creation - Log-likelihood computation @@ -53,40 +58,49 @@ julia --project=. -e 'using Pkg; Pkg.test()' ## 🔍 Key Hotspots for Optimization -Based on the workflow in `test_decorated_paper.jl`, these are the critical bottlenecks: +Based on the workflow in `test_decorated_paper.jl`, these are the critical +bottlenecks: ### 1. `apply_swap!` Function + **Location**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` -**Why it matters**: Called millions of times during optimization (once per iteration) +**Why it matters**: Called millions of times during optimization (once per +iteration) **Current bottlenecks**: + - Uses `deepcopy` for state management - Iterates over all neighbors repeatedly - Allocates temporary arrays **Optimization ideas**: + - Pre-allocate workspace buffers - Use in-place operations - Cache neighbor lists - Reduce `deepcopy` usage ### 2. `get_edges_in_groups` Function + **Location**: `src/assignment.jl` **Why it matters**: Called during log-likelihood recomputation **Current bottlenecks**: + - Uses `findall` (allocates) - Creates new vector each time - Linear search through nodes **Optimization ideas**: + - Pre-compute group membership indices - Use pre-allocated output buffers - Cache results for frequently accessed group pairs ### 3. Log-likelihood Updates + **Location**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` **Why it matters**: Must be computed after each swap @@ -94,6 +108,7 @@ Based on the workflow in `test_decorated_paper.jl`, these are the critical bottl **Current approach**: Recomputes only affected group pairs (good!) **Optimization ideas**: + - Batch `logpdf` computations - Use vectorized operations - Cache intermediate calculations @@ -181,7 +196,8 @@ using BenchmarkTools ## 📚 Resources - **Detailed benchmarking guide**: See `dev/BENCHMARKING.md` -- **Julia Performance Tips**: https://docs.julialang.org/en/v1/manual/performance-tips/ +- **Julia Performance Tips**: + https://docs.julialang.org/en/v1/manual/performance-tips/ - **BenchmarkTools.jl**: https://juliaci.github.io/BenchmarkTools.jl/stable/ - **Profile module**: https://docs.julialang.org/en/v1/stdlib/Profile/ diff --git a/QUICKREF.md b/QUICKREF.md index 2353ea1..d0e73eb 100644 --- a/QUICKREF.md +++ b/QUICKREF.md @@ -30,16 +30,19 @@ julia dev/visualize_benchmarks.jl --all ## 🎯 Priority Hotspots ### 1. `apply_swap!` 🔴 CRITICAL + - **File**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` - **Why**: Called ~1M times per run - **Fix**: Reduce allocations, avoid `deepcopy` ### 2. `get_edges_in_groups` 🟡 MODERATE + - **File**: `src/assignment.jl` - **Why**: Called during LL updates - **Fix**: Pre-allocate, cache group membership ### 3. Edge iteration 🟢 LOW + - **File**: `src/EdgeList.jl` - **Why**: Used everywhere - **Fix**: Ensure type stability @@ -47,6 +50,7 @@ julia dev/visualize_benchmarks.jl --all ## 🛠️ Common Optimizations ### Check Allocations + ```julia using BenchmarkTools @btime my_function($args) samples=1 evals=1 @@ -54,6 +58,7 @@ using BenchmarkTools ``` ### Profile Code + ```julia using Profile @profile my_function(args) @@ -61,6 +66,7 @@ Profile.print(maxdepth=15) ``` ### Type Stability + ```julia using Cthulhu @descend my_function(args) @@ -90,7 +96,8 @@ using Cthulhu ## 🔄 Workflow -1. **Baseline** → 2. **Profile** → 3. **Optimize** → 4. **Benchmark** → 5. **Test** → Repeat +1. **Baseline** → 2. **Profile** → 3. **Optimize** → 4. **Benchmark** → 5. + **Test** → Repeat ## 💡 Tips @@ -103,16 +110,19 @@ using Cthulhu ## 🆘 Troubleshooting ### "BenchmarkTools not found" + ```bash julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' ``` ### Results vary + - Close other apps - Use `--threads=1` - Increase samples ### Too slow + - Reduce samples - Use smaller networks - Run specific benchmarks diff --git a/test/runtests.jl b/test/runtests.jl index efd4769..9f8eb38 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,7 +17,7 @@ end include("test_swap_workspace.jl") include("test_cat_case.jl") include("test_get_edges_in_groups.jl") - + # Only run performance tests if BenchmarkTools is available if RUN_BENCHMARKS @testset "Performance Regression" begin diff --git a/test/test_performance_regression.jl b/test/test_performance_regression.jl index 4b057a6..1e7ca95 100644 --- a/test/test_performance_regression.jl +++ b/test/test_performance_regression.jl @@ -18,11 +18,11 @@ Based on the workflow in test_decorated_paper.jl """ # Helper function to create test networks -function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed=42) +function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) Random.seed!(seed) d = NetworkHistogram.Bernoulli(0.5) sbm = NetworkHistogram.BlockModel(n_groups, d) - + # Create varied probabilities between groups for g1 in 1:n_groups for g2 in g1:n_groups @@ -30,18 +30,19 @@ function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed=42) sbm[g1, g2] = NetworkHistogram.Bernoulli(p) end end - + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) A = NetworkHistogram.sample(sbm, labels) return A, labels, d end -function create_test_sbm_categorical(n_groups::Int, n_nodes::Int, n_categories::Int; seed=42) +function create_test_sbm_categorical( + n_groups::Int, n_nodes::Int, n_categories::Int; seed = 42) Random.seed!(seed) ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) d = NetworkHistogram.Cat(ps) sbm = NetworkHistogram.BlockModel(n_groups, d) - + # Create varied probability distributions between groups for g1 in 1:n_groups for g2 in g1:n_groups @@ -50,147 +51,155 @@ function create_test_sbm_categorical(n_groups::Int, n_nodes::Int, n_categories:: sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) end end - + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) A = NetworkHistogram.sample(sbm, labels) return A, labels, d end @testset "Performance Regression Tests" begin - @testset "Bernoulli Networks" begin @testset "Small network (n=50, k=2)" begin A, labels, d = create_test_sbm_bernoulli(2, 50) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + # Benchmark single swap operation swap = NetworkHistogram.make_swap(assignment, (1, 50)) ll_before = NetworkHistogram.loglikelihood(assignment) - + b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) end samples=100 evals=1 - + # Verify correctness ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol=1e-10) - - @info "Bernoulli (n=50, k=2) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + @test isapprox(ll_before, ll_after, atol = 1e-10) + + @info "Bernoulli (n=50, k=2) - Single swap" median=median(b_swap.times) / 1e6 mean=mean(b_swap.times) / + 1e6 end - + @testset "Medium network (n=200, k=3)" begin A, labels, d = create_test_sbm_bernoulli(3, 200) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + swap = NetworkHistogram.make_swap(assignment, (1, 200)) ll_before = NetworkHistogram.loglikelihood(assignment) - + b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) end samples=100 evals=1 - + ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol=1e-10) - - @info "Bernoulli (n=200, k=3) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + @test isapprox(ll_before, ll_after, atol = 1e-10) + + @info "Bernoulli (n=200, k=3) - Single swap" median=median(b_swap.times) / 1e6 mean=mean(b_swap.times) / + 1e6 end - + @testset "Large network (n=500, k=5)" begin A, labels, d = create_test_sbm_bernoulli(5, 500) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + swap = NetworkHistogram.make_swap(assignment, (1, 500)) ll_before = NetworkHistogram.loglikelihood(assignment) - + b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) end samples=50 evals=1 - + ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol=1e-10) - - @info "Bernoulli (n=500, k=5) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + @test isapprox(ll_before, ll_after, atol = 1e-10) + + @info "Bernoulli (n=500, k=5) - Single swap" median=median(b_swap.times) / 1e6 mean=mean(b_swap.times) / + 1e6 end end - + @testset "Categorical Networks" begin @testset "Small network (n=50, k=2, m=3)" begin A, labels, d = create_test_sbm_categorical(2, 50, 3) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + swap = NetworkHistogram.make_swap(assignment, (1, 50)) ll_before = NetworkHistogram.loglikelihood(assignment) - + b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) end samples=100 evals=1 - + ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol=1e-10) - - @info "Categorical (n=50, k=2, m=3) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + @test isapprox(ll_before, ll_after, atol = 1e-10) + + @info "Categorical (n=50, k=2, m=3) - Single swap" median=median(b_swap.times) / + 1e6 mean=mean(b_swap.times) / + 1e6 end - + @testset "Medium network (n=200, k=3, m=4)" begin A, labels, d = create_test_sbm_categorical(3, 200, 4) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + swap = NetworkHistogram.make_swap(assignment, (1, 200)) ll_before = NetworkHistogram.loglikelihood(assignment) - + b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) end samples=100 evals=1 - + ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol=1e-10) - - @info "Categorical (n=200, k=3, m=4) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + @test isapprox(ll_before, ll_after, atol = 1e-10) + + @info "Categorical (n=200, k=3, m=4) - Single swap" median=median(b_swap.times) / + 1e6 mean=mean(b_swap.times) / + 1e6 end - + @testset "Large network (n=500, k=5, m=5)" begin A, labels, d = create_test_sbm_categorical(5, 500, 5) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + swap = NetworkHistogram.make_swap(assignment, (1, 500)) ll_before = NetworkHistogram.loglikelihood(assignment) - + b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) end samples=50 evals=1 - + ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol=1e-10) - - @info "Categorical (n=500, k=5, m=5) - Single swap" median=median(b_swap.times)/1e6 mean=mean(b_swap.times)/1e6 + @test isapprox(ll_before, ll_after, atol = 1e-10) + + @info "Categorical (n=500, k=5, m=5) - Single swap" median=median(b_swap.times) / + 1e6 mean=mean(b_swap.times) / + 1e6 end end - + @testset "Full Optimization Workflow" begin @testset "Bernoulli - Short optimization (n=100, k=3)" begin A, labels, d = create_test_sbm_bernoulli(3, 100) - + # Randomize initial labels initial_labels = rand(1:3, 100) - + params = NetworkHistogram.GreedyParams( 1_000, # Small number for testing NetworkHistogram.RandomNodeSwap(), @@ -198,20 +207,22 @@ end NetworkHistogram.PreviousBestValue(500), false # No progress bar for benchmarking ) - + b_optimize = @benchmark begin NetworkHistogram.nethist($A, $d, $initial_labels, $params) end samples=10 evals=1 - - @info "Bernoulli full optimization (n=100, 1k iters)" median=median(b_optimize.times)/1e6 mean=mean(b_optimize.times)/1e6 + + @info "Bernoulli full optimization (n=100, 1k iters)" median=median(b_optimize.times) / + 1e6 mean=mean(b_optimize.times) / + 1e6 end - + @testset "Categorical - Short optimization (n=100, k=3, m=3)" begin A, labels, d = create_test_sbm_categorical(3, 100, 3) - + # Randomize initial labels initial_labels = rand(1:3, 100) - + params = NetworkHistogram.GreedyParams( 1_000, NetworkHistogram.RandomNodeSwap(), @@ -219,61 +230,67 @@ end NetworkHistogram.PreviousBestValue(500), false ) - + b_optimize = @benchmark begin NetworkHistogram.nethist($A, $d, $initial_labels, $params) end samples=10 evals=1 - - @info "Categorical full optimization (n=100, 1k iters)" median=median(b_optimize.times)/1e6 mean=mean(b_optimize.times)/1e6 + + @info "Categorical full optimization (n=100, 1k iters)" median=median(b_optimize.times) / + 1e6 mean=mean(b_optimize.times) / + 1e6 end end - + @testset "Component Benchmarks" begin @testset "Assignment creation (n=200, k=3)" begin A, labels, d = create_test_sbm_bernoulli(3, 200) edgelist = NetworkHistogram.EdgeList(A) - + b_assignment = @benchmark begin NetworkHistogram.Assignment($labels, $edgelist, NetworkHistogram.Dist($d)) end samples=100 - - @info "Assignment creation (n=200)" median=median(b_assignment.times)/1e6 mean=mean(b_assignment.times)/1e6 + + @info "Assignment creation (n=200)" median=median(b_assignment.times) / 1e6 mean=mean(b_assignment.times) / + 1e6 end - + @testset "EdgeList creation (n=200)" begin A, _, _ = create_test_sbm_bernoulli(3, 200) - + b_edgelist = @benchmark begin NetworkHistogram.EdgeList($A) end samples=100 - - @info "EdgeList creation (n=200)" median=median(b_edgelist.times)/1e6 mean=mean(b_edgelist.times)/1e6 + + @info "EdgeList creation (n=200)" median=median(b_edgelist.times) / 1e6 mean=mean(b_edgelist.times) / + 1e6 end - + @testset "Loglikelihood computation (n=200, k=3)" begin A, labels, d = create_test_sbm_bernoulli(3, 200) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + b_ll = @benchmark begin NetworkHistogram.loglikelihood($assignment) end samples=1000 - - @info "Loglikelihood computation (n=200)" median=median(b_ll.times)/1e3 mean=mean(b_ll.times)/1e3 + + @info "Loglikelihood computation (n=200)" median=median(b_ll.times) / 1e3 mean=mean(b_ll.times) / + 1e3 end - + @testset "Get edges in groups (n=200, k=3)" begin A, labels, d = create_test_sbm_bernoulli(3, 200) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d)) - + b_get_edges = @benchmark begin NetworkHistogram.get_edges_in_groups($assignment, 1, 2) end samples=1000 - - @info "Get edges in groups (n=200)" median=median(b_get_edges.times)/1e3 mean=mean(b_get_edges.times)/1e3 + + @info "Get edges in groups (n=200)" median=median(b_get_edges.times) / 1e3 mean=mean(b_get_edges.times) / + 1e3 end end end From 675a4055c98da61a61e28e7e8fe090ff6d3583c9 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 15 Oct 2025 23:09:21 +0200 Subject: [PATCH 190/266] add speed benchmark --- Project.toml | 3 + benchmark/BENCHMARKING.md | 329 +++++++++++++++++ benchmark/BENCHMARK_SUMMARY.md | 330 +++++++++++++++++ benchmark/README.md | 136 +++++++ benchmark/benchmark_optimization.jl | 336 ++++++++++++++++++ benchmark/benchmark_results/baseline.json | 83 +++++ .../benchmark_2025-10-15T22-51-21.json | 83 +++++ .../benchmark_2025-10-15T22-52-50.json | 83 +++++ .../benchmark_2025-10-15T22-55-28.json | 83 +++++ .../benchmark_2025-10-15T22-56-00.json | 83 +++++ .../benchmark_2025-10-15T22-58-43.json | 83 +++++ benchmark/benchmark_results/optimized.json | 83 +++++ benchmark/profile_optimization.jl | 309 ++++++++++++++++ benchmark/run_benchmarks.jl | 210 +++++++++++ benchmark/visualize_benchmarks.jl | 228 ++++++++++++ src/assignment.jl | 11 +- src/optimization/swap_workspace.jl | 42 ++- test/test_performance_regression.jl | 58 +-- 18 files changed, 2536 insertions(+), 37 deletions(-) create mode 100644 benchmark/BENCHMARKING.md create mode 100644 benchmark/BENCHMARK_SUMMARY.md create mode 100644 benchmark/README.md create mode 100644 benchmark/benchmark_optimization.jl create mode 100644 benchmark/benchmark_results/baseline.json create mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json create mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json create mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json create mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json create mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json create mode 100644 benchmark/benchmark_results/optimized.json create mode 100644 benchmark/profile_optimization.jl create mode 100644 benchmark/run_benchmarks.jl create mode 100644 benchmark/visualize_benchmarks.jl diff --git a/Project.toml b/Project.toml index 2d1219b..927c3f8 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,8 @@ authors = ["Charles Dufour", "Jake Grainger"] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" @@ -28,6 +30,7 @@ MakieExt = "Makie" [compat] Graphons = "0.1.0" +Printf = "1.11.0" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/benchmark/BENCHMARKING.md b/benchmark/BENCHMARKING.md new file mode 100644 index 0000000..3e19a0c --- /dev/null +++ b/benchmark/BENCHMARKING.md @@ -0,0 +1,329 @@ +# NetworkHistogram Performance Benchmarking Suite + +This directory contains tools for measuring and tracking the performance of +NetworkHistogram's optimization algorithms. + +## Overview + +The benchmarking suite is designed to: + +1. **Track performance improvements/regressions** over time +2. **Identify bottlenecks** in the optimization workflow +3. **Ensure optimization changes maintain correctness** +4. **Compare performance** before and after code changes + +## Files + +- `benchmark_optimization.jl` - Standalone benchmarking script that runs all + benchmarks +- `test_performance_regression.jl` - Test suite that can be run with + `Pkg.test()` +- `benchmark_results/` - Directory for storing benchmark results (created + automatically) + +## Quick Start + +### Running Benchmarks + +```bash +# From the repository root +julia --project=. benchmark_optimization.jl + +# With custom output file +julia --project=. benchmark_optimization.jl results/my_benchmark.json + +# Compare with baseline +julia --project=. benchmark_optimization.jl results/current.json results/baseline.json +``` + +### Running as Tests + +```bash +# Run all tests including performance tests +julia --project=. -e 'using Pkg; Pkg.test()' + +# Run only performance tests +julia --project=test test/test_performance_regression.jl +``` + +## Workflow for Performance Optimization + +### 1. Establish Baseline + +Before making any changes, establish a baseline: + +```bash +julia --project=. benchmark_optimization.jl benchmark_results/baseline.json +``` + +### 2. Make Your Changes + +Edit the source code to improve performance (e.g., optimize `apply_swap!`, +reduce allocations, etc.). + +### 3. Run Benchmarks + +```bash +julia --project=. benchmark_optimization.jl benchmark_results/after_changes.json +``` + +### 4. Compare Results + +The script will automatically compare with `baseline.json` if it exists, or you +can manually compare: + +```bash +julia --project=. benchmark_optimization.jl \ + benchmark_results/after_changes.json \ + benchmark_results/baseline.json +``` + +### 5. Verify Correctness + +Run the full test suite to ensure your changes don't break anything: + +```bash +julia --project=. -e 'using Pkg; Pkg.test()' +``` + +## Benchmark Categories + +### Single Swap Operations + +Measures the performance of a single node swap operation (apply + revert): + +- **Bernoulli networks**: Binary edge weights (0/1) + + - Small: n=50, k=2 + - Medium: n=200, k=3 + - Large: n=500, k=5 + +- **Categorical networks**: Multi-valued edge weights + - Small: n=50, k=2, m=3 + - Medium: n=200, k=3, m=4 + - Large: n=500, k=5, m=5 + +**Why it matters**: Swap operations are the core of the greedy optimization +algorithm and are called millions of times. + +### Full Optimization Workflow + +Measures end-to-end performance of the optimization process: + +- Bernoulli: n=100, 1,000 iterations +- Categorical: n=100, 1,000 iterations + +**Why it matters**: Shows real-world performance for typical use cases. + +### Component Benchmarks + +Measures individual components: + +- **Assignment creation**: Time to create initial assignment +- **EdgeList creation**: Time to convert adjacency matrix to edge list +- **Loglikelihood computation**: Time to compute total log-likelihood +- **Get edges in groups**: Time to extract edges between two groups + +**Why it matters**: Identifies which components are bottlenecks. + +## Interpreting Results + +### Benchmark Output + +``` +Benchmarking Bernoulli swap (n=50, k=2)... + Median: 0.234 ms +``` + +- **Median**: The middle value (most representative of typical performance) +- **Mean**: Average value (affected by outliers) +- **Min/Max**: Best and worst case performance +- **Std**: Standard deviation (consistency of performance) + +### Performance Comparison + +``` +✓ FASTER bernoulli_swap_n50_k2: 1.23x (23.0%) + Current: 0.190 ms | Baseline: 0.234 ms + +✗ SLOWER categorical_swap_n200_k3_m4: 0.87x (-13.0%) + Current: 1.450 ms | Baseline: 1.260 ms + +≈ SIMILAR bernoulli_optimize_n100_1k: 1.02x (2.0%) + Current: 123.4 ms | Baseline: 125.9 ms +``` + +- **✓ FASTER**: >5% improvement +- **✗ SLOWER**: >5% regression +- **≈ SIMILAR**: Within ±5% + +## Key Performance Hotspots + +Based on the codebase analysis, these are the most critical areas for +optimization: + +### 1. `apply_swap!` (swap_workspace.jl, swap_categorical.jl) + +**Impact**: Called once per iteration in greedy search + +**Current approach**: + +- Iterates over all neighbors of swapped nodes +- Updates θ parameters and log-likelihoods incrementally +- Uses `deepcopy` for categorical distributions + +**Optimization opportunities**: + +- Reduce allocations in the hot path +- Optimize neighbor iteration +- Pre-allocate workspace for intermediate computations + +### 2. `get_edges_in_groups` (assignment.jl) + +**Impact**: Called during likelihood recomputation + +**Current approach**: + +- Allocates new vector for each call +- Uses `findall` to identify nodes in groups +- Iterates over all edges + +**Optimization opportunities**: + +- Pre-compute and cache group membership +- Use pre-allocated buffers +- Use views instead of copying data + +### 3. Edge iteration (EdgeList.jl) + +**Impact**: Used throughout the codebase + +**Current approach**: + +- Iterator-based access to edges + +**Optimization opportunities**: + +- Ensure type stability +- Minimize bounds checking +- Cache frequently accessed data + +### 4. Log-likelihood computation + +**Impact**: Called after every swap + +**Current approach**: + +- Recomputes for affected groups only (good!) +- Calls `logpdf` for each edge + +**Optimization opportunities**: + +- Batch logpdf computations +- Use SIMD operations where possible +- Cache intermediate results + +## Example: Optimizing a Function + +Let's say you want to optimize `apply_swap!`: + +```julia +# 1. Add profiling annotations +using Profile + +@profile begin + for i in 1:1000 + apply_swap!(assignment, swap) + revert_swap!(assignment, swap) + end +end + +Profile.print() + +# 2. Identify hot spots from profiling output + +# 3. Make targeted changes (e.g., reduce allocations) + +# 4. Benchmark before and after +julia benchmark_optimization.jl +``` + +## Tips for Performance Optimization + +1. **Start with profiling**: Use `@profile` to identify actual bottlenecks +2. **Benchmark incrementally**: Make one change at a time +3. **Check allocations**: Use `@btime` with `samples=1 evals=1` to see + allocations +4. **Maintain correctness**: Always run tests after changes +5. **Consider trade-offs**: Sometimes slight speedups aren't worth added + complexity + +## Advanced Usage + +### Custom Benchmarks + +Add your own benchmarks to `benchmark_optimization.jl`: + +```julia +function benchmark_my_function() + # Setup + data = create_test_data() + + # Benchmark + b = @benchmark my_function($data) samples=100 + + return Dict( + "median_ms" => median(b.times) / 1e6, + "mean_ms" => mean(b.times) / 1e6 + ) +end +``` + +### Continuous Integration + +To track performance over time in CI: + +```yaml +# .github/workflows/benchmark.yml +- name: Run benchmarks + run: julia --project=. benchmark_optimization.jl results/current.json + +- name: Compare with main + run: | + git checkout main + julia --project=. benchmark_optimization.jl results/baseline.json + git checkout - + julia --project=. benchmark_optimization.jl results/current.json results/baseline.json +``` + +## Troubleshooting + +### Inconsistent Results + +If you see high variance in results: + +- Close other applications +- Run with `--threads=1` to avoid threading variability +- Increase the number of samples +- Let the system warm up with a few iterations first + +### Out of Memory + +For large benchmarks: + +- Reduce the number of samples +- Run benchmarks separately instead of all at once +- Use smaller test networks + +### Compilation Effects + +Julia's JIT compilation can affect first-run timing: + +- BenchmarkTools automatically handles warmup +- For manual timing, always run at least once before measuring + +## Resources + +- [BenchmarkTools.jl documentation](https://juliaci.github.io/BenchmarkTools.jl/stable/) +- [Julia Performance Tips](https://docs.julialang.org/en/v1/manual/performance-tips/) +- [Profile module documentation](https://docs.julialang.org/en/v1/stdlib/Profile/) diff --git a/benchmark/BENCHMARK_SUMMARY.md b/benchmark/BENCHMARK_SUMMARY.md new file mode 100644 index 0000000..a576392 --- /dev/null +++ b/benchmark/BENCHMARK_SUMMARY.md @@ -0,0 +1,330 @@ +# Performance Regression Test Suite - Summary + +## What Was Created + +A comprehensive performance benchmarking and profiling suite for +NetworkHistogram optimization, consisting of: + +### 1. **Test Files** + +- `test/test_performance_regression.jl` - Performance regression tests that run + with `Pkg.test()` + - Tests for Bernoulli and Categorical networks + - Multiple network sizes (50, 200, 500 nodes) + - Single swap operations and full optimization workflows + - Component-level benchmarks + +### 2. **Standalone Benchmarking** + +- `benchmark_optimization.jl` - Comprehensive standalone benchmark suite + - Saves results to JSON with timestamps + - Automatic comparison with baseline + - Detailed performance metrics (median, mean, std, min, max) + +### 3. **Easy-to-Use Runner** + +- `dev/run_benchmarks.jl` - User-friendly command-line interface + - Simple commands: `baseline`, `current`, `compare`, `clean` + - Handles dependencies automatically + - Interactive confirmations for destructive operations + +### 4. **Profiling Tools** + +- `dev/profile_optimization.jl` - Profiling helper + - Profile swap operations, full optimization, or components + - Integrated flamegraph support + - Configurable network sizes and iteration counts + +### 5. **Documentation** + +- `PERFORMANCE.md` - Main performance guide +- `benchmark/BENCHMARKING.md` - Detailed benchmarking documentation +- This summary document + +## How to Use + +### Quick Start (5 minutes) + +```bash +# 1. Create baseline +julia dev/run_benchmarks.jl baseline + +# 2. Make your optimizations in src/optimization/ + +# 3. Test performance +julia dev/run_benchmarks.jl current + +# 4. Verify correctness +julia --project=. -e 'using Pkg; Pkg.test()' +``` + +### Example Output + +``` +--- Single Swap Operations (Bernoulli) --- +Benchmarking Bernoulli swap (n=50, k=2)... + Median: 0.234 ms + +======================================== +Performance Comparison vs Baseline +======================================== +✓ FASTER bernoulli_swap_n50_k2: 1.23x (23.0%) + Current: 0.190 ms | Baseline: 0.234 ms +``` + +## Key Insights from Code Analysis + +Based on analysis of `test_decorated_paper.jl` and the source code: + +### Primary Bottlenecks + +1. **`apply_swap!`** (called millions of times) + + - Location: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` + - Issues: Uses `deepcopy`, allocates during iteration + - Impact: 🔴 CRITICAL - dominates runtime + +2. **`get_edges_in_groups`** (called during LL updates) + + - Location: `src/assignment.jl` + - Issues: Uses `findall`, allocates new vectors + - Impact: 🟡 MODERATE - called less frequently + +3. **Edge iteration** (used throughout) + - Location: `src/EdgeList.jl` + - Issues: Iterator overhead + - Impact: 🟢 LOW - but cumulative + +### Workflow from test_decorated_paper.jl + +The typical optimization workflow: + +1. Create SBM (Stochastic Block Model) +2. Sample network from SBM +3. Initialize node labels +4. Run greedy optimization with `nethist()` + - Iteratively swap nodes between groups + - Accept swaps that improve log-likelihood +5. Measure convergence via log-likelihood + +## Benchmarked Scenarios + +### Network Sizes + +- **Small**: n=50 nodes, k=2 groups (quick iteration) +- **Medium**: n=200 nodes, k=3 groups (realistic size) +- **Large**: n=500 nodes, k=5 groups (stress test) + +### Network Types + +- **Bernoulli**: Binary edges (0/1) - simpler, faster +- **Categorical**: Multi-valued edges (m categories) - more complex + +### Benchmark Types + +- **Single swap**: Apply + revert one node swap +- **Full optimization**: Complete optimization run (1k iterations) +- **Components**: Individual function benchmarks + +## Files and Their Purpose + +``` +NetworkHistogram/ +├── PERFORMANCE.md # Main guide (START HERE) +├── test/ +│ ├── test_performance_regression.jl # CI-friendly tests +│ └── Project.toml # Added BenchmarkTools dependency +├── dev/ +│ ├── run_benchmarks.jl # 👈 Easy CLI (USE THIS) +│ ├── benchmark_optimization.jl # Core benchmarking logic +│ ├── profile_optimization.jl # Profiling helper +│ ├── BENCHMARKING.md # Detailed docs +│ ├── benchmark_results/ # Stored results (auto-created) +│ │ └── baseline.json # Your reference baseline +│ └── test_decorated_paper.jl # Original workflow example +└── src/optimization/ # 🎯 Optimize these files + ├── greedy.jl + ├── swap_workspace.jl + ├── swap_categorical.jl + └── config_rules/ +``` + +## Common Workflows + +### A. Making Performance Improvements + +```bash +# Step 1: Baseline +julia dev/run_benchmarks.jl baseline + +# Step 2: Profile to find bottlenecks +julia dev/profile_optimization.jl swap + +# Step 3: Make changes to src/optimization/ + +# Step 4: Benchmark +julia dev/run_benchmarks.jl current + +# Step 5: Test correctness +julia --project=. -e 'using Pkg; Pkg.test()' + +# Step 6: Repeat steps 2-5 until satisfied +``` + +### B. Comparing Two Versions + +```bash +# Benchmark version A +git checkout feature-A +julia dev/run_benchmarks.jl results_A.json + +# Benchmark version B +git checkout feature-B +julia dev/run_benchmarks.jl results_B.json + +# Compare +julia dev/run_benchmarks.jl compare results_A.json results_B.json +``` + +### C. Debugging Performance Regression + +```bash +# Find when regression occurred +git bisect start +git bisect bad HEAD +git bisect good v1.0.0 + +# For each commit +julia dev/run_benchmarks.jl +# Mark good/bad based on results +git bisect good # or bad +``` + +## Optimization Strategies + +### 1. Profile First + +Don't guess - use `profile_optimization.jl` to see what's actually slow. + +### 2. Reduce Allocations + +The biggest wins usually come from eliminating allocations in hot paths. + +**Check allocations**: + +```julia +using BenchmarkTools +@btime apply_swap!($assignment, $swap) samples=1 evals=1 +# ^^^^^ This shows allocations +``` + +**Common fixes**: + +- Pre-allocate buffers +- Use `@inbounds` (after bounds checking once) +- Avoid `deepcopy` when possible +- Use views instead of copies + +### 3. Type Stability + +Julia is fast when types are known at compile time. + +**Check type stability**: + +```julia +using Cthulhu +@descend apply_swap!(assignment, swap) +# Look for red (runtime dispatch) +``` + +### 4. SIMD/Vectorization + +For bulk operations on arrays, help the compiler vectorize. + +### 5. Cache-Friendly Access + +Access memory in order when possible (column-major for Julia). + +## Expected Performance Gains + +Based on typical optimization opportunities in similar codebases: + +- **Low-hanging fruit** (reduce allocations): 20-50% speedup +- **Algorithm improvements** (better data structures): 2-10x speedup +- **SIMD/vectorization**: 2-4x speedup (for vectorizable operations) +- **Type stability fixes**: 2-5x speedup (if unstable) + +The swap operation is called O(iterations × n) times, so even small +improvements compound significantly. + +## Integration with CI/CD + +Add to `.github/workflows/benchmark.yml`: + +```yaml +name: Benchmark +on: [pull_request] + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - uses: julia-actions/setup-julia@v1 + + - name: Benchmark PR + run: julia benchmark_optimization.jl pr_results.json + + - name: Benchmark main + run: | + git fetch origin main + git checkout origin/main + julia benchmark_optimization.jl main_results.json + + - name: Compare + run: + julia dev/run_benchmarks.jl compare pr_results.json main_results.json +``` + +## Troubleshooting + +### "BenchmarkTools not found" + +```bash +julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' +``` + +### Results vary too much + +- Close other applications +- Disable CPU frequency scaling +- Run with `--threads=1` +- Increase sample count + +### Benchmark takes too long + +- Reduce `samples` parameter +- Use smaller networks +- Run specific benchmarks only + +## Next Steps + +1. **Establish your baseline**: `julia dev/run_benchmarks.jl baseline` +2. **Read the detailed docs**: See `benchmark/BENCHMARKING.md` +3. **Profile the code**: `julia dev/profile_optimization.jl swap` +4. **Start optimizing**: Focus on `apply_swap!` first +5. **Measure improvements**: `julia dev/run_benchmarks.jl current` +6. **Share results**: Open PR with before/after benchmarks + +## Questions? + +- Check `PERFORMANCE.md` for main guide +- Check `benchmark/BENCHMARKING.md` for detailed docs +- Run `julia dev/run_benchmarks.jl help` +- Run `julia dev/profile_optimization.jl help` + +--- + +Happy optimizing! The suite is designed to make performance work systematic and +data-driven. 🚀 diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..755e28d --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,136 @@ +# NetworkHistogram Benchmarks + +This directory contains benchmarking and profiling tools for +NetworkHistogram.jl performance analysis. + +## Files + +### `benchmark_optimization.jl` + +Main benchmarking script that runs comprehensive performance tests. + +**Usage:** + +```bash +julia --project=. benchmark/benchmark_optimization.jl [output_file] +``` + +**Features:** + +- Single swap operations (Bernoulli and Categorical networks) +- Full optimization workflows +- Component benchmarks (Assignment, EdgeList, loglikelihood) +- Automatic comparison with baseline +- Results saved as timestamped JSON files + +**Example:** + +```bash +# Run benchmarks and save to default location +julia --project=. benchmark/benchmark_optimization.jl + +# Save to specific file +julia --project=. benchmark/benchmark_optimization.jl my_results.json + +# Compare with custom baseline +julia --project=. benchmark/benchmark_optimization.jl current.json baseline.json +``` + +### `visualize_benchmarks.jl` + +Compare and visualize benchmark results over time. + +**Usage:** + +```bash +julia --project=. benchmark/visualize_benchmarks.jl [files...] +``` + +**Example:** + +```bash +# Compare two benchmark runs +julia --project=. benchmark/visualize_benchmarks.jl \ + benchmark/benchmark_results/baseline.json \ + benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json + +# Use all files in benchmark_results/ +julia --project=. benchmark/visualize_benchmarks.jl --all +``` + +### `profile_optimization.jl` + +Profile code to identify performance bottlenecks. + +**Usage:** + +```bash +julia --project=. benchmark/profile_optimization.jl [scenario] +``` + +**Scenarios:** + +- `swap` - Profile single swap operations +- `optimize` - Profile full optimization run +- `components` - Profile individual components + +**Example:** + +```bash +julia --project=. benchmark/profile_optimization.jl swap +``` + +## Benchmark Results + +Results are stored in `benchmark/benchmark_results/` as JSON files with +timestamps. + +### Setting a Baseline + +To set a benchmark run as the baseline for future comparisons: + +```bash +cp benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json \ + benchmark/benchmark_results/baseline.json +``` + +## Performance Metrics + +The benchmarks track: + +- **Median time**: Most representative performance measurement +- **Mean time**: Average across all samples +- **Min/Max time**: Best and worst case performance +- **Standard deviation**: Performance consistency + +### Current Performance (October 2025) + +**Bernoulli Swap Operations:** + +- n=50, k=2: ~0.038 ms +- n=200, k=3: ~0.52 ms +- n=500, k=5: ~2.4 ms (6.3x faster than pre-optimization baseline) + +**Categorical Swap Operations:** + +- n=50, k=2, m=3: ~0.009 ms +- n=200, k=3, m=4: ~0.04 ms +- n=500, k=5, m=5: ~0.10 ms + +**Full Optimization:** + +- Bernoulli (n=100, 1k iters): ~90 ms +- Categorical (n=100, 1k iters): ~20 ms + +## Optimization History + +Major optimizations implemented: + +1. Eliminated `deepcopy` in swap operations (replaced with in-place + `copy_symarray!`) +2. Optimized `get_edges_in_groups` (removed `findall`, added direct iteration) +3. Added `@inbounds` annotations to hot paths +4. Pre-sized Set allocations to avoid resizing + +**Result:** 6.3x speedup for large Bernoulli networks while maintaining full +correctness. diff --git a/benchmark/benchmark_optimization.jl b/benchmark/benchmark_optimization.jl new file mode 100644 index 0000000..c48a1da --- /dev/null +++ b/benchmark/benchmark_optimization.jl @@ -0,0 +1,336 @@ +""" +Standalone benchmarking script for NetworkHistogram optimization. + +This script runs performance benchmarks and saves results to track improvements +over time. Results are saved in JSON format with timestamps. + +Usage: + julia --project=. benchmark/benchmark_optimization.jl [output_file] + +If output_file is not provided, results are saved to: + benchmark/benchmark_results/benchmark_YYYY-MM-DD_HH-MM-SS.json +""" + +using Random +using StatsBase +using StaticArrays +using BenchmarkTools +using JSON3 +using Dates +using NetworkHistogram + +# Create output directory if it doesn't exist +const BENCHMARK_DIR = joinpath(@__DIR__, "benchmark_results") +mkpath(BENCHMARK_DIR) + +# Helper functions to create test networks +function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) + Random.seed!(seed) + d = NetworkHistogram.Bernoulli(0.5) + sbm = NetworkHistogram.BlockModel(n_groups, d) + + for g1 in 1:n_groups + for g2 in g1:n_groups + p = 0.1 + 0.7 * rand() + sbm[g1, g2] = NetworkHistogram.Bernoulli(p) + end + end + + base_size = n_nodes ÷ n_groups + remainder = n_nodes % n_groups + sizes = fill(base_size, n_groups) + sizes[1:remainder] .+= 1 + labels = StatsBase.inverse_rle(1:n_groups, sizes) + A = NetworkHistogram.sample(sbm, labels) + return A, labels, d +end + +function create_test_sbm_categorical( + n_groups::Int, n_nodes::Int, n_categories::Int; seed = 42) + Random.seed!(seed) + ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) + d = NetworkHistogram.Cat(ps) + sbm = NetworkHistogram.BlockModel(n_groups, d) + + for g1 in 1:n_groups + for g2 in g1:n_groups + probs = rand(n_categories) + probs ./= sum(probs) + sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) + end + end + + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + # Ensure we have exactly n_nodes by padding with last group if needed + while length(labels) < n_nodes + push!(labels, n_groups) + end + A = NetworkHistogram.sample(sbm, labels) + return A, labels, d +end + +function benchmark_single_swap( + network_type, n_nodes, n_groups, n_categories = nothing; samples = 100) + if network_type == :bernoulli + A, labels, d = create_test_sbm_bernoulli(n_groups, n_nodes) + else + A, labels, d = create_test_sbm_categorical(n_groups, n_nodes, n_categories) + end + + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) + swap = NetworkHistogram.make_swap(assignment, (1, n_nodes)) + + b = @benchmark begin + NetworkHistogram.apply_swap!($assignment, $swap) + NetworkHistogram.revert_swap!($assignment, $swap) + end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=samples evals=1 + + return Dict( + "median_ms" => median(b.times) / 1e6, + "mean_ms" => mean(b.times) / 1e6, + "min_ms" => minimum(b.times) / 1e6, + "max_ms" => maximum(b.times) / 1e6, + "std_ms" => std(b.times) / 1e6 + ) +end + +function benchmark_full_optimization( + network_type, n_nodes, n_groups, n_categories = nothing, + max_iter = 1000; samples = 10) + if network_type == :bernoulli + A, labels, d = create_test_sbm_bernoulli(n_groups, n_nodes) + else + A, labels, d = create_test_sbm_categorical(n_groups, n_nodes, n_categories) + end + + initial_labels = rand(1:n_groups, n_nodes) + + b = @benchmark begin + # Create fresh params for each benchmark iteration + params = NetworkHistogram.GreedyParams( + $max_iter, + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue($max_iter ÷ 2), + false + ) + NetworkHistogram.nethist($A, $d, $initial_labels, params) + end samples=samples evals=1 + + return Dict( + "median_ms" => median(b.times) / 1e6, + "mean_ms" => mean(b.times) / 1e6, + "min_ms" => minimum(b.times) / 1e6, + "max_ms" => maximum(b.times) / 1e6, + "std_ms" => std(b.times) / 1e6 + ) +end + +function benchmark_component(component_name, setup_fn, benchmark_fn; samples = 1000) + setup_data = setup_fn() + + b = @benchmark $benchmark_fn($setup_data...) samples=samples + + return Dict( + "median_us" => median(b.times) / 1e3, + "mean_us" => mean(b.times) / 1e3, + "min_us" => minimum(b.times) / 1e3, + "max_us" => maximum(b.times) / 1e3, + "std_us" => std(b.times) / 1e3 + ) +end + +function run_all_benchmarks() + println("="^70) + println("NetworkHistogram Performance Benchmarks") + println("Started at: $(Dates.format(now(), "yyyy-mm-dd HH:MM:SS"))") + println("="^70) + + results = Dict( + "timestamp" => Dates.format(now(), "yyyy-mm-dd HH:MM:SS"), + "julia_version" => string(VERSION), + "benchmarks" => Dict() + ) + + # Single swap benchmarks - Bernoulli + println("\n--- Single Swap Operations (Bernoulli) ---") + for (n, k, s) in [(50, 2, 100), (200, 3, 100), (500, 5, 50)] + println("Benchmarking Bernoulli swap (n=$n, k=$k)...") + results["benchmarks"]["bernoulli_swap_n$(n)_k$(k)"] = benchmark_single_swap( + :bernoulli, n, k; samples = s) + r = results["benchmarks"]["bernoulli_swap_n$(n)_k$(k)"] + println(" Median: $(round(r["median_ms"], digits=3)) ms") + end + + # Single swap benchmarks - Categorical + println("\n--- Single Swap Operations (Categorical) ---") + for (n, k, m, s) in [(50, 2, 3, 100), (200, 3, 4, 100), (500, 5, 5, 50)] + println("Benchmarking Categorical swap (n=$n, k=$k, m=$m)...") + results["benchmarks"]["categorical_swap_n$(n)_k$(k)_m$(m)"] = benchmark_single_swap( + :categorical, n, k, m; samples = s) + r = results["benchmarks"]["categorical_swap_n$(n)_k$(k)_m$(m)"] + println(" Median: $(round(r["median_ms"], digits=3)) ms") + end + + # Full optimization benchmarks + println("\n--- Full Optimization Workflow ---") + println("Benchmarking Bernoulli optimization (n=100, 1k iters)...") + results["benchmarks"]["bernoulli_optimize_n100_1k"] = benchmark_full_optimization( + :bernoulli, 100, 3, nothing, 1000; samples = 10) + r = results["benchmarks"]["bernoulli_optimize_n100_1k"] + println(" Median: $(round(r["median_ms"], digits=1)) ms") + + println("Benchmarking Categorical optimization (n=100, 1k iters)...") + results["benchmarks"]["categorical_optimize_n100_1k"] = benchmark_full_optimization( + :categorical, 100, 3, 3, 1000; samples = 10) + r = results["benchmarks"]["categorical_optimize_n100_1k"] + println(" Median: $(round(r["median_ms"], digits=1)) ms") + + # Component benchmarks + println("\n--- Component Benchmarks ---") + + println("Benchmarking Assignment creation (n=200)...") + results["benchmarks"]["assignment_creation_n200"] = benchmark_component( + "assignment_creation", + () -> begin + A, labels, d = create_test_sbm_bernoulli(3, 200) + edgelist = NetworkHistogram.EdgeList(A) + (labels, edgelist, NetworkHistogram.Dist(d)) + end, + (labels, edgelist, dist) -> NetworkHistogram.Assignment(labels, edgelist, dist); + samples = 100 + ) + r = results["benchmarks"]["assignment_creation_n200"] + println(" Median: $(round(r["median_us"], digits=1)) μs") + + println("Benchmarking EdgeList creation (n=200)...") + results["benchmarks"]["edgelist_creation_n200"] = benchmark_component( + "edgelist_creation", + () -> begin + A, _, _ = create_test_sbm_bernoulli(3, 200) + (A,) + end, + (A,) -> NetworkHistogram.EdgeList(A); + samples = 100 + ) + r = results["benchmarks"]["edgelist_creation_n200"] + println(" Median: $(round(r["median_us"], digits=1)) μs") + + println("Benchmarking Loglikelihood computation (n=200)...") + results["benchmarks"]["loglikelihood_n200"] = benchmark_component( + "loglikelihood", + () -> begin + A, labels, d = create_test_sbm_bernoulli(3, 200) + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment( + labels, edgelist, NetworkHistogram.Dist(d)) + (assignment,) + end, + (assignment,) -> NetworkHistogram.loglikelihood(assignment); + samples = 1000 + ) + r = results["benchmarks"]["loglikelihood_n200"] + println(" Median: $(round(r["median_us"], digits=2)) μs") + + return results +end + +function save_results(results, output_file = nothing) + if isnothing(output_file) + timestamp = Dates.format(now(), "yyyy-mm-ddTHH-MM-SS") + output_file = joinpath(BENCHMARK_DIR, "benchmark_$timestamp.json") + end + + open(output_file, "w") do io + JSON3.pretty(io, results) + end + + println("\n" * "="^70) + println("Results saved to: $output_file") + println("="^70) + + return output_file +end + +function compare_with_baseline(results, baseline_file) + if !isfile(baseline_file) + println("\nBaseline file not found: $baseline_file") + return + end + + baseline = JSON3.read(read(baseline_file, String)) + + println("\n" * "="^70) + println("Performance Comparison vs Baseline") + println("Baseline: $(baseline["timestamp"])") + println("="^70) + + for (key, value) in results["benchmarks"] + if haskey(baseline["benchmarks"], key) + baseline_val = baseline["benchmarks"][key] + + # Determine which unit to use (ms or us) + # Check both string and symbol keys for JSON3 compatibility + if haskey(value, "median_ms") + current_median = value["median_ms"] + # Try string key first, then symbol key + if haskey(baseline_val, "median_ms") + baseline_median = baseline_val["median_ms"] + elseif haskey(baseline_val, :median_ms) + baseline_median = baseline_val[:median_ms] + else + baseline_median = get( + baseline_val, "median_us", get(baseline_val, :median_us, 0)) / 1000 + end + unit = "ms" + else + current_median = value["median_us"] + # Try string key first, then symbol key + if haskey(baseline_val, "median_us") + baseline_median = baseline_val["median_us"] + elseif haskey(baseline_val, :median_us) + baseline_median = baseline_val[:median_us] + else + baseline_median = get( + baseline_val, "median_ms", get(baseline_val, :median_ms, 0)) * 1000 + end + unit = "μs" + end + + speedup = baseline_median / current_median + change_pct = (speedup - 1) * 100 + + status = if speedup > 1.05 + "✓ FASTER" + elseif speedup < 0.95 + "✗ SLOWER" + else + "≈ SIMILAR" + end + + println("$status $key: $(round(speedup, digits=2))x ($(round(change_pct, sigdigits=3))%)") + println(" Current: $(round(current_median, digits=2)) $unit | Baseline: $(round(baseline_median, digits=2)) $unit") + end + end +end + +# Main execution +function main() + output_file = length(ARGS) >= 1 ? ARGS[1] : nothing + baseline_file = length(ARGS) >= 2 ? ARGS[2] : joinpath(BENCHMARK_DIR, "baseline.json") + + results = run_all_benchmarks() + saved_file = save_results(results, output_file) + + if isfile(baseline_file) + compare_with_baseline(results, baseline_file) + else + println("\nNo baseline found. To set this as baseline, run:") + println(" cp $saved_file $baseline_file") + end +end + +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/benchmark/benchmark_results/baseline.json b/benchmark/benchmark_results/baseline.json new file mode 100644 index 0000000..4a58148 --- /dev/null +++ b/benchmark/benchmark_results/baseline.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 15:54:06", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 17.386917, + "min_ms": 15.523083, + "mean_ms": 15.75884578, + "median_ms": 15.6418545, + "std_ms": 0.3105241491716007 + }, + "assignment_creation_n200": { + "max_us": 958.667, + "min_us": 856.458, + "median_us": 891.771, + "mean_us": 892.07211, + "std_us": 18.802076604038753 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.060375, + "min_ms": 0.008958, + "mean_ms": 0.00995414, + "median_ms": 0.009209, + "std_ms": 0.005302552623372667 + }, + "edgelist_creation_n200": { + "max_us": 3982.25, + "min_us": 191.292, + "median_us": 250.729, + "mean_us": 287.22586, + "std_us": 373.70431110094887 + }, + "loglikelihood_n200": { + "max_us": 0.04755310621242485, + "min_us": 0.014654308617234468, + "median_us": 0.015947895791583167, + "mean_us": 0.01565783266533066, + "std_us": 0.0015483768072472187 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 260.846875, + "min_ms": 0.562458, + "mean_ms": 56.7193917, + "median_ms": 0.7415835, + "std_ms": 107.19352380450401 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.05275, + "min_ms": 0.023833, + "mean_ms": 0.02478879, + "median_ms": 0.024167, + "std_ms": 0.0034973870832379988 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.078083, + "min_ms": 0.060125, + "mean_ms": 0.06149838, + "median_ms": 0.0605625, + "std_ms": 0.002599585086474028 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.125292, + "min_ms": 0.092625, + "mean_ms": 0.09619675999999999, + "median_ms": 0.093375, + "std_ms": 0.006499953084791443 + }, + "categorical_optimize_n100_1k": { + "max_ms": 21.680917, + "min_ms": 0.48575, + "mean_ms": 3.8231835, + "median_ms": 0.5199585, + "std_ms": 7.301765661997686 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 1.895959, + "min_ms": 1.728084, + "mean_ms": 1.74944463, + "median_ms": 1.739271, + "std_ms": 0.02638663709158731 + } + } +} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json new file mode 100644 index 0000000..ee2cc68 --- /dev/null +++ b/benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 22:51:00", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 3.227041, + "min_ms": 2.389583, + "mean_ms": 2.57204906, + "median_ms": 2.5154585, + "std_ms": 0.18953309587998995 + }, + "assignment_creation_n200": { + "max_us": 978.292, + "min_us": 851.25, + "median_us": 890.166, + "mean_us": 893.5574399999999, + "std_us": 27.32222851667806 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.027917, + "min_ms": 0.009125, + "mean_ms": 0.010145870000000001, + "median_ms": 0.009542, + "std_ms": 0.002766908359851873 + }, + "edgelist_creation_n200": { + "max_us": 3529.958, + "min_us": 198.958, + "median_us": 251.083, + "mean_us": 285.34033, + "std_us": 328.14675494182416 + }, + "loglikelihood_n200": { + "max_us": 0.03218937875751503, + "min_us": 0.014654308617234468, + "median_us": 0.015947895791583167, + "mean_us": 0.015587750501002001, + "std_us": 0.001110091729142502 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 125.034125, + "min_ms": 0.47775, + "mean_ms": 20.107170699999998, + "median_ms": 0.5687915, + "std_ms": 42.12092585974448 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.055292, + "min_ms": 0.036333, + "mean_ms": 0.03703418, + "median_ms": 0.036833, + "std_ms": 0.0018664594223699422 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.105291, + "min_ms": 0.038791, + "mean_ms": 0.040879559999999995, + "median_ms": 0.039333, + "std_ms": 0.007915583005028433 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.127208, + "min_ms": 0.0955, + "mean_ms": 0.09864, + "median_ms": 0.0968955, + "std_ms": 0.006224672275612228 + }, + "categorical_optimize_n100_1k": { + "max_ms": 21.485209, + "min_ms": 0.485375, + "mean_ms": 3.8293791, + "median_ms": 0.526625, + "std_ms": 7.277684487244635 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.883125, + "min_ms": 0.510125, + "mean_ms": 0.54511338, + "median_ms": 0.528875, + "std_ms": 0.06696204429448495 + } + } +} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json new file mode 100644 index 0000000..b0e60bf --- /dev/null +++ b/benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 22:52:28", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 3.112333, + "min_ms": 2.408458, + "mean_ms": 2.49714256, + "median_ms": 2.465771, + "std_ms": 0.11591144798617362 + }, + "assignment_creation_n200": { + "max_us": 1056.667, + "min_us": 854.083, + "median_us": 891.854, + "mean_us": 895.33457, + "std_us": 27.603034723460127 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.033792, + "min_ms": 0.009041, + "mean_ms": 0.00967924, + "median_ms": 0.009417, + "std_ms": 0.002441718719560987 + }, + "edgelist_creation_n200": { + "max_us": 3570.541, + "min_us": 200.541, + "median_us": 253.0205, + "mean_us": 283.21247999999997, + "std_us": 332.34212178512064 + }, + "loglikelihood_n200": { + "max_us": 0.044714428857715434, + "min_us": 0.014612224448897794, + "median_us": 0.015947895791583167, + "mean_us": 0.01585511623246492, + "std_us": 0.0019358509774869575 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 124.224584, + "min_ms": 0.454917, + "mean_ms": 20.3278875, + "median_ms": 0.5847705, + "std_ms": 42.30671579701058 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.045042, + "min_ms": 0.025542, + "mean_ms": 0.026280889999999998, + "median_ms": 0.026, + "std_ms": 0.001935421087452954 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.050166, + "min_ms": 0.037458, + "mean_ms": 0.03860959, + "median_ms": 0.038, + "std_ms": 0.0019394555890035667 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.126417, + "min_ms": 0.094791, + "mean_ms": 0.10100667999999999, + "median_ms": 0.0979795, + "std_ms": 0.006914122676472129 + }, + "categorical_optimize_n100_1k": { + "max_ms": 21.871542, + "min_ms": 0.491292, + "mean_ms": 3.8501959, + "median_ms": 0.514959, + "std_ms": 7.368805184415804 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.610667, + "min_ms": 0.511, + "mean_ms": 0.52604377, + "median_ms": 0.5165625, + "std_ms": 0.01953388411306172 + } + } +} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json new file mode 100644 index 0000000..63a888f --- /dev/null +++ b/benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 22:55:07", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 2.710667, + "min_ms": 2.482959, + "mean_ms": 2.5396208799999997, + "median_ms": 2.5191045, + "std_ms": 0.05846034240358578 + }, + "assignment_creation_n200": { + "max_us": 3125.959, + "min_us": 885.666, + "median_us": 931.9585, + "mean_us": 1039.505, + "std_us": 339.0273008825351 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.045167, + "min_ms": 0.009416, + "mean_ms": 0.01004002, + "median_ms": 0.009666, + "std_ms": 0.0035524098208046737 + }, + "edgelist_creation_n200": { + "max_us": 3497.792, + "min_us": 191.166, + "median_us": 250.8955, + "mean_us": 285.86834000000005, + "std_us": 325.67732384836796 + }, + "loglikelihood_n200": { + "max_us": 0.04972444889779559, + "min_us": 0.014654308617234468, + "median_us": 0.015947895791583167, + "mean_us": 0.015710969939879756, + "std_us": 0.001805494529688457 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 124.640792, + "min_ms": 0.441125, + "mean_ms": 19.8177917, + "median_ms": 0.495354, + "std_ms": 41.869538317803716 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.043916, + "min_ms": 0.025291, + "mean_ms": 0.02627703, + "median_ms": 0.025709, + "std_ms": 0.0024766015508182085 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.087459, + "min_ms": 0.038917, + "mean_ms": 0.04051669, + "median_ms": 0.039292, + "std_ms": 0.00566492637037033 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.11875, + "min_ms": 0.098333, + "mean_ms": 0.10018246, + "median_ms": 0.099583, + "std_ms": 0.00284072972236188 + }, + "categorical_optimize_n100_1k": { + "max_ms": 22.528667, + "min_ms": 0.462459, + "mean_ms": 3.9042627000000003, + "median_ms": 0.501917, + "std_ms": 7.566682937768953 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.62325, + "min_ms": 0.530166, + "mean_ms": 0.54022791, + "median_ms": 0.5352085, + "std_ms": 0.01611283337453269 + } + } +} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json new file mode 100644 index 0000000..ca15355 --- /dev/null +++ b/benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 22:55:38", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 2.630917, + "min_ms": 2.406333, + "mean_ms": 2.48274666, + "median_ms": 2.4808125, + "std_ms": 0.043789408323134535 + }, + "assignment_creation_n200": { + "max_us": 4273.291, + "min_us": 879.833, + "median_us": 932.104, + "mean_us": 997.5184, + "std_us": 350.62550806514275 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.028625, + "min_ms": 0.00975, + "mean_ms": 0.01021751, + "median_ms": 0.01, + "std_ms": 0.001868408459522746 + }, + "edgelist_creation_n200": { + "max_us": 3616.333, + "min_us": 193.791, + "median_us": 258.8955, + "mean_us": 297.08418, + "std_us": 336.1381451871029 + }, + "loglikelihood_n200": { + "max_us": 0.044547094188376754, + "min_us": 0.014654308617234468, + "median_us": 0.015948897795591183, + "mean_us": 0.016109651302605204, + "std_us": 0.0018390735764153526 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 126.498292, + "min_ms": 0.422959, + "mean_ms": 20.0525292, + "median_ms": 0.4898955, + "std_ms": 42.46023333898911 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.052292, + "min_ms": 0.0245, + "mean_ms": 0.025403779999999997, + "median_ms": 0.025, + "std_ms": 0.002871652896008067 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.048666, + "min_ms": 0.037333, + "mean_ms": 0.03801714, + "median_ms": 0.0375, + "std_ms": 0.0015056452358246997 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.124, + "min_ms": 0.095125, + "mean_ms": 0.09845666, + "median_ms": 0.09675, + "std_ms": 0.0052526714509701375 + }, + "categorical_optimize_n100_1k": { + "max_ms": 21.61125, + "min_ms": 0.464917, + "mean_ms": 3.7754376, + "median_ms": 0.477375, + "std_ms": 7.284791611432641 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.628125, + "min_ms": 0.51325, + "mean_ms": 0.52672377, + "median_ms": 0.520625, + "std_ms": 0.01733130179578166 + } + } +} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json new file mode 100644 index 0000000..00a806e --- /dev/null +++ b/benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 22:58:20", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 2.524833, + "min_ms": 2.393667, + "mean_ms": 2.4419925, + "median_ms": 2.4398125, + "std_ms": 0.028738214054297325 + }, + "assignment_creation_n200": { + "max_us": 1435.875, + "min_us": 905.084, + "median_us": 969.5415, + "mean_us": 996.6576, + "std_us": 92.09479767702254 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.045875, + "min_ms": 0.0095, + "mean_ms": 0.01018415, + "median_ms": 0.0097705, + "std_ms": 0.00361557269413376 + }, + "edgelist_creation_n200": { + "max_us": 4299.417, + "min_us": 208.25, + "median_us": 288.5, + "mean_us": 319.5867, + "std_us": 403.23297395167117 + }, + "loglikelihood_n200": { + "max_us": 0.06508817635270542, + "min_us": 0.015155310621242485, + "median_us": 0.016490981963927856, + "mean_us": 0.01614670541082165, + "std_us": 0.001998467439260285 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 129.661875, + "min_ms": 64.177334, + "mean_ms": 95.95533329999999, + "median_ms": 92.004916, + "std_ms": 30.315790347507026 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.140084, + "min_ms": 0.039208, + "mean_ms": 0.041837980000000004, + "median_ms": 0.039875, + "std_ms": 0.010841225036988497 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.052, + "min_ms": 0.037542, + "mean_ms": 0.03809124, + "median_ms": 0.037708, + "std_ms": 0.0018499183226304992 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.127083, + "min_ms": 0.09925, + "mean_ms": 0.10115922000000001, + "median_ms": 0.1005625, + "std_ms": 0.003808675051049879 + }, + "categorical_optimize_n100_1k": { + "max_ms": 24.93025, + "min_ms": 11.742417, + "mean_ms": 17.8008251, + "median_ms": 19.0025625, + "std_ms": 4.622818379951839 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.604917, + "min_ms": 0.511375, + "mean_ms": 0.52376955, + "median_ms": 0.5186875, + "std_ms": 0.013513957816500744 + } + } +} \ No newline at end of file diff --git a/benchmark/benchmark_results/optimized.json b/benchmark/benchmark_results/optimized.json new file mode 100644 index 0000000..e2d21d0 --- /dev/null +++ b/benchmark/benchmark_results/optimized.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 22:35:21", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 2.49025, + "min_ms": 2.412167, + "mean_ms": 2.4460266, + "median_ms": 2.440625, + "std_ms": 0.018879105065697174 + }, + "assignment_creation_n200": { + "max_us": 1020.75, + "min_us": 850.584, + "median_us": 895.833, + "mean_us": 899.73671, + "std_us": 26.985610491180616 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.028041, + "min_ms": 0.009167, + "mean_ms": 0.00977582, + "median_ms": 0.009459, + "std_ms": 0.002031678931426922 + }, + "edgelist_creation_n200": { + "max_us": 3978.25, + "min_us": 201.041, + "median_us": 255.0205, + "mean_us": 293.98788, + "std_us": 372.7416273862515 + }, + "loglikelihood_n200": { + "max_us": 0.04283567134268537, + "min_us": 0.014612224448897794, + "median_us": 0.015947895791583167, + "mean_us": 0.01562773947895791, + "std_us": 0.0013940979567484434 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 125.829375, + "min_ms": 0.489416, + "mean_ms": 28.099379199999998, + "median_ms": 0.5702295, + "std_ms": 51.02862436095698 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.044625, + "min_ms": 0.024375, + "mean_ms": 0.02514243, + "median_ms": 0.024875, + "std_ms": 0.0019936978244589937 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.049542, + "min_ms": 0.037541, + "mean_ms": 0.03824543, + "median_ms": 0.03775, + "std_ms": 0.0019822861674329366 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.131541, + "min_ms": 0.093667, + "mean_ms": 0.09694180000000001, + "median_ms": 0.095166, + "std_ms": 0.0059769960716748284 + }, + "categorical_optimize_n100_1k": { + "max_ms": 22.4595, + "min_ms": 0.516042, + "mean_ms": 4.0880541, + "median_ms": 0.627167, + "std_ms": 7.622196654548731 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.595833, + "min_ms": 0.516792, + "mean_ms": 0.52424699, + "median_ms": 0.5198955, + "std_ms": 0.01275029263319953 + } + } +} \ No newline at end of file diff --git a/benchmark/profile_optimization.jl b/benchmark/profile_optimization.jl new file mode 100644 index 0000000..804fc80 --- /dev/null +++ b/benchmark/profile_optimization.jl @@ -0,0 +1,309 @@ +""" +Profiling helper for NetworkHistogram optimization. + +This script helps identify performance bottlenecks using Julia's Profile module. + +Usage: + julia --project=. benchmark/profile_optimization.jl [scenario] + +Scenarios: + swap - Profile single swap operations + optimize - Profile full optimization run + components - Profile individual components + +The script generates profiling data and can display it as: +- Text output (default) +- FlameGraph (requires ProfileView.jl or PProf.jl) +""" + +using Profile +using Random +using StatsBase +using StaticArrays +using NetworkHistogram + +# Helper functions to create test networks +function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) + Random.seed!(seed) + d = NetworkHistogram.Bernoulli(0.5) + sbm = NetworkHistogram.BlockModel(n_groups, d) + + for g1 in 1:n_groups + for g2 in g1:n_groups + p = 0.1 + 0.7 * rand() + sbm[g1, g2] = NetworkHistogram.Bernoulli(p) + end + end + + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + A = NetworkHistogram.sample(sbm, labels) + return A, labels, d +end + +function create_test_sbm_categorical( + n_groups::Int, n_nodes::Int, n_categories::Int; seed = 42) + Random.seed!(seed) + ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) + d = NetworkHistogram.Cat(ps) + sbm = NetworkHistogram.BlockModel(n_groups, d) + + for g1 in 1:n_groups + for g2 in g1:n_groups + probs = rand(n_categories) + probs ./= sum(probs) + sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) + end + end + + labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + A = NetworkHistogram.sample(sbm, labels) + return A, labels, d +end + +function profile_swap_operations(network_type = :bernoulli, n = 200, k = 3) + println("Setting up $(network_type) network (n=$n, k=$k)...") + + if network_type == :bernoulli + A, labels, d = create_test_sbm_bernoulli(k, n) + else + A, labels, d = create_test_sbm_categorical(k, n, 4) + end + + edgelist = NetworkHistogram.EdgeList(A) + assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) + swap = NetworkHistogram.make_swap(assignment, (1, n)) + + # Warm up + println("Warming up...") + for i in 1:100 + NetworkHistogram.apply_swap!(assignment, swap) + NetworkHistogram.revert_swap!(assignment, swap) + end + + # Profile + println("Profiling swap operations (5000 iterations)...") + Profile.clear() + @profile begin + for i in 1:5000 + NetworkHistogram.apply_swap!(assignment, swap) + NetworkHistogram.revert_swap!(assignment, swap) + end + end + + return true +end + +function profile_full_optimization( + network_type = :bernoulli, n = 200, k = 3, max_iter = 10_000) + println("Setting up $(network_type) network (n=$n, k=$k)...") + + if network_type == :bernoulli + A, labels, d = create_test_sbm_bernoulli(k, n) + else + A, labels, d = create_test_sbm_categorical(k, n, 4) + end + + initial_labels = rand(1:k, n) + params = NetworkHistogram.GreedyParams( + max_iter, + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(max_iter ÷ 2), + false + ) + + # Warm up + println("Warming up...") + test_params = NetworkHistogram.GreedyParams( + 100, + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(50), + false + ) + NetworkHistogram.nethist(A, d, copy(initial_labels), test_params) + + # Profile + println("Profiling full optimization ($max_iter iterations)...") + Profile.clear() + @profile NetworkHistogram.nethist(A, d, initial_labels, params) + + return true +end + +function profile_components(n = 200, k = 3) + println("Setting up network (n=$n, k=$k)...") + A, labels, d = create_test_sbm_bernoulli(k, n) + edgelist = NetworkHistogram.EdgeList(A) + + # Profile Assignment creation + println("\nProfiling Assignment creation...") + Profile.clear() + @profile begin + for i in 1:1000 + NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) + end + end + println("Results for Assignment creation:") + Profile.print(maxdepth = 15) + + # Profile EdgeList creation + println("\nProfiling EdgeList creation...") + Profile.clear() + @profile begin + for i in 1:1000 + NetworkHistogram.EdgeList(A) + end + end + println("Results for EdgeList creation:") + Profile.print(maxdepth = 15) + + # Profile log-likelihood computation + assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) + println("\nProfiling log-likelihood computation...") + Profile.clear() + @profile begin + for i in 1:10000 + NetworkHistogram.loglikelihood(assignment) + end + end + println("Results for log-likelihood computation:") + Profile.print(maxdepth = 15) + + # Profile get_edges_in_groups + println("\nProfiling get_edges_in_groups...") + Profile.clear() + @profile begin + for i in 1:10000 + NetworkHistogram.get_edges_in_groups(assignment, 1, 2) + end + end + println("Results for get_edges_in_groups:") + Profile.print(maxdepth = 15) + + return false # Don't print again at the end +end + +function print_results() + println("\n" * "="^70) + println("Profile Results") + println("="^70) + println("\nTop functions by exclusive time:") + Profile.print(maxdepth = 15) + + println("\n" * "="^70) + println("Generating flamegraph...") + println("="^70) + + # Try to use ProfileView if available + try + @eval using ProfileView + println("\nOpening ProfileView (flamegraph)...") + ProfileView.view() + println("Close the ProfileView window to continue...") + catch + # Try PProf + try + @eval using PProf + println("\nGenerating flamegraph with PProf...") + PProf.pprof() + catch + println("\nNo flamegraph viewer available.") + println("To visualize results, install ProfileView.jl or PProf.jl:") + println(" julia> using Pkg") + println(" julia> Pkg.add(\"ProfileView\") # or \"PProf\"") + end + end +end + +function print_help() + println(""" + NetworkHistogram Profiling Helper + ================================== + + Usage: julia --project=. dev/profile_optimization.jl [scenario] [options] + + Scenarios: + swap Profile swap operations (default) + swap-bernoulli Profile Bernoulli swap operations + swap-categorical Profile Categorical swap operations + optimize Profile full optimization run + components Profile individual components + help Show this message + + Options: + --n=N Number of nodes (default: 200) + --k=K Number of groups (default: 3) + --iter=N Number of iterations for optimization (default: 10000) + + Examples: + julia dev/profile_optimization.jl swap + julia dev/profile_optimization.jl swap-categorical --n=500 + julia dev/profile_optimization.jl optimize --iter=5000 + julia dev/profile_optimization.jl components + + The profiling results will show: + - Which functions consume the most time + - Call counts for each function + - Memory allocation patterns + - Call stack visualization (with flamegraph viewer) + + Tips: + - Focus on functions with high "exclusive" time + - Look for unexpected allocations + - Check for type instabilities + - Use flamegraph for visual exploration + """) +end + +function parse_args(args) + options = Dict( + :n => 200, + :k => 3, + :iter => 10_000 + ) + + for arg in args + if startswith(arg, "--n=") + options[:n] = parse(Int, split(arg, "=")[2]) + elseif startswith(arg, "--k=") + options[:k] = parse(Int, split(arg, "=")[2]) + elseif startswith(arg, "--iter=") + options[:iter] = parse(Int, split(arg, "=")[2]) + end + end + + return options +end + +function main() + if length(ARGS) == 0 || ARGS[1] in ["help", "-h", "--help"] + print_help() + return + end + + scenario = ARGS[1] + options = parse_args(ARGS[2:end]) + + should_print = if scenario == "swap" || scenario == "swap-bernoulli" + profile_swap_operations(:bernoulli, options[:n], options[:k]) + elseif scenario == "swap-categorical" + profile_swap_operations(:categorical, options[:n], options[:k]) + elseif scenario == "optimize" + profile_full_optimization(:bernoulli, options[:n], options[:k], options[:iter]) + elseif scenario == "components" + profile_components(options[:n], options[:k]) + else + println("Error: Unknown scenario '$scenario'") + print_help() + return + end + + if should_print + print_results() + end +end + +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl new file mode 100644 index 0000000..49035fc --- /dev/null +++ b/benchmark/run_benchmarks.jl @@ -0,0 +1,210 @@ +#!/usr/bin/env julia + +""" +Quick-start script for running NetworkHistogram benchmarks. + +Usage: + ./run_benchmarks.jl [command] [options] + +Commands: + baseline - Establish a baseline benchmark + current - Run current benchmarks (compares with baseline if available) + compare - Compare two benchmark files + clean - Remove all benchmark results + help - Show this help message + +Examples: + ./run_benchmarks.jl baseline + ./run_benchmarks.jl current + ./run_benchmarks.jl compare results1.json results2.json +""" + +using Pkg + +# Ensure we're in the right directory +cd(dirname(@__DIR__)) + +function print_help() + println(""" + NetworkHistogram Benchmark Runner + ================================== + + Usage: julia run_benchmarks.jl [command] [options] + + Commands: + baseline Create a baseline benchmark + current Run current benchmarks + compare FILE1 FILE2 Compare two benchmark files + clean Remove all benchmark results + help Show this message + + Examples: + julia run_benchmarks.jl baseline + julia run_benchmarks.jl current + julia run_benchmarks.jl compare results/v1.json results/v2.json + """) +end + +function ensure_dependencies() + println("Checking dependencies...") + + # Check if BenchmarkTools and JSON3 are available + try + @eval using BenchmarkTools + @eval using JSON3 + catch + println("Installing required dependencies...") + Pkg.activate("test") + Pkg.add(["BenchmarkTools", "JSON3"]) + Pkg.activate(".") + end + + println("Dependencies OK ✓") +end + +function run_baseline() + ensure_dependencies() + + baseline_file = joinpath("dev", "benchmark_results", "baseline.json") + + if isfile(baseline_file) + print("Baseline already exists. Overwrite? (y/N): ") + response = readline() + if lowercase(strip(response)) != "y" + println("Aborted.") + return + end + end + + println("\nRunning baseline benchmarks...") + println("This may take several minutes...\n") + + run(`julia --project=. benchmark_optimization.jl $baseline_file`) + + println("\n✓ Baseline established at: $baseline_file") + println("\nNext steps:") + println(" 1. Make your performance improvements") + println(" 2. Run: julia run_benchmarks.jl current") + println(" 3. Review the performance comparison") +end + +function run_current() + ensure_dependencies() + + baseline_file = joinpath("dev", "benchmark_results", "baseline.json") + + if !isfile(baseline_file) + println("⚠ Warning: No baseline found!") + println("Consider running: julia run_benchmarks.jl baseline") + println("\nContinuing anyway...\n") + end + + timestamp = Dates.format(Dates.now(), "yyyy-mm-ddTHH-MM-SS") + current_file = joinpath("dev", "benchmark_results", "current_$timestamp.json") + + println("Running current benchmarks...") + println("This may take several minutes...\n") + + if isfile(baseline_file) + run(`julia --project=. benchmark_optimization.jl $current_file $baseline_file`) + else + run(`julia --project=. benchmark_optimization.jl $current_file`) + end + + println("\n✓ Results saved to: $current_file") +end + +function compare_benchmarks(file1, file2) + ensure_dependencies() + + if !isfile(file1) + println("Error: File not found: $file1") + return + end + + if !isfile(file2) + println("Error: File not found: $file2") + return + end + + println("Comparing benchmarks...") + println(" Baseline: $file2") + println(" Current: $file1\n") + + # Re-run comparison + run(`julia --project=. benchmark_optimization.jl $file1 $file2`) +end + +function clean_results() + results_dir = joinpath("dev", "benchmark_results") + + if !isdir(results_dir) + println("No results directory found.") + return + end + + files = filter(f -> endswith(f, ".json") && f != "baseline.json", readdir(results_dir)) + + if isempty(files) + println("No benchmark results to clean.") + return + end + + println("Found $(length(files)) benchmark result file(s):") + for f in files + println(" - $f") + end + + print("\nDelete these files? (y/N): ") + response = readline() + + if lowercase(strip(response)) == "y" + for f in files + rm(joinpath(results_dir, f)) + end + println("✓ Cleaned $(length(files)) file(s)") + else + println("Aborted.") + end +end + +function run_tests() + ensure_dependencies() + + println("Running full test suite...") + Pkg.test() +end + +# Main execution +function main() + if length(ARGS) == 0 || ARGS[1] == "help" || ARGS[1] == "-h" || ARGS[1] == "--help" + print_help() + return + end + + command = ARGS[1] + + if command == "baseline" + run_baseline() + elseif command == "current" + run_current() + elseif command == "compare" + if length(ARGS) < 3 + println("Error: compare requires two file arguments") + println("Usage: julia run_benchmarks.jl compare FILE1 FILE2") + return + end + compare_benchmarks(ARGS[2], ARGS[3]) + elseif command == "clean" + clean_results() + elseif command == "test" + run_tests() + else + println("Error: Unknown command '$command'") + print_help() + end +end + +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/benchmark/visualize_benchmarks.jl b/benchmark/visualize_benchmarks.jl new file mode 100644 index 0000000..adde75d --- /dev/null +++ b/benchmark/visualize_benchmarks.jl @@ -0,0 +1,228 @@ +""" +Visualize benchmark results over time. + +This script reads multiple benchmark result files and creates a simple +comparison table or plot showing performance trends. + +Usage: + julia --project=. benchmark/visualize_benchmarks.jl [files...] + julia --project=. benchmark/visualize_benchmarks.jl --all # Use all files in benchmark_results/ + +Example: + julia benchmark/visualize_benchmarks.jl \\ + benchmark/benchmark_results/baseline.json \\ + benchmark/benchmark_results/current_2024-10-15.json +""" + +using JSON3 +using Dates +using Printf +using Statistics + +function load_benchmark(filepath) + if !isfile(filepath) + @warn "File not found: $filepath" + return nothing + end + + data = JSON3.read(read(filepath, String)) + return data +end + +function extract_key_metrics(benchmark_data) + metrics = Dict{String, Float64}() + + for (name, values) in benchmark_data["benchmarks"] + name_str = string(name) # Convert Symbol to String + if haskey(values, "median_ms") + metrics[name_str] = values["median_ms"] + elseif haskey(values, "median_us") + metrics[name_str] = values["median_us"] / 1000.0 # Convert to ms + end + end + + return metrics +end + +function compare_multiple(files) + if isempty(files) + println("No files provided") + return + end + + # Load all benchmarks + benchmarks = [] + for file in files + data = load_benchmark(file) + if !isnothing(data) + push!(benchmarks, + ( + file = basename(file), + timestamp = data["timestamp"], + metrics = extract_key_metrics(data) + )) + end + end + + if isempty(benchmarks) + println("No valid benchmark files found") + return + end + + # Sort by timestamp + sort!(benchmarks, by = b -> b.timestamp) + + # Get all metric names + all_metrics = Set{String}() + for b in benchmarks + union!(all_metrics, keys(b.metrics)) + end + all_metrics = sort(collect(all_metrics)) + + # Print header + println("\n" * "="^100) + println("Benchmark Comparison Across Versions") + println("="^100) + + # Create table + header = ["Benchmark", [b.file for b in benchmarks]...] + col_widths = [40, fill(18, length(benchmarks))...] + + # Print header + print(rpad("Benchmark", col_widths[1])) + for (i, b) in enumerate(benchmarks) + print(rpad(b.file[1:min(end, 16)], col_widths[i + 1])) + end + println() + print(rpad("", col_widths[1])) + for (i, b) in enumerate(benchmarks) + print(rpad(b.timestamp[1:min(end, 16)], col_widths[i + 1])) + end + println() + println("-"^sum(col_widths)) + + # Print each metric + for metric in all_metrics + # Skip if metric has no values + values = [haskey(b.metrics, metric) ? b.metrics[metric] : NaN for b in benchmarks] + if all(isnan, values) + continue + end + + # Shorten metric name for display + display_name = metric + if length(display_name) > col_widths[1] - 2 + display_name = metric[1:(col_widths[1] - 5)] * "..." + end + + print(rpad(display_name, col_widths[1])) + + baseline_val = values[1] + for (i, val) in enumerate(values) + if isnan(val) + print(rpad("N/A", col_widths[i + 1])) + else + speedup = if !isnan(baseline_val) && baseline_val > 0 && i > 1 + baseline_val / val + else + 1.0 + end + + # Format with speedup indicator + val_str = @sprintf("%.2f ms", val) + if i > 1 && !isnan(baseline_val) + if speedup > 1.05 + val_str *= " ✓" + elseif speedup < 0.95 + val_str *= " ✗" + end + end + print(rpad(val_str, col_widths[i + 1])) + end + end + println() + end + + println("="^100) + println("\nLegend: ✓ = >5% faster, ✗ = >5% slower") + println() + + # Calculate aggregate statistics + if length(benchmarks) >= 2 + println("Overall Summary:") + println("-" * "="^50) + + baseline = benchmarks[1] + for i in 2:length(benchmarks) + current = benchmarks[i] + + speedups = Float64[] + for metric in all_metrics + if haskey(baseline.metrics, metric) && haskey(current.metrics, metric) + base_val = baseline.metrics[metric] + curr_val = current.metrics[metric] + if base_val > 0 && curr_val > 0 + push!(speedups, base_val / curr_val) + end + end + end + + if !isempty(speedups) + median_speedup = median(speedups) + geomean_speedup = exp(mean(log.(speedups))) + faster_count = count(s -> s > 1.05, speedups) + slower_count = count(s -> s < 0.95, speedups) + similar_count = length(speedups) - faster_count - slower_count + + println("\n$(current.file) vs $(baseline.file):") + println(" Geometric mean speedup: $(round(geomean_speedup, digits=2))x") + println(" Median speedup: $(round(median_speedup, digits=2))x") + println(" Benchmarks: $faster_count faster, $slower_count slower, $similar_count similar") + end + end + end +end + +function main() + if length(ARGS) == 0 || ARGS[1] in ["-h", "--help", "help"] + println(""" + Visualize NetworkHistogram Benchmark Results + ============================================= + + Usage: julia dev/visualize_benchmarks.jl [options] [files...] + + Options: + --all Compare all files in benchmark_results/ + -h, --help Show this help + + Examples: + # Compare specific files + julia dev/visualize_benchmarks.jl \\ + benchmark_results/baseline.json \\ + benchmark_results/current.json + + # Compare all available benchmarks + julia dev/visualize_benchmarks.jl --all + """) + return + end + + files = if ARGS[1] == "--all" + results_dir = joinpath("dev", "benchmark_results") + if !isdir(results_dir) + println("Error: benchmark_results directory not found") + return + end + + all_files = filter(f -> endswith(f, ".json"), readdir(results_dir)) + sort!([joinpath(results_dir, f) for f in all_files]) + else + ARGS + end + + compare_multiple(files) +end + +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/src/assignment.jl b/src/assignment.jl index 1fe2601..ac7df15 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -125,12 +125,15 @@ end function get_edges_in_groups(node_labels, edges_all, g1, g2) edges = Vector{edge_type(edges_all)}() - nodes_g1 = findall(x -> x == g1, node_labels) - nodes_g2 = findall(x -> x == g2, node_labels) + # Pre-size the vector to avoid repeated reallocations + sizehint!(edges, 32) - for u in nodes_g1 + @inbounds for u in eachindex(node_labels) + if node_labels[u] != g1 + continue + end for (v, e) in iterate_neighbors(edges_all, u) - if v in nodes_g2 && ((g1 == g2 && u < v) || g1 != g2) + if node_labels[v] == g2 && ((g1 == g2 && u < v) || g1 != g2) push!(edges, e) end end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 653fd00..e533955 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -4,7 +4,11 @@ mutable struct WorkspaceSwap{D, F} end function make_workspace(a::Assignment) - return WorkspaceSwap(deepcopy(a.θ), deepcopy(a.log_likelihood)) + # Pre-allocate workspace with same structure + k = number_groups(a) + θ_copy = SymArray(k, zero(a.θ[1, 1])) + ll_copy = SymArray(k, 0.0) + return WorkspaceSwap(θ_copy, ll_copy) end mutable struct Swap{W} @@ -13,18 +17,31 @@ mutable struct Swap{W} workspace::W end +function copy_symarray!(dest::SymArray, src::SymArray) + # In-place copy without allocation + # SymArray stores data in a dictionary .d + # Just overwrite the values - don't empty first! + @inbounds for key in keys(src.d) + dest.d[key] = src.d[key] + end +end + function make_swap_workspace!(ws, a::Assignment) - ws.θ = deepcopy(a.θ) - ws.log_likelihood_per_group = deepcopy(a.log_likelihood) + # Use in-place copy instead of deepcopy + copy_symarray!(ws.θ, a.θ) + copy_symarray!(ws.log_likelihood_per_group, a.log_likelihood) end function revert_swap_workspace!(a::Assignment, ws) - a.θ = deepcopy(ws.θ) - a.log_likelihood = deepcopy(ws.log_likelihood_per_group) + # Use in-place copy instead of deepcopy + copy_symarray!(a.θ, ws.θ) + copy_symarray!(a.log_likelihood, ws.log_likelihood_per_group) end function make_swap(a::Assignment, id) - return Swap(id[1], id[2], make_workspace(a)) + ws = make_workspace(a) + make_swap_workspace!(ws, a) # Actually copy the current state + return Swap(id[1], id[2], ws) end function make_swap!(swap::Swap, a::Assignment, id) @@ -57,9 +74,14 @@ function apply_swap!(a::Assignment, s::Swap) u, v = s.u, s.v gu = a.node_labels[u] gv = a.node_labels[v] - groups_concerned = Set{Tuple{Int, Int}}([minmax(gu, gv)]) - for (node, d) in iterate_neighbors(a.dists, u) + # Pre-allocate with reasonable capacity to avoid resizing + # Most swaps affect at most degree(u) + degree(v) + 1 group pairs + groups_concerned = Set{Tuple{Int, Int}}() + sizehint!(groups_concerned, 16) # Reasonable default + push!(groups_concerned, minmax(gu, gv)) + + @inbounds for (node, d) in iterate_neighbors(a.dists, u) if node == v continue end @@ -70,7 +92,7 @@ function apply_swap!(a::Assignment, s::Swap) push!(groups_concerned, minmax(gv, g1)) end - for (index, (node, d)) in enumerate(iterate_neighbors(a.dists, v)) + @inbounds for (index, (node, d)) in enumerate(iterate_neighbors(a.dists, v)) if node == u continue end @@ -82,7 +104,7 @@ function apply_swap!(a::Assignment, s::Swap) end swap_node_labels!(a, u, v) - for (g1, g2) in groups_concerned + @inbounds for (g1, g2) in groups_concerned a.log_likelihood[g1, g2] = 0.0 for e in get_edges_in_groups(a.node_labels, a.edges, g1, g2) a.log_likelihood[g1, g2] += logpdf(a.θ[g1, g2], e) diff --git a/test/test_performance_regression.jl b/test/test_performance_regression.jl index 1e7ca95..02c09ca 100644 --- a/test/test_performance_regression.jl +++ b/test/test_performance_regression.jl @@ -31,7 +31,13 @@ function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) end end - labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + # Ensure labels has exactly n_nodes elements + base_size = n_nodes ÷ n_groups + remainder = n_nodes % n_groups + sizes = fill(base_size, n_groups) + sizes[1:remainder] .+= 1 # Distribute remainder to first groups + labels = StatsBase.inverse_rle(1:n_groups, sizes) + @assert length(labels) == n_nodes A = NetworkHistogram.sample(sbm, labels) return A, labels, d end @@ -52,7 +58,13 @@ function create_test_sbm_categorical( end end - labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + # Ensure labels has exactly n_nodes elements + base_size = n_nodes ÷ n_groups + remainder = n_nodes % n_groups + sizes = fill(base_size, n_groups) + sizes[1:remainder] .+= 1 # Distribute remainder to first groups + labels = StatsBase.inverse_rle(1:n_groups, sizes) + @assert length(labels) == n_nodes A = NetworkHistogram.sample(sbm, labels) return A, labels, d end @@ -72,7 +84,7 @@ end b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) - end samples=100 evals=1 + end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=100 evals=1 # Verify correctness ll_after = NetworkHistogram.loglikelihood(assignment) @@ -94,7 +106,7 @@ end b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) - end samples=100 evals=1 + end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=100 evals=1 ll_after = NetworkHistogram.loglikelihood(assignment) @test isapprox(ll_before, ll_after, atol = 1e-10) @@ -115,7 +127,7 @@ end b_swap = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) - end samples=50 evals=1 + end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=50 evals=1 ll_after = NetworkHistogram.loglikelihood(assignment) @test isapprox(ll_before, ll_after, atol = 1e-10) @@ -200,16 +212,16 @@ end # Randomize initial labels initial_labels = rand(1:3, 100) - params = NetworkHistogram.GreedyParams( - 1_000, # Small number for testing - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(500), - false # No progress bar for benchmarking - ) - b_optimize = @benchmark begin - NetworkHistogram.nethist($A, $d, $initial_labels, $params) + # Create fresh params for each benchmark iteration + params = NetworkHistogram.GreedyParams( + 1_000, # Small number for testing + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(500), + false # No progress bar for benchmarking + ) + NetworkHistogram.nethist($A, $d, $initial_labels, params) end samples=10 evals=1 @info "Bernoulli full optimization (n=100, 1k iters)" median=median(b_optimize.times) / @@ -223,16 +235,16 @@ end # Randomize initial labels initial_labels = rand(1:3, 100) - params = NetworkHistogram.GreedyParams( - 1_000, - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(500), - false - ) - b_optimize = @benchmark begin - NetworkHistogram.nethist($A, $d, $initial_labels, $params) + # Create fresh params for each benchmark iteration + params = NetworkHistogram.GreedyParams( + 1_000, + NetworkHistogram.RandomNodeSwap(), + NetworkHistogram.Strict(), + NetworkHistogram.PreviousBestValue(500), + false + ) + NetworkHistogram.nethist($A, $d, $initial_labels, params) end samples=10 evals=1 @info "Categorical full optimization (n=100, 1k iters)" median=median(b_optimize.times) / From 81d629efe5daeb9a3c98034e2612f056ee837775 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 15 Oct 2025 23:32:15 +0200 Subject: [PATCH 191/266] updated categorical updates --- .../benchmark_2025-10-15T23-27-10.json | 83 +++++ src/EdgeList.jl | 17 - src/assignment.jl | 5 +- src/block_model.jl | 11 + src/distributions/distributions_type.jl | 13 +- src/optimization/swap_categorical.jl | 77 ++++- src/optimization/swap_workspace.jl | 27 +- src/utils/SymArray.jl | 25 +- test/runtests.jl | 16 - test/test_performance_regression.jl | 308 ------------------ 10 files changed, 208 insertions(+), 374 deletions(-) create mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json delete mode 100644 test/test_performance_regression.jl diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json b/benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json new file mode 100644 index 0000000..a1a5894 --- /dev/null +++ b/benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json @@ -0,0 +1,83 @@ +{ + "julia_version": "1.12.0", + "timestamp": "2025-10-15 23:26:47", + "benchmarks": { + "bernoulli_swap_n500_k5": { + "max_ms": 3.89675, + "min_ms": 2.490625, + "mean_ms": 2.67821422, + "median_ms": 2.5600625, + "std_ms": 0.30520369156501004 + }, + "assignment_creation_n200": { + "max_us": 1076.709, + "min_us": 918.375, + "median_us": 966.5625, + "mean_us": 967.24371, + "std_us": 27.151506778246986 + }, + "categorical_swap_n50_k2_m3": { + "max_ms": 0.031625, + "min_ms": 0.008, + "mean_ms": 0.00841167, + "median_ms": 0.008125, + "std_ms": 0.0023503062085265367 + }, + "edgelist_creation_n200": { + "max_us": 3455.5, + "min_us": 196.875, + "median_us": 253.354, + "mean_us": 289.45708, + "std_us": 320.40221672081117 + }, + "loglikelihood_n200": { + "max_us": 0.039495991983967936, + "min_us": 0.014612224448897794, + "median_us": 0.015947895791583167, + "mean_us": 0.015709479959919836, + "std_us": 0.001707683939625983 + }, + "bernoulli_optimize_n100_1k": { + "max_ms": 131.9495, + "min_ms": 63.998125, + "mean_ms": 96.4309377, + "median_ms": 92.050771, + "std_ms": 31.092325846843277 + }, + "categorical_swap_n200_k3_m4": { + "max_ms": 0.070375, + "min_ms": 0.034333, + "mean_ms": 0.03502833, + "median_ms": 0.034666, + "std_ms": 0.0035768376333930193 + }, + "bernoulli_swap_n50_k2": { + "max_ms": 0.057292, + "min_ms": 0.042458, + "mean_ms": 0.04351547, + "median_ms": 0.043292, + "std_ms": 0.001701379543966091 + }, + "categorical_swap_n500_k5_m5": { + "max_ms": 0.112416, + "min_ms": 0.088375, + "mean_ms": 0.08935744, + "median_ms": 0.088833, + "std_ms": 0.003347376689350657 + }, + "categorical_optimize_n100_1k": { + "max_ms": 17.914792, + "min_ms": 8.9885, + "mean_ms": 13.7015751, + "median_ms": 14.930396, + "std_ms": 3.4717329142617093 + }, + "bernoulli_swap_n200_k3": { + "max_ms": 0.774791, + "min_ms": 0.534542, + "mean_ms": 0.5486900100000001, + "median_ms": 0.5379165, + "std_ms": 0.03468300186071855 + } + } +} \ No newline at end of file diff --git a/src/EdgeList.jl b/src/EdgeList.jl index 7c20abe..3a09a8b 100644 --- a/src/EdgeList.jl +++ b/src/EdgeList.jl @@ -129,23 +129,6 @@ function get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} return zero(E) # If edge not found in the iteration end -# function EdgeList(A::AbstractMatrix{<:Union{Missing,E}}) where {E} -# n = size(A, 1) -# data = Vector{Vector{E}}(undef, n) -# name_list = Vector{Vector{Int}}(undef, n) -# for j in 1:n -# data[j] = Vector{E}(undef, 0) -# name_list[j] = Vector{Int}(undef, 0) -# for i in 1:n -# if !ismissing(A[i,j]) # gonna be an issue with MC! have to define 0 chain and fast operations on them -# push!(name_list[j], i) -# push!(data[j], A[i, j]) -# end -# end -# end -# return EdgeList(data, name_list) -# end - # Internal function to convert adjacency matrix to EdgeList format function _from_adj_to_edge_list( A::AbstractMatrix, function_to_apply = identity) diff --git a/src/assignment.jl b/src/assignment.jl index ac7df15..57115c3 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -45,6 +45,9 @@ mutable struct Assignment{E, D, F, W, V <: AbstractVector{Int}} additional_workspace::W end +# Default capacity for edge collection - typical node degree in sparse networks +const DEFAULT_EDGE_CAPACITY = 32 + """ number_nodes(a::Assignment) @@ -126,7 +129,7 @@ end function get_edges_in_groups(node_labels, edges_all, g1, g2) edges = Vector{edge_type(edges_all)}() # Pre-size the vector to avoid repeated reallocations - sizehint!(edges, 32) + sizehint!(edges, DEFAULT_EDGE_CAPACITY) @inbounds for u in eachindex(node_labels) if node_labels[u] != g1 diff --git a/src/block_model.jl b/src/block_model.jl index f4caca2..ab1cc7d 100644 --- a/src/block_model.jl +++ b/src/block_model.jl @@ -61,6 +61,7 @@ end Create a block model with `k` uniform-sized blocks, each initialized with distribution `d`. """ function BlockModel(k::Int, d::D) where {D} + k > 0 || throw(ArgumentError("Number of blocks k=$k must be positive")) sizes = fill(1 / k, k) cumulative_sizes = cumsum(sizes) _dists = SymArray(k, d) @@ -78,6 +79,11 @@ function BlockModel(a::Assignment) sizes = proportions(a) cumulative_sizes = cumsum(sizes) _dists = unwrap.(a.θ) + + # Validate that sizes sum to approximately 1.0 + size_sum = sum(sizes) + abs(size_sum - 1.0) < 1e-10 || @warn "Block sizes sum to $size_sum, expected 1.0" + return BlockModel(_dists, sizes, cumulative_sizes) end @@ -91,6 +97,11 @@ function BlockModel(nodes_labels, θ) sizes = counts(nodes_labels) / length(nodes_labels) cumulative_sizes = cumsum(sizes) _dists = unwrap.(θ) + + # Validate that sizes sum to approximately 1.0 + size_sum = sum(sizes) + abs(size_sum - 1.0) < 1e-10 || @warn "Block sizes sum to $size_sum, expected 1.0" + return BlockModel(_dists, sizes, cumulative_sizes) end diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index 349faa4..2fc8fd4 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -44,7 +44,9 @@ struct Dist{D} dist::D counts::Int function Dist(d, counts::Int) - counts < 0 ? error("Counts cannot be negative") : + if counts < 0 + throw(ArgumentError("Counts ($counts) cannot be negative")) + end new{typeof(d)}(d, counts) end end @@ -233,6 +235,15 @@ For a distribution to work with NetworkHistogram, it must implement: """ struct Bernoulli{T <: Real} p::T + function Bernoulli(p::T) where {T <: Real} + if isnan(p) || isinf(p) + throw(ArgumentError("Bernoulli parameter p=$p must be finite")) + end + if !(0 <= p <= 1) + throw(ArgumentError("Bernoulli parameter p=$p must be in [0, 1]")) + end + new{T}(p) + end end zero(d::Bernoulli) = Bernoulli(zero(d.p)) diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index 4454086..f816295 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -45,29 +45,84 @@ function Assignment( estimated[g1, g2], realized[g1, g2]) end end - w = WorkspaceDiscreteSwap(deepcopy(log_likelihood_per_group), - counts, deepcopy(realized), deepcopy(estimated)) - return Assignment( + + # Pre-allocate workspace with copies of current state + w = WorkspaceDiscreteSwap( + SymArray(n_groups, 0.0), + SymArray(n_groups, 0), + SymArray(n_groups, zeros(Float64, M)), + SymArray(n_groups, zeros(Float64, M)) + ) + + # Create assignment first + assignment = Assignment( node_labels, edge_list, dists, θ, log_likelihood_per_group, w) + + # Now copy the actual workspace data into w + for g2 in 1:n_groups, g1 in g2:n_groups + w.log_likelihood_per_group[g1, g2] = log_likelihood_per_group[g1, g2] + w.counts[g1, g2] = counts[g1, g2] + copyto!(w.realized[g1, g2], realized[g1, g2]) + copyto!(w.estimated[g1, g2], estimated[g1, g2]) + end + + return assignment end function make_workspace(a::Assignment{E, Dist{D}, F, W}) where {E, F, D <: Cat, W} - return deepcopy(a.additional_workspace) + # Pre-allocate workspace instead of deepcopy + k = number_groups(a) + m = num_categories(unwrap(a.θ[1, 1])) + + log_ll = SymArray(k, 0.0) + counts = SymArray(k, 0) + realized = SymArray(k, zeros(Float64, m)) + estimated = SymArray(k, zeros(Float64, m)) + + return WorkspaceDiscreteSwap(log_ll, counts, realized, estimated) +end + +function copy_categorical_workspace!( + dest::WorkspaceDiscreteSwap, src_assignment::Assignment) + # In-place copy without allocation + copy_symarray!(dest.log_likelihood_per_group, src_assignment.log_likelihood) + + src_ws = src_assignment.additional_workspace + # Copy counts (scalars) + copy_symarray!(dest.counts, src_ws.counts) + + # Copy vector-valued SymArrays element by element + @inbounds for key in keys(src_ws.realized.d) + copyto!(dest.realized.d[key], src_ws.realized.d[key]) + end + + @inbounds for key in keys(src_ws.estimated.d) + copyto!(dest.estimated.d[key], src_ws.estimated.d[key]) + end end function make_swap_workspace!(ws::WorkspaceDiscreteSwap, a::Assignment) - ws.log_likelihood_per_group = deepcopy(a.log_likelihood) - ws.realized = deepcopy(a.additional_workspace.realized) - ws.estimated = deepcopy(a.additional_workspace.estimated) + # Use in-place copy instead of deepcopy + copy_categorical_workspace!(ws, a) end function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) - a.log_likelihood = deepcopy(ws.log_likelihood_per_group) + # Use in-place copy instead of deepcopy + copy_symarray!(a.log_likelihood, ws.log_likelihood_per_group) + as = a.additional_workspace - as.log_likelihood_per_group = deepcopy(ws.log_likelihood_per_group) - as.realized = deepcopy(ws.realized) - as.estimated = deepcopy(ws.estimated) + copy_symarray!(as.log_likelihood_per_group, ws.log_likelihood_per_group) + copy_symarray!(as.counts, ws.counts) + + # Copy vector-valued SymArrays element by element + @inbounds for key in keys(ws.realized.d) + copyto!(as.realized.d[key], ws.realized.d[key]) + end + + @inbounds for key in keys(ws.estimated.d) + copyto!(as.estimated.d[key], ws.estimated.d[key]) + end end function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index e533955..8774ef6 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -1,6 +1,10 @@ -mutable struct WorkspaceSwap{D, F} +# Reasonable default capacity for affected groups in a swap +const MAX_AFFECTED_GROUPS = 16 + +mutable struct WorkspaceSwap{D, F, G} θ::SymArray{D} log_likelihood_per_group::SymArray{F} + groups_buffer::G # Pre-allocated buffer for affected group pairs end function make_workspace(a::Assignment) @@ -8,7 +12,9 @@ function make_workspace(a::Assignment) k = number_groups(a) θ_copy = SymArray(k, zero(a.θ[1, 1])) ll_copy = SymArray(k, 0.0) - return WorkspaceSwap(θ_copy, ll_copy) + groups_buffer = Set{Tuple{Int, Int}}() + sizehint!(groups_buffer, MAX_AFFECTED_GROUPS) + return WorkspaceSwap(θ_copy, ll_copy, groups_buffer) end mutable struct Swap{W} @@ -60,25 +66,14 @@ function swap_node_labels!(a::Assignment, i, j) a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] end -# for reference and testing -function _slow_swap!(a::Assignment, s::Swap) - swap_node_labels!(a, s.u, s.v) - a.θ, - a.log_likelihood = _compute_theta_and_ll( - a.node_labels, a.dists, a.edges, a.θ[1, 1]) -end - -# apply_swap!(a::Assignment, s::Swap) = _slow_swap!(a, s) - function apply_swap!(a::Assignment, s::Swap) u, v = s.u, s.v gu = a.node_labels[u] gv = a.node_labels[v] - # Pre-allocate with reasonable capacity to avoid resizing - # Most swaps affect at most degree(u) + degree(v) + 1 group pairs - groups_concerned = Set{Tuple{Int, Int}}() - sizehint!(groups_concerned, 16) # Reasonable default + # Reuse pre-allocated buffer instead of allocating new Set each time + groups_concerned = s.workspace.groups_buffer + empty!(groups_concerned) push!(groups_concerned, minmax(gu, gv)) @inbounds for (node, d) in iterate_neighbors(a.dists, u) diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index df39ad2..0b97103 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -56,7 +56,7 @@ sym = SymArray(5, 0.0) # 5×5 matrix of zeros ``` """ function SymArray(k::T, d::F) where {F, T <: Real} - @assert k > 0 + k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) return SymArray{F}( Dict{Tuple{Int, Int}, F}(minmax(i, j) => deepcopy(d) for i in 1:k for j in i:k), @@ -64,7 +64,7 @@ function SymArray(k::T, d::F) where {F, T <: Real} end function SymArray(k::T, d::AbstractArray) where {T <: Real} - @assert k > 0 + k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) return SymArray{typeof(d)}( Dict{Tuple{Int, Int}, typeof(d)}(minmax(i, j) => deepcopy(d) for i in 1:k @@ -76,8 +76,25 @@ end SymArray(d::AbstractMatrix{F}) Create a SymArray from an existing matrix. The matrix should be symmetric. +Validates symmetry with a tolerance for floating-point errors. """ function SymArray(d::AbstractMatrix{F}) where {F} + size(d, 1) == size(d, 2) || throw(ArgumentError( + "Input matrix must be square, got size $(size(d))")) + + # Validate symmetry for floating-point types + if F <: AbstractFloat + k = size(d, 1) + max_asymmetry = zero(F) + for j in 1:k, i in 1:(j - 1) + max_asymmetry = max(max_asymmetry, abs(d[i, j] - d[j, i])) + end + tol = sqrt(eps(F)) * maximum(abs, d) + if max_asymmetry > tol + @warn "Input matrix has asymmetry up to $max_asymmetry (tolerance: $tol). Using upper triangle." + end + end + return convert(SymArray{F}, d) end @@ -87,12 +104,12 @@ end Base.@propagate_inbounds function Base.getindex(a::SymArray, i, j) @boundscheck checkbounds(a, i, j) - return a.d[minmax(i, j)] + @inbounds return a.d[minmax(i, j)] end Base.@propagate_inbounds function Base.setindex!(a::SymArray, v, i, j) @boundscheck checkbounds(a, i, j) - a.d[minmax(i, j)] = v + @inbounds a.d[minmax(i, j)] = v end """ diff --git a/test/runtests.jl b/test/runtests.jl index 9f8eb38..c2e2e1c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,26 +2,10 @@ using Test using LinearAlgebra, SparseArrays using NetworkHistogram -# Check if BenchmarkTools is available (it's not required for basic tests) -const RUN_BENCHMARKS = try - using BenchmarkTools - true -catch - @warn "BenchmarkTools not available, skipping performance regression tests" - false -end - @testset "Tests" begin include("test_data_format.jl") include("test_distributions_type.jl") include("test_swap_workspace.jl") include("test_cat_case.jl") include("test_get_edges_in_groups.jl") - - # Only run performance tests if BenchmarkTools is available - if RUN_BENCHMARKS - @testset "Performance Regression" begin - include("test_performance_regression.jl") - end - end end diff --git a/test/test_performance_regression.jl b/test/test_performance_regression.jl deleted file mode 100644 index 02c09ca..0000000 --- a/test/test_performance_regression.jl +++ /dev/null @@ -1,308 +0,0 @@ -using Test -using NetworkHistogram -using StatsBase -using Random -using Distributions -using StaticArrays -using BenchmarkTools - -""" -Performance regression test suite for NetworkHistogram optimization. - -This file contains benchmarks for the key optimization operations, designed to: -1. Track performance improvements/regressions over time -2. Identify bottlenecks in the optimization workflow -3. Ensure optimization changes maintain correctness - -Based on the workflow in test_decorated_paper.jl -""" - -# Helper function to create test networks -function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) - Random.seed!(seed) - d = NetworkHistogram.Bernoulli(0.5) - sbm = NetworkHistogram.BlockModel(n_groups, d) - - # Create varied probabilities between groups - for g1 in 1:n_groups - for g2 in g1:n_groups - p = 0.1 + 0.7 * rand() - sbm[g1, g2] = NetworkHistogram.Bernoulli(p) - end - end - - # Ensure labels has exactly n_nodes elements - base_size = n_nodes ÷ n_groups - remainder = n_nodes % n_groups - sizes = fill(base_size, n_groups) - sizes[1:remainder] .+= 1 # Distribute remainder to first groups - labels = StatsBase.inverse_rle(1:n_groups, sizes) - @assert length(labels) == n_nodes - A = NetworkHistogram.sample(sbm, labels) - return A, labels, d -end - -function create_test_sbm_categorical( - n_groups::Int, n_nodes::Int, n_categories::Int; seed = 42) - Random.seed!(seed) - ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) - d = NetworkHistogram.Cat(ps) - sbm = NetworkHistogram.BlockModel(n_groups, d) - - # Create varied probability distributions between groups - for g1 in 1:n_groups - for g2 in g1:n_groups - probs = rand(n_categories) - probs ./= sum(probs) - sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) - end - end - - # Ensure labels has exactly n_nodes elements - base_size = n_nodes ÷ n_groups - remainder = n_nodes % n_groups - sizes = fill(base_size, n_groups) - sizes[1:remainder] .+= 1 # Distribute remainder to first groups - labels = StatsBase.inverse_rle(1:n_groups, sizes) - @assert length(labels) == n_nodes - A = NetworkHistogram.sample(sbm, labels) - return A, labels, d -end - -@testset "Performance Regression Tests" begin - @testset "Bernoulli Networks" begin - @testset "Small network (n=50, k=2)" begin - A, labels, d = create_test_sbm_bernoulli(2, 50) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - # Benchmark single swap operation - swap = NetworkHistogram.make_swap(assignment, (1, 50)) - ll_before = NetworkHistogram.loglikelihood(assignment) - - b_swap = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=100 evals=1 - - # Verify correctness - ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol = 1e-10) - - @info "Bernoulli (n=50, k=2) - Single swap" median=median(b_swap.times) / 1e6 mean=mean(b_swap.times) / - 1e6 - end - - @testset "Medium network (n=200, k=3)" begin - A, labels, d = create_test_sbm_bernoulli(3, 200) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - swap = NetworkHistogram.make_swap(assignment, (1, 200)) - ll_before = NetworkHistogram.loglikelihood(assignment) - - b_swap = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=100 evals=1 - - ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol = 1e-10) - - @info "Bernoulli (n=200, k=3) - Single swap" median=median(b_swap.times) / 1e6 mean=mean(b_swap.times) / - 1e6 - end - - @testset "Large network (n=500, k=5)" begin - A, labels, d = create_test_sbm_bernoulli(5, 500) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - swap = NetworkHistogram.make_swap(assignment, (1, 500)) - ll_before = NetworkHistogram.loglikelihood(assignment) - - b_swap = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=50 evals=1 - - ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol = 1e-10) - - @info "Bernoulli (n=500, k=5) - Single swap" median=median(b_swap.times) / 1e6 mean=mean(b_swap.times) / - 1e6 - end - end - - @testset "Categorical Networks" begin - @testset "Small network (n=50, k=2, m=3)" begin - A, labels, d = create_test_sbm_categorical(2, 50, 3) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - swap = NetworkHistogram.make_swap(assignment, (1, 50)) - ll_before = NetworkHistogram.loglikelihood(assignment) - - b_swap = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end samples=100 evals=1 - - ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol = 1e-10) - - @info "Categorical (n=50, k=2, m=3) - Single swap" median=median(b_swap.times) / - 1e6 mean=mean(b_swap.times) / - 1e6 - end - - @testset "Medium network (n=200, k=3, m=4)" begin - A, labels, d = create_test_sbm_categorical(3, 200, 4) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - swap = NetworkHistogram.make_swap(assignment, (1, 200)) - ll_before = NetworkHistogram.loglikelihood(assignment) - - b_swap = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end samples=100 evals=1 - - ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol = 1e-10) - - @info "Categorical (n=200, k=3, m=4) - Single swap" median=median(b_swap.times) / - 1e6 mean=mean(b_swap.times) / - 1e6 - end - - @testset "Large network (n=500, k=5, m=5)" begin - A, labels, d = create_test_sbm_categorical(5, 500, 5) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - swap = NetworkHistogram.make_swap(assignment, (1, 500)) - ll_before = NetworkHistogram.loglikelihood(assignment) - - b_swap = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end samples=50 evals=1 - - ll_after = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_before, ll_after, atol = 1e-10) - - @info "Categorical (n=500, k=5, m=5) - Single swap" median=median(b_swap.times) / - 1e6 mean=mean(b_swap.times) / - 1e6 - end - end - - @testset "Full Optimization Workflow" begin - @testset "Bernoulli - Short optimization (n=100, k=3)" begin - A, labels, d = create_test_sbm_bernoulli(3, 100) - - # Randomize initial labels - initial_labels = rand(1:3, 100) - - b_optimize = @benchmark begin - # Create fresh params for each benchmark iteration - params = NetworkHistogram.GreedyParams( - 1_000, # Small number for testing - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(500), - false # No progress bar for benchmarking - ) - NetworkHistogram.nethist($A, $d, $initial_labels, params) - end samples=10 evals=1 - - @info "Bernoulli full optimization (n=100, 1k iters)" median=median(b_optimize.times) / - 1e6 mean=mean(b_optimize.times) / - 1e6 - end - - @testset "Categorical - Short optimization (n=100, k=3, m=3)" begin - A, labels, d = create_test_sbm_categorical(3, 100, 3) - - # Randomize initial labels - initial_labels = rand(1:3, 100) - - b_optimize = @benchmark begin - # Create fresh params for each benchmark iteration - params = NetworkHistogram.GreedyParams( - 1_000, - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(500), - false - ) - NetworkHistogram.nethist($A, $d, $initial_labels, params) - end samples=10 evals=1 - - @info "Categorical full optimization (n=100, 1k iters)" median=median(b_optimize.times) / - 1e6 mean=mean(b_optimize.times) / - 1e6 - end - end - - @testset "Component Benchmarks" begin - @testset "Assignment creation (n=200, k=3)" begin - A, labels, d = create_test_sbm_bernoulli(3, 200) - edgelist = NetworkHistogram.EdgeList(A) - - b_assignment = @benchmark begin - NetworkHistogram.Assignment($labels, $edgelist, NetworkHistogram.Dist($d)) - end samples=100 - - @info "Assignment creation (n=200)" median=median(b_assignment.times) / 1e6 mean=mean(b_assignment.times) / - 1e6 - end - - @testset "EdgeList creation (n=200)" begin - A, _, _ = create_test_sbm_bernoulli(3, 200) - - b_edgelist = @benchmark begin - NetworkHistogram.EdgeList($A) - end samples=100 - - @info "EdgeList creation (n=200)" median=median(b_edgelist.times) / 1e6 mean=mean(b_edgelist.times) / - 1e6 - end - - @testset "Loglikelihood computation (n=200, k=3)" begin - A, labels, d = create_test_sbm_bernoulli(3, 200) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - b_ll = @benchmark begin - NetworkHistogram.loglikelihood($assignment) - end samples=1000 - - @info "Loglikelihood computation (n=200)" median=median(b_ll.times) / 1e3 mean=mean(b_ll.times) / - 1e3 - end - - @testset "Get edges in groups (n=200, k=3)" begin - A, labels, d = create_test_sbm_bernoulli(3, 200) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - - b_get_edges = @benchmark begin - NetworkHistogram.get_edges_in_groups($assignment, 1, 2) - end samples=1000 - - @info "Get edges in groups (n=200)" median=median(b_get_edges.times) / 1e3 mean=mean(b_get_edges.times) / - 1e3 - end - end -end From 49818b33382fba66cd0aa732293603b1d339d672 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 17 Oct 2025 09:31:06 +0200 Subject: [PATCH 192/266] make symarray based on sparse array --- Project.toml | 5 + benchmark/BENCHMARKING.md | 329 ----------------- benchmark/BENCHMARK_SUMMARY.md | 330 ------------------ benchmark/README.md | 181 +++++----- benchmark/benchmark_results/baseline.json | 112 +++--- .../benchmark_2025-10-15T22-51-21.json | 83 ----- .../benchmark_2025-10-15T22-52-50.json | 83 ----- .../benchmark_2025-10-15T22-55-28.json | 83 ----- .../benchmark_2025-10-15T22-56-00.json | 83 ----- .../benchmark_2025-10-15T22-58-43.json | 83 ----- .../benchmark_2025-10-15T23-27-10.json | 83 ----- benchmark/benchmark_results/optimized.json | 83 ----- benchmark/run_benchmarks.jl | 17 +- src/NetworkHistogram.jl | 1 + src/assignment.jl | 4 +- src/optimization/swap_categorical.jl | 75 ++-- src/optimization/swap_workspace.jl | 17 +- src/utils/SymArray.jl | 246 +++++++++++-- test/runtests.jl | 1 + test/test_symarray.jl | 296 ++++++++++++++++ 20 files changed, 721 insertions(+), 1474 deletions(-) delete mode 100644 benchmark/BENCHMARKING.md delete mode 100644 benchmark/BENCHMARK_SUMMARY.md delete mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json delete mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json delete mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json delete mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json delete mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json delete mode 100644 benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json delete mode 100644 benchmark/benchmark_results/optimized.json create mode 100644 test/test_symarray.jl diff --git a/Project.toml b/Project.toml index 927c3f8..7c67a0a 100644 --- a/Project.toml +++ b/Project.toml @@ -5,12 +5,15 @@ authors = ["Charles Dufour", "Jake Grainger"] [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" @@ -29,7 +32,9 @@ LightMCExt = "LightMC" MakieExt = "Makie" [compat] +Dates = "1.11.0" Graphons = "0.1.0" +LinearAlgebra = "1.12.0" Printf = "1.11.0" [extras] diff --git a/benchmark/BENCHMARKING.md b/benchmark/BENCHMARKING.md deleted file mode 100644 index 3e19a0c..0000000 --- a/benchmark/BENCHMARKING.md +++ /dev/null @@ -1,329 +0,0 @@ -# NetworkHistogram Performance Benchmarking Suite - -This directory contains tools for measuring and tracking the performance of -NetworkHistogram's optimization algorithms. - -## Overview - -The benchmarking suite is designed to: - -1. **Track performance improvements/regressions** over time -2. **Identify bottlenecks** in the optimization workflow -3. **Ensure optimization changes maintain correctness** -4. **Compare performance** before and after code changes - -## Files - -- `benchmark_optimization.jl` - Standalone benchmarking script that runs all - benchmarks -- `test_performance_regression.jl` - Test suite that can be run with - `Pkg.test()` -- `benchmark_results/` - Directory for storing benchmark results (created - automatically) - -## Quick Start - -### Running Benchmarks - -```bash -# From the repository root -julia --project=. benchmark_optimization.jl - -# With custom output file -julia --project=. benchmark_optimization.jl results/my_benchmark.json - -# Compare with baseline -julia --project=. benchmark_optimization.jl results/current.json results/baseline.json -``` - -### Running as Tests - -```bash -# Run all tests including performance tests -julia --project=. -e 'using Pkg; Pkg.test()' - -# Run only performance tests -julia --project=test test/test_performance_regression.jl -``` - -## Workflow for Performance Optimization - -### 1. Establish Baseline - -Before making any changes, establish a baseline: - -```bash -julia --project=. benchmark_optimization.jl benchmark_results/baseline.json -``` - -### 2. Make Your Changes - -Edit the source code to improve performance (e.g., optimize `apply_swap!`, -reduce allocations, etc.). - -### 3. Run Benchmarks - -```bash -julia --project=. benchmark_optimization.jl benchmark_results/after_changes.json -``` - -### 4. Compare Results - -The script will automatically compare with `baseline.json` if it exists, or you -can manually compare: - -```bash -julia --project=. benchmark_optimization.jl \ - benchmark_results/after_changes.json \ - benchmark_results/baseline.json -``` - -### 5. Verify Correctness - -Run the full test suite to ensure your changes don't break anything: - -```bash -julia --project=. -e 'using Pkg; Pkg.test()' -``` - -## Benchmark Categories - -### Single Swap Operations - -Measures the performance of a single node swap operation (apply + revert): - -- **Bernoulli networks**: Binary edge weights (0/1) - - - Small: n=50, k=2 - - Medium: n=200, k=3 - - Large: n=500, k=5 - -- **Categorical networks**: Multi-valued edge weights - - Small: n=50, k=2, m=3 - - Medium: n=200, k=3, m=4 - - Large: n=500, k=5, m=5 - -**Why it matters**: Swap operations are the core of the greedy optimization -algorithm and are called millions of times. - -### Full Optimization Workflow - -Measures end-to-end performance of the optimization process: - -- Bernoulli: n=100, 1,000 iterations -- Categorical: n=100, 1,000 iterations - -**Why it matters**: Shows real-world performance for typical use cases. - -### Component Benchmarks - -Measures individual components: - -- **Assignment creation**: Time to create initial assignment -- **EdgeList creation**: Time to convert adjacency matrix to edge list -- **Loglikelihood computation**: Time to compute total log-likelihood -- **Get edges in groups**: Time to extract edges between two groups - -**Why it matters**: Identifies which components are bottlenecks. - -## Interpreting Results - -### Benchmark Output - -``` -Benchmarking Bernoulli swap (n=50, k=2)... - Median: 0.234 ms -``` - -- **Median**: The middle value (most representative of typical performance) -- **Mean**: Average value (affected by outliers) -- **Min/Max**: Best and worst case performance -- **Std**: Standard deviation (consistency of performance) - -### Performance Comparison - -``` -✓ FASTER bernoulli_swap_n50_k2: 1.23x (23.0%) - Current: 0.190 ms | Baseline: 0.234 ms - -✗ SLOWER categorical_swap_n200_k3_m4: 0.87x (-13.0%) - Current: 1.450 ms | Baseline: 1.260 ms - -≈ SIMILAR bernoulli_optimize_n100_1k: 1.02x (2.0%) - Current: 123.4 ms | Baseline: 125.9 ms -``` - -- **✓ FASTER**: >5% improvement -- **✗ SLOWER**: >5% regression -- **≈ SIMILAR**: Within ±5% - -## Key Performance Hotspots - -Based on the codebase analysis, these are the most critical areas for -optimization: - -### 1. `apply_swap!` (swap_workspace.jl, swap_categorical.jl) - -**Impact**: Called once per iteration in greedy search - -**Current approach**: - -- Iterates over all neighbors of swapped nodes -- Updates θ parameters and log-likelihoods incrementally -- Uses `deepcopy` for categorical distributions - -**Optimization opportunities**: - -- Reduce allocations in the hot path -- Optimize neighbor iteration -- Pre-allocate workspace for intermediate computations - -### 2. `get_edges_in_groups` (assignment.jl) - -**Impact**: Called during likelihood recomputation - -**Current approach**: - -- Allocates new vector for each call -- Uses `findall` to identify nodes in groups -- Iterates over all edges - -**Optimization opportunities**: - -- Pre-compute and cache group membership -- Use pre-allocated buffers -- Use views instead of copying data - -### 3. Edge iteration (EdgeList.jl) - -**Impact**: Used throughout the codebase - -**Current approach**: - -- Iterator-based access to edges - -**Optimization opportunities**: - -- Ensure type stability -- Minimize bounds checking -- Cache frequently accessed data - -### 4. Log-likelihood computation - -**Impact**: Called after every swap - -**Current approach**: - -- Recomputes for affected groups only (good!) -- Calls `logpdf` for each edge - -**Optimization opportunities**: - -- Batch logpdf computations -- Use SIMD operations where possible -- Cache intermediate results - -## Example: Optimizing a Function - -Let's say you want to optimize `apply_swap!`: - -```julia -# 1. Add profiling annotations -using Profile - -@profile begin - for i in 1:1000 - apply_swap!(assignment, swap) - revert_swap!(assignment, swap) - end -end - -Profile.print() - -# 2. Identify hot spots from profiling output - -# 3. Make targeted changes (e.g., reduce allocations) - -# 4. Benchmark before and after -julia benchmark_optimization.jl -``` - -## Tips for Performance Optimization - -1. **Start with profiling**: Use `@profile` to identify actual bottlenecks -2. **Benchmark incrementally**: Make one change at a time -3. **Check allocations**: Use `@btime` with `samples=1 evals=1` to see - allocations -4. **Maintain correctness**: Always run tests after changes -5. **Consider trade-offs**: Sometimes slight speedups aren't worth added - complexity - -## Advanced Usage - -### Custom Benchmarks - -Add your own benchmarks to `benchmark_optimization.jl`: - -```julia -function benchmark_my_function() - # Setup - data = create_test_data() - - # Benchmark - b = @benchmark my_function($data) samples=100 - - return Dict( - "median_ms" => median(b.times) / 1e6, - "mean_ms" => mean(b.times) / 1e6 - ) -end -``` - -### Continuous Integration - -To track performance over time in CI: - -```yaml -# .github/workflows/benchmark.yml -- name: Run benchmarks - run: julia --project=. benchmark_optimization.jl results/current.json - -- name: Compare with main - run: | - git checkout main - julia --project=. benchmark_optimization.jl results/baseline.json - git checkout - - julia --project=. benchmark_optimization.jl results/current.json results/baseline.json -``` - -## Troubleshooting - -### Inconsistent Results - -If you see high variance in results: - -- Close other applications -- Run with `--threads=1` to avoid threading variability -- Increase the number of samples -- Let the system warm up with a few iterations first - -### Out of Memory - -For large benchmarks: - -- Reduce the number of samples -- Run benchmarks separately instead of all at once -- Use smaller test networks - -### Compilation Effects - -Julia's JIT compilation can affect first-run timing: - -- BenchmarkTools automatically handles warmup -- For manual timing, always run at least once before measuring - -## Resources - -- [BenchmarkTools.jl documentation](https://juliaci.github.io/BenchmarkTools.jl/stable/) -- [Julia Performance Tips](https://docs.julialang.org/en/v1/manual/performance-tips/) -- [Profile module documentation](https://docs.julialang.org/en/v1/stdlib/Profile/) diff --git a/benchmark/BENCHMARK_SUMMARY.md b/benchmark/BENCHMARK_SUMMARY.md deleted file mode 100644 index a576392..0000000 --- a/benchmark/BENCHMARK_SUMMARY.md +++ /dev/null @@ -1,330 +0,0 @@ -# Performance Regression Test Suite - Summary - -## What Was Created - -A comprehensive performance benchmarking and profiling suite for -NetworkHistogram optimization, consisting of: - -### 1. **Test Files** - -- `test/test_performance_regression.jl` - Performance regression tests that run - with `Pkg.test()` - - Tests for Bernoulli and Categorical networks - - Multiple network sizes (50, 200, 500 nodes) - - Single swap operations and full optimization workflows - - Component-level benchmarks - -### 2. **Standalone Benchmarking** - -- `benchmark_optimization.jl` - Comprehensive standalone benchmark suite - - Saves results to JSON with timestamps - - Automatic comparison with baseline - - Detailed performance metrics (median, mean, std, min, max) - -### 3. **Easy-to-Use Runner** - -- `dev/run_benchmarks.jl` - User-friendly command-line interface - - Simple commands: `baseline`, `current`, `compare`, `clean` - - Handles dependencies automatically - - Interactive confirmations for destructive operations - -### 4. **Profiling Tools** - -- `dev/profile_optimization.jl` - Profiling helper - - Profile swap operations, full optimization, or components - - Integrated flamegraph support - - Configurable network sizes and iteration counts - -### 5. **Documentation** - -- `PERFORMANCE.md` - Main performance guide -- `benchmark/BENCHMARKING.md` - Detailed benchmarking documentation -- This summary document - -## How to Use - -### Quick Start (5 minutes) - -```bash -# 1. Create baseline -julia dev/run_benchmarks.jl baseline - -# 2. Make your optimizations in src/optimization/ - -# 3. Test performance -julia dev/run_benchmarks.jl current - -# 4. Verify correctness -julia --project=. -e 'using Pkg; Pkg.test()' -``` - -### Example Output - -``` ---- Single Swap Operations (Bernoulli) --- -Benchmarking Bernoulli swap (n=50, k=2)... - Median: 0.234 ms - -======================================== -Performance Comparison vs Baseline -======================================== -✓ FASTER bernoulli_swap_n50_k2: 1.23x (23.0%) - Current: 0.190 ms | Baseline: 0.234 ms -``` - -## Key Insights from Code Analysis - -Based on analysis of `test_decorated_paper.jl` and the source code: - -### Primary Bottlenecks - -1. **`apply_swap!`** (called millions of times) - - - Location: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` - - Issues: Uses `deepcopy`, allocates during iteration - - Impact: 🔴 CRITICAL - dominates runtime - -2. **`get_edges_in_groups`** (called during LL updates) - - - Location: `src/assignment.jl` - - Issues: Uses `findall`, allocates new vectors - - Impact: 🟡 MODERATE - called less frequently - -3. **Edge iteration** (used throughout) - - Location: `src/EdgeList.jl` - - Issues: Iterator overhead - - Impact: 🟢 LOW - but cumulative - -### Workflow from test_decorated_paper.jl - -The typical optimization workflow: - -1. Create SBM (Stochastic Block Model) -2. Sample network from SBM -3. Initialize node labels -4. Run greedy optimization with `nethist()` - - Iteratively swap nodes between groups - - Accept swaps that improve log-likelihood -5. Measure convergence via log-likelihood - -## Benchmarked Scenarios - -### Network Sizes - -- **Small**: n=50 nodes, k=2 groups (quick iteration) -- **Medium**: n=200 nodes, k=3 groups (realistic size) -- **Large**: n=500 nodes, k=5 groups (stress test) - -### Network Types - -- **Bernoulli**: Binary edges (0/1) - simpler, faster -- **Categorical**: Multi-valued edges (m categories) - more complex - -### Benchmark Types - -- **Single swap**: Apply + revert one node swap -- **Full optimization**: Complete optimization run (1k iterations) -- **Components**: Individual function benchmarks - -## Files and Their Purpose - -``` -NetworkHistogram/ -├── PERFORMANCE.md # Main guide (START HERE) -├── test/ -│ ├── test_performance_regression.jl # CI-friendly tests -│ └── Project.toml # Added BenchmarkTools dependency -├── dev/ -│ ├── run_benchmarks.jl # 👈 Easy CLI (USE THIS) -│ ├── benchmark_optimization.jl # Core benchmarking logic -│ ├── profile_optimization.jl # Profiling helper -│ ├── BENCHMARKING.md # Detailed docs -│ ├── benchmark_results/ # Stored results (auto-created) -│ │ └── baseline.json # Your reference baseline -│ └── test_decorated_paper.jl # Original workflow example -└── src/optimization/ # 🎯 Optimize these files - ├── greedy.jl - ├── swap_workspace.jl - ├── swap_categorical.jl - └── config_rules/ -``` - -## Common Workflows - -### A. Making Performance Improvements - -```bash -# Step 1: Baseline -julia dev/run_benchmarks.jl baseline - -# Step 2: Profile to find bottlenecks -julia dev/profile_optimization.jl swap - -# Step 3: Make changes to src/optimization/ - -# Step 4: Benchmark -julia dev/run_benchmarks.jl current - -# Step 5: Test correctness -julia --project=. -e 'using Pkg; Pkg.test()' - -# Step 6: Repeat steps 2-5 until satisfied -``` - -### B. Comparing Two Versions - -```bash -# Benchmark version A -git checkout feature-A -julia dev/run_benchmarks.jl results_A.json - -# Benchmark version B -git checkout feature-B -julia dev/run_benchmarks.jl results_B.json - -# Compare -julia dev/run_benchmarks.jl compare results_A.json results_B.json -``` - -### C. Debugging Performance Regression - -```bash -# Find when regression occurred -git bisect start -git bisect bad HEAD -git bisect good v1.0.0 - -# For each commit -julia dev/run_benchmarks.jl -# Mark good/bad based on results -git bisect good # or bad -``` - -## Optimization Strategies - -### 1. Profile First - -Don't guess - use `profile_optimization.jl` to see what's actually slow. - -### 2. Reduce Allocations - -The biggest wins usually come from eliminating allocations in hot paths. - -**Check allocations**: - -```julia -using BenchmarkTools -@btime apply_swap!($assignment, $swap) samples=1 evals=1 -# ^^^^^ This shows allocations -``` - -**Common fixes**: - -- Pre-allocate buffers -- Use `@inbounds` (after bounds checking once) -- Avoid `deepcopy` when possible -- Use views instead of copies - -### 3. Type Stability - -Julia is fast when types are known at compile time. - -**Check type stability**: - -```julia -using Cthulhu -@descend apply_swap!(assignment, swap) -# Look for red (runtime dispatch) -``` - -### 4. SIMD/Vectorization - -For bulk operations on arrays, help the compiler vectorize. - -### 5. Cache-Friendly Access - -Access memory in order when possible (column-major for Julia). - -## Expected Performance Gains - -Based on typical optimization opportunities in similar codebases: - -- **Low-hanging fruit** (reduce allocations): 20-50% speedup -- **Algorithm improvements** (better data structures): 2-10x speedup -- **SIMD/vectorization**: 2-4x speedup (for vectorizable operations) -- **Type stability fixes**: 2-5x speedup (if unstable) - -The swap operation is called O(iterations × n) times, so even small -improvements compound significantly. - -## Integration with CI/CD - -Add to `.github/workflows/benchmark.yml`: - -```yaml -name: Benchmark -on: [pull_request] - -jobs: - benchmark: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: julia-actions/setup-julia@v1 - - - name: Benchmark PR - run: julia benchmark_optimization.jl pr_results.json - - - name: Benchmark main - run: | - git fetch origin main - git checkout origin/main - julia benchmark_optimization.jl main_results.json - - - name: Compare - run: - julia dev/run_benchmarks.jl compare pr_results.json main_results.json -``` - -## Troubleshooting - -### "BenchmarkTools not found" - -```bash -julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' -``` - -### Results vary too much - -- Close other applications -- Disable CPU frequency scaling -- Run with `--threads=1` -- Increase sample count - -### Benchmark takes too long - -- Reduce `samples` parameter -- Use smaller networks -- Run specific benchmarks only - -## Next Steps - -1. **Establish your baseline**: `julia dev/run_benchmarks.jl baseline` -2. **Read the detailed docs**: See `benchmark/BENCHMARKING.md` -3. **Profile the code**: `julia dev/profile_optimization.jl swap` -4. **Start optimizing**: Focus on `apply_swap!` first -5. **Measure improvements**: `julia dev/run_benchmarks.jl current` -6. **Share results**: Open PR with before/after benchmarks - -## Questions? - -- Check `PERFORMANCE.md` for main guide -- Check `benchmark/BENCHMARKING.md` for detailed docs -- Run `julia dev/run_benchmarks.jl help` -- Run `julia dev/profile_optimization.jl help` - ---- - -Happy optimizing! The suite is designed to make performance work systematic and -data-driven. 🚀 diff --git a/benchmark/README.md b/benchmark/README.md index 755e28d..4e21258 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -3,134 +3,151 @@ This directory contains benchmarking and profiling tools for NetworkHistogram.jl performance analysis. -## Files - -### `benchmark_optimization.jl` - -Main benchmarking script that runs comprehensive performance tests. - -**Usage:** +## Quick Start ```bash -julia --project=. benchmark/benchmark_optimization.jl [output_file] +# Run all benchmarks (saves to benchmark_results/ with timestamp) +julia --project=. benchmark/benchmark_optimization.jl + +# Profile to find bottlenecks +julia --project=. benchmark/profile_optimization.jl swap ``` -**Features:** +## Files + +| File | Purpose | +| --------------------------- | ----------------------------------------------------- | +| `benchmark_optimization.jl` | Main benchmarking script - runs all performance tests | +| `profile_optimization.jl` | Profile code to identify bottlenecks | +| `visualize_benchmarks.jl` | Compare benchmark results over time | +| `run_benchmarks.jl` | Convenience wrapper with baseline management | +| `benchmark_results/` | Stored benchmark results (JSON, auto-created) | -- Single swap operations (Bernoulli and Categorical networks) -- Full optimization workflows -- Component benchmarks (Assignment, EdgeList, loglikelihood) -- Automatic comparison with baseline -- Results saved as timestamped JSON files +## Usage Examples -**Example:** +### Running Benchmarks ```bash -# Run benchmarks and save to default location +# Basic usage julia --project=. benchmark/benchmark_optimization.jl # Save to specific file julia --project=. benchmark/benchmark_optimization.jl my_results.json -# Compare with custom baseline -julia --project=. benchmark/benchmark_optimization.jl current.json baseline.json +# Compare with baseline (auto-detects baseline.json) +julia --project=. benchmark/benchmark_optimization.jl ``` -### `visualize_benchmarks.jl` +### Profiling -Compare and visualize benchmark results over time. +```bash +# Profile swap operations +julia --project=. benchmark/profile_optimization.jl swap -**Usage:** +# Profile full optimization +julia --project=. benchmark/profile_optimization.jl optimize -```bash -julia --project=. benchmark/visualize_benchmarks.jl [files...] +# Profile individual components +julia --project=. benchmark/profile_optimization.jl components ``` -**Example:** +### Baseline Management ```bash -# Compare two benchmark runs -julia --project=. benchmark/visualize_benchmarks.jl \ - benchmark/benchmark_results/baseline.json \ - benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json - -# Use all files in benchmark_results/ -julia --project=. benchmark/visualize_benchmarks.jl --all +# Set current run as baseline +cp benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json \ + benchmark/benchmark_results/baseline.json ``` -### `profile_optimization.jl` +## What Gets Benchmarked -Profile code to identify performance bottlenecks. +### Single Swap Operations -**Usage:** +Tests the core swap operation (apply + revert): -```bash -julia --project=. benchmark/profile_optimization.jl [scenario] -``` +- **Bernoulli networks**: Binary edges (0/1) + - Small: n=50, k=2 + - Medium: n=200, k=3 + - Large: n=500, k=5 +- **Categorical networks**: Multi-valued edges + - Small: n=50, k=2, m=3 + - Medium: n=200, k=3, m=4 + - Large: n=500, k=5, m=5 -**Scenarios:** +### Full Optimization -- `swap` - Profile single swap operations -- `optimize` - Profile full optimization run -- `components` - Profile individual components +End-to-end optimization performance: -**Example:** - -```bash -julia --project=. benchmark/profile_optimization.jl swap -``` +- Bernoulli: n=100, 1,000 iterations +- Categorical: n=100, 1,000 iterations -## Benchmark Results +### Component Benchmarks -Results are stored in `benchmark/benchmark_results/` as JSON files with -timestamps. +Individual function performance: -### Setting a Baseline +- Assignment creation +- EdgeList creation +- Log-likelihood computation +- Edge extraction between groups -To set a benchmark run as the baseline for future comparisons: +## Current Performance (October 2025) -```bash -cp benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json \ - benchmark/benchmark_results/baseline.json -``` +**After Phase 1 Optimizations:** -## Performance Metrics +| Operation | Time | vs Baseline | +| ---------------------------- | ------- | ----------- | +| Bernoulli swap (n=500) | 2.56 ms | 6.1x faster | +| Bernoulli swap (n=200) | 0.54 ms | 3.2x faster | +| Categorical swap (n=200) | 0.04 ms | 1.4x faster | +| Bernoulli optimize (n=100) | 92 ms | - | +| Categorical optimize (n=100) | 15 ms | - | -The benchmarks track: +## Optimization Workflow -- **Median time**: Most representative performance measurement -- **Mean time**: Average across all samples -- **Min/Max time**: Best and worst case performance -- **Standard deviation**: Performance consistency +1. **Establish baseline**: Run benchmarks before changes +2. **Profile**: Use `profile_optimization.jl` to find hotspots +3. **Optimize**: Edit source code (usually `src/optimization/`) +4. **Benchmark**: Run benchmarks again +5. **Verify**: Run tests to ensure correctness +6. **Repeat**: Continue until satisfied -### Current Performance (October 2025) +## Key Hotspots -**Bernoulli Swap Operations:** +Focus optimization efforts on: -- n=50, k=2: ~0.038 ms -- n=200, k=3: ~0.52 ms -- n=500, k=5: ~2.4 ms (6.3x faster than pre-optimization baseline) +1. **`apply_swap!`** - Called millions of times (biggest impact) +2. **`get_edges_in_groups`** - Called during likelihood updates +3. **Edge iteration** - Used throughout, cumulative effect -**Categorical Swap Operations:** +See `dev/CODE_REVIEW_2025-10-15.md` for detailed analysis. -- n=50, k=2, m=3: ~0.009 ms -- n=200, k=3, m=4: ~0.04 ms -- n=500, k=5, m=5: ~0.10 ms +## Output Format -**Full Optimization:** +``` +--- Single Swap Operations (Bernoulli) --- +Benchmarking Bernoulli swap (n=200, k=3)... + Median: 0.538 ms + +====================================================================== +Performance Comparison vs Baseline +====================================================================== +✓ FASTER bernoulli_swap_n200_k3: 3.23x (223.0%) + Current: 0.54 ms | Baseline: 1.74 ms +``` -- Bernoulli (n=100, 1k iters): ~90 ms -- Categorical (n=100, 1k iters): ~20 ms +- **✓ FASTER**: >5% improvement +- **✗ SLOWER**: >5% regression +- **≈ SIMILAR**: Within ±5% -## Optimization History +## Tips -Major optimizations implemented: +- **Close other apps** for consistent results +- **Run multiple times** to warm up JIT compiler (BenchmarkTools handles this) +- **Check allocations** with `@btime ... samples=1 evals=1` +- **Profile first** before optimizing +- **Test after** every optimization -1. Eliminated `deepcopy` in swap operations (replaced with in-place - `copy_symarray!`) -2. Optimized `get_edges_in_groups` (removed `findall`, added direct iteration) -3. Added `@inbounds` annotations to hot paths -4. Pre-sized Set allocations to avoid resizing +## Documentation -**Result:** 6.3x speedup for large Bernoulli networks while maintaining full -correctness. +- Full details: See `PERFORMANCE.md` (root directory) +- Code review: See `dev/CODE_REVIEW_2025-10-15.md` diff --git a/benchmark/benchmark_results/baseline.json b/benchmark/benchmark_results/baseline.json index 4a58148..a7d34fb 100644 --- a/benchmark/benchmark_results/baseline.json +++ b/benchmark/benchmark_results/baseline.json @@ -1,83 +1,83 @@ { "julia_version": "1.12.0", - "timestamp": "2025-10-15 15:54:06", + "timestamp": "2025-10-17 09:29:56", "benchmarks": { "bernoulli_swap_n500_k5": { - "max_ms": 17.386917, - "min_ms": 15.523083, - "mean_ms": 15.75884578, - "median_ms": 15.6418545, - "std_ms": 0.3105241491716007 + "max_ms": 14.133584, + "min_ms": 2.259333, + "mean_ms": 2.57733328, + "median_ms": 2.294333, + "std_ms": 1.6774071498610286 }, "assignment_creation_n200": { - "max_us": 958.667, - "min_us": 856.458, - "median_us": 891.771, - "mean_us": 892.07211, - "std_us": 18.802076604038753 + "max_us": 1090.334, + "min_us": 810.875, + "median_us": 1002.0835, + "mean_us": 979.6716700000001, + "std_us": 77.08177811411355 }, "categorical_swap_n50_k2_m3": { - "max_ms": 0.060375, - "min_ms": 0.008958, - "mean_ms": 0.00995414, - "median_ms": 0.009209, - "std_ms": 0.005302552623372667 + "max_ms": 0.02725, + "min_ms": 0.007125, + "mean_ms": 0.00747457, + "median_ms": 0.00725, + "std_ms": 0.002000665006890953 }, "edgelist_creation_n200": { - "max_us": 3982.25, - "min_us": 191.292, - "median_us": 250.729, - "mean_us": 287.22586, - "std_us": 373.70431110094887 + "max_us": 3194.709, + "min_us": 196.959, + "median_us": 257.6665, + "mean_us": 287.92171, + "std_us": 295.18723492986845 }, "loglikelihood_n200": { - "max_us": 0.04755310621242485, - "min_us": 0.014654308617234468, - "median_us": 0.015947895791583167, - "mean_us": 0.01565783266533066, - "std_us": 0.0015483768072472187 + "max_us": 0.00975, + "min_us": 0.005208999999999999, + "median_us": 0.005333, + "mean_us": 0.005361388, + "std_us": 0.0001773114909441032 }, "bernoulli_optimize_n100_1k": { - "max_ms": 260.846875, - "min_ms": 0.562458, - "mean_ms": 56.7193917, - "median_ms": 0.7415835, - "std_ms": 107.19352380450401 + "max_ms": 109.028, + "min_ms": 53.956375, + "mean_ms": 80.9743292, + "median_ms": 78.6800415, + "std_ms": 25.923985956866588 }, "categorical_swap_n200_k3_m4": { - "max_ms": 0.05275, - "min_ms": 0.023833, - "mean_ms": 0.02478879, - "median_ms": 0.024167, - "std_ms": 0.0034973870832379988 + "max_ms": 0.049458, + "min_ms": 0.020292, + "mean_ms": 0.02077412, + "median_ms": 0.020417, + "std_ms": 0.0029010176211081164 }, "bernoulli_swap_n50_k2": { - "max_ms": 0.078083, - "min_ms": 0.060125, - "mean_ms": 0.06149838, - "median_ms": 0.0605625, - "std_ms": 0.002599585086474028 + "max_ms": 0.042708, + "min_ms": 0.032125, + "mean_ms": 0.032648790000000004, + "median_ms": 0.032375, + "std_ms": 0.001290518346215656 }, "categorical_swap_n500_k5_m5": { - "max_ms": 0.125292, - "min_ms": 0.092625, - "mean_ms": 0.09619675999999999, - "median_ms": 0.093375, - "std_ms": 0.006499953084791443 + "max_ms": 0.081875, + "min_ms": 0.056125, + "mean_ms": 0.057171599999999996, + "median_ms": 0.056375, + "std_ms": 0.0037585196771156197 }, "categorical_optimize_n100_1k": { - "max_ms": 21.680917, - "min_ms": 0.48575, - "mean_ms": 3.8231835, - "median_ms": 0.5199585, - "std_ms": 7.301765661997686 + "max_ms": 16.551667, + "min_ms": 8.196416, + "mean_ms": 12.5445417, + "median_ms": 13.474271, + "std_ms": 3.0889250452015777 }, "bernoulli_swap_n200_k3": { - "max_ms": 1.895959, - "min_ms": 1.728084, - "mean_ms": 1.74944463, - "median_ms": 1.739271, - "std_ms": 0.02638663709158731 + "max_ms": 0.514583, + "min_ms": 0.437791, + "mean_ms": 0.44661872999999996, + "median_ms": 0.441792, + "std_ms": 0.015195061493625175 } } } \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json deleted file mode 100644 index ee2cc68..0000000 --- a/benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 22:51:00", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 3.227041, - "min_ms": 2.389583, - "mean_ms": 2.57204906, - "median_ms": 2.5154585, - "std_ms": 0.18953309587998995 - }, - "assignment_creation_n200": { - "max_us": 978.292, - "min_us": 851.25, - "median_us": 890.166, - "mean_us": 893.5574399999999, - "std_us": 27.32222851667806 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.027917, - "min_ms": 0.009125, - "mean_ms": 0.010145870000000001, - "median_ms": 0.009542, - "std_ms": 0.002766908359851873 - }, - "edgelist_creation_n200": { - "max_us": 3529.958, - "min_us": 198.958, - "median_us": 251.083, - "mean_us": 285.34033, - "std_us": 328.14675494182416 - }, - "loglikelihood_n200": { - "max_us": 0.03218937875751503, - "min_us": 0.014654308617234468, - "median_us": 0.015947895791583167, - "mean_us": 0.015587750501002001, - "std_us": 0.001110091729142502 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 125.034125, - "min_ms": 0.47775, - "mean_ms": 20.107170699999998, - "median_ms": 0.5687915, - "std_ms": 42.12092585974448 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.055292, - "min_ms": 0.036333, - "mean_ms": 0.03703418, - "median_ms": 0.036833, - "std_ms": 0.0018664594223699422 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.105291, - "min_ms": 0.038791, - "mean_ms": 0.040879559999999995, - "median_ms": 0.039333, - "std_ms": 0.007915583005028433 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.127208, - "min_ms": 0.0955, - "mean_ms": 0.09864, - "median_ms": 0.0968955, - "std_ms": 0.006224672275612228 - }, - "categorical_optimize_n100_1k": { - "max_ms": 21.485209, - "min_ms": 0.485375, - "mean_ms": 3.8293791, - "median_ms": 0.526625, - "std_ms": 7.277684487244635 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.883125, - "min_ms": 0.510125, - "mean_ms": 0.54511338, - "median_ms": 0.528875, - "std_ms": 0.06696204429448495 - } - } -} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json deleted file mode 100644 index b0e60bf..0000000 --- a/benchmark/benchmark_results/benchmark_2025-10-15T22-52-50.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 22:52:28", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 3.112333, - "min_ms": 2.408458, - "mean_ms": 2.49714256, - "median_ms": 2.465771, - "std_ms": 0.11591144798617362 - }, - "assignment_creation_n200": { - "max_us": 1056.667, - "min_us": 854.083, - "median_us": 891.854, - "mean_us": 895.33457, - "std_us": 27.603034723460127 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.033792, - "min_ms": 0.009041, - "mean_ms": 0.00967924, - "median_ms": 0.009417, - "std_ms": 0.002441718719560987 - }, - "edgelist_creation_n200": { - "max_us": 3570.541, - "min_us": 200.541, - "median_us": 253.0205, - "mean_us": 283.21247999999997, - "std_us": 332.34212178512064 - }, - "loglikelihood_n200": { - "max_us": 0.044714428857715434, - "min_us": 0.014612224448897794, - "median_us": 0.015947895791583167, - "mean_us": 0.01585511623246492, - "std_us": 0.0019358509774869575 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 124.224584, - "min_ms": 0.454917, - "mean_ms": 20.3278875, - "median_ms": 0.5847705, - "std_ms": 42.30671579701058 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.045042, - "min_ms": 0.025542, - "mean_ms": 0.026280889999999998, - "median_ms": 0.026, - "std_ms": 0.001935421087452954 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.050166, - "min_ms": 0.037458, - "mean_ms": 0.03860959, - "median_ms": 0.038, - "std_ms": 0.0019394555890035667 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.126417, - "min_ms": 0.094791, - "mean_ms": 0.10100667999999999, - "median_ms": 0.0979795, - "std_ms": 0.006914122676472129 - }, - "categorical_optimize_n100_1k": { - "max_ms": 21.871542, - "min_ms": 0.491292, - "mean_ms": 3.8501959, - "median_ms": 0.514959, - "std_ms": 7.368805184415804 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.610667, - "min_ms": 0.511, - "mean_ms": 0.52604377, - "median_ms": 0.5165625, - "std_ms": 0.01953388411306172 - } - } -} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json deleted file mode 100644 index 63a888f..0000000 --- a/benchmark/benchmark_results/benchmark_2025-10-15T22-55-28.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 22:55:07", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 2.710667, - "min_ms": 2.482959, - "mean_ms": 2.5396208799999997, - "median_ms": 2.5191045, - "std_ms": 0.05846034240358578 - }, - "assignment_creation_n200": { - "max_us": 3125.959, - "min_us": 885.666, - "median_us": 931.9585, - "mean_us": 1039.505, - "std_us": 339.0273008825351 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.045167, - "min_ms": 0.009416, - "mean_ms": 0.01004002, - "median_ms": 0.009666, - "std_ms": 0.0035524098208046737 - }, - "edgelist_creation_n200": { - "max_us": 3497.792, - "min_us": 191.166, - "median_us": 250.8955, - "mean_us": 285.86834000000005, - "std_us": 325.67732384836796 - }, - "loglikelihood_n200": { - "max_us": 0.04972444889779559, - "min_us": 0.014654308617234468, - "median_us": 0.015947895791583167, - "mean_us": 0.015710969939879756, - "std_us": 0.001805494529688457 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 124.640792, - "min_ms": 0.441125, - "mean_ms": 19.8177917, - "median_ms": 0.495354, - "std_ms": 41.869538317803716 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.043916, - "min_ms": 0.025291, - "mean_ms": 0.02627703, - "median_ms": 0.025709, - "std_ms": 0.0024766015508182085 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.087459, - "min_ms": 0.038917, - "mean_ms": 0.04051669, - "median_ms": 0.039292, - "std_ms": 0.00566492637037033 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.11875, - "min_ms": 0.098333, - "mean_ms": 0.10018246, - "median_ms": 0.099583, - "std_ms": 0.00284072972236188 - }, - "categorical_optimize_n100_1k": { - "max_ms": 22.528667, - "min_ms": 0.462459, - "mean_ms": 3.9042627000000003, - "median_ms": 0.501917, - "std_ms": 7.566682937768953 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.62325, - "min_ms": 0.530166, - "mean_ms": 0.54022791, - "median_ms": 0.5352085, - "std_ms": 0.01611283337453269 - } - } -} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json deleted file mode 100644 index ca15355..0000000 --- a/benchmark/benchmark_results/benchmark_2025-10-15T22-56-00.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 22:55:38", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 2.630917, - "min_ms": 2.406333, - "mean_ms": 2.48274666, - "median_ms": 2.4808125, - "std_ms": 0.043789408323134535 - }, - "assignment_creation_n200": { - "max_us": 4273.291, - "min_us": 879.833, - "median_us": 932.104, - "mean_us": 997.5184, - "std_us": 350.62550806514275 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.028625, - "min_ms": 0.00975, - "mean_ms": 0.01021751, - "median_ms": 0.01, - "std_ms": 0.001868408459522746 - }, - "edgelist_creation_n200": { - "max_us": 3616.333, - "min_us": 193.791, - "median_us": 258.8955, - "mean_us": 297.08418, - "std_us": 336.1381451871029 - }, - "loglikelihood_n200": { - "max_us": 0.044547094188376754, - "min_us": 0.014654308617234468, - "median_us": 0.015948897795591183, - "mean_us": 0.016109651302605204, - "std_us": 0.0018390735764153526 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 126.498292, - "min_ms": 0.422959, - "mean_ms": 20.0525292, - "median_ms": 0.4898955, - "std_ms": 42.46023333898911 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.052292, - "min_ms": 0.0245, - "mean_ms": 0.025403779999999997, - "median_ms": 0.025, - "std_ms": 0.002871652896008067 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.048666, - "min_ms": 0.037333, - "mean_ms": 0.03801714, - "median_ms": 0.0375, - "std_ms": 0.0015056452358246997 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.124, - "min_ms": 0.095125, - "mean_ms": 0.09845666, - "median_ms": 0.09675, - "std_ms": 0.0052526714509701375 - }, - "categorical_optimize_n100_1k": { - "max_ms": 21.61125, - "min_ms": 0.464917, - "mean_ms": 3.7754376, - "median_ms": 0.477375, - "std_ms": 7.284791611432641 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.628125, - "min_ms": 0.51325, - "mean_ms": 0.52672377, - "median_ms": 0.520625, - "std_ms": 0.01733130179578166 - } - } -} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json b/benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json deleted file mode 100644 index 00a806e..0000000 --- a/benchmark/benchmark_results/benchmark_2025-10-15T22-58-43.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 22:58:20", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 2.524833, - "min_ms": 2.393667, - "mean_ms": 2.4419925, - "median_ms": 2.4398125, - "std_ms": 0.028738214054297325 - }, - "assignment_creation_n200": { - "max_us": 1435.875, - "min_us": 905.084, - "median_us": 969.5415, - "mean_us": 996.6576, - "std_us": 92.09479767702254 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.045875, - "min_ms": 0.0095, - "mean_ms": 0.01018415, - "median_ms": 0.0097705, - "std_ms": 0.00361557269413376 - }, - "edgelist_creation_n200": { - "max_us": 4299.417, - "min_us": 208.25, - "median_us": 288.5, - "mean_us": 319.5867, - "std_us": 403.23297395167117 - }, - "loglikelihood_n200": { - "max_us": 0.06508817635270542, - "min_us": 0.015155310621242485, - "median_us": 0.016490981963927856, - "mean_us": 0.01614670541082165, - "std_us": 0.001998467439260285 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 129.661875, - "min_ms": 64.177334, - "mean_ms": 95.95533329999999, - "median_ms": 92.004916, - "std_ms": 30.315790347507026 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.140084, - "min_ms": 0.039208, - "mean_ms": 0.041837980000000004, - "median_ms": 0.039875, - "std_ms": 0.010841225036988497 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.052, - "min_ms": 0.037542, - "mean_ms": 0.03809124, - "median_ms": 0.037708, - "std_ms": 0.0018499183226304992 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.127083, - "min_ms": 0.09925, - "mean_ms": 0.10115922000000001, - "median_ms": 0.1005625, - "std_ms": 0.003808675051049879 - }, - "categorical_optimize_n100_1k": { - "max_ms": 24.93025, - "min_ms": 11.742417, - "mean_ms": 17.8008251, - "median_ms": 19.0025625, - "std_ms": 4.622818379951839 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.604917, - "min_ms": 0.511375, - "mean_ms": 0.52376955, - "median_ms": 0.5186875, - "std_ms": 0.013513957816500744 - } - } -} \ No newline at end of file diff --git a/benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json b/benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json deleted file mode 100644 index a1a5894..0000000 --- a/benchmark/benchmark_results/benchmark_2025-10-15T23-27-10.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 23:26:47", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 3.89675, - "min_ms": 2.490625, - "mean_ms": 2.67821422, - "median_ms": 2.5600625, - "std_ms": 0.30520369156501004 - }, - "assignment_creation_n200": { - "max_us": 1076.709, - "min_us": 918.375, - "median_us": 966.5625, - "mean_us": 967.24371, - "std_us": 27.151506778246986 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.031625, - "min_ms": 0.008, - "mean_ms": 0.00841167, - "median_ms": 0.008125, - "std_ms": 0.0023503062085265367 - }, - "edgelist_creation_n200": { - "max_us": 3455.5, - "min_us": 196.875, - "median_us": 253.354, - "mean_us": 289.45708, - "std_us": 320.40221672081117 - }, - "loglikelihood_n200": { - "max_us": 0.039495991983967936, - "min_us": 0.014612224448897794, - "median_us": 0.015947895791583167, - "mean_us": 0.015709479959919836, - "std_us": 0.001707683939625983 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 131.9495, - "min_ms": 63.998125, - "mean_ms": 96.4309377, - "median_ms": 92.050771, - "std_ms": 31.092325846843277 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.070375, - "min_ms": 0.034333, - "mean_ms": 0.03502833, - "median_ms": 0.034666, - "std_ms": 0.0035768376333930193 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.057292, - "min_ms": 0.042458, - "mean_ms": 0.04351547, - "median_ms": 0.043292, - "std_ms": 0.001701379543966091 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.112416, - "min_ms": 0.088375, - "mean_ms": 0.08935744, - "median_ms": 0.088833, - "std_ms": 0.003347376689350657 - }, - "categorical_optimize_n100_1k": { - "max_ms": 17.914792, - "min_ms": 8.9885, - "mean_ms": 13.7015751, - "median_ms": 14.930396, - "std_ms": 3.4717329142617093 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.774791, - "min_ms": 0.534542, - "mean_ms": 0.5486900100000001, - "median_ms": 0.5379165, - "std_ms": 0.03468300186071855 - } - } -} \ No newline at end of file diff --git a/benchmark/benchmark_results/optimized.json b/benchmark/benchmark_results/optimized.json deleted file mode 100644 index e2d21d0..0000000 --- a/benchmark/benchmark_results/optimized.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-15 22:35:21", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 2.49025, - "min_ms": 2.412167, - "mean_ms": 2.4460266, - "median_ms": 2.440625, - "std_ms": 0.018879105065697174 - }, - "assignment_creation_n200": { - "max_us": 1020.75, - "min_us": 850.584, - "median_us": 895.833, - "mean_us": 899.73671, - "std_us": 26.985610491180616 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.028041, - "min_ms": 0.009167, - "mean_ms": 0.00977582, - "median_ms": 0.009459, - "std_ms": 0.002031678931426922 - }, - "edgelist_creation_n200": { - "max_us": 3978.25, - "min_us": 201.041, - "median_us": 255.0205, - "mean_us": 293.98788, - "std_us": 372.7416273862515 - }, - "loglikelihood_n200": { - "max_us": 0.04283567134268537, - "min_us": 0.014612224448897794, - "median_us": 0.015947895791583167, - "mean_us": 0.01562773947895791, - "std_us": 0.0013940979567484434 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 125.829375, - "min_ms": 0.489416, - "mean_ms": 28.099379199999998, - "median_ms": 0.5702295, - "std_ms": 51.02862436095698 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.044625, - "min_ms": 0.024375, - "mean_ms": 0.02514243, - "median_ms": 0.024875, - "std_ms": 0.0019936978244589937 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.049542, - "min_ms": 0.037541, - "mean_ms": 0.03824543, - "median_ms": 0.03775, - "std_ms": 0.0019822861674329366 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.131541, - "min_ms": 0.093667, - "mean_ms": 0.09694180000000001, - "median_ms": 0.095166, - "std_ms": 0.0059769960716748284 - }, - "categorical_optimize_n100_1k": { - "max_ms": 22.4595, - "min_ms": 0.516042, - "mean_ms": 4.0880541, - "median_ms": 0.627167, - "std_ms": 7.622196654548731 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.595833, - "min_ms": 0.516792, - "mean_ms": 0.52424699, - "median_ms": 0.5198955, - "std_ms": 0.01275029263319953 - } - } -} \ No newline at end of file diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl index 49035fc..3bdab74 100644 --- a/benchmark/run_benchmarks.jl +++ b/benchmark/run_benchmarks.jl @@ -20,6 +20,7 @@ Examples: """ using Pkg +using Dates # Ensure we're in the right directory cd(dirname(@__DIR__)) @@ -65,7 +66,7 @@ end function run_baseline() ensure_dependencies() - baseline_file = joinpath("dev", "benchmark_results", "baseline.json") + baseline_file = joinpath("benchmark", "benchmark_results", "baseline.json") if isfile(baseline_file) print("Baseline already exists. Overwrite? (y/N): ") @@ -79,7 +80,7 @@ function run_baseline() println("\nRunning baseline benchmarks...") println("This may take several minutes...\n") - run(`julia --project=. benchmark_optimization.jl $baseline_file`) + run(`julia --project=. benchmark/benchmark_optimization.jl $baseline_file`) println("\n✓ Baseline established at: $baseline_file") println("\nNext steps:") @@ -91,7 +92,7 @@ end function run_current() ensure_dependencies() - baseline_file = joinpath("dev", "benchmark_results", "baseline.json") + baseline_file = joinpath("benchmark", "benchmark_results", "baseline.json") if !isfile(baseline_file) println("⚠ Warning: No baseline found!") @@ -100,15 +101,15 @@ function run_current() end timestamp = Dates.format(Dates.now(), "yyyy-mm-ddTHH-MM-SS") - current_file = joinpath("dev", "benchmark_results", "current_$timestamp.json") + current_file = joinpath("benchmark", "benchmark_results", "current_$timestamp.json") println("Running current benchmarks...") println("This may take several minutes...\n") if isfile(baseline_file) - run(`julia --project=. benchmark_optimization.jl $current_file $baseline_file`) + run(`julia --project=. benchmark/benchmark_optimization.jl $current_file $baseline_file`) else - run(`julia --project=. benchmark_optimization.jl $current_file`) + run(`julia --project=. benchmark/benchmark_optimization.jl $current_file`) end println("\n✓ Results saved to: $current_file") @@ -132,11 +133,11 @@ function compare_benchmarks(file1, file2) println(" Current: $file1\n") # Re-run comparison - run(`julia --project=. benchmark_optimization.jl $file1 $file2`) + run(`julia --project=. benchmark/benchmark_optimization.jl $file1 $file2`) end function clean_results() - results_dir = joinpath("dev", "benchmark_results") + results_dir = joinpath("benchmark", "benchmark_results") if !isdir(results_dir) println("No results directory found.") diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 6c145da..acbccf4 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -5,6 +5,7 @@ using ProgressMeter import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero using Distributions +using LinearAlgebra include("utils/include.jl") using .FastSymArray diff --git a/src/assignment.jl b/src/assignment.jl index 57115c3..3a28ce0 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -85,7 +85,7 @@ node partition) fits the observed network data. # Returns - `Float64`: Total log-likelihood value """ -@inline function loglikelihood(a::Assignment) +function loglikelihood(a::Assignment) return FastSymArray.sum_tri_with_diag(a.log_likelihood) end @@ -101,7 +101,7 @@ Get the group label for a specific node. # Returns - `Int`: Group index that the node belongs to """ -@inline function group(a::Assignment, node::Int) +function group(a::Assignment, node::Int) @boundscheck checkbounds(a.node_labels, node) @inbounds return a.node_labels[node] end diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index f816295..c29e78d 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -21,8 +21,8 @@ function Assignment( for (v, e) in iterate_neighbors(edge_list, u) g2 = node_labels[v] if v < u - counts[minmax(g1, g2)...] += 1 - realized[minmax(g1, g2)...][e] += 1 + counts[g1, g2] += 1 + realized[g1, g2][e] += 1 else break end @@ -30,8 +30,6 @@ function Assignment( end for g2 in 1:n_groups, g1 in g2:n_groups - counts[g1, g2] = counts[minmax(g1, g2)...] - realized[g1, g2] = realized[minmax(g1, g2)...] _fast_normalization!( estimated[g1, g2], realized[g1, g2], counts[g1, g2]) end @@ -86,20 +84,24 @@ end function copy_categorical_workspace!( dest::WorkspaceDiscreteSwap, src_assignment::Assignment) # In-place copy without allocation - copy_symarray!(dest.log_likelihood_per_group, src_assignment.log_likelihood) + copy!(dest.log_likelihood_per_group, src_assignment.log_likelihood) - src_ws = src_assignment.additional_workspace # Copy counts (scalars) - copy_symarray!(dest.counts, src_ws.counts) + copy!(dest.counts, src_assignment.additional_workspace.counts) # Copy vector-valued SymArrays element by element - @inbounds for key in keys(src_ws.realized.d) - copyto!(dest.realized.d[key], src_ws.realized.d[key]) - end - - @inbounds for key in keys(src_ws.estimated.d) - copyto!(dest.estimated.d[key], src_ws.estimated.d[key]) - end + # Use sparse matrix iteration instead of .d dictionary + k = size(dest.realized, 1) + copy_with_array!(dest.realized, src_assignment.additional_workspace.realized) + copy_with_array!(dest.estimated, src_assignment.additional_workspace.estimated) + + # @inbounds for j in 1:k, i in 1:j + # copyto!(dest.realized[i, j], src_ws.realized[i, j]) + # end + + # @inbounds for j in 1:k, i in 1:j + # copyto!(dest.estimated[i, j], src_ws.estimated[i, j]) + # end end function make_swap_workspace!(ws::WorkspaceDiscreteSwap, a::Assignment) @@ -109,20 +111,23 @@ end function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) # Use in-place copy instead of deepcopy - copy_symarray!(a.log_likelihood, ws.log_likelihood_per_group) + copy!(a.log_likelihood, ws.log_likelihood_per_group) - as = a.additional_workspace - copy_symarray!(as.log_likelihood_per_group, ws.log_likelihood_per_group) - copy_symarray!(as.counts, ws.counts) + copy!(a.additional_workspace.log_likelihood_per_group, ws.log_likelihood_per_group) + copy!(a.additional_workspace.counts, ws.counts) # Copy vector-valued SymArrays element by element - @inbounds for key in keys(ws.realized.d) - copyto!(as.realized.d[key], ws.realized.d[key]) - end - - @inbounds for key in keys(ws.estimated.d) - copyto!(as.estimated.d[key], ws.estimated.d[key]) - end + # Use sparse matrix iteration instead of .d dictionary + k = size(ws.realized, 1) + copy_with_array!(a.additional_workspace.realized, ws.realized) + copy_with_array!(a.additional_workspace.estimated, ws.estimated) + # @inbounds for j in 1:k, i in 1:j + # copyto!(as.realized[i, j], ws.realized[i, j]) + # end + + # @inbounds for j in 1:k, i in 1:j + # copyto!(as.estimated[i, j], ws.estimated[i, j]) + # end end function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) @@ -135,20 +140,20 @@ function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) continue end g_inter = as.node_labels[node] - as.additional_workspace.counts[minmax(gu, g_inter)...] -= 1 - as.additional_workspace.realized[minmax(gu, g_inter)...][e] -= 1 - as.additional_workspace.counts[minmax(gv, g_inter)...] += 1 - as.additional_workspace.realized[minmax(gv, g_inter)...][e] += 1 + as.additional_workspace.counts[gu, g_inter] -= 1 + as.additional_workspace.realized[gu, g_inter][e] -= 1 + as.additional_workspace.counts[gv, g_inter] += 1 + as.additional_workspace.realized[gv, g_inter][e] += 1 end for (node, e) in iterate_neighbors(as.edges, v) if node == u continue end g_inter = as.node_labels[node] - as.additional_workspace.counts[minmax(gv, g_inter)...] -= 1 - as.additional_workspace.realized[minmax(gv, g_inter)...][e] -= 1 - as.additional_workspace.counts[minmax(gu, g_inter)...] += 1 - as.additional_workspace.realized[minmax(gu, g_inter)...][e] += 1 + as.additional_workspace.counts[gv, g_inter] -= 1 + as.additional_workspace.realized[gv, g_inter][e] -= 1 + as.additional_workspace.counts[gu, g_inter] += 1 + as.additional_workspace.realized[gu, g_inter][e] += 1 end _fast_normalization!.(as.additional_workspace.estimated, as.additional_workspace.realized, as.additional_workspace.counts) @@ -164,8 +169,8 @@ function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) g1, g2]) end end - - as.log_likelihood = deepcopy(as.additional_workspace.log_likelihood_per_group) + copy!(as.log_likelihood, as.additional_workspace.log_likelihood_per_group) + # as.log_likelihood = deepcopy(as.additional_workspace.log_likelihood_per_group) end function _fast_normalization!(p::AbstractVector, r::AbstractVector, c::Real) diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl index 8774ef6..aa94c73 100644 --- a/src/optimization/swap_workspace.jl +++ b/src/optimization/swap_workspace.jl @@ -23,25 +23,16 @@ mutable struct Swap{W} workspace::W end -function copy_symarray!(dest::SymArray, src::SymArray) - # In-place copy without allocation - # SymArray stores data in a dictionary .d - # Just overwrite the values - don't empty first! - @inbounds for key in keys(src.d) - dest.d[key] = src.d[key] - end -end - function make_swap_workspace!(ws, a::Assignment) # Use in-place copy instead of deepcopy - copy_symarray!(ws.θ, a.θ) - copy_symarray!(ws.log_likelihood_per_group, a.log_likelihood) + copy!(ws.θ, a.θ) + copy!(ws.log_likelihood_per_group, a.log_likelihood) end function revert_swap_workspace!(a::Assignment, ws) # Use in-place copy instead of deepcopy - copy_symarray!(a.θ, ws.θ) - copy_symarray!(a.log_likelihood, ws.log_likelihood_per_group) + copy!(a.θ, ws.θ) + copy!(a.log_likelihood, ws.log_likelihood_per_group) end function make_swap(a::Assignment, id) diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 0b97103..e9922a4 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -2,23 +2,26 @@ FastSymArray - Efficient symmetric matrix storage This module provides `SymArray`, a memory-efficient storage for symmetric matrices -that only stores the upper triangle (including diagonal) of the matrix. +that only stores the upper triangle (including diagonal) of the matrix using a sparse matrix. """ module FastSymArray -import Base: eltype, convert -export SymArray, eltype +using SparseArrays +using LinearAlgebra +import Base: eltype, convert, size, getindex, setindex!, copy!, similar, + IndexStyle, axes, length, iterate, copyto! +export SymArray, eltype, copy_with_array!, sum_tri_with_diag """ SymArray{F} <: AbstractArray{F, 2} -A symmetric matrix that stores only the upper triangle to save memory. +A symmetric matrix that stores only the upper triangle using a sparse matrix. For a k×k symmetric matrix, only k(k+1)/2 elements are stored instead of k². +This implementation uses Julia's SparseMatrixCSC for efficient storage and access. # Fields -- `d::Dict{Tuple{Int, Int}, F}`: Dictionary storing (i,j) → value for i ≤ j -- `k::Int`: Dimension of the square matrix +- `uppertrian::SparseMatrixCSC{F, Int}`: Sparse matrix storing the upper triangle (i ≤ j) # Examples ```julia @@ -36,9 +39,8 @@ sym = SymArray(A) See also: [`sum_tri_with_diag`](@ref) """ -mutable struct SymArray{F} <: AbstractArray{F, 2} - d::Dict{Tuple{Int, Int}, F} - k::Int +mutable struct SymArray{F} <: AbstractSparseMatrix{F, Int} + uppertrian::SparseMatrixCSC{F, Int} end """ @@ -57,19 +59,48 @@ sym = SymArray(5, 0.0) # 5×5 matrix of zeros """ function SymArray(k::T, d::F) where {F, T <: Real} k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) - return SymArray{F}( - Dict{Tuple{Int, Int}, F}(minmax(i, j) => deepcopy(d) for i in 1:k - for j in i:k), - k) + + # Pre-allocate arrays with exact size needed for upper triangle + n_elements = div(k * (k + 1), 2) + I_indices = Vector{Int}(undef, n_elements) + J_indices = Vector{Int}(undef, n_elements) + values = Vector{F}(undef, n_elements) + + idx = 1 + for j in 1:k + for i in 1:j + I_indices[idx] = i + J_indices[idx] = j + values[idx] = deepcopy(d) + idx += 1 + end + end + + uppertrian = sparse(I_indices, J_indices, values, k, k) + return SymArray{F}(uppertrian) end function SymArray(k::T, d::AbstractArray) where {T <: Real} k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) - return SymArray{typeof(d)}( - Dict{Tuple{Int, Int}, typeof(d)}(minmax(i, j) => deepcopy(d) - for i in 1:k - for j in i:k), - k) + + # Pre-allocate arrays with exact size needed for upper triangle + n_elements = div(k * (k + 1), 2) + I_indices = Vector{Int}(undef, n_elements) + J_indices = Vector{Int}(undef, n_elements) + values = Vector{typeof(d)}(undef, n_elements) + + idx = 1 + for j in 1:k + for i in 1:j + I_indices[idx] = i + J_indices[idx] = j + values[idx] = deepcopy(d) + idx += 1 + end + end + + uppertrian = sparse(I_indices, J_indices, values, k, k) + return SymArray{typeof(d)}(uppertrian) end """ @@ -98,18 +129,61 @@ function SymArray(d::AbstractMatrix{F}) where {F} return convert(SymArray{F}, d) end -function Base.size(a::SymArray) - return (a.k, a.k) +function size(a::SymArray) + return size(a.uppertrian) end -Base.@propagate_inbounds function Base.getindex(a::SymArray, i, j) +# IndexStyle trait - use CartesianIndex for 2D arrays +Base.IndexStyle(::Type{<:SymArray}) = IndexCartesian() + +# axes function +function axes(a::SymArray) + return axes(a.uppertrian) +end + +# length function +function length(a::SymArray) + return length(a.uppertrian) +end + +Base.@propagate_inbounds function getindex(a::SymArray{F}, i::Int, j::Int) where {F} @boundscheck checkbounds(a, i, j) - @inbounds return a.d[minmax(i, j)] + if i <= j + @inbounds return a.uppertrian[i, j] + else + @inbounds return a.uppertrian[j, i] + end end -Base.@propagate_inbounds function Base.setindex!(a::SymArray, v, i, j) +Base.@propagate_inbounds function setindex!(a::SymArray{F}, v, i::Int, j::Int) where {F} @boundscheck checkbounds(a, i, j) - @inbounds a.d[minmax(i, j)] = v + if i <= j + @inbounds a.uppertrian[i, j] = v + else + @inbounds a.uppertrian[j, i] = v + end +end + +# similar function for creating similar arrays +function similar(a::SymArray{F}) where {F} + k = size(a, 1) + return SymArray(k, zero(F)) +end + +function similar(a::SymArray, ::Type{T}) where {T} + k = size(a, 1) + return SymArray(k, zero(T)) +end + +function similar(a::SymArray, ::Type{T}, dims::Dims{2}) where {T} + dims[1] == dims[2] || throw(ArgumentError("SymArray must be square")) + return SymArray(dims[1], zero(T)) +end + +function copyto!(dest::SymArray{F}, src::SymArray{F}) where {F} + size(dest) == size(src) || throw(DimensionMismatch("arrays must have the same size")) + copyto!(dest.uppertrian, src.uppertrian) + return dest end """ @@ -124,7 +198,7 @@ Efficiently sum all elements in the symmetric matrix (counting each off-diagonal This is more efficient than `sum(a)` because it only sums stored elements. """ function sum_tri_with_diag(a::SymArray) - return sum(values(a.d)) + return sum(a.uppertrian.nzval) end function eltype(::SymArray{F}) where {F} @@ -134,26 +208,122 @@ end function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} @assert size(a, 1) == size(a, 2) k = size(a, 1) - res = SymArray(k, a[1, 1]) - for j in axes(a, 2) - for i in axes(a, 1) - if i <= j - res[i, j] = a[i, j] - end + + # Directly build upper triangle sparse matrix + # Pre-allocate with exact size needed + I_indices = Vector{Int}(undef, div(k * (k + 1), 2)) + J_indices = Vector{Int}(undef, div(k * (k + 1), 2)) + values = Vector{F}(undef, div(k * (k + 1), 2)) + + idx = 1 + for j in 1:k + for i in 1:j + I_indices[idx] = i + J_indices[idx] = j + values[idx] = a[i, j] + idx += 1 end end - return res + + uppertrian = sparse(I_indices, J_indices, values, k, k) + return SymArray{F}(uppertrian) end function convert(::Type{AbstractMatrix{F}}, a::SymArray{F}) where {F} - k = a.k - m = zeros(F, k, k) - for i in 1:k - for j in i:k - m[i, j] = a[i, j] + # Reconstruct full symmetric matrix from upper triangle + # m = upper + upper' - Diagonal(upper) creates the full symmetric matrix + m = a.uppertrian + transpose(a.uppertrian) - + SparseArrays.spdiagm(0 => diag(a.uppertrian)) + return Matrix(m) +end + +function copy!(dest::SymArray{F}, src::SymArray{F}) where {F <: Real} + copyto!(dest, src) + return dest +end + +function copy_with_array!(dest::SymArray{F}, src::SymArray{F}) where {F <: AbstractArray} + @inbounds for index in eachindex(dest) + copyto!(dest[index], src[index]) + end + return dest +end + +# Broadcasting support - custom style to maintain symmetric structure +struct SymArrayStyle <: Broadcast.AbstractArrayStyle{2} end +SymArrayStyle(::Val{2}) = SymArrayStyle() + +Base.BroadcastStyle(::Type{<:SymArray}) = SymArrayStyle() + +# When broadcasting with scalars or other styles, keep SymArrayStyle +Base.BroadcastStyle(::SymArrayStyle, ::Broadcast.DefaultArrayStyle{0}) = SymArrayStyle() +Base.BroadcastStyle(::Broadcast.DefaultArrayStyle{0}, ::SymArrayStyle) = SymArrayStyle() + +# When broadcasting with other arrays, use default array style +function Base.BroadcastStyle(::SymArrayStyle, ::Broadcast.DefaultArrayStyle) + Broadcast.DefaultArrayStyle{2}() +end +function Base.BroadcastStyle(::Broadcast.DefaultArrayStyle, ::SymArrayStyle) + Broadcast.DefaultArrayStyle{2}() +end + +# When broadcasting between SymArrays, keep SymArrayStyle +Base.BroadcastStyle(::SymArrayStyle, ::SymArrayStyle) = SymArrayStyle() + +# Custom similar for broadcasted SymArrays +function Base.similar( + bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{ElType}) where {ElType} + # For mutating functions that return Nothing, don't allocate a SymArray + if ElType === Nothing + # Find the first SymArray in the broadcast expression + A = find_first_symarray(bc) + # Return a similar array with the same element type as the input + # This allows the broadcast to work but the result won't be used + return similar(Array{ElType}, axes(bc)) + end + # Find the first SymArray in the broadcast expression to get dimensions + A = find_first_symarray(bc) + return SymArray(size(A, 1), zero(ElType)) +end + +# Helper function to find a SymArray in the broadcast tree +find_first_symarray(bc::Broadcast.Broadcasted) = find_first_symarray(bc.args) +find_first_symarray(args::Tuple{}) = error("No SymArray found in broadcast") +find_first_symarray(args::Tuple) = find_first_symarray_in_args(args[1], Base.tail(args)) + +# Handle direct SymArray +find_first_symarray_in_args(x::SymArray, rest) = x +# Handle Extruded SymArray (from broadcasting) +find_first_symarray_in_args(x::Broadcast.Extruded{<:SymArray}, rest) = x.x +# Handle nested broadcasts +find_first_symarray_in_args(x::Broadcast.Broadcasted, rest) = find_first_symarray(x) +# Keep searching +find_first_symarray_in_args(x, rest) = find_first_symarray(rest) + +# Custom copyto! for efficient broadcasting +function Base.copyto!(dest::SymArray, bc::Broadcast.Broadcasted{SymArrayStyle}) + # Broadcast only over the upper triangle for efficiency + axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) + bc′ = Broadcast.preprocess(dest, bc) + + # Only compute upper triangle + k = size(dest, 1) + @inbounds for j in 1:k + for i in 1:j + dest[i, j] = bc′[CartesianIndex(i, j)] end end - return m + return dest +end + +# For broadcasting that returns Nothing (like with mutating functions) +function Base.copyto!(dest::AbstractArray, bc::Broadcast.Broadcasted{SymArrayStyle}) + # Fall back to default behavior + Broadcast.materialize!(dest, bc) +end + +@inline function throwdm(axdest, axsrc) + throw(DimensionMismatch("destination axes $axdest are not compatible with source axes $axsrc")) end end diff --git a/test/runtests.jl b/test/runtests.jl index c2e2e1c..0767e79 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,7 @@ using LinearAlgebra, SparseArrays using NetworkHistogram @testset "Tests" begin + include("test_symarray.jl") include("test_data_format.jl") include("test_distributions_type.jl") include("test_swap_workspace.jl") diff --git a/test/test_symarray.jl b/test/test_symarray.jl new file mode 100644 index 0000000..a2a1f53 --- /dev/null +++ b/test/test_symarray.jl @@ -0,0 +1,296 @@ +using Test +using NetworkHistogram +using NetworkHistogram.FastSymArray +using SparseArrays +using LinearAlgebra + +@testset "SymArray Array Interface" begin + @testset "Construction and basic properties" begin + # Test construction with scalar + a = SymArray(3, 1.0) + @test a isa AbstractArray{Float64, 2} + @test size(a) == (3, 3) + @test length(a) == 9 + @test axes(a) == (1:3, 1:3) + @test eltype(a) == Float64 + + # Test construction with zeros + b = SymArray(5, 0.0) + @test size(b) == (5, 5) + @test all(b[i, j] == 0.0 for i in 1:5 for j in 1:5) + + # Test dimension validation + @test_throws ArgumentError SymArray(0, 1.0) + @test_throws ArgumentError SymArray(-1, 1.0) + end + + @testset "Indexing - getindex and setindex!" begin + a = SymArray(4, 0.0) + + # Test setindex! in upper triangle + a[1, 2] = 5.0 + @test a[1, 2] == 5.0 + @test a[2, 1] == 5.0 # Symmetry + + # Test setindex! in lower triangle (should set upper) + a[3, 2] = 7.0 + @test a[2, 3] == 7.0 + @test a[3, 2] == 7.0 + + # Test diagonal + a[2, 2] = 3.0 + @test a[2, 2] == 3.0 + + # Test bounds checking + @test_throws BoundsError a[0, 1] + @test_throws BoundsError a[5, 1] + @test_throws BoundsError a[1, 5] + end + + @testset "Symmetry property" begin + a = SymArray(5, 0.0) + + # Set values and verify symmetry + for i in 1:5 + for j in 1:5 + val = i * 10 + j + a[i, j] = val + @test a[i, j] == a[j, i] + end + end + end + + @testset "Construction from matrix" begin + # Test from symmetric matrix + M = [1.0 2.0 3.0; + 2.0 4.0 5.0; + 3.0 5.0 6.0] + a = SymArray(M) + + @test size(a) == (3, 3) + for i in 1:3, j in 1:3 + @test a[i, j] == M[i, j] + end + + # Test non-square matrix throws error + @test_throws ArgumentError SymArray([1.0 2.0; 3.0 4.0; 5.0 6.0]) + end + + @testset "convert functions" begin + # Test conversion to SymArray + M = [1.0 2.0; 2.0 4.0] + a = convert(SymArray{Float64}, M) + @test a isa SymArray{Float64} + @test a[1, 1] == 1.0 + @test a[1, 2] == 2.0 + @test a[2, 2] == 4.0 + + # Test conversion to AbstractMatrix + b = convert(AbstractMatrix{Float64}, a) + @test b isa Matrix{Float64} + @test b == M + @test b[1, 2] == b[2, 1] # Verify symmetry + end + + @testset "similar function" begin + a = SymArray(3, 5.0) + + # Test similar without type + b = similar(a) + @test size(b) == size(a) + @test eltype(b) == eltype(a) + @test b isa SymArray{Float64} + + # Test similar with type + c = similar(a, Int) + @test size(c) == size(a) + @test eltype(c) == Int + @test c isa SymArray{Int} + + # Test similar with type and dimensions + d = similar(a, Float32, (4, 4)) + @test size(d) == (4, 4) + @test eltype(d) == Float32 + + # Test non-square dimensions throw error + @test_throws ArgumentError similar(a, Float64, (3, 4)) + end + + @testset "copyto! and copy!" begin + a = SymArray(3, 0.0) + a[1, 1] = 1.0 + a[1, 2] = 2.0 + a[2, 3] = 5.0 + + b = similar(a) + copyto!(b, a) + + @test b[1, 1] == 1.0 + @test b[1, 2] == 2.0 + @test b[2, 1] == 2.0 + @test b[2, 3] == 5.0 + @test b[3, 2] == 5.0 + + # Test copy! + c = similar(a) + copy!(c, a) + @test c[1, 1] == a[1, 1] + @test c[1, 2] == a[1, 2] + @test c[2, 3] == a[2, 3] + + # Test dimension mismatch + d = SymArray(4, 0.0) + @test_throws DimensionMismatch copyto!(d, a) + end + + @testset "Array operations" begin + a = SymArray(3, 2.0) + + # Test iteration + count = 0 + for val in a + @test val == 2.0 + count += 1 + end + @test count == 9 + + # Test sum + @test sum(a) == 18.0 + + # Test all/any + @test all(x -> x == 2.0, a) + @test any(x -> x == 2.0, a) + + # Test maximum/minimum + b = SymArray(3, 0.0) + b[1, 1] = 5.0 + b[2, 3] = -3.0 + @test maximum(b) == 5.0 + @test minimum(b) == -3.0 + end + + @testset "Mathematical operations" begin + a = SymArray(3, 2.0) + b = SymArray(3, 3.0) + + # Element-wise operations (using broadcasting) + c = a .+ b + @test c isa SymArray + @test all(c[i, j] == 5.0 for i in 1:3, j in 1:3) + + d = a .* 2 + @test d isa SymArray + @test all(d[i, j] == 4.0 for i in 1:3, j in 1:3) + + # Test subtraction + e = b .- a + @test e isa SymArray + @test all(e[i, j] == 1.0 for i in 1:3, j in 1:3) + + # Test division + f = b ./ 2.0 + @test f isa SymArray + @test all(f[i, j] == 1.5 for i in 1:3, j in 1:3) + + # Test unary operations + g = SymArray(3, -2.0) + h = abs.(g) + @test h isa SymArray + @test all(h[i, j] == 2.0 for i in 1:3, j in 1:3) + + # Test with mixed values + m = SymArray(3, 0.0) + m[1, 1] = 1.0 + m[1, 2] = 2.0 + m[2, 2] = 3.0 + m[1, 3] = 4.0 + m[2, 3] = 5.0 + m[3, 3] = 6.0 + + n = m .+ 10.0 + @test n isa SymArray + @test n[1, 1] == 11.0 + @test n[1, 2] == 12.0 + @test n[2, 1] == 12.0 # Symmetry + @test n[2, 2] == 13.0 + @test n[3, 3] == 16.0 + + # Test operations between two SymArrays with different values + p = SymArray(3, 0.0) + p[1, 1] = 10.0 + p[2, 2] = 20.0 + p[3, 3] = 30.0 + + q = m .+ p + @test q isa SymArray + @test q[1, 1] == 11.0 + @test q[2, 2] == 23.0 + @test q[3, 3] == 36.0 + @test q[1, 2] == 2.0 + @test q[2, 1] == 2.0 + end + + @testset "Special case: sum_tri_with_diag" begin + a = SymArray(3, 1.0) + # Only upper triangle is stored: 6 elements + # [1,1], [1,2], [1,3], [2,2], [2,3], [3,3] + @test sum_tri_with_diag(a) == 6.0 + + b = SymArray(4, 2.0) + # Upper triangle has 10 elements for 4x4 + @test sum_tri_with_diag(b) == 20.0 + + # Verify it's different from full sum (which counts off-diag twice) + # Full sum would be 2*n*(n-1)/2 + n for value v + # = v*(n^2-n+n) = v*n^2 + # While sum_tri_with_diag gives v*n*(n+1)/2 + end + + @testset "Type stability" begin + # Float64 + a = SymArray(3, 1.0) + @test typeof(a[1, 1]) == Float64 + + # Int + b = SymArray(3, 1) + @test typeof(b[1, 1]) == Int + + # Float32 + c = SymArray(3, 1.0f0) + @test typeof(c[1, 1]) == Float32 + end + + @testset "Sparse matrix properties" begin + a = SymArray(10, 0.0) + # Initially all elements are stored (including zeros) + # Set only a few elements to non-zero + a[1, 5] = 3.0 + a[3, 7] = 4.0 + a[9, 9] = 5.0 + + # Verify values are correct (symmetry) + @test a[1, 5] == 3.0 + @test a[5, 1] == 3.0 + @test a[3, 7] == 4.0 + @test a[7, 3] == 4.0 + @test a[9, 9] == 5.0 + @test a[2, 2] == 0.0 + end + + @testset "Edge cases" begin + # 1x1 matrix + a = SymArray(1, 5.0) + @test size(a) == (1, 1) + @test a[1, 1] == 5.0 + a[1, 1] = 10.0 + @test a[1, 1] == 10.0 + + # Large diagonal + b = SymArray(100, 0.0) + for i in 1:100 + b[i, i] = Float64(i) + end + @test b[50, 50] == 50.0 + @test b[99, 99] == 99.0 + end +end From adfbccade79b1d38a0849bb13f9520a01fcd15b3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 17 Oct 2025 11:26:43 +0200 Subject: [PATCH 193/266] clean --- PERFORMANCE.md | 242 ---------------------- Project.toml | 7 - QUICKREF.md | 138 ------------ benchmark/Project.toml | 9 + benchmark/benchmark_optimization.jl | 94 +++++++-- benchmark/benchmark_results/baseline.json | 112 +++++----- benchmark/profile_optimization.jl | 36 +--- benchmark/run_benchmarks.jl | 107 +++++----- benchmark/visualize_benchmarks.jl | 127 ++++++++---- src/distributions/distributions_type.jl | 8 +- 10 files changed, 290 insertions(+), 590 deletions(-) delete mode 100644 PERFORMANCE.md delete mode 100644 QUICKREF.md create mode 100644 benchmark/Project.toml diff --git a/PERFORMANCE.md b/PERFORMANCE.md deleted file mode 100644 index d68accc..0000000 --- a/PERFORMANCE.md +++ /dev/null @@ -1,242 +0,0 @@ -# Performance Optimization Guide for NetworkHistogram - -This repository now includes a comprehensive performance regression test suite -to help improve the optimization speed of NetworkHistogram algorithms. - -## 🎯 Quick Start - -### 1. Establish a Baseline - -Before making any changes: - -```bash -julia dev/run_benchmarks.jl baseline -``` - -### 2. Make Your Changes - -Edit the optimization code (e.g., in `src/optimization/`) - -### 3. Test Performance - -```bash -julia dev/run_benchmarks.jl current -``` - -This will automatically compare against your baseline and show: - -- Which operations got faster/slower -- By how much (speedup factor and percentage) -- Detailed timing statistics - -### 4. Verify Correctness - -```bash -julia --project=. -e 'using Pkg; Pkg.test()' -``` - -## 📊 What Gets Benchmarked - -### Core Operations - -- **Single swap operations** (Bernoulli & Categorical networks) - - Small networks: n=50 nodes - - Medium networks: n=200 nodes - - Large networks: n=500 nodes - -### Full Workflows - -- Complete optimization runs (1,000 iterations) -- End-to-end performance measurement - -### Components - -- Assignment creation -- EdgeList creation -- Log-likelihood computation -- Edge extraction - -## 🔍 Key Hotspots for Optimization - -Based on the workflow in `test_decorated_paper.jl`, these are the critical -bottlenecks: - -### 1. `apply_swap!` Function - -**Location**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` - -**Why it matters**: Called millions of times during optimization (once per -iteration) - -**Current bottlenecks**: - -- Uses `deepcopy` for state management -- Iterates over all neighbors repeatedly -- Allocates temporary arrays - -**Optimization ideas**: - -- Pre-allocate workspace buffers -- Use in-place operations -- Cache neighbor lists -- Reduce `deepcopy` usage - -### 2. `get_edges_in_groups` Function - -**Location**: `src/assignment.jl` - -**Why it matters**: Called during log-likelihood recomputation - -**Current bottlenecks**: - -- Uses `findall` (allocates) -- Creates new vector each time -- Linear search through nodes - -**Optimization ideas**: - -- Pre-compute group membership indices -- Use pre-allocated output buffers -- Cache results for frequently accessed group pairs - -### 3. Log-likelihood Updates - -**Location**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` - -**Why it matters**: Must be computed after each swap - -**Current approach**: Recomputes only affected group pairs (good!) - -**Optimization ideas**: - -- Batch `logpdf` computations -- Use vectorized operations -- Cache intermediate calculations - -## 📁 File Structure - -``` -dev/ - ├── run_benchmarks.jl # Easy-to-use benchmark runner - ├── benchmark_optimization.jl # Standalone benchmarking script - ├── BENCHMARKING.md # Detailed documentation - └── benchmark_results/ # Stored benchmark results - └── baseline.json # Reference baseline - -test/ - └── test_performance_regression.jl # Performance tests for CI -``` - -## 📈 Example Output - -``` ---- Single Swap Operations (Bernoulli) --- -Benchmarking Bernoulli swap (n=50, k=2)... - Median: 0.234 ms -Benchmarking Bernoulli swap (n=200, k=3)... - Median: 1.567 ms - -======================================== -Performance Comparison vs Baseline -Baseline: 2024-10-15 14:30:00 -======================================== -✓ FASTER bernoulli_swap_n50_k2: 1.23x (23.0%) - Current: 0.190 ms | Baseline: 0.234 ms - -≈ SIMILAR bernoulli_swap_n200_k3: 1.02x (2.0%) - Current: 1.537 ms | Baseline: 1.567 ms -``` - -## 🔧 Advanced Usage - -### Run Only Specific Benchmarks - -Edit `dev/benchmark_optimization.jl` to comment out benchmarks you don't need. - -### Compare Two Specific Benchmark Files - -```bash -julia dev/run_benchmarks.jl compare results/v1.json results/v2.json -``` - -### Profile Your Code - -```julia -using Profile - -include("dev/test_decorated_paper.jl") - -# Profile a specific function -@profile main(500:500:1000, 2) - -Profile.print() -# Or for a flamegraph: -using ProfileView -ProfileView.view() -``` - -### Check Allocations - -```julia -using BenchmarkTools - -# See allocations for a single operation -@btime apply_swap!($assignment, $swap) samples=1 evals=1 -``` - -## 🎓 Best Practices - -1. **Always establish a baseline first** - You need a reference point -2. **Make incremental changes** - Change one thing at a time -3. **Profile before optimizing** - Don't guess where the bottleneck is -4. **Test correctness** - Fast but wrong is useless -5. **Document your changes** - Explain why you made each optimization -6. **Consider maintainability** - Don't sacrifice readability for tiny gains - -## 📚 Resources - -- **Detailed benchmarking guide**: See `dev/BENCHMARKING.md` -- **Julia Performance Tips**: - https://docs.julialang.org/en/v1/manual/performance-tips/ -- **BenchmarkTools.jl**: https://juliaci.github.io/BenchmarkTools.jl/stable/ -- **Profile module**: https://docs.julialang.org/en/v1/stdlib/Profile/ - -## 🤝 Contributing Performance Improvements - -When submitting a PR with performance improvements: - -1. Include before/after benchmark results -2. Explain what you optimized and why -3. Ensure all tests still pass -4. Document any trade-offs made -5. Consider adding new benchmarks for your changes - -## ❓ Troubleshooting - -### "BenchmarkTools not available" - -```bash -julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' -``` - -### High variance in results - -- Close other applications -- Run benchmarks multiple times -- Use `--threads=1` flag for consistency - -### Benchmark takes too long - -- Reduce the `samples` parameter -- Use smaller test networks -- Run individual benchmark categories instead of all at once - -## 📞 Getting Help - -- Open an issue with benchmark results -- Include your system specs (OS, Julia version, CPU) -- Describe what you're trying to optimize - ---- - -Happy optimizing! 🚀 diff --git a/Project.toml b/Project.toml index 7c67a0a..affc7d6 100644 --- a/Project.toml +++ b/Project.toml @@ -4,20 +4,15 @@ version = "0.5.2" authors = ["Charles Dufour", "Jake Grainger"] [deps] -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" -JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" [weakdeps] Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" @@ -32,10 +27,8 @@ LightMCExt = "LightMC" MakieExt = "Makie" [compat] -Dates = "1.11.0" Graphons = "0.1.0" LinearAlgebra = "1.12.0" -Printf = "1.11.0" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/QUICKREF.md b/QUICKREF.md deleted file mode 100644 index d0e73eb..0000000 --- a/QUICKREF.md +++ /dev/null @@ -1,138 +0,0 @@ -# Performance Optimization Quick Reference - -## 🚀 Quick Commands - -```bash -# Establish baseline -julia dev/run_benchmarks.jl baseline - -# Benchmark current code -julia dev/run_benchmarks.jl current - -# Profile to find bottlenecks -julia dev/profile_optimization.jl swap - -# Run tests -julia --project=. -e 'using Pkg; Pkg.test()' - -# Visualize results -julia dev/visualize_benchmarks.jl --all -``` - -## 📊 Understanding Output - -``` -✓ FASTER = >5% improvement -✗ SLOWER = >5% regression -≈ SIMILAR = Within ±5% -``` - -## 🎯 Priority Hotspots - -### 1. `apply_swap!` 🔴 CRITICAL - -- **File**: `src/optimization/swap_workspace.jl`, `swap_categorical.jl` -- **Why**: Called ~1M times per run -- **Fix**: Reduce allocations, avoid `deepcopy` - -### 2. `get_edges_in_groups` 🟡 MODERATE - -- **File**: `src/assignment.jl` -- **Why**: Called during LL updates -- **Fix**: Pre-allocate, cache group membership - -### 3. Edge iteration 🟢 LOW - -- **File**: `src/EdgeList.jl` -- **Why**: Used everywhere -- **Fix**: Ensure type stability - -## 🛠️ Common Optimizations - -### Check Allocations - -```julia -using BenchmarkTools -@btime my_function($args) samples=1 evals=1 -# Look for allocations in output -``` - -### Profile Code - -```julia -using Profile -@profile my_function(args) -Profile.print(maxdepth=15) -``` - -### Type Stability - -```julia -using Cthulhu -@descend my_function(args) -# Red = type unstable (BAD) -``` - -## 📁 Key Files - -``` -├── PERFORMANCE.md # Main guide -├── dev/ -│ ├── run_benchmarks.jl # 👈 USE THIS -│ ├── profile_optimization.jl # For profiling -│ ├── visualize_benchmarks.jl # View results -│ └── BENCHMARKING.md # Details -└── src/optimization/ # 🎯 Optimize here - ├── swap_workspace.jl - └── swap_categorical.jl -``` - -## 📈 Expected Gains - -- Reduce allocations: **20-50%** speedup -- Better data structures: **2-10x** speedup -- SIMD/vectorization: **2-4x** speedup -- Fix type instability: **2-5x** speedup - -## 🔄 Workflow - -1. **Baseline** → 2. **Profile** → 3. **Optimize** → 4. **Benchmark** → 5. - **Test** → Repeat - -## 💡 Tips - -- Focus on **hot paths** (profile first!) -- Measure **before and after** every change -- Keep changes **small and focused** -- Always **test correctness** -- Document **what and why** - -## 🆘 Troubleshooting - -### "BenchmarkTools not found" - -```bash -julia --project=test -e 'using Pkg; Pkg.add("BenchmarkTools")' -``` - -### Results vary - -- Close other apps -- Use `--threads=1` -- Increase samples - -### Too slow - -- Reduce samples -- Use smaller networks -- Run specific benchmarks - -## 📚 Learn More - -- `PERFORMANCE.md` - Full guide -- `dev/BENCHMARKING.md` - Detailed docs -- `julia dev/run_benchmarks.jl help` - CLI help - ---- - -**Remember**: Profile → Optimize → Benchmark → Test → Repeat 🔁 diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 0000000..691e4d7 --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,9 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" +NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" +PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/benchmark/benchmark_optimization.jl b/benchmark/benchmark_optimization.jl index c48a1da..89feebc 100644 --- a/benchmark/benchmark_optimization.jl +++ b/benchmark/benchmark_optimization.jl @@ -17,7 +17,9 @@ using StaticArrays using BenchmarkTools using JSON3 using Dates +using PrettyTables using NetworkHistogram +using LoggingExtras # Create output directory if it doesn't exist const BENCHMARK_DIR = joinpath(@__DIR__, "benchmark_results") @@ -84,7 +86,7 @@ function benchmark_single_swap( b = @benchmark begin NetworkHistogram.apply_swap!($assignment, $swap) NetworkHistogram.revert_swap!($assignment, $swap) - end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=samples evals=1 + end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=samples #evals=1 return Dict( "median_ms" => median(b.times) / 1e6, @@ -112,11 +114,11 @@ function benchmark_full_optimization( $max_iter, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue($max_iter ÷ 2), + NetworkHistogram.PreviousBestValue($max_iter), false ) NetworkHistogram.nethist($A, $d, $initial_labels, params) - end samples=samples evals=1 + end #samples=samples evals=1 return Dict( "median_ms" => median(b.times) / 1e6, @@ -261,12 +263,15 @@ function compare_with_baseline(results, baseline_file) baseline = JSON3.read(read(baseline_file, String)) - println("\n" * "="^70) + println("\n" * "="^80) println("Performance Comparison vs Baseline") println("Baseline: $(baseline["timestamp"])") - println("="^70) + println("="^80 * "\n") + + # Prepare data for table + table_data = [] - for (key, value) in results["benchmarks"] + for (key, value) in sort(collect(results["benchmarks"]), by = x -> string(x[1])) if haskey(baseline["benchmarks"], key) baseline_val = baseline["benchmarks"][key] @@ -301,18 +306,67 @@ function compare_with_baseline(results, baseline_file) speedup = baseline_median / current_median change_pct = (speedup - 1) * 100 - status = if speedup > 1.05 - "✓ FASTER" - elseif speedup < 0.95 - "✗ SLOWER" - else - "≈ SIMILAR" - end - - println("$status $key: $(round(speedup, digits=2))x ($(round(change_pct, sigdigits=3))%)") - println(" Current: $(round(current_median, digits=2)) $unit | Baseline: $(round(baseline_median, digits=2)) $unit") + push!(table_data, + ( + string(key), + baseline_median, + current_median, + unit, + speedup, + change_pct + )) end end + + if isempty(table_data) + println("No comparable benchmarks found.") + return + end + + # Create table with headers + headers = ["Benchmark", "Baseline", "Current", "Unit", "Speedup", "Change (%)"] + + # Extract data into columns + benchmark_names = [row[1] for row in table_data] + baseline_vals = [round(row[2], digits = 3) for row in table_data] + current_vals = [round(row[3], digits = 3) for row in table_data] + units = [row[4] for row in table_data] + speedups = [round(row[5], digits = 3) for row in table_data] + changes = [round(row[6], digits = 2) for row in table_data] + + # Create highlighters for improvements (green) and regressions (red) + # These highlight entire rows based on speedup value + hl_improvement = TextHighlighter( + (data, i, j) -> data[i, 5] > 1.05, # Speedup column 5, >5% improvement + crayon"green" + ) + + hl_regression = TextHighlighter( + (data, i, j) -> data[i, 5] < 0.95, # Speedup column 5, >5% regression + crayon"red" + ) + + # Print the table + pretty_table( + hcat(benchmark_names, baseline_vals, current_vals, units, speedups, changes); + column_labels = headers, + highlighters = [hl_improvement, hl_regression], + alignment = [:l, :r, :r, :c, :r, :r], + table_format = TextTableFormat(borders = text_table_borders__unicode_rounded) + ) + + # Print summary statistics + all_speedups = [row[5] for row in table_data] + n_improved = count(s -> s > 1.05, all_speedups) + n_regressed = count(s -> s < 0.95, all_speedups) + n_similar = length(all_speedups) - n_improved - n_regressed + geomean_speedup = exp(sum(log.(all_speedups)) / length(all_speedups)) + + println("\nSummary:") + println(" Geometric mean speedup: $(round(geomean_speedup, digits=3))x") + println(" Benchmarks improved: $n_improved") + println(" Benchmarks regressed: $n_regressed") + println(" Benchmarks similar: $n_similar") end # Main execution @@ -332,5 +386,11 @@ function main() end if abspath(PROGRAM_FILE) == @__FILE__ - main() + # Filter logs from NetworkHistogram module + logger_filter = EarlyFilteredLogger(global_logger()) do args + return !(args._module === NetworkHistogram) + end + with_logger(logger_filter) do + main() + end end diff --git a/benchmark/benchmark_results/baseline.json b/benchmark/benchmark_results/baseline.json index a7d34fb..7a6aade 100644 --- a/benchmark/benchmark_results/baseline.json +++ b/benchmark/benchmark_results/baseline.json @@ -1,83 +1,83 @@ { "julia_version": "1.12.0", - "timestamp": "2025-10-17 09:29:56", + "timestamp": "2025-10-17 10:48:29", "benchmarks": { "bernoulli_swap_n500_k5": { - "max_ms": 14.133584, - "min_ms": 2.259333, - "mean_ms": 2.57733328, - "median_ms": 2.294333, - "std_ms": 1.6774071498610286 + "max_ms": 2.400625, + "min_ms": 2.256625, + "mean_ms": 2.31981594, + "median_ms": 2.3136875, + "std_ms": 0.025263586533731647 }, "assignment_creation_n200": { - "max_us": 1090.334, - "min_us": 810.875, - "median_us": 1002.0835, - "mean_us": 979.6716700000001, - "std_us": 77.08177811411355 + "max_us": 1476.625, + "min_us": 803.875, + "median_us": 1040.75, + "mean_us": 1023.70714, + "std_us": 102.75691494911177 }, "categorical_swap_n50_k2_m3": { - "max_ms": 0.02725, - "min_ms": 0.007125, - "mean_ms": 0.00747457, - "median_ms": 0.00725, - "std_ms": 0.002000665006890953 + "max_ms": 0.0126875, + "min_ms": 0.007677, + "mean_ms": 0.007845412500000001, + "median_ms": 0.0077395, + "std_ms": 0.0005249944761313536 }, "edgelist_creation_n200": { - "max_us": 3194.709, - "min_us": 196.959, - "median_us": 257.6665, - "mean_us": 287.92171, - "std_us": 295.18723492986845 + "max_us": 3618.958, + "min_us": 200.166, + "median_us": 258.3545, + "mean_us": 295.56669, + "std_us": 337.15490001630405 }, "loglikelihood_n200": { - "max_us": 0.00975, - "min_us": 0.005208999999999999, - "median_us": 0.005333, - "mean_us": 0.005361388, - "std_us": 0.0001773114909441032 + "max_us": 0.017709, + "min_us": 0.00525, + "median_us": 0.005417, + "mean_us": 0.005481977999999998, + "std_us": 0.0005442266588216458 }, "bernoulli_optimize_n100_1k": { - "max_ms": 109.028, - "min_ms": 53.956375, - "mean_ms": 80.9743292, - "median_ms": 78.6800415, - "std_ms": 25.923985956866588 + "max_ms": 113.915708, + "min_ms": 105.567625, + "mean_ms": 109.5839403478261, + "median_ms": 109.3843125, + "std_ms": 1.665303524274982 }, "categorical_swap_n200_k3_m4": { - "max_ms": 0.049458, - "min_ms": 0.020292, - "mean_ms": 0.02077412, - "median_ms": 0.020417, - "std_ms": 0.0029010176211081164 + "max_ms": 0.0485, + "min_ms": 0.021917, + "mean_ms": 0.02255671, + "median_ms": 0.022125, + "std_ms": 0.0026733226036775534 }, "bernoulli_swap_n50_k2": { - "max_ms": 0.042708, - "min_ms": 0.032125, - "mean_ms": 0.032648790000000004, - "median_ms": 0.032375, - "std_ms": 0.001290518346215656 + "max_ms": 0.043833, + "min_ms": 0.031958, + "mean_ms": 0.032637479999999996, + "median_ms": 0.032416, + "std_ms": 0.0015867087750969608 }, "categorical_swap_n500_k5_m5": { - "max_ms": 0.081875, - "min_ms": 0.056125, - "mean_ms": 0.057171599999999996, - "median_ms": 0.056375, - "std_ms": 0.0037585196771156197 + "max_ms": 0.080292, + "min_ms": 0.06, + "mean_ms": 0.06298337999999999, + "median_ms": 0.06225, + "std_ms": 0.003947748006400714 }, "categorical_optimize_n100_1k": { - "max_ms": 16.551667, - "min_ms": 8.196416, - "mean_ms": 12.5445417, - "median_ms": 13.474271, - "std_ms": 3.0889250452015777 + "max_ms": 34.224917, + "min_ms": 16.19525, + "mean_ms": 17.52069984965035, + "median_ms": 16.7704585, + "std_ms": 1.8944621490608184 }, "bernoulli_swap_n200_k3": { - "max_ms": 0.514583, - "min_ms": 0.437791, - "mean_ms": 0.44661872999999996, - "median_ms": 0.441792, - "std_ms": 0.015195061493625175 + "max_ms": 0.596417, + "min_ms": 0.438375, + "mean_ms": 0.45312872, + "median_ms": 0.443208, + "std_ms": 0.025498219537002414 } } } \ No newline at end of file diff --git a/benchmark/profile_optimization.jl b/benchmark/profile_optimization.jl index 804fc80..24083de 100644 --- a/benchmark/profile_optimization.jl +++ b/benchmark/profile_optimization.jl @@ -35,7 +35,11 @@ function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) end end - labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + base_size = n_nodes ÷ n_groups + remainder = n_nodes % n_groups + sizes = fill(base_size, n_groups) + sizes[1:remainder] .+= 1 + labels = StatsBase.inverse_rle(1:n_groups, sizes) A = NetworkHistogram.sample(sbm, labels) return A, labels, d end @@ -55,7 +59,11 @@ function create_test_sbm_categorical( end end - labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) + base_size = n_nodes ÷ n_groups + remainder = n_nodes % n_groups + sizes = fill(base_size, n_groups) + sizes[1:remainder] .+= 1 + labels = StatsBase.inverse_rle(1:n_groups, sizes) A = NetworkHistogram.sample(sbm, labels) return A, labels, d end @@ -108,7 +116,7 @@ function profile_full_optimization( max_iter, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(max_iter ÷ 2), + NetworkHistogram.PreviousBestValue(max_iter), false ) @@ -192,28 +200,6 @@ function print_results() Profile.print(maxdepth = 15) println("\n" * "="^70) - println("Generating flamegraph...") - println("="^70) - - # Try to use ProfileView if available - try - @eval using ProfileView - println("\nOpening ProfileView (flamegraph)...") - ProfileView.view() - println("Close the ProfileView window to continue...") - catch - # Try PProf - try - @eval using PProf - println("\nGenerating flamegraph with PProf...") - PProf.pprof() - catch - println("\nNo flamegraph viewer available.") - println("To visualize results, install ProfileView.jl or PProf.jl:") - println(" julia> using Pkg") - println(" julia> Pkg.add(\"ProfileView\") # or \"PProf\"") - end - end end function print_help() diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl index 3bdab74..3cf5956 100644 --- a/benchmark/run_benchmarks.jl +++ b/benchmark/run_benchmarks.jl @@ -46,136 +46,128 @@ function print_help() """) end -function ensure_dependencies() - println("Checking dependencies...") - - # Check if BenchmarkTools and JSON3 are available +function ensure_dependencies(tries = 0) + tries == 0 && @info "Checking dependencies..." + # Check if benchmark dependencies are installed try + @eval using StaticArrays @eval using BenchmarkTools @eval using JSON3 + @eval using PrettyTables + @eval using LoggingExtras + + @info "Dependencies OK ✓" catch - println("Installing required dependencies...") - Pkg.activate("test") - Pkg.add(["BenchmarkTools", "JSON3"]) - Pkg.activate(".") + if tries >= 2 + error("Failed to install dependencies after multiple attempts.") + elseif tries == 1 + @info "Trying to instantiate project..." + Pkg.instantiate() + ensure_dependencies(tries + 1) + else + @info "Activating benchmark project" + Pkg.activate("benchmark") + ensure_dependencies(tries + 1) + end end - - println("Dependencies OK ✓") end function run_baseline() - ensure_dependencies() - baseline_file = joinpath("benchmark", "benchmark_results", "baseline.json") if isfile(baseline_file) - print("Baseline already exists. Overwrite? (y/N): ") + printstyled( + "Baseline already exists. Overwrite? (y/N): ", color = :light_yellow, blink = true) response = readline() if lowercase(strip(response)) != "y" - println("Aborted.") + @info "Aborted." return end end - println("\nRunning baseline benchmarks...") - println("This may take several minutes...\n") + @info "Running baseline benchmarks... \nThis may take several minutes...\n" - run(`julia --project=. benchmark/benchmark_optimization.jl $baseline_file`) + run(`julia --project=benchmark benchmark/benchmark_optimization.jl $baseline_file`) - println("\n✓ Baseline established at: $baseline_file") - println("\nNext steps:") - println(" 1. Make your performance improvements") - println(" 2. Run: julia run_benchmarks.jl current") - println(" 3. Review the performance comparison") + @info "\n✓ Baseline established at: $baseline_file" * + "\n Next steps: " * + "\n 1. Make your performance improvements" * + "\n 2. Run: julia run_benchmarks.jl current" * + "\n 3. Review the performance comparison" end function run_current() - ensure_dependencies() - baseline_file = joinpath("benchmark", "benchmark_results", "baseline.json") if !isfile(baseline_file) - println("⚠ Warning: No baseline found!") - println("Consider running: julia run_benchmarks.jl baseline") - println("\nContinuing anyway...\n") + @warn "⚠ Warning: No baseline found! \n Consider running: julia run_benchmarks.jl baseline" + @info "\nContinuing anyway...\n" end timestamp = Dates.format(Dates.now(), "yyyy-mm-ddTHH-MM-SS") current_file = joinpath("benchmark", "benchmark_results", "current_$timestamp.json") - println("Running current benchmarks...") - println("This may take several minutes...\n") + @info "Running current benchmarks... \n This may take several minutes...\n" if isfile(baseline_file) - run(`julia --project=. benchmark/benchmark_optimization.jl $current_file $baseline_file`) + run(`julia --project=benchmark benchmark/benchmark_optimization.jl $current_file $baseline_file`) else - run(`julia --project=. benchmark/benchmark_optimization.jl $current_file`) + run(`julia --project=benchmark benchmark/benchmark_optimization.jl $current_file`) end - println("\n✓ Results saved to: $current_file") + @info "✓ Results saved to: $current_file" end function compare_benchmarks(file1, file2) - ensure_dependencies() - if !isfile(file1) - println("Error: File not found: $file1") + @error "Error: File not found: $file1" return end if !isfile(file2) - println("Error: File not found: $file2") + @error "Error: File not found: $file2" return end - println("Comparing benchmarks...") - println(" Baseline: $file2") - println(" Current: $file1\n") + @info "Comparing benchmarks... \n Baseline: $file2 \n Current: $file1\n" # Re-run comparison - run(`julia --project=. benchmark/benchmark_optimization.jl $file1 $file2`) + run(`julia --project=benchmark benchmark/benchmark_optimization.jl $file1 $file2`) end function clean_results() results_dir = joinpath("benchmark", "benchmark_results") if !isdir(results_dir) - println("No results directory found.") + @info "No results directory found." return end files = filter(f -> endswith(f, ".json") && f != "baseline.json", readdir(results_dir)) if isempty(files) - println("No benchmark results to clean.") + @info "No benchmark results to clean." return end - println("Found $(length(files)) benchmark result file(s):") + @info "Found $(length(files)) benchmark result file(s):" for f in files - println(" - $f") + @info " - $f" end - print("\nDelete these files? (y/N): ") + printstyled("\nDelete these files? (y/N): ", color = :light_yellow, blink = true) response = readline() if lowercase(strip(response)) == "y" for f in files rm(joinpath(results_dir, f)) end - println("✓ Cleaned $(length(files)) file(s)") + @info "✓ Cleaned $(length(files)) file(s)" else - println("Aborted.") + @info "Aborted." end end -function run_tests() - ensure_dependencies() - - println("Running full test suite...") - Pkg.test() -end - # Main execution function main() if length(ARGS) == 0 || ARGS[1] == "help" || ARGS[1] == "-h" || ARGS[1] == "--help" @@ -183,6 +175,8 @@ function main() return end + ensure_dependencies() + command = ARGS[1] if command == "baseline" @@ -191,8 +185,7 @@ function main() run_current() elseif command == "compare" if length(ARGS) < 3 - println("Error: compare requires two file arguments") - println("Usage: julia run_benchmarks.jl compare FILE1 FILE2") + @error "Error: compare requires two file arguments \n Usage: julia run_benchmarks.jl compare FILE1 FILE2" return end compare_benchmarks(ARGS[2], ARGS[3]) @@ -201,7 +194,7 @@ function main() elseif command == "test" run_tests() else - println("Error: Unknown command '$command'") + @info "Error: Unknown command '$command'" print_help() end end diff --git a/benchmark/visualize_benchmarks.jl b/benchmark/visualize_benchmarks.jl index adde75d..4a1a3a0 100644 --- a/benchmark/visualize_benchmarks.jl +++ b/benchmark/visualize_benchmarks.jl @@ -18,6 +18,7 @@ using JSON3 using Dates using Printf using Statistics +using PrettyTables function load_benchmark(filepath) if !isfile(filepath) @@ -82,26 +83,12 @@ function compare_multiple(files) # Print header println("\n" * "="^100) println("Benchmark Comparison Across Versions") - println("="^100) + println("="^100 * "\n") - # Create table - header = ["Benchmark", [b.file for b in benchmarks]...] - col_widths = [40, fill(18, length(benchmarks))...] + # Prepare data for table + table_data = [] + baseline_vals = Dict{String, Float64}() - # Print header - print(rpad("Benchmark", col_widths[1])) - for (i, b) in enumerate(benchmarks) - print(rpad(b.file[1:min(end, 16)], col_widths[i + 1])) - end - println() - print(rpad("", col_widths[1])) - for (i, b) in enumerate(benchmarks) - print(rpad(b.timestamp[1:min(end, 16)], col_widths[i + 1])) - end - println() - println("-"^sum(col_widths)) - - # Print each metric for metric in all_metrics # Skip if metric has no values values = [haskey(b.metrics, metric) ? b.metrics[metric] : NaN for b in benchmarks] @@ -109,42 +96,90 @@ function compare_multiple(files) continue end - # Shorten metric name for display - display_name = metric - if length(display_name) > col_widths[1] - 2 - display_name = metric[1:(col_widths[1] - 5)] * "..." - end - - print(rpad(display_name, col_widths[1])) - + row = Any[metric] baseline_val = values[1] + baseline_vals[metric] = baseline_val + for (i, val) in enumerate(values) if isnan(val) - print(rpad("N/A", col_widths[i + 1])) + push!(row, "N/A") else - speedup = if !isnan(baseline_val) && baseline_val > 0 && i > 1 - baseline_val / val - else - 1.0 - end - - # Format with speedup indicator - val_str = @sprintf("%.2f ms", val) - if i > 1 && !isnan(baseline_val) - if speedup > 1.05 - val_str *= " ✓" - elseif speedup < 0.95 - val_str *= " ✗" - end - end - print(rpad(val_str, col_widths[i + 1])) + push!(row, round(val, digits = 2)) end end - println() + + push!(table_data, row) + end + + if isempty(table_data) + println("No metrics to display") + return + end + + # Create headers + headers = ["Benchmark"] + for b in benchmarks + short_name = length(b.file) > 16 ? b.file[1:13] * "..." : b.file + push!(headers, short_name) end - println("="^100) - println("\nLegend: ✓ = >5% faster, ✗ = >5% slower") + # Create subheaders with timestamps + subheaders = [""] + for b in benchmarks + short_ts = length(b.timestamp) > 16 ? b.timestamp[1:16] : b.timestamp + push!(subheaders, short_ts) + end + + # Create highlighters for improvements and regressions + # We'll color entire rows based on whether the value improved or regressed vs baseline + hl_improvement = TextHighlighter( + (data, i, j) -> begin + # Check if current value (in any column after baseline) shows improvement + baseline_idx = 2 # First value column + baseline_val = data[i, baseline_idx] + + if j > 2 && baseline_val isa Number && baseline_val > 0 + current_val = data[i, j] + if current_val isa Number + speedup = baseline_val / current_val + return speedup > 1.05 # >5% improvement + end + end + return false + end, + crayon"green" + ) + + hl_regression = TextHighlighter( + (data, i, j) -> begin + # Check if current value (in any column after baseline) shows regression + baseline_idx = 2 # First value column + baseline_val = data[i, baseline_idx] + + if j > 2 && baseline_val isa Number && baseline_val > 0 + current_val = data[i, j] + if current_val isa Number + speedup = baseline_val / current_val + return speedup < 0.95 # >5% regression + end + end + return false + end, + crayon"red" + ) # Convert table_data to matrix + data_matrix = permutedims(hcat([vcat(row...) for row in table_data]...)) + + # Print table + pretty_table( + data_matrix; + column_labels = headers, + highlighters = [hl_improvement, hl_regression], + alignment = vcat(:l, fill(:r, length(benchmarks))), + table_format = TextTableFormat(borders = text_table_borders__unicode_rounded) + ) + + println("\nLegend: Green = >5% faster, Red = >5% slower (compared to first column)") + println("All values in milliseconds (ms)") println() # Calculate aggregate statistics diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl index 2fc8fd4..bec1f42 100644 --- a/src/distributions/distributions_type.jl +++ b/src/distributions/distributions_type.jl @@ -247,12 +247,16 @@ struct Bernoulli{T <: Real} end zero(d::Bernoulli) = Bernoulli(zero(d.p)) +zero(::Type{Bernoulli{T}}) where {T} = Bernoulli(zero(T)) function agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) - Bernoulli(w1 * d1.p + w2 * d2.p) + p = w1 * d1.p + w2 * d2.p + # Clamp to [0, 1] to handle floating-point arithmetic errors + p = clamp(p, 0.0, 1.0) + Bernoulli(p) end fit(::Bernoulli, x) = Bernoulli(mean(x)) distance(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) params(d::Bernoulli) = (d.p,) eltype(d::Bernoulli) = Bool -sample(d::Bernoulli) = Bool(rand() <= d.p) +sample(d::Bernoulli) = rand() <= d.p From b71700edf7bf548ee1bf50fbc62f76df24f487a3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 17 Oct 2025 17:09:31 +0200 Subject: [PATCH 194/266] start adding more extensible design --- src/api.jl | 4 +- src/estimator/abstractEstimator.jl | 134 +++++++++++++++++++++++++++ src/optimization/swap_categorical.jl | 8 +- src/utils/SymArray.jl | 4 +- 4 files changed, 142 insertions(+), 8 deletions(-) create mode 100644 src/estimator/abstractEstimator.jl diff --git a/src/api.jl b/src/api.jl index 7806e30..4411237 100644 --- a/src/api.jl +++ b/src/api.jl @@ -22,6 +22,7 @@ preprocessing, optimization, and returns an Assignment representing the estimate # Examples ```julia using NetworkHistogram, LinearAlgebra +import NetworkHistogram: nethist, GreedyParams # Binary network A = Symmetric(rand(0:1, 100, 100)) @@ -31,8 +32,7 @@ A[diagind(A)] .= 0 initial_labels = rand(1:3, 100) # Fit network histogram -params = GreedyParams() -result = nethist(A, Bernoulli(0.5), initial_labels, params) +result = nethist(A, Bernoulli(0.5), initial_labels, GreedyParams()) # Extract results block_matrix = result.θ diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl new file mode 100644 index 0000000..ada619a --- /dev/null +++ b/src/estimator/abstractEstimator.jl @@ -0,0 +1,134 @@ +abstract type SBMEstimator end + +struct SumGreedyEstimator{C, S, P} <: SBMEstimator + counts::C # counts of possible edges between groups + counts_swap::C + realized::S # sums of observed Λ between groups + realized_swap::S + max_iter::Int + stop_rule::P +end + +function init!(es::SumGreedyEstimator, data, initial_labels) + for j in axes(data, 2) + label_j = initial_labels[j] + for i in axes(data, 1) # double counting edges in undirected graphs + if !isnothing(data[i, j]) && i < j + add_counts!(es.realized[initial_labels[i], label_j], data[i, j]) + add_counts!(es.realized_swap[initial_labels[i], label_j], data[i, j]) + es.counts[initial_labels[i], label_j] += 1 + es.counts_swap[initial_labels[i], label_j] += 1 + end + end + end +end + +function estimate(es::SumGreedyEstimator, data, initial_labels) + init!(es, data, initial_labels) + loss = loss_function(es.realized, es.counts) + es.stop_rule.previous_best_value = loss + ## optim + node_labels = copy(initial_labels) + k = length(unique(node_labels)) + pbar = ProgressUnknown(enabled = true, showspeed = true, desc = "Greedy search: ") + for iter in 1:(es.max_iter) + next!(pbar) + groups = StatsBase.sample(1:k, 2; replace = false) + index1 = rand(findall(x -> x == groups[1], node_labels)) + index2 = rand(findall(x -> x == groups[2], node_labels)) + #index1, index2 = StatsBase.sample(1:length(node_labels), 2; replace = false) + g1 = node_labels[index1] + g2 = node_labels[index2] + if g1 == g2 + continue + end + edges_index1 = view(data, :, index1) + edges_index2 = view(data, :, index2) + for j in axes(data, 1) + if j == index1 || j == index2 + continue + end + gj = node_labels[j] + remove_counts!(es.realized_swap[g1, gj], edges_index1[j]) + es.counts_swap[g1, gj] -= 1 + add_counts!(es.realized_swap[g2, gj], edges_index1[j]) + es.counts_swap[g2, gj] += 1 + remove_counts!(es.realized_swap[g2, gj], edges_index2[j]) + es.counts_swap[g2, gj] -= 1 + add_counts!(es.realized_swap[g1, gj], edges_index2[j]) + es.counts_swap[g1, gj] += 1 + end + node_labels[index1] = g2 + node_labels[index2] = g1 + + loss_new = loss_function(es.realized_swap, es.counts_swap) + if loss_new < es.stop_rule.previous_best_value + es.stop_rule.previous_best_value = loss_new + es.stop_rule.iterations_since_best = 0 + deepcopy!(es.realized, es.realized_swap) + copy!(es.counts, es.counts_swap) + loss = loss_new + else + # revert swap + node_labels[index1] = g1 + node_labels[index2] = g2 + deepcopy!(es.realized_swap, es.realized) + copy!(es.counts_swap, es.counts) + es.stop_rule.iterations_since_best += 1 + end + + if es.stop_rule.iterations_since_best >= es.stop_rule.k + @info "Stopping criterion met at iteration $iter" + finish!(pbar) + break + end + end + return node_labels, losses +end + +function loss_function(realized, counts) + loss = 0.0 + @inbounds for j in axes(realized, 2) + for i in axes(realized, 1) + if i <= j + # θ = realized[i, j] ./ counts[i, j] + # loss += sum(xlogx, θ) * counts[i, j] + loss += sum(counts[i, j] - sum(abs2, realized[i, j]) / counts[i, j]) + end + end + end + return loss +end + +function add_counts!(parameter::AbstractArray, data_value::AbstractArray) + @inbounds parameter .+= data_value +end + +function remove_counts!(parameter::AbstractArray, data_value::AbstractArray) + @inbounds parameter .-= data_value +end + +function add_counts!(parameter::AbstractArray, data_value::Real) + @inbounds parameter[data_value] += 1 +end + +function remove_counts!(parameter::AbstractArray, data_value::Real) + @inbounds parameter[data_value] -= 1 +end + +## +using Makie + +function Makie.convert_arguments( + ::Type{<:AbstractPlot}, graphon::DecoratedSBM, k::Int = 1) + x = collect(0:0.01:1) + return (x, x, [_extract_param(graphon(xi, yi), k) for xi in x, yi in x]) +end + +function _extract_param(d::Distribution, k::Int) + return params(d)[k] +end + +function _extract_param(d::DiscreteNonParametric, k::Int) + return params(d)[2][k] +end diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index c29e78d..e72ea85 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -92,8 +92,8 @@ function copy_categorical_workspace!( # Copy vector-valued SymArrays element by element # Use sparse matrix iteration instead of .d dictionary k = size(dest.realized, 1) - copy_with_array!(dest.realized, src_assignment.additional_workspace.realized) - copy_with_array!(dest.estimated, src_assignment.additional_workspace.estimated) + deepcopy!(dest.realized, src_assignment.additional_workspace.realized) + deepcopy!(dest.estimated, src_assignment.additional_workspace.estimated) # @inbounds for j in 1:k, i in 1:j # copyto!(dest.realized[i, j], src_ws.realized[i, j]) @@ -119,8 +119,8 @@ function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) # Copy vector-valued SymArrays element by element # Use sparse matrix iteration instead of .d dictionary k = size(ws.realized, 1) - copy_with_array!(a.additional_workspace.realized, ws.realized) - copy_with_array!(a.additional_workspace.estimated, ws.estimated) + deepcopy!(a.additional_workspace.realized, ws.realized) + deepcopy!(a.additional_workspace.estimated, ws.estimated) # @inbounds for j in 1:k, i in 1:j # copyto!(as.realized[i, j], ws.realized[i, j]) # end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index e9922a4..e80ee6c 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -10,7 +10,7 @@ using SparseArrays using LinearAlgebra import Base: eltype, convert, size, getindex, setindex!, copy!, similar, IndexStyle, axes, length, iterate, copyto! -export SymArray, eltype, copy_with_array!, sum_tri_with_diag +export SymArray, eltype, deepcopy!, sum_tri_with_diag """ SymArray{F} <: AbstractArray{F, 2} @@ -242,7 +242,7 @@ function copy!(dest::SymArray{F}, src::SymArray{F}) where {F <: Real} return dest end -function copy_with_array!(dest::SymArray{F}, src::SymArray{F}) where {F <: AbstractArray} +function deepcopy!(dest::SymArray{F}, src::SymArray{F}) where {F <: AbstractArray} @inbounds for index in eachindex(dest) copyto!(dest[index], src[index]) end From 6b637085a1c47cab913691970b49d1c818f96f46 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 19 Oct 2025 17:31:28 +0200 Subject: [PATCH 195/266] progress towards better package --- Project.toml | 2 + src/NetworkHistogram.jl | 4 + src/estimator/SpectralEstimator.jl | 413 ++++++++++++++++++++ src/estimator/abstractEstimator.jl | 414 +++++++++++++++++---- src/optimization/config_rules/stop_rule.jl | 35 +- src/optimization/config_rules/swap_rule.jl | 22 +- src/utils/include.jl | 13 + 7 files changed, 804 insertions(+), 99 deletions(-) create mode 100644 src/estimator/SpectralEstimator.jl diff --git a/Project.toml b/Project.toml index affc7d6..20c7b7a 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ version = "0.5.2" authors = ["Charles Dufour", "Jake Grainger"] [deps] +ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -27,6 +28,7 @@ LightMCExt = "LightMC" MakieExt = "Makie" [compat] +ArgCheck = "2.5.0" Graphons = "0.1.0" LinearAlgebra = "1.12.0" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index acbccf4..7b0e54d 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -6,6 +6,8 @@ import StatsAPI: loglikelihood, fit, params import Base: convert, eltype, zero using Distributions using LinearAlgebra +using ArgCheck +using Random: randperm include("utils/include.jl") using .FastSymArray @@ -15,6 +17,8 @@ include("EdgeList.jl") include("assignment.jl") include("block_model.jl") include("optimization/greedy.jl") +include("estimator/abstractEstimator.jl") +include("estimator/SpectralEstimator.jl") include("api.jl") export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf diff --git a/src/estimator/SpectralEstimator.jl b/src/estimator/SpectralEstimator.jl new file mode 100644 index 0000000..fd46d6e --- /dev/null +++ b/src/estimator/SpectralEstimator.jl @@ -0,0 +1,413 @@ +""" + SpectralEstimator{T} + +Spectral clustering estimator for Stochastic Block Models (SBM). + +This estimator uses spectral clustering to partition nodes into groups based on +the graph structure. It computes the normalized Laplacian and performs k-means +clustering on the top k eigenvectors. + +# Fields +- `k::Int`: Number of groups/communities to detect +- `eig_size::Int`: Number of eigenvectors to use (default: k) +- `adjacency_type::Symbol`: Type of adjacency matrix to use (`:binary`, `:weighted`) +- `laplacian_type::Symbol`: Type of Laplacian (`:normalized`, `:unnormalized`) +- `max_kmeans_iter::Int`: Maximum iterations for k-means clustering +- `balanced::Bool`: If true, forces balanced community sizes (default: false) + +# Example +```julia +# Binary adjacency matrix with balanced communities +A = [0 1 1 0; 1 0 1 0; 1 1 0 1; 0 0 1 0] +estimator = SpectralEstimator(2, balanced=true) +labels = estimate(estimator, A) +``` +""" +struct SpectralEstimator{T <: Real} <: SBMEstimator + k::Int + eig_size::Int + adjacency_type::Symbol + laplacian_type::Symbol + max_kmeans_iter::Int + balanced::Bool + + function SpectralEstimator(k::Int; + eig_size::Int = k, + adjacency_type::Symbol = :binary, + laplacian_type::Symbol = :normalized, + max_kmeans_iter::Int = 100, + balanced::Bool = false) + @argcheck k>0 "Number of groups k must be positive" + @argcheck adjacency_type in [:binary, :weighted] "adjacency_type must be :binary or :weighted" + @argcheck laplacian_type in [:normalized, :unnormalized] "laplacian_type must be :normalized or :unnormalized" + @argcheck max_kmeans_iter>0 "max_kmeans_iter must be positive" + new{Float64}(k, eig_size, adjacency_type, laplacian_type, max_kmeans_iter, balanced) + end +end + +""" + estimate(estimator::SpectralEstimator, data; progress = true) + +Perform spectral clustering on the network data. + +# Arguments +- `estimator::SpectralEstimator`: The spectral estimator configuration +- `data`: The adjacency matrix or network data +- `progress::Bool`: Whether to show progress information (for compatibility with other estimators) + +# Returns +- `labels::Vector{Int}`: Node group assignments (1 to k) + +# Algorithm +1. Construct adjacency matrix from data +2. Compute the Laplacian matrix (normalized or unnormalized) +3. Compute eigenvectors corresponding to smallest eigenvalues +4. Perform k-means clustering on the eigenvectors +5. Return cluster assignments +""" +function estimate(estimator::SpectralEstimator, data; progress = true) + progress && @info "Starting spectral clustering with k=$(estimator.k)" + + # Convert data to adjacency matrix + A = construct_adjacency(data, estimator.adjacency_type) + n = size(A, 1) + + @argcheck n>=estimator.k "Number of nodes ($n) must be >= number of groups ($(estimator.k))" + + # Compute Laplacian + L = compute_laplacian(A, estimator.laplacian_type) + + # Compute eigenvectors + progress && @info "Computing eigenvectors..." + eigvals, eigvecs = compute_spectral_embedding(L, estimator.eig_size) + + # Normalize rows for normalized spectral clustering + if estimator.laplacian_type == :normalized + eigvecs = normalize_rows(eigvecs) + end + + # Perform k-means clustering + progress && @info "Performing k-means clustering..." + if estimator.balanced + labels = balanced_kmeans_clustering(eigvecs, estimator.k, estimator.max_kmeans_iter) + else + labels = kmeans_clustering(eigvecs, estimator.k, estimator.max_kmeans_iter) + end + + progress && @info "Spectral clustering complete" + return labels +end + +""" + estimate(estimator::SpectralEstimator, data, initial_labels; progress = true) + +Perform spectral clustering on the network data. The initial_labels are ignored +as spectral clustering doesn't use an initialization. + +This method signature is provided for compatibility with other estimators. +""" +function estimate(estimator::SpectralEstimator, data, initial_labels; progress = true) + return estimate(estimator, data; progress = progress) +end + +""" + construct_adjacency(data, adjacency_type::Symbol) + +Construct an adjacency matrix from the data. + +# Arguments +- `data`: Network data (can be a matrix with various edge types) +- `adjacency_type`: Either `:binary` or `:weighted` + +# Returns +- Symmetric adjacency matrix +""" +function construct_adjacency(data::AbstractMatrix, adjacency_type::Symbol) + n = size(data, 1) + A = zeros(Float64, n, n) + + if adjacency_type == :binary + # Binary adjacency: edge exists if data is not nothing/zero + for i in 1:n + for j in (i + 1):n + if !isnothing(data[i, j]) && data[i, j] != 0 + A[i, j] = 1.0 + A[j, i] = 1.0 + end + end + end + elseif adjacency_type == :weighted + # Weighted adjacency: use the actual values + for i in 1:n + for j in (i + 1):n + if !isnothing(data[i, j]) + if data[i, j] isa AbstractArray + # For categorical data, use sum or count + weight = sum(data[i, j]) + else + weight = float(data[i, j]) + end + A[i, j] = weight + A[j, i] = weight + end + end + end + end + + return A +end + +""" + compute_laplacian(A::AbstractMatrix, laplacian_type::Symbol) + +Compute the graph Laplacian matrix. + +# Arguments +- `A`: Adjacency matrix +- `laplacian_type`: Either `:normalized` or `:unnormalized` + +# Returns +- Laplacian matrix +""" +function compute_laplacian(A::AbstractMatrix, laplacian_type::Symbol) + n = size(A, 1) + d = vec(sum(A, dims = 2)) # Degree vector + + if laplacian_type == :unnormalized + # L = D - A + D = Diagonal(d) + return D - A + elseif laplacian_type == :normalized + # L = I - D^{-1/2} A D^{-1/2} + # Handle zero degrees + d_inv_sqrt = zeros(n) + for i in 1:n + d_inv_sqrt[i] = d[i] > 0 ? 1.0 / sqrt(d[i]) : 0.0 + end + D_inv_sqrt = Diagonal(d_inv_sqrt) + return I - D_inv_sqrt * A * D_inv_sqrt + end +end + +""" + compute_spectral_embedding(L::AbstractMatrix, k::Int) + +Compute the spectral embedding by finding eigenvectors corresponding to +the k smallest eigenvalues of the Laplacian. + +# Arguments +- `L`: Laplacian matrix +- `k`: Number of eigenvectors to compute + +# Returns +- `eigvals`: The k smallest eigenvalues +- `eigvecs`: Matrix where each row is a node and columns are eigenvector components +""" +function compute_spectral_embedding(L::AbstractMatrix, k::Int) + # Compute smallest k eigenvalues and eigenvectors + # Use eigen for small matrices, could use iterative methods for large ones + n = size(L, 1) + + if n <= 1000 + # For small matrices, compute all eigenvalues + F = eigen(Symmetric(L)) + idx = sortperm(F.values)[1:k] + return F.values[idx], F.vectors[:, idx] + else + # For larger matrices, use iterative solver (if available) + # For now, still use full eigen but this could be optimized + F = eigen(Symmetric(L)) + idx = sortperm(F.values)[1:k] + return F.values[idx], F.vectors[:, idx] + end +end + +""" + normalize_rows(X::AbstractMatrix) + +Normalize each row of the matrix to unit length. + +# Arguments +- `X`: Matrix to normalize + +# Returns +- Matrix with normalized rows +""" +function normalize_rows(X::AbstractMatrix) + n, k = size(X) + X_norm = similar(X) + + for i in 1:n + row_norm = norm(X[i, :]) + if row_norm > 0 + X_norm[i, :] = X[i, :] / row_norm + else + X_norm[i, :] = X[i, :] + end + end + + return X_norm +end + +""" + kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) + +Perform k-means clustering on the rows of X. + +# Arguments +- `X`: Data matrix (n × d), where n is number of points, d is dimensionality +- `k`: Number of clusters +- `max_iter`: Maximum number of iterations + +# Returns +- `labels::Vector{Int}`: Cluster assignments (1 to k) +""" +function kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) + n, d = size(X) + + # Initialize centers by randomly selecting k rows + center_indices = randperm(n)[1:k] + centers = X[center_indices, :] + labels = zeros(Int, n) + + converged = false + for iter in 1:max_iter + # Assignment step + old_labels = copy(labels) + for i in 1:n + min_dist = Inf + best_cluster = 1 + for j in 1:k + dist = sum(abs2, X[i, :] - centers[j, :]) + if dist < min_dist + min_dist = dist + best_cluster = j + end + end + labels[i] = best_cluster + end + + # Check convergence + if labels == old_labels + converged = true + break + end + + # Update step + for j in 1:k + cluster_points = findall(labels .== j) + if !isempty(cluster_points) + centers[j, :] = vec(mean(X[cluster_points, :], dims = 1)) + end + end + end + + return labels +end + +""" + balanced_kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) + +Perform balanced k-means clustering on the rows of X, ensuring approximately equal-sized clusters. + +This uses a greedy assignment approach where each cluster is filled to its target size +by assigning the closest points to each cluster's center, respecting size constraints. + +# Arguments +- `X`: Data matrix (n × d), where n is number of points, d is dimensionality +- `k`: Number of clusters +- `max_iter`: Maximum number of iterations + +# Returns +- `labels::Vector{Int}`: Cluster assignments (1 to k) with balanced sizes +""" +function balanced_kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) + n, d = size(X) + target_size = n ÷ k + remainder = n % k + + # Initialize centers by randomly selecting k rows + center_indices = randperm(n)[1:k] + centers = X[center_indices, :] + labels = zeros(Int, n) + + for iter in 1:max_iter + old_labels = copy(labels) + + # Compute all distances + distances = zeros(n, k) + for i in 1:n + for j in 1:k + distances[i, j] = sum(abs2, X[i, :] - centers[j, :]) + end + end + + # Balanced assignment using greedy approach + labels = balanced_assignment(distances, k, target_size, remainder) + + # Check convergence + if labels == old_labels + break + end + + # Update centers + for j in 1:k + cluster_points = findall(labels .== j) + if !isempty(cluster_points) + centers[j, :] = vec(mean(X[cluster_points, :], dims = 1)) + end + end + end + + return labels +end + +""" + balanced_assignment(distances::Matrix, k::Int, target_size::Int, remainder::Int) + +Assign points to clusters in a balanced way using a greedy approach. + +Each cluster gets exactly `target_size` or `target_size + 1` points (depending on remainder). + +# Arguments +- `distances`: Matrix of distances from each point to each cluster center (n × k) +- `k`: Number of clusters +- `target_size`: Base number of points per cluster +- `remainder`: Number of clusters that get one extra point + +# Returns +- `labels::Vector{Int}`: Balanced cluster assignments +""" +function balanced_assignment(distances::Matrix, k::Int, target_size::Int, remainder::Int) + n = size(distances, 1) + labels = zeros(Int, n) + cluster_sizes = zeros(Int, k) + max_sizes = fill(target_size, k) + max_sizes[1:remainder] .+= 1 + + # Create a list of (distance, point_idx, cluster_idx) tuples + assignments = [] + for i in 1:n + for j in 1:k + push!(assignments, (distances[i, j], i, j)) + end + end + + # Sort by distance (greedy: assign closest points first) + sort!(assignments, by = x -> x[1]) + + # Assign points greedily while respecting size constraints + assigned = falses(n) + for (dist, i, j) in assignments + if !assigned[i] && cluster_sizes[j] < max_sizes[j] + labels[i] = j + cluster_sizes[j] += 1 + assigned[i] = true + end + if all(assigned) + break + end + end + + return labels +end diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index ada619a..440e4b5 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -1,134 +1,388 @@ +""" + SBMEstimator + +Abstract base type for all Stochastic Block Model (SBM) estimators. + +All concrete estimator types should implement: +- `estimate(estimator, data, initial_labels; progress=true)`: Main estimation function +- `score(estimator)`: Return current objective value (if applicable) +""" abstract type SBMEstimator end -struct SumGreedyEstimator{C, S, P} <: SBMEstimator - counts::C # counts of possible edges between groups +""" + SumGreedyEstimator{C, S, NodeR, StopR} + +Greedy optimization estimator for Stochastic Block Models using sum-of-squares loss. + +This estimator uses a greedy node-swapping algorithm to minimize the loss function: + L = (1/n_edges) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] + +The algorithm iteratively swaps nodes between groups to improve the block model fit. + +# Type Parameters +- `C`: Type for count matrices (usually symmetric array of integers) +- `S`: Type for realized value matrices (usually symmetric array of vectors) +- `NodeR <: NodeSwapRule`: Rule for selecting which nodes to swap +- `StopR <: StopRule`: Rule for determining when to stop optimization + +# Fields +- `counts::C`: Number of possible edges between each pair of groups +- `counts_swap::C`: Working copy of counts for swap evaluation +- `realized::S`: Sum of observed edge values between each pair of groups +- `realized_swap::S`: Working copy of realized values for swap evaluation +- `max_iter::Int`: Maximum number of iterations +- `node_swap_rule::NodeR`: Strategy for selecting nodes to swap +- `stop_rule::StopR`: Criterion for early stopping + +# Example +```julia +k = 5 # number of groups +counts = SymArray(k, 0) +counts_swap = SymArray(k, 0) +realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) +realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) + +estimator = SumGreedyEstimator( + counts, counts_swap, realized, realized_swap, + max_iter=100_000, + node_swap_rule=RandomGroupSwap(), + stop_rule=PreviousBestValue(1000, Inf, :min) +) + +labels = estimate(estimator, data, initial_labels) +``` +""" +struct SumGreedyEstimator{C, S, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator + counts::C counts_swap::C - realized::S # sums of observed Λ between groups + realized::S realized_swap::S max_iter::Int - stop_rule::P + node_swap_rule::NodeR + stop_rule::StopR end -function init!(es::SumGreedyEstimator, data, initial_labels) +""" + score(estimator::SumGreedyEstimator) + +Compute the current objective value (loss) for the estimator. + +Lower values indicate better fit to a block model structure. +""" +function score(estimator::SumGreedyEstimator) + return loss_function(estimator.realized, estimator.counts) +end + +""" + init!(estimator::SumGreedyEstimator, data, initial_labels) + +Initialize the estimator's count and realized value matrices from data. + +Iterates through the upper triangle of the adjacency matrix (i < j) to avoid +double-counting edges in undirected graphs. Updates both the main and swap +workspace matrices. + +# Arguments +- `estimator::SumGreedyEstimator`: The estimator to initialize +- `data::AbstractMatrix`: Network adjacency matrix +- `initial_labels::Vector{Int}`: Initial group assignments for nodes +""" +function init!(estimator::SumGreedyEstimator, data, initial_labels) + # Iterate over upper triangle to avoid double-counting edges for j in axes(data, 2) label_j = initial_labels[j] - for i in axes(data, 1) # double counting edges in undirected graphs + for i in axes(data, 1) + # Only process upper triangle (i < j) for undirected graphs if !isnothing(data[i, j]) && i < j - add_counts!(es.realized[initial_labels[i], label_j], data[i, j]) - add_counts!(es.realized_swap[initial_labels[i], label_j], data[i, j]) - es.counts[initial_labels[i], label_j] += 1 - es.counts_swap[initial_labels[i], label_j] += 1 + label_i = initial_labels[i] + edge_value = data[i, j] + + # Update both main and swap workspaces + add_counts!(estimator.realized[label_i, label_j], edge_value) + add_counts!(estimator.realized_swap[label_i, label_j], edge_value) + estimator.counts[label_i, label_j] += 1 + estimator.counts_swap[label_i, label_j] += 1 end end end end -function estimate(es::SumGreedyEstimator, data, initial_labels) - init!(es, data, initial_labels) - loss = loss_function(es.realized, es.counts) - es.stop_rule.previous_best_value = loss - ## optim +""" + estimate(estimator::SumGreedyEstimator, data, initial_labels; progress=true) + +Estimate node group assignments using greedy optimization with node swapping. + +# Algorithm +The algorithm proceeds as follows: +1. Initialize count and realized value matrices from data and initial labels +2. For each iteration: + a. Select two nodes to swap according to the swap rule + b. Tentatively swap them and update statistics + c. Accept swap if it improves the loss, otherwise revert + d. Check stopping criterion +3. Return final node labels + +# Arguments +- `estimator::SumGreedyEstimator`: The estimator with configuration +- `data::AbstractMatrix`: Network adjacency matrix (n × n) +- `initial_labels::Vector{Int}`: Initial group assignments (length n) +- `progress::Bool`: Whether to show progress bar (default: true) + +# Returns +- `node_labels::Vector{Int}`: Optimized group assignments for each node + +# Performance Notes +- Uses views to avoid allocating temporary arrays +- Swap workspace allows O(n) evaluation of swap quality +- Early stopping can significantly reduce computation time +""" +function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress = true) + # Initialize counts and realized values from data + init!(estimator, data, initial_labels) + initialise_stop_rule!(estimator.stop_rule, estimator) + + # Compute initial loss + current_loss = score(estimator) + + # Start with initial labeling node_labels = copy(initial_labels) - k = length(unique(node_labels)) - pbar = ProgressUnknown(enabled = true, showspeed = true, desc = "Greedy search: ") - for iter in 1:(es.max_iter) - next!(pbar) - groups = StatsBase.sample(1:k, 2; replace = false) - index1 = rand(findall(x -> x == groups[1], node_labels)) - index2 = rand(findall(x -> x == groups[2], node_labels)) - #index1, index2 = StatsBase.sample(1:length(node_labels), 2; replace = false) - g1 = node_labels[index1] - g2 = node_labels[index2] - if g1 == g2 - continue - end - edges_index1 = view(data, :, index1) - edges_index2 = view(data, :, index2) - for j in axes(data, 1) - if j == index1 || j == index2 - continue + n_groups = length(unique(node_labels)) + + # Progress tracking + pbar = ProgressUnknown( + enabled = progress, + showspeed = true, + desc = "Greedy search: " + ) + + # Main optimization loop + for iter in 1:(estimator.max_iter) + # Select two nodes to potentially swap + index1, index2 = select_indices_swap(node_labels, estimator.node_swap_rule) + + group1 = node_labels[index1] + group2 = node_labels[index2] + + # Only process if nodes are in different groups + if group1 != group2 + # Get edge lists for both nodes (views for performance) + edges_node1 = view(data, :, index1) + edges_node2 = view(data, :, index2) + + # Update swap workspace to reflect the proposed swap + for j in axes(data, 1) + # Skip the swapped nodes themselves + if j == index1 || j == index2 + continue + end + + group_j = node_labels[j] + + # Update for node1: remove from group1, add to group2 + remove_counts!(estimator.realized_swap[group1, group_j], edges_node1[j]) + estimator.counts_swap[group1, group_j] -= 1 + add_counts!(estimator.realized_swap[group2, group_j], edges_node1[j]) + estimator.counts_swap[group2, group_j] += 1 + + # Update for node2: remove from group2, add to group1 + remove_counts!(estimator.realized_swap[group2, group_j], edges_node2[j]) + estimator.counts_swap[group2, group_j] -= 1 + add_counts!(estimator.realized_swap[group1, group_j], edges_node2[j]) + estimator.counts_swap[group1, group_j] += 1 + end + + # Tentatively apply swap + node_labels[index1] = group2 + node_labels[index2] = group1 + + # Compute new loss + new_loss = loss_function(estimator.realized_swap, estimator.counts_swap) + + # Accept or reject swap + if new_loss < current_loss + # Accept: commit swap to main workspace + deepcopy!(estimator.realized, estimator.realized_swap) + copy!(estimator.counts, estimator.counts_swap) + current_loss = new_loss + else + # Reject: revert labels and workspace + node_labels[index1] = group1 + node_labels[index2] = group2 + deepcopy!(estimator.realized_swap, estimator.realized) + copy!(estimator.counts_swap, estimator.counts) end - gj = node_labels[j] - remove_counts!(es.realized_swap[g1, gj], edges_index1[j]) - es.counts_swap[g1, gj] -= 1 - add_counts!(es.realized_swap[g2, gj], edges_index1[j]) - es.counts_swap[g2, gj] += 1 - remove_counts!(es.realized_swap[g2, gj], edges_index2[j]) - es.counts_swap[g2, gj] -= 1 - add_counts!(es.realized_swap[g1, gj], edges_index2[j]) - es.counts_swap[g1, gj] += 1 - end - node_labels[index1] = g2 - node_labels[index2] = g1 - - loss_new = loss_function(es.realized_swap, es.counts_swap) - if loss_new < es.stop_rule.previous_best_value - es.stop_rule.previous_best_value = loss_new - es.stop_rule.iterations_since_best = 0 - deepcopy!(es.realized, es.realized_swap) - copy!(es.counts, es.counts_swap) - loss = loss_new - else - # revert swap - node_labels[index1] = g1 - node_labels[index2] = g2 - deepcopy!(es.realized_swap, es.realized) - copy!(es.counts_swap, es.counts) - es.stop_rule.iterations_since_best += 1 end - - if es.stop_rule.iterations_since_best >= es.stop_rule.k + + # Update progress bar + next!(pbar; showvalues = [ + ("loss", current_loss), + info_to_print(estimator.stop_rule) + ]) + + # Check stopping criterion + if stopping_rule(current_loss, estimator.stop_rule) @info "Stopping criterion met at iteration $iter" finish!(pbar) break end end - return node_labels, losses + + return node_labels end +""" + loss_function(realized, counts) + +Compute the normalized sum-of-squares loss for block model fitting. + +The loss measures how well a block model fits the data by computing: + L = (1/N) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] + +where the sum is over the upper triangle (i ≤ j) to avoid double-counting. + +# Mathematical Interpretation +For each pair of groups (i,j): +- `count(i,j)` is the number of edges between groups i and j +- `realized(i,j)` is a vector of observed edge values +- The term `||realized(i,j)||²/count(i,j)` measures concentration of values +- Lower loss indicates better block structure (more homogeneous within blocks) + +# Arguments +- `realized`: Symmetric array of realized edge value sums between groups +- `counts`: Symmetric array of edge counts between groups + +# Returns +- Normalized loss value (lower is better) + +# Performance +Uses @inbounds for speed. Assumes symmetric structure. +""" function loss_function(realized, counts) - loss = 0.0 + total_loss = 0.0 + total_edges = 0.0 + + # Iterate over upper triangle to avoid double-counting @inbounds for j in axes(realized, 2) for i in axes(realized, 1) if i <= j - # θ = realized[i, j] ./ counts[i, j] - # loss += sum(xlogx, θ) * counts[i, j] - loss += sum(counts[i, j] - sum(abs2, realized[i, j]) / counts[i, j]) + n_edges = counts[i, j] + if n_edges > 0 + # Compute sum of squares of realized values + sum_squares = sum(abs2, realized[i, j]) + # Add variance-like term to loss + total_loss += n_edges - sum_squares / n_edges + total_edges += n_edges + end end end end - return loss + + return total_edges > 0 ? total_loss / total_edges : 0.0 end +# ============================================================================ +# Count manipulation helpers +# ============================================================================ + +""" + add_counts!(parameter::AbstractArray, data_value::AbstractArray) + +Add array data value to parameter array (for categorical edge values). +""" function add_counts!(parameter::AbstractArray, data_value::AbstractArray) @inbounds parameter .+= data_value end +""" + remove_counts!(parameter::AbstractArray, data_value::AbstractArray) + +Remove array data value from parameter array (for categorical edge values). +""" function remove_counts!(parameter::AbstractArray, data_value::AbstractArray) @inbounds parameter .-= data_value end +""" + add_counts!(parameter::AbstractArray, data_value::Real) + +Increment the count for a specific category (for categorical edge values). +""" function add_counts!(parameter::AbstractArray, data_value::Real) @inbounds parameter[data_value] += 1 end +""" + remove_counts!(parameter::AbstractArray, data_value::Real) + +Decrement the count for a specific category (for categorical edge values). +""" function remove_counts!(parameter::AbstractArray, data_value::Real) @inbounds parameter[data_value] -= 1 end -## -using Makie +# ============================================================================ +# Data preparation utilities +# ============================================================================ -function Makie.convert_arguments( - ::Type{<:AbstractPlot}, graphon::DecoratedSBM, k::Int = 1) - x = collect(0:0.01:1) - return (x, x, [_extract_param(graphon(xi, yi), k) for xi in x, yi in x]) -end +""" + prepare_data_cat(A::AbstractMatrix{<:Real}, k; m=length(unique(A)), has_zero=zero(eltype(A)) in A) -function _extract_param(d::Distribution, k::Int) - return params(d)[k] -end +Prepare categorical network data for SumGreedyEstimator. + +Creates the necessary data structures (count matrices and realized value tensors) +for estimating a categorical Stochastic Block Model with k groups. -function _extract_param(d::DiscreteNonParametric, k::Int) - return params(d)[2][k] +# Arguments +- `A::AbstractMatrix{<:Real}`: Adjacency matrix with categorical edge values +- `k::Int`: Number of groups to partition nodes into +- `m::Int`: Number of edge categories (default: inferred from unique values in A) +- `has_zero::Bool`: Whether the data contains zero values (default: auto-detected) + +# Returns +A tuple containing: +- `data`: Preprocessed adjacency matrix (shifted if zero-indexed) +- `counts`: Symmetric k×k array for edge counts (initialized to 0) +- `counts_swap`: Workspace copy of counts for swap evaluation +- `realized`: Symmetric k×k array of m-dimensional count vectors (initialized to 0) +- `realized_swap`: Workspace copy of realized for swap evaluation + +# Example +```julia +# Network with 3 edge types (0, 1, 2) for no edge, layer 1, layer 2 +A = rand(0:2, 100, 100) +A = (A + A') .÷ 2 # Make symmetric + +data, counts, counts_swap, realized, realized_swap = prepare_data_cat(A, k=5) +``` + +# Notes +- If data contains zeros, they are shifted to 1-indexing for categorical representation +- The realized arrays use StaticArrays.MVector for performance +- The symmetric array structure avoids redundant storage +""" +function prepare_data_cat( + A::AbstractMatrix{<:Real}, + k::Int; + m::Int = length(unique(A)), + has_zero::Bool = zero(eltype(A)) in A + ) + @info "Preparing data for categorical SBM with $m categories and $k groups." + + # Adjust data if zero-indexed (shift to 1-indexing for Julia) + if has_zero + @info "Data contains zero values, using 1-based indexing." + data = A .+ 1 + else + data = A + end + + # Initialize count matrices + counts = SymArray(k, 0) + counts_swap = SymArray(k, 0) + + # Initialize realized value tensors (k×k matrices of m-dimensional vectors) + realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) + realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) + + return data, counts, counts_swap, realized, realized_swap end diff --git a/src/optimization/config_rules/stop_rule.jl b/src/optimization/config_rules/stop_rule.jl index 8f8ec0b..dbaab9a 100644 --- a/src/optimization/config_rules/stop_rule.jl +++ b/src/optimization/config_rules/stop_rule.jl @@ -7,27 +7,37 @@ end function initialise_stop_rule!(stop_rule::StopRule, a, g) end -# default score is the log likelihood function score(a::Assignment) - return loglikelihood(a) #/ binomial(number_nodes(a), 2) + return loglikelihood(a) end -mutable struct PreviousBestValue{T} <: StopRule +mutable struct PreviousBestValue{T, S} <: StopRule k::Int previous_best_value::T iterations_since_best::Int - function PreviousBestValue(k::Int, x::T = -Inf) where {T <: Real} - @assert k > 0 - # queue stores the best values and at most k subsequent values - new{T}(k, x, 0) - end end +function PreviousBestValue(k::Int, x::T = -Inf, best = :max) where {T <: Real} + @argcheck k > 0 + PreviousBestValue{T, Val(best)}(k, x, 0) +end + +const PreviousMaxValue{T} = PreviousBestValue{T, Val(:max)} +const PreviousMinValue{T} = PreviousBestValue{T, Val(:min)} + function initialise_stop_rule!(stop_rule::PreviousBestValue, a) score_value = score(a) stop_rule.previous_best_value = score_value end +function compare_to_best(current, past, ::PreviousMaxValue) + return current > past +end + +function compare_to_best(current, past, ::PreviousMinValue) + return current < past +end + """ stopping_rule(assignment::Assignment,g, stop_rule::StopRule) @@ -39,10 +49,9 @@ Returns a Bool with true if we should stop the optimization based on the `stop_r """ stopping_rule -function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) - score_value = score(assignment) - if score_value > stop_rule.previous_best_value - stop_rule.previous_best_value = score_value +function stopping_rule(loss::T, stop_rule::PreviousBestValue{T}) where {T <: Real} + if compare_to_best(loss, stop_rule.previous_best_value, stop_rule) + stop_rule.previous_best_value = loss stop_rule.iterations_since_best = 0 else stop_rule.iterations_since_best += 1 @@ -50,6 +59,8 @@ function stopping_rule(assignment::Assignment, stop_rule::PreviousBestValue) return stop_rule.iterations_since_best >= stop_rule.k end +stopping_rule(a, stop_rule::StopRule) = stopping_rule(score(a), stop_rule) + function info_to_print(stop_rule::PreviousBestValue) ("stalled iter: ", stop_rule.iterations_since_best) end diff --git a/src/optimization/config_rules/swap_rule.jl b/src/optimization/config_rules/swap_rule.jl index 19c3dca..fb25f72 100644 --- a/src/optimization/config_rules/swap_rule.jl +++ b/src/optimization/config_rules/swap_rule.jl @@ -14,14 +14,22 @@ current assignment `node_assignment`. """ select_swap -function select_indices_swap(assignment::Assignment, ::RandomNodeSwap) - return Tuple(StatsBase.sample(1:number_nodes(assignment), 2; replace = false)) +function select_indices_swap(node_labels::AbstractVector{Int}, ::RandomNodeSwap) + return Tuple(StatsBase.sample(1:length(node_labels), 2; replace = false)) end -function select_indices_swap(assignment::Assignment, ::RandomGroupSwap) - groups = StatsBase.sample( - 1:number_groups(assignment), 2; replace = false) - index1 = rand(findall(x -> x == groups[1], assignment.node_labels)) - index2 = rand(findall(x -> x == groups[2], assignment.node_labels)) +function select_indices_swap(node_labels::AbstractVector{Int}, ::RandomGroupSwap, + k::Int = length(unique(node_labels))) + groups = StatsBase.sample(1:k, 2; replace = false) + index1 = rand(findall(x -> x == groups[1], node_labels)) + index2 = rand(findall(x -> x == groups[2], node_labels)) return index1, index2 end + +function select_indices_swap(a::Assignment, rule::NodeSwapRule) + select_indices_swap(a.node_labels, rule) +end + +function select_indices_swap(assignment::Assignment, rule::RandomGroupSwap) + return select_indices_swap(assignment.node_labels, rule, number_groups(assignment)) +end diff --git a/src/utils/include.jl b/src/utils/include.jl index 2e9c8ad..94fb077 100644 --- a/src/utils/include.jl +++ b/src/utils/include.jl @@ -1 +1,14 @@ include("SymArray.jl") + +function ordered_start_labels(n::Int, k::Int) + labels = Vector{Int}(undef, n) + base_size = n ÷ k + remainder = n % k + for group in 1:k + fill!(view(labels, ((group - 1) * base_size + 1):(group * base_size)), group) + end + if remainder > 0 + fill!(view(labels, (k * base_size + 1):(k * base_size + remainder)), k) + end + return labels +end From 93b3c44362c3121e6d811414f6acc289e14c9e4a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 19 Oct 2025 17:47:36 +0200 Subject: [PATCH 196/266] typo --- src/estimator/abstractEstimator.jl | 79 ++++++++++++++---------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index 440e4b5..02a3b45 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -96,7 +96,7 @@ function init!(estimator::SumGreedyEstimator, data, initial_labels) if !isnothing(data[i, j]) && i < j label_i = initial_labels[i] edge_value = data[i, j] - + # Update both main and swap workspaces add_counts!(estimator.realized[label_i, label_j], edge_value) add_counts!(estimator.realized_swap[label_i, label_j], edge_value) @@ -130,74 +130,68 @@ The algorithm proceeds as follows: # Returns - `node_labels::Vector{Int}`: Optimized group assignments for each node - -# Performance Notes -- Uses views to avoid allocating temporary arrays -- Swap workspace allows O(n) evaluation of swap quality -- Early stopping can significantly reduce computation time """ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress = true) # Initialize counts and realized values from data init!(estimator, data, initial_labels) initialise_stop_rule!(estimator.stop_rule, estimator) - + # Compute initial loss current_loss = score(estimator) - + # Start with initial labeling node_labels = copy(initial_labels) - n_groups = length(unique(node_labels)) - + # Progress tracking pbar = ProgressUnknown( - enabled = progress, - showspeed = true, + enabled = progress, + showspeed = true, desc = "Greedy search: " ) - + # Main optimization loop for iter in 1:(estimator.max_iter) # Select two nodes to potentially swap index1, index2 = select_indices_swap(node_labels, estimator.node_swap_rule) - + group1 = node_labels[index1] group2 = node_labels[index2] - + # Only process if nodes are in different groups if group1 != group2 # Get edge lists for both nodes (views for performance) edges_node1 = view(data, :, index1) edges_node2 = view(data, :, index2) - + # Update swap workspace to reflect the proposed swap for j in axes(data, 1) # Skip the swapped nodes themselves if j == index1 || j == index2 continue end - + group_j = node_labels[j] - + # Update for node1: remove from group1, add to group2 remove_counts!(estimator.realized_swap[group1, group_j], edges_node1[j]) estimator.counts_swap[group1, group_j] -= 1 add_counts!(estimator.realized_swap[group2, group_j], edges_node1[j]) estimator.counts_swap[group2, group_j] += 1 - + # Update for node2: remove from group2, add to group1 remove_counts!(estimator.realized_swap[group2, group_j], edges_node2[j]) estimator.counts_swap[group2, group_j] -= 1 add_counts!(estimator.realized_swap[group1, group_j], edges_node2[j]) estimator.counts_swap[group1, group_j] += 1 end - + # Tentatively apply swap node_labels[index1] = group2 node_labels[index2] = group1 - + # Compute new loss new_loss = loss_function(estimator.realized_swap, estimator.counts_swap) - + # Accept or reject swap if new_loss < current_loss # Accept: commit swap to main workspace @@ -212,13 +206,14 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress copy!(estimator.counts_swap, estimator.counts) end end - + # Update progress bar - next!(pbar; showvalues = [ - ("loss", current_loss), - info_to_print(estimator.stop_rule) - ]) - + next!( + pbar; showvalues = [ + ("loss", current_loss), + info_to_print(estimator.stop_rule) + ]) + # Check stopping criterion if stopping_rule(current_loss, estimator.stop_rule) @info "Stopping criterion met at iteration $iter" @@ -226,7 +221,7 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress break end end - + return node_labels end @@ -260,7 +255,7 @@ Uses @inbounds for speed. Assumes symmetric structure. function loss_function(realized, counts) total_loss = 0.0 total_edges = 0.0 - + # Iterate over upper triangle to avoid double-counting @inbounds for j in axes(realized, 2) for i in axes(realized, 1) @@ -273,10 +268,12 @@ function loss_function(realized, counts) total_loss += n_edges - sum_squares / n_edges total_edges += n_edges end + else + break end end end - + return total_edges > 0 ? total_loss / total_edges : 0.0 end @@ -361,28 +358,28 @@ data, counts, counts_swap, realized, realized_swap = prepare_data_cat(A, k=5) - The symmetric array structure avoids redundant storage """ function prepare_data_cat( - A::AbstractMatrix{<:Real}, - k::Int; - m::Int = length(unique(A)), + A::AbstractMatrix{<:Real}, + k::Int; + m::Int = length(unique(A)), has_zero::Bool = zero(eltype(A)) in A - ) - @info "Preparing data for categorical SBM with $m categories and $k groups." - +) + @debug "Preparing data for categorical SBM with $m categories and $k groups." + # Adjust data if zero-indexed (shift to 1-indexing for Julia) if has_zero - @info "Data contains zero values, using 1-based indexing." + @debug "Data contains zero values, using 1-based indexing." data = A .+ 1 else data = A end - + # Initialize count matrices counts = SymArray(k, 0) counts_swap = SymArray(k, 0) - + # Initialize realized value tensors (k×k matrices of m-dimensional vectors) realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) - + return data, counts, counts_swap, realized, realized_swap end From ffa17ab91c9885038c03e441e221e3fd581cb887 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 19 Oct 2025 17:56:33 +0200 Subject: [PATCH 197/266] minor performance improvements --- src/estimator/abstractEstimator.jl | 76 ++++++++++++++++-------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index 02a3b45..b79728b 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -89,13 +89,12 @@ workspace matrices. """ function init!(estimator::SumGreedyEstimator, data, initial_labels) # Iterate over upper triangle to avoid double-counting edges - for j in axes(data, 2) + @inbounds for j in axes(data, 2) label_j = initial_labels[j] - for i in axes(data, 1) - # Only process upper triangle (i < j) for undirected graphs - if !isnothing(data[i, j]) && i < j + for i in 1:(j - 1) # More efficient than i < j check inside loop + edge_value = data[i, j] + if !isnothing(edge_value) label_i = initial_labels[i] - edge_value = data[i, j] # Update both main and swap workspaces add_counts!(estimator.realized[label_i, label_j], edge_value) @@ -149,6 +148,9 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress desc = "Greedy search: " ) + # Update progress bar only every N iterations to reduce overhead + progress_update_interval = max(1, estimator.max_iter ÷ 1000) + # Main optimization loop for iter in 1:(estimator.max_iter) # Select two nodes to potentially swap @@ -159,29 +161,28 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress # Only process if nodes are in different groups if group1 != group2 - # Get edge lists for both nodes (views for performance) - edges_node1 = view(data, :, index1) - edges_node2 = view(data, :, index2) - # Update swap workspace to reflect the proposed swap - for j in axes(data, 1) + # Using @inbounds for performance - loop bounds are guaranteed safe + @inbounds for j in axes(data, 1) # Skip the swapped nodes themselves if j == index1 || j == index2 continue end group_j = node_labels[j] + edge_val_1 = data[j, index1] + edge_val_2 = data[j, index2] # Update for node1: remove from group1, add to group2 - remove_counts!(estimator.realized_swap[group1, group_j], edges_node1[j]) + remove_counts!(estimator.realized_swap[group1, group_j], edge_val_1) estimator.counts_swap[group1, group_j] -= 1 - add_counts!(estimator.realized_swap[group2, group_j], edges_node1[j]) + add_counts!(estimator.realized_swap[group2, group_j], edge_val_1) estimator.counts_swap[group2, group_j] += 1 # Update for node2: remove from group2, add to group1 - remove_counts!(estimator.realized_swap[group2, group_j], edges_node2[j]) + remove_counts!(estimator.realized_swap[group2, group_j], edge_val_2) estimator.counts_swap[group2, group_j] -= 1 - add_counts!(estimator.realized_swap[group1, group_j], edges_node2[j]) + add_counts!(estimator.realized_swap[group1, group_j], edge_val_2) estimator.counts_swap[group1, group_j] += 1 end @@ -208,11 +209,16 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress end # Update progress bar - next!( - pbar; showvalues = [ - ("loss", current_loss), - info_to_print(estimator.stop_rule) - ]) + + # Update progress bar only periodically to reduce overhead + if progress && (iter % progress_update_interval == 0 || iter == estimator.max_iter) + update!( + pbar, iter; + showvalues = [ + ("loss", current_loss), + info_to_print(estimator.stop_rule) + ]) + end # Check stopping criterion if stopping_rule(current_loss, estimator.stop_rule) @@ -252,24 +258,22 @@ For each pair of groups (i,j): # Performance Uses @inbounds for speed. Assumes symmetric structure. """ -function loss_function(realized, counts) +@inline function loss_function(realized, counts) total_loss = 0.0 total_edges = 0.0 # Iterate over upper triangle to avoid double-counting + # Reorder loops for better cache locality (j outer, i inner) @inbounds for j in axes(realized, 2) - for i in axes(realized, 1) - if i <= j - n_edges = counts[i, j] - if n_edges > 0 - # Compute sum of squares of realized values - sum_squares = sum(abs2, realized[i, j]) - # Add variance-like term to loss - total_loss += n_edges - sum_squares / n_edges - total_edges += n_edges - end - else - break + for i in 1:j # More efficient iteration pattern + n_edges = counts[i, j] + # Combine conditions to reduce branching + if n_edges > 0 + # Compute sum of squares of realized values inline + sum_squares = sum(abs2, realized[i, j]) + # Add variance-like term to loss + total_loss += n_edges - sum_squares / n_edges + total_edges += n_edges end end end @@ -286,7 +290,7 @@ end Add array data value to parameter array (for categorical edge values). """ -function add_counts!(parameter::AbstractArray, data_value::AbstractArray) +@inline function add_counts!(parameter::AbstractArray, data_value::AbstractArray) @inbounds parameter .+= data_value end @@ -295,7 +299,7 @@ end Remove array data value from parameter array (for categorical edge values). """ -function remove_counts!(parameter::AbstractArray, data_value::AbstractArray) +@inline function remove_counts!(parameter::AbstractArray, data_value::AbstractArray) @inbounds parameter .-= data_value end @@ -304,7 +308,7 @@ end Increment the count for a specific category (for categorical edge values). """ -function add_counts!(parameter::AbstractArray, data_value::Real) +@inline function add_counts!(parameter::AbstractArray, data_value::Real) @inbounds parameter[data_value] += 1 end @@ -313,7 +317,7 @@ end Decrement the count for a specific category (for categorical edge values). """ -function remove_counts!(parameter::AbstractArray, data_value::Real) +@inline function remove_counts!(parameter::AbstractArray, data_value::Real) @inbounds parameter[data_value] -= 1 end From 674fb3c5d8c8b7f9b4e4bb2bad773935b33fc2c2 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 19 Oct 2025 18:44:50 +0200 Subject: [PATCH 198/266] generalise add counts and realised for funkier use cases (e.g. MC) --- src/estimator/abstractEstimator.jl | 52 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index b79728b..1e472ac 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -97,10 +97,10 @@ function init!(estimator::SumGreedyEstimator, data, initial_labels) label_i = initial_labels[i] # Update both main and swap workspaces - add_counts!(estimator.realized[label_i, label_j], edge_value) - add_counts!(estimator.realized_swap[label_i, label_j], edge_value) - estimator.counts[label_i, label_j] += 1 - estimator.counts_swap[label_i, label_j] += 1 + add_realized(estimator.realized[label_i, label_j], edge_value) + add_realized(estimator.realized_swap[label_i, label_j], edge_value) + add_counts!(estimator.counts, edge_value, label_i, label_j) + add_counts!(estimator.counts_swap, edge_value, label_i, label_j) end end end @@ -174,16 +174,16 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress edge_val_2 = data[j, index2] # Update for node1: remove from group1, add to group2 - remove_counts!(estimator.realized_swap[group1, group_j], edge_val_1) - estimator.counts_swap[group1, group_j] -= 1 - add_counts!(estimator.realized_swap[group2, group_j], edge_val_1) - estimator.counts_swap[group2, group_j] += 1 + remove_realized(estimator.realized_swap[group1, group_j], edge_val_1) + remove_counts!(estimator.counts_swap, edge_val_1, group1, group_j) + add_realized(estimator.realized_swap[group2, group_j], edge_val_1) + add_counts!(estimator.counts_swap, edge_val_1, group2, group_j) # Update for node2: remove from group2, add to group1 - remove_counts!(estimator.realized_swap[group2, group_j], edge_val_2) - estimator.counts_swap[group2, group_j] -= 1 - add_counts!(estimator.realized_swap[group1, group_j], edge_val_2) - estimator.counts_swap[group1, group_j] += 1 + remove_realized(estimator.realized_swap[group2, group_j], edge_val_2) + remove_counts!(estimator.counts_swap, edge_val_2, group2, group_j) + add_realized(estimator.realized_swap[group1, group_j], edge_val_2) + add_counts!(estimator.counts_swap, edge_val_2, group1, group_j) end # Tentatively apply swap @@ -286,41 +286,53 @@ end # ============================================================================ """ - add_counts!(parameter::AbstractArray, data_value::AbstractArray) + add_realized(parameter::AbstractArray, data_value::AbstractArray) Add array data value to parameter array (for categorical edge values). """ -@inline function add_counts!(parameter::AbstractArray, data_value::AbstractArray) +@inline function add_realized(parameter::AbstractArray, data_value::AbstractArray) @inbounds parameter .+= data_value end """ - remove_counts!(parameter::AbstractArray, data_value::AbstractArray) + remove_realized(parameter::AbstractArray, data_value::AbstractArray) Remove array data value from parameter array (for categorical edge values). """ -@inline function remove_counts!(parameter::AbstractArray, data_value::AbstractArray) +@inline function remove_realized(parameter::AbstractArray, data_value::AbstractArray) @inbounds parameter .-= data_value end """ - add_counts!(parameter::AbstractArray, data_value::Real) + add_realized(parameter::AbstractArray, data_value::Real) Increment the count for a specific category (for categorical edge values). """ -@inline function add_counts!(parameter::AbstractArray, data_value::Real) +@inline function add_realized(parameter::AbstractArray, data_value::Real) @inbounds parameter[data_value] += 1 end """ - remove_counts!(parameter::AbstractArray, data_value::Real) + remove_realized(parameter::AbstractArray, data_value::Real) Decrement the count for a specific category (for categorical edge values). """ -@inline function remove_counts!(parameter::AbstractArray, data_value::Real) +@inline function remove_realized(parameter::AbstractArray, data_value::Real) @inbounds parameter[data_value] -= 1 end +@inline function add_counts!( + counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: + Real} + @inbounds counts[group_i, group_j] += one(T) +end + +@inline function remove_counts!( + counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: + Real} + @inbounds counts[group_i, group_j] -= one(T) +end + # ============================================================================ # Data preparation utilities # ============================================================================ From 401dd967fd39feb0acd53c9106642e6dde75cc2f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 19 Oct 2025 19:43:33 +0200 Subject: [PATCH 199/266] make loss compatible with extensions --- src/estimator/abstractEstimator.jl | 45 ++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index 1e472ac..5f09f25 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -255,23 +255,18 @@ For each pair of groups (i,j): # Returns - Normalized loss value (lower is better) -# Performance -Uses @inbounds for speed. Assumes symmetric structure. +# !warning + This will need to be modified for other data types! """ -@inline function loss_function(realized, counts) +@inline function loss_function(realized, counts::AbstractArray{<:Real}) total_loss = 0.0 total_edges = 0.0 - # Iterate over upper triangle to avoid double-counting - # Reorder loops for better cache locality (j outer, i inner) @inbounds for j in axes(realized, 2) - for i in 1:j # More efficient iteration pattern + for i in 1:j n_edges = counts[i, j] - # Combine conditions to reduce branching if n_edges > 0 - # Compute sum of squares of realized values inline sum_squares = sum(abs2, realized[i, j]) - # Add variance-like term to loss total_loss += n_edges - sum_squares / n_edges total_edges += n_edges end @@ -281,6 +276,28 @@ Uses @inbounds for speed. Assumes symmetric structure. return total_edges > 0 ? total_loss / total_edges : 0.0 end +# this assumes that sum realized = counts +@inline function loss_function(realized, counts) + total_loss = 0.0 + total_edges = 0.0 + @inbounds for j in axes(realized, 2) + for i in 1:j + for m in eachindex(realized[i, j]) + total_edges += realized[i, j][m] + total_loss += realized[i, j][m] * + (1 - + _fast_div_(realized[i, j][m], counts[i, j][m])) + end + end + end + return total_loss / total_edges +end + +@inline function _fast_div_(num::Real, denom::Real) + num == 0.0 && denom == 0.0 && return 0.0 + return num / denom +end + # ============================================================================ # Count manipulation helpers # ============================================================================ @@ -333,6 +350,16 @@ end @inbounds counts[group_i, group_j] -= one(T) end +@inline function add_counts!( + counts::AbstractArray, data_value, group_i::Int, group_j::Int) + @inbounds counts[group_i, group_j] .+= 1#data_value +end + +@inline function remove_counts!( + counts::AbstractArray, data_value, group_i::Int, group_j::Int) + @inbounds counts[group_i, group_j] .-= 1#data_value +end + # ============================================================================ # Data preparation utilities # ============================================================================ From 7349eaed7c272969c07d4bdf4c35b0b09eb9e64c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 12:43:48 +0200 Subject: [PATCH 200/266] add clustering --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 20c7b7a..959a0fa 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ authors = ["Charles Dufour", "Jake Grainger"] [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -29,6 +30,7 @@ MakieExt = "Makie" [compat] ArgCheck = "2.5.0" +Clustering = "0.15.8" Graphons = "0.1.0" LinearAlgebra = "1.12.0" From 02427fb47eb67f41a44e8d8dc2d3adb609788613 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 14:28:00 +0200 Subject: [PATCH 201/266] use Graphons.jl to generate graphs for test --- test/Project.toml | 1 - test/test_cat_case.jl | 13 +++++++------ test/test_swap_workspace.jl | 21 ++++++++++++--------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index 1c5fcb0..e35dbe4 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,5 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" diff --git a/test/test_cat_case.jl b/test/test_cat_case.jl index 51e6bfd..deca73d 100644 --- a/test/test_cat_case.jl +++ b/test/test_cat_case.jl @@ -12,14 +12,15 @@ using StaticArrays m = 3 ps = SVector{m}(fill(1 / m, m)) d_mine = NetworkHistogram.Cat(ps) - # Create a block model with two groups - sbm = NetworkHistogram.BlockModel(k, d_mine) - sbm[1, 1] = NetworkHistogram.Cat(SVector{3}([0.7, 0.2, 0.1])) - sbm[2, 2] = NetworkHistogram.Cat(SVector{3}([0.1, 0.3, 0.6])) - sbm[1, 2] = NetworkHistogram.Cat(SVector{3}([0.3, 0.4, 0.3])) + + θ = [NetworkHistogram.Cat(SVector{3}([0.7, 0.2, 0.1])) NetworkHistogram.Cat(SVector{3}([0.1, 0.3, 0.6])); + NetworkHistogram.Cat(SVector{3}([0.1, 0.3, 0.6])) NetworkHistogram.Cat(SVector{3}([0.3, 0.4, 0.3]))] + sbm = DecoratedSBM(θ, [0.5, 0.5]) labels = StatsBase.inverse_rle(1:k, fill(n ÷ k, k)) - A = NetworkHistogram.sample(sbm, labels) + latents = vcat(repeat([0.2], n ÷ 2), repeat([0.8], n ÷ 2)) + A = sample_graph(sbm, latents) + edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment( labels, edgelist, NetworkHistogram.Dist(d_mine)) diff --git a/test/test_swap_workspace.jl b/test/test_swap_workspace.jl index 6401e49..367d0e9 100644 --- a/test/test_swap_workspace.jl +++ b/test/test_swap_workspace.jl @@ -9,14 +9,14 @@ function manual_loglikelihood(A, node_labels, θ) ll = 0.0 for j in 1:n for i in 1:n - if i!=j + if i != j g1 = node_labels[i] g2 = node_labels[j] ll += NetworkHistogram.logpdf(θ[g1, g2], A[i, j]) end end end - return ll/2 + return ll / 2 end function slow_swap(a::NetworkHistogram.Assignment, s::NetworkHistogram.Swap) @@ -29,16 +29,19 @@ end Random.seed!(42) n = 6 k = 2 - p1, p2 = 0.8, 0.3 + #p1, p2 = 0.8, 0.3 d = NetworkHistogram.Bernoulli(0.5) # Create a block model with two groups - sbm = NetworkHistogram.BlockModel(k, d) - sbm[1, 1] = NetworkHistogram.Bernoulli(p1) - sbm[2, 2] = NetworkHistogram.Bernoulli(p2) - sbm[1, 2] = NetworkHistogram.Bernoulli(0.1) + # sbm = NetworkHistogram.BlockModel(k, d) + # sbm[1, 1] = NetworkHistogram.Bernoulli(p1) + # sbm[2, 2] = NetworkHistogram.Bernoulli(p2) + # sbm[1, 2] = NetworkHistogram.Bernoulli(0.1) - labels = StatsBase.inverse_rle(1:k, fill(n÷k, k)) - A = NetworkHistogram.sample(sbm, labels) + sbm = SBM([0.8 0.3; 0.3 0.8], [0.5, 0.5]) + + labels = StatsBase.inverse_rle(1:k, fill(n ÷ k, k)) + latents = [0.1, 0.1, 0.1, 0.9, 0.9, 0.9] + A = sample_graph(sbm, latents) edgelist = NetworkHistogram.EdgeList(A) assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) From 2930519f05986093c173ba0754c5ee8a70a63d82 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 14:28:31 +0200 Subject: [PATCH 202/266] reexport Graphons.jl export in NetworkHistogram --- Project.toml | 2 ++ src/NetworkHistogram.jl | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 959a0fa..847dbe6 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" @@ -33,6 +34,7 @@ ArgCheck = "2.5.0" Clustering = "0.15.8" Graphons = "0.1.0" LinearAlgebra = "1.12.0" +Reexport = "1.2.2" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 7b0e54d..19a2633 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -9,13 +9,15 @@ using LinearAlgebra using ArgCheck using Random: randperm +using Reexport +@reexport using Graphons + include("utils/include.jl") using .FastSymArray include("distributions/include.jl") include("EdgeList.jl") include("assignment.jl") -include("block_model.jl") include("optimization/greedy.jl") include("estimator/abstractEstimator.jl") include("estimator/SpectralEstimator.jl") From f1462fe96368dc3d164eb93e8d98b4612a20d439 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 20:49:09 +0200 Subject: [PATCH 203/266] fix symarray --- src/NetworkHistogram.jl | 6 +- src/utils/SymArray.jl | 217 +++++++++++++++++++--------------------- test/test_symarray.jl | 41 ++++++-- 3 files changed, 136 insertions(+), 128 deletions(-) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 19a2633..e00f37f 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -13,7 +13,8 @@ using Reexport @reexport using Graphons include("utils/include.jl") -using .FastSymArray + +@reexport using .FastSymArray include("distributions/include.jl") include("EdgeList.jl") @@ -30,4 +31,7 @@ function from_adjs_to_decorated end function heatmap_params end export from_adjs_to_decorated, heatmap_params + +export NethistResult + end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index e80ee6c..1268835 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -9,11 +9,12 @@ module FastSymArray using SparseArrays using LinearAlgebra import Base: eltype, convert, size, getindex, setindex!, copy!, similar, - IndexStyle, axes, length, iterate, copyto! -export SymArray, eltype, deepcopy!, sum_tri_with_diag + IndexStyle, axes, length, iterate, copyto!, fill! +export SymArray, eltype, deepcopy!, sum_tri_with_diag, fast_getindex, fast_setindex! +import SparseArrays: getcolptr, nonzeros, FixedSparseCSC """ - SymArray{F} <: AbstractArray{F, 2} + SymArray{F} <: AbstractSparseMatrix{F, 2} A symmetric matrix that stores only the upper triangle using a sparse matrix. @@ -43,6 +44,42 @@ mutable struct SymArray{F} <: AbstractSparseMatrix{F, Int} uppertrian::SparseMatrixCSC{F, Int} end +SymArray(::Type{F}, dims::Int...) where {F} = SymArray(F, dims) +function SymArray(::Type{F}, dims::NTuple{2, Int}) where {F} + if dims[1] != dims[2] + throw(ArgumentError("SymArray must be square, got dims=$(dims)")) + end + SymArray{F}(SparseMatrixCSC{F, Int}(make_csc_format(dims[1], F)...)) +end + +SymArray{F}(::UndefInitializer, dims::Int...) where {F} = SymArray{F}(undef, dims) +function SymArray{F}(::UndefInitializer, dims::NTuple{2, Int}) where {F} + return SymArray(F, dims) +end + +function make_csc_format(k::Int, ::Type{F}) where {F} + k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) + + n_elements = div(k * (k + 1), 2) # Number of non-zeros in upper triangle + + colptr = Vector{Int}(undef, k + 1) + rowval = Vector{Int}(undef, n_elements) + nzval = Vector{F}(undef, n_elements) + + @inbounds for j in 1:(k + 1) + colptr[j] = div((j - 1) * j, 2) + 1 + end + + idx = 1 + @inbounds for j in 1:k + for i in 1:j + rowval[idx] = i + idx += 1 + end + end + return k, k, colptr, rowval, nzval +end + """ SymArray(k::Int, d::F) @@ -133,9 +170,6 @@ function size(a::SymArray) return size(a.uppertrian) end -# IndexStyle trait - use CartesianIndex for 2D arrays -Base.IndexStyle(::Type{<:SymArray}) = IndexCartesian() - # axes function function axes(a::SymArray) return axes(a.uppertrian) @@ -146,44 +180,47 @@ function length(a::SymArray) return length(a.uppertrian) end -Base.@propagate_inbounds function getindex(a::SymArray{F}, i::Int, j::Int) where {F} - @boundscheck checkbounds(a, i, j) - if i <= j - @inbounds return a.uppertrian[i, j] - else - @inbounds return a.uppertrian[j, i] - end -end - -Base.@propagate_inbounds function setindex!(a::SymArray{F}, v, i::Int, j::Int) where {F} - @boundscheck checkbounds(a, i, j) - if i <= j - @inbounds a.uppertrian[i, j] = v - else - @inbounds a.uppertrian[j, i] = v - end +# Base.@propagate_inbounds function getindex(a::SymArray{F}, i::Int, j::Int) where {F} +# @boundscheck checkbounds(a, i, j) +# if i <= j +# @inbounds return a.uppertrian[i, j] +# else +# @inbounds return a.uppertrian[j, i] +# end +# end + +# Base.@propagate_inbounds function setindex!(a::SymArray{F}, v, i::Int, j::Int) where {F} +# @boundscheck checkbounds(a, i, j) +# if i <= j +# @inbounds a.uppertrian[i, j] = v +# else +# @inbounds a.uppertrian[j, i] = v +# end +# end + +# faster indexing by avoiding search +Base.@propagate_inbounds function getindex(A::SymArray, i0::Integer, i1::Integer) + i0, i1 = minmax(i0, i1) + @boundscheck checkbounds(A, i0, i1) + r1 = Int(@inbounds getcolptr(A.uppertrian)[i1]) + nonzeros(A.uppertrian)[r1 + i0 - 1] end -# similar function for creating similar arrays -function similar(a::SymArray{F}) where {F} - k = size(a, 1) - return SymArray(k, zero(F)) +Base.@propagate_inbounds function setindex!(A::SymArray, v, i::Int, j::Int) + i, j = minmax(i, j) + @boundscheck checkbounds(A, i, j) + r1 = Int(@inbounds getcolptr(A.uppertrian)[j]) + nonzeros(A.uppertrian)[r1 + i - 1] = v end -function similar(a::SymArray, ::Type{T}) where {T} - k = size(a, 1) - return SymArray(k, zero(T)) +function similar(a::SymArray, ::Type{T} = eltype(a), dims::Dims{2} = size(a)) where {T} + return SymArray{T}(undef, dims) end -function similar(a::SymArray, ::Type{T}, dims::Dims{2}) where {T} - dims[1] == dims[2] || throw(ArgumentError("SymArray must be square")) - return SymArray(dims[1], zero(T)) -end - -function copyto!(dest::SymArray{F}, src::SymArray{F}) where {F} +function copy!(dest::SymArray{F}, src::SymArray{F}) where {F} size(dest) == size(src) || throw(DimensionMismatch("arrays must have the same size")) - copyto!(dest.uppertrian, src.uppertrian) - return dest + copy!(dest.uppertrian.nzval, src.uppertrian.nzval) + return nothing end """ @@ -198,11 +235,7 @@ Efficiently sum all elements in the symmetric matrix (counting each off-diagonal This is more efficient than `sum(a)` because it only sums stored elements. """ function sum_tri_with_diag(a::SymArray) - return sum(a.uppertrian.nzval) -end - -function eltype(::SymArray{F}) where {F} - return F + return sum(a.uppertrian) end function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} @@ -211,44 +244,32 @@ function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} # Directly build upper triangle sparse matrix # Pre-allocate with exact size needed - I_indices = Vector{Int}(undef, div(k * (k + 1), 2)) - J_indices = Vector{Int}(undef, div(k * (k + 1), 2)) - values = Vector{F}(undef, div(k * (k + 1), 2)) - + m, n, colptr, rowval, nzval = make_csc_format(k, F) idx = 1 - for j in 1:k + @inbounds for j in 1:k for i in 1:j - I_indices[idx] = i - J_indices[idx] = j - values[idx] = a[i, j] + nzval[idx] = a[i, j] idx += 1 end end - - uppertrian = sparse(I_indices, J_indices, values, k, k) - return SymArray{F}(uppertrian) -end - -function convert(::Type{AbstractMatrix{F}}, a::SymArray{F}) where {F} - # Reconstruct full symmetric matrix from upper triangle - # m = upper + upper' - Diagonal(upper) creates the full symmetric matrix - m = a.uppertrian + transpose(a.uppertrian) - - SparseArrays.spdiagm(0 => diag(a.uppertrian)) - return Matrix(m) -end - -function copy!(dest::SymArray{F}, src::SymArray{F}) where {F <: Real} - copyto!(dest, src) - return dest + return SymArray(SparseMatrixCSC{F, Int}(m, n, colptr, rowval, nzval)) end function deepcopy!(dest::SymArray{F}, src::SymArray{F}) where {F <: AbstractArray} - @inbounds for index in eachindex(dest) - copyto!(dest[index], src[index]) + dest_ = dest.uppertrian.nzval + src_ = src.uppertrian.nzval + @inbounds for index in eachindex(src_) + if isassigned(dest_, index) + copy!(dest_[index], src_[index]) + else + dest_[index] = copy(src_[index]) + end end return dest end +deepcopy!(dest::SymArray{F}, src::SymArray{F}) where {F <: Real} = copy!(dest, src) + # Broadcasting support - custom style to maintain symmetric structure struct SymArrayStyle <: Broadcast.AbstractArrayStyle{2} end SymArrayStyle(::Val{2}) = SymArrayStyle() @@ -273,57 +294,21 @@ Base.BroadcastStyle(::SymArrayStyle, ::SymArrayStyle) = SymArrayStyle() # Custom similar for broadcasted SymArrays function Base.similar( bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{ElType}) where {ElType} - # For mutating functions that return Nothing, don't allocate a SymArray - if ElType === Nothing - # Find the first SymArray in the broadcast expression - A = find_first_symarray(bc) - # Return a similar array with the same element type as the input - # This allows the broadcast to work but the result won't be used - return similar(Array{ElType}, axes(bc)) - end - # Find the first SymArray in the broadcast expression to get dimensions - A = find_first_symarray(bc) - return SymArray(size(A, 1), zero(ElType)) -end - -# Helper function to find a SymArray in the broadcast tree -find_first_symarray(bc::Broadcast.Broadcasted) = find_first_symarray(bc.args) -find_first_symarray(args::Tuple{}) = error("No SymArray found in broadcast") -find_first_symarray(args::Tuple) = find_first_symarray_in_args(args[1], Base.tail(args)) - -# Handle direct SymArray -find_first_symarray_in_args(x::SymArray, rest) = x -# Handle Extruded SymArray (from broadcasting) -find_first_symarray_in_args(x::Broadcast.Extruded{<:SymArray}, rest) = x.x -# Handle nested broadcasts -find_first_symarray_in_args(x::Broadcast.Broadcasted, rest) = find_first_symarray(x) -# Keep searching -find_first_symarray_in_args(x, rest) = find_first_symarray(rest) - -# Custom copyto! for efficient broadcasting -function Base.copyto!(dest::SymArray, bc::Broadcast.Broadcasted{SymArrayStyle}) - # Broadcast only over the upper triangle for efficiency - axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) - bc′ = Broadcast.preprocess(dest, bc) - - # Only compute upper triangle - k = size(dest, 1) - @inbounds for j in 1:k - for i in 1:j - dest[i, j] = bc′[CartesianIndex(i, j)] - end - end - return dest + A = find_symarray(bc) + return SymArray(similar(A.uppertrian, ElType)) end -# For broadcasting that returns Nothing (like with mutating functions) -function Base.copyto!(dest::AbstractArray, bc::Broadcast.Broadcasted{SymArrayStyle}) - # Fall back to default behavior - Broadcast.materialize!(dest, bc) +# I don't understand this, but it's needed to avoid errors somehow?? +function Base.similar(bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{Nothing}) + return similar(Array{Nothing}, axes(bc)) end -@inline function throwdm(axdest, axsrc) - throw(DimensionMismatch("destination axes $axdest are not compatible with source axes $axsrc")) -end +# Helper function to find a SymArray in the broadcast tree +find_symarray(bc::Broadcast.Broadcasted) = find_symarray(bc.args) +find_symarray(args::Tuple) = find_symarray(args[1], Base.tail(args)) +find_symarray(x) = x +find_symarray(args::Tuple{}) = nothing +find_symarray(a::SymArray, rest) = a +find_symarray(::Any, rest) = find_symarray(rest) end diff --git a/test/test_symarray.jl b/test/test_symarray.jl index a2a1f53..7d36971 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -3,6 +3,7 @@ using NetworkHistogram using NetworkHistogram.FastSymArray using SparseArrays using LinearAlgebra +using StaticArrays @testset "SymArray Array Interface" begin @testset "Construction and basic properties" begin @@ -86,7 +87,7 @@ using LinearAlgebra @test a[2, 2] == 4.0 # Test conversion to AbstractMatrix - b = convert(AbstractMatrix{Float64}, a) + b = convert(Matrix{Float64}, a) @test b isa Matrix{Float64} @test b == M @test b[1, 2] == b[2, 1] # Verify symmetry @@ -116,14 +117,14 @@ using LinearAlgebra @test_throws ArgumentError similar(a, Float64, (3, 4)) end - @testset "copyto! and copy!" begin + @testset "copy! and deepcopy!" begin a = SymArray(3, 0.0) a[1, 1] = 1.0 a[1, 2] = 2.0 a[2, 3] = 5.0 b = similar(a) - copyto!(b, a) + copy!(b, a) @test b[1, 1] == 1.0 @test b[1, 2] == 2.0 @@ -131,16 +132,34 @@ using LinearAlgebra @test b[2, 3] == 5.0 @test b[3, 2] == 5.0 - # Test copy! - c = similar(a) - copy!(c, a) - @test c[1, 1] == a[1, 1] - @test c[1, 2] == a[1, 2] - @test c[2, 3] == a[2, 3] - # Test dimension mismatch d = SymArray(4, 0.0) - @test_throws DimensionMismatch copyto!(d, a) + @test_throws DimensionMismatch copy!(d, a) + + # Test deepcopy! + src = SymArray{Vector{Int}}(undef, 4, 4) + for j in 1:4, i in j:4 + src[i, j] = [i, j] + end + + # on unassigned dest + dest = similar(src) + deepcopy!(dest, src) + for j in 1:4, i in j:4 + @test dest[i, j] == src[i, j] + @test !(dest[i, j] === src[i, j]) # Ensure deep copy + end + + # on assigned dest + dest2 = similar(src) + for j in 1:4, i in j:4 + dest2[i, j] = [-1, -1] + end + deepcopy!(dest2, src) + for j in 1:4, i in j:4 + @test dest2[i, j] == src[i, j] + @test !(dest2[i, j] === src[i, j]) # Ensure deep copy + end end @testset "Array operations" begin From 6413ec9010e6806b3cdf9d5e362bc2761defb02e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 20:52:13 +0200 Subject: [PATCH 204/266] fix some weird similar broadcast shenanigans --- src/optimization/swap_categorical.jl | 18 ++++++++++-------- src/utils/SymArray.jl | 5 ----- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl index e72ea85..8f4d89c 100644 --- a/src/optimization/swap_categorical.jl +++ b/src/optimization/swap_categorical.jl @@ -60,8 +60,8 @@ function Assignment( for g2 in 1:n_groups, g1 in g2:n_groups w.log_likelihood_per_group[g1, g2] = log_likelihood_per_group[g1, g2] w.counts[g1, g2] = counts[g1, g2] - copyto!(w.realized[g1, g2], realized[g1, g2]) - copyto!(w.estimated[g1, g2], estimated[g1, g2]) + copy!(w.realized[g1, g2], realized[g1, g2]) + copy!(w.estimated[g1, g2], estimated[g1, g2]) end return assignment @@ -96,11 +96,11 @@ function copy_categorical_workspace!( deepcopy!(dest.estimated, src_assignment.additional_workspace.estimated) # @inbounds for j in 1:k, i in 1:j - # copyto!(dest.realized[i, j], src_ws.realized[i, j]) + # copy!(dest.realized[i, j], src_ws.realized[i, j]) # end # @inbounds for j in 1:k, i in 1:j - # copyto!(dest.estimated[i, j], src_ws.estimated[i, j]) + # copy!(dest.estimated[i, j], src_ws.estimated[i, j]) # end end @@ -122,11 +122,11 @@ function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) deepcopy!(a.additional_workspace.realized, ws.realized) deepcopy!(a.additional_workspace.estimated, ws.estimated) # @inbounds for j in 1:k, i in 1:j - # copyto!(as.realized[i, j], ws.realized[i, j]) + # copy!(as.realized[i, j], ws.realized[i, j]) # end # @inbounds for j in 1:k, i in 1:j - # copyto!(as.estimated[i, j], ws.estimated[i, j]) + # copy!(as.estimated[i, j], ws.estimated[i, j]) # end end @@ -155,8 +155,10 @@ function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) as.additional_workspace.counts[gu, g_inter] += 1 as.additional_workspace.realized[gu, g_inter][e] += 1 end - _fast_normalization!.(as.additional_workspace.estimated, - as.additional_workspace.realized, as.additional_workspace.counts) + @inbounds for index in eachindex(as.additional_workspace.estimated) + _fast_normalization!(as.additional_workspace.estimated[index], + as.additional_workspace.realized[index], as.additional_workspace.counts[index]) + end swap_node_labels!(as, u, v) m = size(as.additional_workspace.estimated[1, 1], 1) for g2 in 1:n_groups diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 1268835..799e37f 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -298,11 +298,6 @@ function Base.similar( return SymArray(similar(A.uppertrian, ElType)) end -# I don't understand this, but it's needed to avoid errors somehow?? -function Base.similar(bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{Nothing}) - return similar(Array{Nothing}, axes(bc)) -end - # Helper function to find a SymArray in the broadcast tree find_symarray(bc::Broadcast.Broadcasted) = find_symarray(bc.args) find_symarray(args::Tuple) = find_symarray(args[1], Base.tail(args)) From 1d56733f57b3d87e48a1cd139b8367f3722efd2f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 21:05:45 +0200 Subject: [PATCH 205/266] switch tutorials to use graphons.jl --- docs/Project.toml | 3 + docs/literate/tutorials/simple_graph.jl | 102 ++++++++++++++++-------- docs/src/tutorials/simple_graph.md | 75 ++++++++++++++--- 3 files changed, 138 insertions(+), 42 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index d1d0c6f..53fbe89 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,10 +1,13 @@ [deps] CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" +Kneedle = "4ef9287f-f14a-4b13-b4c1-9bb5ae54399a" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 0543fcd..311bc43 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -23,17 +23,16 @@ using NetworkHistogram using Distributions h = 300; # hide +Random.seed!(1234); # Define a simple step-function graphon -W(u, v) = u * v +w = SimpleContinuousGraphon((x, y) -> x * y) # We can visualize this graphon as a heatmap. let - grid = 0:0.01:1 fig = Mke.Figure(size = (h + 20, h)) - ax = Mke.Axis(fig[1, 1], title = "True Graphon W(u,v)", - xlabel = "u", ylabel = "v", aspect = Mke.DataAspect()) - hm = Mke.heatmap!(ax, grid, grid, W, colormap = :binary, colorrange = (0, 1)) + ax = Mke.Axis(fig[1, 1], title = "True Graphon W(u,v)") + hm = Mke.heatmap!(ax, w, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 2], hm) fig end @@ -44,26 +43,10 @@ end # To generate a random graph from a graphon, we follow these steps: # 1. **Assign latent positions:** For a graph with `n` nodes, we sample `n` independent and identically distributed random variables $u_1, u_2, \dots, u_n$ from a Uniform(0, 1) distribution. These are the latent positions of our nodes. # 2. **Generate edges:** For each pair of nodes `(i, j)` with `i < j`, we generate a random number from a Bernoulli distribution with probability $W(u_i, u_j)$. This determines whether an edge exists between them. The resulting adjacency matrix `A` will be symmetric. - -# Let's write a function to do this. - -function sample_graph(W_func, n::Int; seed = 123) - Random.seed!(seed) - u = rand(n) # Latent positions - A = zeros(Int, n, n) - for i in 1:n - for j in (i + 1):n - if rand() < W_func(u[i], u[j]) - A[i, j] = A[j, i] = 1 - end - end - end - return A, u -end - -# Now, let's sample a graph with 400 nodes from our graphon `W`. +# Let's sample a graph with 400 nodes from our graphon `W`. n = 400 -A, u_true = sample_graph(W, n); +u_true = rand(n); # Latent positions +A = sample_graph(w, u_true); # We can visualize the adjacency matrix of the sampled graph. # To make the block structure visible, we sort the nodes by their latent positions. @@ -112,7 +95,8 @@ initial_assignment = shuffle(oracle_labels); # Now, we create an `Assignment` object, which holds all the information # about the model and the current state of the node groupings. oracle_estimator = Assignment(oracle_labels, edge_list, Dist(dist)); -heatmap_params(oracle_estimator, ordering = false, colorrange = (0, 1)) +sbm_oracle = NetworkHistogram.to_block_model(oracle_estimator); +Mke.heatmap(sbm_oracle, colormap = :binary, colorrange = (0, 1)) println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) # `NetworkHistogram.jl` provides optimization algorithms to improve the initial assignment. @@ -121,7 +105,7 @@ println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) params_opti = NetworkHistogram.GreedyParams( 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(2_000), false) + NetworkHistogram.PreviousBestValue(2_000), false); a = nethist(A, dist, initial_assignment, params_opti, false); @@ -131,20 +115,72 @@ a = nethist(A, dist, initial_assignment, params_opti, false); # We can visualize the fitted histogram. heatmap_params(a, ordering = false, colorrange = (0, 1)) -# And we can look at the estimated block model. -sbm_fitted = NetworkHistogram.BlockModel(a); -# We first align the groups to the true latent positions. -NetworkHistogram.align_sbm_true_latents!(sbm_fitted, a, oracle_estimator.node_labels); +# We can convert it to a block model for easier interpretation. + +res = NethistResult(a); + +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], sbm_oracle, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end + +# the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. + +NetworkHistogram.align_res_true_latents!(res, a, oracle_estimator.node_labels); # and display the true function, the oracle estimator, and the fitted model let fig = Mke.Figure(size = (1220, 400)) titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] - Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[2], NetworkHistogram.BlockModel(oracle_estimator), + Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], sbm_oracle, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end + +# We can even fit a Stochastic Shape Model quite easily from the fitted SBM. + +using Clustering + +ξ = NetworkHistogram.node_labels_to_latents(res.node_labels, res.model); +shape_range = 1:(k * (k + 1) ÷ 2 - 1) +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res.model, A, ξ, shape_range) + +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +# Let's extract the optimal number of shapes using the Kneedle algorithm: + +k_knee = knees(kr)[1] +ssm_knee = SSM(res.model, k_knee) + +Mke.heatmap(ssm_estimated, colormap = :binary, colorrange = (0, 1)) +println("Number of shapes in SSM argmin: ", length(ssm_estimated.θ)) +println("Number of shapes in SSM knee: ", length(ssm_knee.θ)) +println("Number of shapes in SBM: ", length(res.model.θ)) + +# We greatly reduced the number of parameters from the original SBM estimate while preserving much of the structure of the estimated graphon as seen below: + +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["SBM", "SSM argmin", "SSM knee"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], res.model, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], ssm_estimated, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[3], sbm_fitted, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], ssm_knee, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 4], colormap = :binary, limits = (0, 1), label = "Edge Probability", width = 20) fig diff --git a/docs/src/tutorials/simple_graph.md b/docs/src/tutorials/simple_graph.md index 1551ad7..2631db4 100644 --- a/docs/src/tutorials/simple_graph.md +++ b/docs/src/tutorials/simple_graph.md @@ -26,6 +26,7 @@ using NetworkHistogram using Distributions h = 300; # hide +Random.seed!(1234); nothing #hide ```` @@ -147,7 +148,8 @@ about the model and the current state of the node groupings. ````@example simple_graph oracle_estimator = Assignment(oracle_labels, edge_list, Dist(dist)); -heatmap_params(oracle_estimator, ordering = false, colorrange = (0, 1)) +sbm_oracle = NetworkHistogram.to_block_model(oracle_estimator); +Mke.heatmap(sbm_oracle, colormap = :binary, colorrange = (0, 1)) println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) ```` @@ -159,7 +161,7 @@ groups to maximize the log-likelihood. ````@example simple_graph params_opti = NetworkHistogram.GreedyParams( 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(2_000), false) + NetworkHistogram.PreviousBestValue(2_000), false); a = nethist(A, dist, initial_assignment, params_opti, false); nothing #hide @@ -174,17 +176,29 @@ We can visualize the fitted histogram. heatmap_params(a, ordering = false, colorrange = (0, 1)) ```` -And we can look at the estimated block model. +We can convert it to a block model for easier interpretation. ````@example simple_graph -sbm_fitted = NetworkHistogram.BlockModel(a); -nothing #hide +res = NethistResult(a); + +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], sbm_oracle, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end ```` -We first align the groups to the true latent positions. +the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. ````@example simple_graph -NetworkHistogram.align_sbm_true_latents!(sbm_fitted, a, oracle_estimator.node_labels); +NetworkHistogram.align_res_true_latents!(res, a, oracle_estimator.node_labels); nothing #hide ```` @@ -196,9 +210,52 @@ let titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[2], NetworkHistogram.BlockModel(oracle_estimator), + Mke.heatmap!(axes[2], sbm_oracle, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end +```` + +We can even fit a Stochastic Shape Model quite easily from the fitted SBM. + +````@example simple_graph +using Clustering + +ξ = NetworkHistogram.node_labels_to_latents(res.node_labels, res.model); +shape_range = 1:(k * (k + 1) ÷ 2 - 1) +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res.model, A, ξ, shape_range) + +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +```` + + Let's extract the optimal number of shapes using the Kneedle algorithm: + +````@example simple_graph +k_knee = knees(kr)[1] +ssm_knee = SSM(res.model, k_knee) + +Mke.heatmap(ssm_estimated, colormap = :binary, colorrange = (0, 1)) +println("Number of shapes in SSM argmin: ", length(ssm_estimated.θ)) +println("Number of shapes in SSM knee: ", length(ssm_knee.θ)) +println("Number of shapes in SBM: ", length(res.model.θ)) +```` + +We greatly reduced the number of parameters from the original SBM estimate while preserving much of the structure of the estimated graphon as seen below: + +````@example simple_graph +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["SBM", "SSM argmin", "SSM knee"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], res.model, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], ssm_estimated, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[3], sbm_fitted, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], ssm_knee, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 4], colormap = :binary, limits = (0, 1), label = "Edge Probability", width = 20) fig From 7f050cf77e951b3f958a65979f51bfbb18ce5946 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 21:07:54 +0200 Subject: [PATCH 206/266] remove old BlockModel --- ext/MakieExt.jl | 56 +---- src/api.jl | 63 ++++++ src/assignment.jl | 28 +++ src/block_model.jl | 339 ----------------------------- src/estimator/abstractEstimator.jl | 20 +- 5 files changed, 104 insertions(+), 402 deletions(-) delete mode 100644 src/block_model.jl diff --git a/ext/MakieExt.jl b/ext/MakieExt.jl index ce0a3fe..8b5dc46 100644 --- a/ext/MakieExt.jl +++ b/ext/MakieExt.jl @@ -10,7 +10,7 @@ using Makie using StatsBase: countmap import NetworkHistogram: get_probability_matrix, Assignment, heatmap_params, - number_nodes, number_groups, Dist, BlockModel + number_nodes, number_groups, Dist, unwrap import Distributions import StatsAPI @@ -32,17 +32,6 @@ function Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) return ps end -""" - Makie.convert_arguments(::Type{<:Heatmap}, sbm::BlockModel) - -Convert a BlockModel to heatmap arguments for Bernoulli distributions. -""" -function Makie.convert_arguments(::Type{<:Heatmap}, - sbm::BlockModel{D}) where {D <: Union{ - Dist{T}, T} where {T <: NetworkHistogram.Bernoulli}} - return (0:0.01:1, 0:0.01:1, (x, y) -> first(StatsAPI.params(sbm[x, y]))) -end - """ heatmap_params(a; colormap=:binary, ordering=false, colorrange=nothing, group_match=1:number_groups(a)) @@ -106,48 +95,5 @@ function heatmap_params(a; colormap = :binary, ordering = false, return fig end -""" - order_groups(a::Assignment, latents::AbstractVector) - -Order groups based on true latent variables (heuristic alignment). - -This is a heuristic approach to match estimated groups to ground truth orderings -by analyzing the overlap between sorted latents and group assignments. - -# Arguments -- `a::Assignment`: The assignment with estimated groups -- `latents::AbstractVector`: True latent variables (e.g., block memberships) - -# Returns -- Permutation vector for reordering groups -""" -function order_groups(a::Assignment, latents::AbstractVector) - n = number_nodes(a) - k = number_groups(a) - sort_perm = sortperm(latents) - sorted_group_labels = a.node_labels[sort_perm] - dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - return sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) -end - -""" - align_sbm_true_latents!(sbm::NetworkHistogram.BlockModel, a::Assignment, latents) - -Align a BlockModel's groups to match true latent variables. - -# Arguments -- `sbm::BlockModel`: The block model to align (modified in-place) -- `a::Assignment`: The assignment -- `latents`: True latent variables - -# Note -This modifies `sbm` in-place to reorder its blocks. -""" -function align_sbm_true_latents!(sbm::NetworkHistogram.BlockModel, a::Assignment, latents) - NetworkHistogram.align_sbm!(sbm, order_groups(a, latents)) -end - export heatmap_params end diff --git a/src/api.jl b/src/api.jl index 4411237..2c152b0 100644 --- a/src/api.jl +++ b/src/api.jl @@ -112,3 +112,66 @@ end function postprocess(out) return out end + +# functions for postprocessing + +struct NethistResult{S} + node_labels::Vector{Int} + model::S +end + +function NethistResult(a::Assignment) + return NethistResult(copy(a.node_labels), to_block_model(a)) +end + +function to_block_model(a::Assignment{ + E, Dist{D}}) where {E, D <: Union{Bernoulli, Distributions.Bernoulli}} + sizes = counts(a.node_labels) ./ length(a.node_labels) + θ::Matrix{Float64} = map(x -> first(params(unwrap(x))), a.θ) + return SBM(θ, sizes) +end + +function to_block_model(a::Assignment) + @info "Converting Assignment to DecoratedSBM" + sizes = counts(a.node_labels) ./ length(a.node_labels) + return DecoratedSBM(unwrap.(a.θ), sizes) +end + +function node_labels_to_latents(node_labels::AbstractVector{Int}, sbm) + return map(label -> _label_to_latent(label, sbm), node_labels) +end + +function _label_to_latent(label::Int, sbm) + return sbm.cumsize[label] - eps() +end + +function align_res_true_latents!(res, a::Assignment, latents) + perm = order_groups(a, latents) + permute!(res.model, perm) + res.node_labels .= map(x -> findfirst(==(x), perm), a.node_labels) +end + +function permute!(sbm, perm) + permuted_theta = copy(sbm.θ) + sbm.θ .= permuted_theta[perm, perm] + sbm.size .= sbm.size[perm] + sbm.cumsize .= cumsum(sbm.size) +end + +""" + order_groups(a::Assignment, latents::AbstractVector) + +Order the groups of an assignment according to the true latents. This is an heuristic +approach, which is not guaranteed to find the true ordering of the groups. +""" +function order_groups(a::Assignment, latents::AbstractVector) + n = number_nodes(a) + k = number_groups(a) + sort_perm = sortperm(latents) + sorted_group_labels = a.node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + return sort( + 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) +end diff --git a/src/assignment.jl b/src/assignment.jl index 3a28ce0..de8647e 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -204,3 +204,31 @@ function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, end return θ, log_likelihood end + +function get_probability_matrix( + a::Assignment, default_dist = nothing, node_labels = a.node_labels) + θ = unwrap.(a.θ) + D = typeof(first(θ)) + if isnothing(default_dist) + try + default_dist = zero(θ[1, 1]) + catch e + if !isa(e, MethodError) + rethrow(e) + end + error("Please provide a default distribution for the diagonal as it could not be inferred") + end + end + n = length(node_labels) + A = Array{D, 2}(undef, n, n) + for j in 1:n + for i in 1:n + if i == j + A[i, i] = default_dist + else + A[i, j] = θ[node_labels[i], node_labels[j]] + end + end + end + return A +end diff --git a/src/block_model.jl b/src/block_model.jl deleted file mode 100644 index ab1cc7d..0000000 --- a/src/block_model.jl +++ /dev/null @@ -1,339 +0,0 @@ -""" - BlockModel{D, V, M <: AbstractMatrix{D}} - -A stochastic block model representation for network generation and analysis. - -A block model is a piecewise constant graphon approximation where nodes are divided -into K blocks, and edges between blocks follow specific distributions. - -# Fields -- `_dists::M`: Symmetric K×K matrix of edge distributions between blocks -- `sizes::V`: Proportions of nodes in each block (sums to 1.0) -- `cum_sizes::V`: Cumulative proportions for mapping latent variables to blocks - -# Type Parameters -- `D`: Distribution type for edges (e.g., Bernoulli, Categorical, etc.) -- `V`: Vector type for storing proportions -- `M`: Matrix type for storing distributions - -# Constructors - -```julia -# Uniform block sizes with k blocks -BlockModel(k::Int, d::D) - -# From an Assignment -BlockModel(a::Assignment) - -# From node labels and parameter matrix -BlockModel(node_labels, θ) - -# From a distribution matrix (infers uniform block sizes) -BlockModel(θ::AbstractMatrix) -``` - -# Examples -```julia -# Create a 3-block model with Bernoulli edges -bm = BlockModel(3, Bernoulli(0.5)) - -# Sample a network from the block model -latents, A = sample(bm, 100) # 100 nodes - -# Access block-to-block distribution -dist_12 = bm[1, 2] - -# Map a latent variable to a block -block = map_ξ_to_block(bm, 0.3) -``` - -See also: [`Assignment`](@ref), [`sample`](@ref), [`get_probability_matrix`](@ref) -""" -struct BlockModel{D, V, M <: AbstractMatrix{D}} - _dists::M - sizes::V - cum_sizes::V -end - -""" - BlockModel(k::Int, d::D) where {D} - -Create a block model with `k` uniform-sized blocks, each initialized with distribution `d`. -""" -function BlockModel(k::Int, d::D) where {D} - k > 0 || throw(ArgumentError("Number of blocks k=$k must be positive")) - sizes = fill(1 / k, k) - cumulative_sizes = cumsum(sizes) - _dists = SymArray(k, d) - return BlockModel(_dists, sizes, cumulative_sizes) -end - -""" - BlockModel(a::Assignment) - -Create a BlockModel from an Assignment, extracting the block proportions and -fitted distributions. -""" -function BlockModel(a::Assignment) - k = length(unique(a.node_labels)) - sizes = proportions(a) - cumulative_sizes = cumsum(sizes) - _dists = unwrap.(a.θ) - - # Validate that sizes sum to approximately 1.0 - size_sum = sum(sizes) - abs(size_sum - 1.0) < 1e-10 || @warn "Block sizes sum to $size_sum, expected 1.0" - - return BlockModel(_dists, sizes, cumulative_sizes) -end - -""" - BlockModel(nodes_labels, θ) - -Create a BlockModel from node labels and a distribution matrix θ. -""" -function BlockModel(nodes_labels, θ) - k = length(unique(nodes_labels)) - sizes = counts(nodes_labels) / length(nodes_labels) - cumulative_sizes = cumsum(sizes) - _dists = unwrap.(θ) - - # Validate that sizes sum to approximately 1.0 - size_sum = sum(sizes) - abs(size_sum - 1.0) < 1e-10 || @warn "Block sizes sum to $size_sum, expected 1.0" - - return BlockModel(_dists, sizes, cumulative_sizes) -end - -""" - BlockModel(θ::AbstractMatrix{D}) where {D} - -Create a BlockModel from a distribution matrix, assuming uniform block sizes. -""" -function BlockModel(θ::AbstractMatrix{D}) where {D} - k = size(θ, 1) - sizes = fill(1 / k, k) - cumulative_sizes = cumsum(sizes) - _dists = convert(SymArray{D}, θ) - return BlockModel(_dists, sizes, cumulative_sizes) -end - -""" - map_ξ_to_block(bm::BlockModel, ξ::Real) - -Map a latent variable ξ ∈ [0,1] to its corresponding block index. - -# Arguments -- `bm::BlockModel`: The block model -- `ξ::Real`: Latent variable in [0, 1] - -# Returns -- `Int`: Block index (1 to k) -""" -function map_ξ_to_block(bm::BlockModel, ξ::T) where {T <: Real} - return findfirst(x -> x >= ξ, bm.cum_sizes) -end - -""" - sample(bm::BlockModel, latents::Int, args...) - -Sample a network from the block model by first generating `latents` random latent -variables, then sampling edges according to the block distributions. - -# Arguments -- `bm::BlockModel`: The block model to sample from -- `latents::Int`: Number of nodes to generate -- `args...`: Additional arguments passed to edge sampling - -# Returns -- Tuple of (latent_assignments, adjacency_matrix) -""" -function sample(bm::BlockModel, latents::Int, args...) - latents = map(x -> map_ξ_to_block(bm, x), rand(latents)) - return latents, sample(bm, latents, args...) -end - -""" - sample(bm::BlockModel, latents::Vector, args...) - -Sample a network from the block model given specific latent block assignments. - -# Arguments -- `bm::BlockModel`: The block model to sample from -- `latents::Vector`: Block assignments for each node -- `args...`: Additional arguments passed to edge sampling - -# Returns -- Adjacency matrix with sampled edges -""" -function sample(bm::BlockModel, latents::Vector{T}, args...) where {T} - A = Array{eltype(bm[1, 1]), 2}(undef, length(latents), length(latents)) - for j in 1:length(latents) - for i in 1:(j - 1) - A[i, j] = A[j, i] - end - for i in (j + 1):length(latents) - A[i, j] = sample(bm[latents[i], latents[j]], args...) - A[j, i] = A[i, j] - end - end - # Fill diagonal with zeros (no self-loops) - for i in 1:length(latents) - A[i, i] = zero(A[1, 2]) - end - return A -end - -# Base interface implementations for BlockModel - -function Base.getindex(s::BlockModel, i::Int, j::Int) - return s._dists[i, j] -end - -function Base.setindex!(s::BlockModel, v, i::Int, j::Int) - s._dists[i, j] = v -end - -function Base.size(s::BlockModel) - return (s._dists.k, s._dists.k) -end - -""" - getindex(bm::BlockModel, i::Real, j::Real) - -Index into the block model using latent variables ξᵢ, ξⱼ ∈ [0,1]. - -Maps latent variables to their corresponding blocks and returns the -distribution between those blocks. -""" -function Base.getindex(s::BlockModel, i::Real, j::Real) - k = findfirst(x -> x ≥ i, s.cum_sizes) - l = findfirst(x -> x ≥ j, s.cum_sizes) - return s._dists[k, l] -end - -function Base.setindex!(s::BlockModel, v, i::Real, j::Real) - k = findfirst(x -> x ≥ i, s.cum_sizes) - l = findfirst(x -> x ≥ j, s.cum_sizes) - s._dists[k, l] = v -end - -""" - ordered_latents(bm::BlockModel, n::Int) - -Generate `n` ordered (sorted) latent block assignments from the block model. - -# Returns -- Sorted vector of block assignments -""" -function ordered_latents(bm::BlockModel, n::Int) - return sort(map(x -> map_ξ_to_block(bm, x), rand(n))) -end - -""" - get_probability_matrix(bm::BlockModel, latents::AbstractVector, default_dist=nothing) - -Generate a node-level probability matrix from a block model and latent assignments. - -Creates an n×n matrix where entry (i,j) contains the distribution for the edge -between nodes i and j, based on their block assignments. - -# Arguments -- `bm::BlockModel`: The block model -- `latents::AbstractVector`: Block assignment for each node -- `default_dist`: Distribution for diagonal entries (defaults to zero(bm[1,1]) if not provided) - -# Returns -- `Matrix`: n×n matrix of distributions - -# Example -```julia -bm = BlockModel(3, Bernoulli(0.5)) -latents = [1, 1, 2, 2, 3] -prob_matrix = get_probability_matrix(bm, latents) -``` -""" -function get_probability_matrix( - bm::BlockModel{D}, latents::AbstractVector, default_dist = nothing) where {D} - # Set default distribution for diagonal (no self-loops) - if isnothing(default_dist) - try - default_dist = zero(bm[1, 1]) - catch e - if !isa(e, MethodError) - rethrow(e) - end - error("Please provide a default distribution for the diagonal as it could not be inferred") - end - end - n = length(latents) - A = Array{D, 2}(undef, n, n) - for j in 1:n - for i in 1:n - if i == j - A[i, i] = default_dist - else - A[i, j] = bm[latents[i], latents[j]] - end - end - end - return A -end - -""" - get_probability_matrix(a::Assignment, default_dist=nothing, node_labels=a.node_labels) - -Generate a node-level probability matrix from an Assignment. - -# Arguments -- `a::Assignment`: The assignment -- `default_dist`: Distribution for diagonal entries (default: nothing) -- `node_labels`: Custom node labels to use (default: a.node_labels) - -# Returns -- `Matrix`: Probability matrix based on the assignment's block structure -""" -function get_probability_matrix( - a::Assignment, default_dist = nothing, node_labels = a.node_labels) - return get_probability_matrix(BlockModel(a.θ), node_labels, default_dist) -end - -""" - align_sbm!(sbm::BlockModel, perm) - -Permute the blocks of a stochastic block model according to permutation `perm`. - -This modifies the block model in-place, reordering blocks and updating the -cumulative sizes accordingly. - -# Arguments -- `sbm::BlockModel`: The block model to modify (modified in-place) -- `perm`: Permutation vector for reordering blocks -""" -function align_sbm!(sbm::BlockModel, perm) - sbm._dists .= sbm._dists[perm, perm] - sbm.sizes .= sbm.sizes[perm] - sbm.cum_sizes .= cumsum(sbm.sizes) -end - -""" - order_groups(a::Assignment, latents::AbstractVector) - -Order the groups of an assignment according to the true latents. This is an heuristic -approach, which is not guaranteed to find the true ordering of the groups. -""" -function order_groups(a::Assignment, latents::AbstractVector) - n = number_nodes(a) - k = number_groups(a) - sort_perm = sortperm(latents) - sorted_group_labels = a.node_labels[sort_perm] - dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - return sort( - 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) -end - -function align_sbm_true_latents!(sbm::BlockModel, a::Assignment, latents) - align_sbm!(sbm, order_groups(a, latents)) -end diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index 5f09f25..f976ca6 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -174,16 +174,20 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress edge_val_2 = data[j, index2] # Update for node1: remove from group1, add to group2 - remove_realized(estimator.realized_swap[group1, group_j], edge_val_1) - remove_counts!(estimator.counts_swap, edge_val_1, group1, group_j) - add_realized(estimator.realized_swap[group2, group_j], edge_val_1) - add_counts!(estimator.counts_swap, edge_val_1, group2, group_j) + if !isnothing(edge_val_1) + remove_realized(estimator.realized_swap[group1, group_j], edge_val_1) + remove_counts!(estimator.counts_swap, edge_val_1, group1, group_j) + add_realized(estimator.realized_swap[group2, group_j], edge_val_1) + add_counts!(estimator.counts_swap, edge_val_1, group2, group_j) + end # Update for node2: remove from group2, add to group1 - remove_realized(estimator.realized_swap[group2, group_j], edge_val_2) - remove_counts!(estimator.counts_swap, edge_val_2, group2, group_j) - add_realized(estimator.realized_swap[group1, group_j], edge_val_2) - add_counts!(estimator.counts_swap, edge_val_2, group1, group_j) + if !isnothing(edge_val_2) + remove_realized(estimator.realized_swap[group2, group_j], edge_val_2) + remove_counts!(estimator.counts_swap, edge_val_2, group2, group_j) + add_realized(estimator.realized_swap[group1, group_j], edge_val_2) + add_counts!(estimator.counts_swap, edge_val_2, group1, group_j) + end end # Tentatively apply swap From 920a5aebfaad567c4444d4099045c7380fff7176 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 24 Oct 2025 21:59:36 +0200 Subject: [PATCH 207/266] butcher api to use fast estimator for now --- docs/literate/tutorials/simple_graph.jl | 30 ++++++++++++++------ src/NetworkHistogram.jl | 4 ++- src/api.jl | 37 +++++++++++++++++++++++++ src/estimator/abstractEstimator.jl | 2 +- src/optimization/greedy.jl | 2 +- src/utils/SymArray.jl | 7 +++++ 6 files changed, 70 insertions(+), 12 deletions(-) diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 311bc43..08be92f 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -43,8 +43,8 @@ end # To generate a random graph from a graphon, we follow these steps: # 1. **Assign latent positions:** For a graph with `n` nodes, we sample `n` independent and identically distributed random variables $u_1, u_2, \dots, u_n$ from a Uniform(0, 1) distribution. These are the latent positions of our nodes. # 2. **Generate edges:** For each pair of nodes `(i, j)` with `i < j`, we generate a random number from a Bernoulli distribution with probability $W(u_i, u_j)$. This determines whether an edge exists between them. The resulting adjacency matrix `A` will be symmetric. -# Let's sample a graph with 400 nodes from our graphon `W`. -n = 400 +# Let's sample a graph with 2000 nodes from our graphon `W`. +n = 2000 u_true = rand(n); # Latent positions A = sample_graph(w, u_true); @@ -87,8 +87,8 @@ import NetworkHistogram: Dist, Assignment, nethist dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter much. # We start with a random initial assignment of nodes to `k=5` groups. -k = floor(Int, sqrt(n)) -oracle_labels = inverse_rle(1:k, fill(n ÷ k, k)) +k = 10 +oracle_labels = ordered_start_labels(n, k); initial_assignment = shuffle(oracle_labels); @@ -103,11 +103,23 @@ println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) # Let's use the `nethist` function with `GreedyParams`, which iteratively moves nodes between # groups to maximize the log-likelihood. -params_opti = NetworkHistogram.GreedyParams( - 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(2_000), false); +# params_opti = NetworkHistogram.GreedyParams( +# 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), +# NetworkHistogram.PreviousBestValue(2_000), false); -a = nethist(A, dist, initial_assignment, params_opti, false); +# a = nethist(A, dist, initial_assignment, params_opti, false); + +res = NetworkHistogram.nethist_binary_edges(A, + initial_assignment, GreedyParams( + 1_000_000, + RandomGroupSwap(), + Strict(), + PreviousBestValue(1_000, Inf, :min), + true + )); + +a = Assignment(res.node_labels, edge_list, Dist(dist)); +println("Log-likelihood after optimization: ", loglikelihood(a)) # The `Assignment` object `a` now contains the optimized node groupings and # the fitted network histogram parameters. @@ -117,7 +129,7 @@ heatmap_params(a, ordering = false, colorrange = (0, 1)) # We can convert it to a block model for easier interpretation. -res = NethistResult(a); +# res = NethistResult(a); let fig = Mke.Figure(size = (1220, 400)) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index e00f37f..8287509 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -24,7 +24,9 @@ include("estimator/abstractEstimator.jl") include("estimator/SpectralEstimator.jl") include("api.jl") -export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf +export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf, + GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, + Strict, PreviousBestValue, nethist_binary_edges function from_adjs_to_decorated end diff --git a/src/api.jl b/src/api.jl index 2c152b0..e409ab5 100644 --- a/src/api.jl +++ b/src/api.jl @@ -113,6 +113,43 @@ function postprocess(out) return out end +function nethist_discrete_edges(A, initial_node_labels, params::GreedyParams, + k = length(unique(initial_node_labels))) + data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) + m = length(unique(data)) + es = SumGreedyEstimator( + counts_main, counts_swap, realized, realized_swap, + params.max_iter, params.swap_rule, params.stop_rule) + node_labels = estimate(es, data, initial_node_labels) + sizes = counts(node_labels) ./ length(node_labels) + + parameters = similar(es.realized) + @inbounds for j in 1:k, i in 1:k + parameters[i, j] = [es.realized[i, j][c] / es.counts[i, j] for c in 1:m] + end + model = DecoratedSBM(Categorical.(parameters), sizes) + return NethistResult(node_labels, model) +end + +function nethist_binary_edges(A, initial_node_labels, params::GreedyParams, + k = length(unique(initial_node_labels))) + data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) + + es = SumGreedyEstimator( + counts_main, counts_swap, realized, realized_swap, + params.max_iter, params.swap_rule, params.stop_rule) + node_labels = estimate(es, data, initial_node_labels) + sizes = counts(node_labels) ./ length(node_labels) + + θ = Matrix{Float64}(undef, k, k) + @inbounds for j in 1:k, i in 1:k + θ[i, j] = es.realized[i, j][2] / es.counts[i, j] + end + model = SBM(θ, sizes) + + return NethistResult(node_labels, model) +end + # functions for postprocessing struct NethistResult{S} diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index f976ca6..aa83f08 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -226,7 +226,7 @@ function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress # Check stopping criterion if stopping_rule(current_loss, estimator.stop_rule) - @info "Stopping criterion met at iteration $iter" + @info "Stopping criterion met at iteration $iter with loss $current_loss" finish!(pbar) break end diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl index 7dcec74..1cabc85 100644 --- a/src/optimization/greedy.jl +++ b/src/optimization/greedy.jl @@ -21,7 +21,7 @@ params = GreedyParams() # Custom parameters with stricter stopping params = GreedyParams( - 50_000, # max iterations + 1_000_000, # max iterations RandomNodeSwap(), # random node selection Strict(), # only accept improvements PreviousBestValue(5000), # stop after 5000 iterations without improvement diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 799e37f..fccb067 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -298,6 +298,13 @@ function Base.similar( return SymArray(similar(A.uppertrian, ElType)) end +# Custom similar for broadcasted SymArrays +function Base.similar( + bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{Nothing}) + A = find_symarray(bc) + return similar(Array{Nothing}, axes(bc)) +end + # Helper function to find a SymArray in the broadcast tree find_symarray(bc::Broadcast.Broadcasted) = find_symarray(bc.args) find_symarray(args::Tuple) = find_symarray(args[1], Base.tail(args)) From ee2e80d4c69471675c4c2980113081716c9424a7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sat, 25 Oct 2025 00:47:39 +0200 Subject: [PATCH 208/266] add bare version of multiplex --- docs/Project.toml | 1 + docs/literate/tutorials/multiplex_network.jl | 87 +++- docs/literate/tutorials/simple_graph.jl | 2 +- src/NetworkHistogram.jl | 2 + src/api.jl | 28 +- src/estimator/GreedyAverage.jl | 355 ++++++++++++++++ src/estimator/abstractEstimator.jl | 425 +------------------ 7 files changed, 463 insertions(+), 437 deletions(-) create mode 100644 src/estimator/GreedyAverage.jl diff --git a/docs/Project.toml b/docs/Project.toml index 53fbe89..e053bd3 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -11,4 +11,5 @@ Kneedle = "4ef9287f-f14a-4b13-b4c1-9bb5ae54399a" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index cd85fb2..1fffc59 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -1,6 +1,91 @@ #= # Decorated Graphon Tutorial for Multiplex Networks =# +using NetworkHistogram +using Distributions +using StaticArrays +import CairoMakie as Mke +using Random +Random.seed!(1234); +h = 300; -# # How to use NetworkHistogram.jl for Multiplex Networks +function W_multiplex(x, y) + ps = zeros(4) + ps[2] = sqrt(abs(x - y)) / 2 # layer 1 only + ps[3] = abs(sin(2π * x) * sin(2π * y)) / 2 # layer 2 only + ps[4] = min(x, y) / 4 # both layers + ps[1] = 1 - sum(ps[2:4]) # no edge + return DiscreteNonParametric(0:3, SVector{4}(ps)) +end + +graphon = DecoratedGraphon(W_multiplex) + +let + fig = Mke.Figure(size = (4 * h, h)) + for m in 1:4 + ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, graphon, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end + +n = 1000 +true_latents = range(0, 1; length = n) +A = sample_graph(graphon, true_latents); + +k = 20 +oracle_labels = ordered_start_labels(n, k); +initial_labels = shuffle(oracle_labels); + +res = NetworkHistogram.nethist_discrete_edges(A, + initial_labels, GreedyParams( + 1_000_000, + RandomGroupSwap(), + Strict(), + PreviousBestValue(5_000, Inf, :min), + true + )); + +let + fig = Mke.Figure(size = (4 * h, h)) + for m in 1:4 + ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, res.model, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end + +NetworkHistogram.align_res_true_latents!(res, oracle_labels); +let + fig = Mke.Figure(size = (4 * h, h)) + for m in 1:4 + ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, res.model, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end + +using Clustering +shape_range = 1:20 +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res.model, A, true_latents, shape_range); + +using Kneedle +# kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +# # Let's extract the optimal number of shapes using the Kneedle algorithm: + +# k_knee = knees(kr)[1] +# k_knee = 10 +ssm = SSM(res.model, k_knee) + +let + fig = Mke.Figure(size = (4 * h, 3 * h)) + for (i, model) in enumerate([graphon, res.model, ssm]) + for m in 1:4 + ax = Mke.Axis(fig[i, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, model, k = m, colormap = :binary, colorrange = (0, 1)) + end + end + fig +end diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 08be92f..0e93a2d 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -146,7 +146,7 @@ end # the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. -NetworkHistogram.align_res_true_latents!(res, a, oracle_estimator.node_labels); +NetworkHistogram.align_res_true_latents!(res, oracle_estimator.node_labels); # and display the true function, the oracle estimator, and the fitted model let diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 8287509..6cc353b 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -20,6 +20,8 @@ include("distributions/include.jl") include("EdgeList.jl") include("assignment.jl") include("optimization/greedy.jl") +include("preprocessor/categorical.jl") +include("preprocessor/continuous.jl") include("estimator/abstractEstimator.jl") include("estimator/SpectralEstimator.jl") include("api.jl") diff --git a/src/api.jl b/src/api.jl index e409ab5..8db95bb 100644 --- a/src/api.jl +++ b/src/api.jl @@ -117,17 +117,17 @@ function nethist_discrete_edges(A, initial_node_labels, params::GreedyParams, k = length(unique(initial_node_labels))) data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) m = length(unique(data)) - es = SumGreedyEstimator( + es = GreedyAverage( counts_main, counts_swap, realized, realized_swap, params.max_iter, params.swap_rule, params.stop_rule) node_labels = estimate(es, data, initial_node_labels) sizes = counts(node_labels) ./ length(node_labels) - parameters = similar(es.realized) + parameters = Matrix{SVector{m, Float64}}(undef, k, k) @inbounds for j in 1:k, i in 1:k parameters[i, j] = [es.realized[i, j][c] / es.counts[i, j] for c in 1:m] end - model = DecoratedSBM(Categorical.(parameters), sizes) + model = DecoratedSBM(DiscreteNonParametric.(Ref(0:(m - 1)), parameters), sizes) return NethistResult(node_labels, model) end @@ -135,7 +135,7 @@ function nethist_binary_edges(A, initial_node_labels, params::GreedyParams, k = length(unique(initial_node_labels))) data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) - es = SumGreedyEstimator( + es = GreedyAverage( counts_main, counts_swap, realized, realized_swap, params.max_iter, params.swap_rule, params.stop_rule) node_labels = estimate(es, data, initial_node_labels) @@ -182,10 +182,10 @@ function _label_to_latent(label::Int, sbm) return sbm.cumsize[label] - eps() end -function align_res_true_latents!(res, a::Assignment, latents) - perm = order_groups(a, latents) +function align_res_true_latents!(res, latents) + perm = order_groups(res.node_labels, latents) permute!(res.model, perm) - res.node_labels .= map(x -> findfirst(==(x), perm), a.node_labels) + res.node_labels .= map(x -> findfirst(==(x), perm), res.node_labels) end function permute!(sbm, perm) @@ -195,17 +195,13 @@ function permute!(sbm, perm) sbm.cumsize .= cumsum(sbm.size) end -""" - order_groups(a::Assignment, latents::AbstractVector) +order_groups(a::Assignment, latents::AbstractVector) = order_groups(a.node_labels, latents) -Order the groups of an assignment according to the true latents. This is an heuristic -approach, which is not guaranteed to find the true ordering of the groups. -""" -function order_groups(a::Assignment, latents::AbstractVector) - n = number_nodes(a) - k = number_groups(a) +function order_groups(node_labels, latents::AbstractVector) + n = length(node_labels) + k = length(unique(node_labels)) sort_perm = sortperm(latents) - sorted_group_labels = a.node_labels[sort_perm] + sorted_group_labels = node_labels[sort_perm] dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) for group in 1:k) diff --git a/src/estimator/GreedyAverage.jl b/src/estimator/GreedyAverage.jl new file mode 100644 index 0000000..68a591d --- /dev/null +++ b/src/estimator/GreedyAverage.jl @@ -0,0 +1,355 @@ +""" + GreedyAverage{C, S, NodeR, StopR} + +Greedy optimization estimator for Stochastic Block Models using sum-of-squares loss. + +This estimator uses a greedy node-swapping algorithm to minimize the loss function: + L = (1/n_edges) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] + +The algorithm iteratively swaps nodes between groups to improve the block model fit. + +# Type Parameters +- `C`: Type for count matrices (usually symmetric array of integers) +- `S`: Type for realized value matrices (usually symmetric array of vectors) +- `NodeR <: NodeSwapRule`: Rule for selecting which nodes to swap +- `StopR <: StopRule`: Rule for determining when to stop optimization + +# Fields +- `counts::C`: Number of possible edges between each pair of groups +- `counts_swap::C`: Working copy of counts for swap evaluation +- `realized::S`: Sum of observed edge values between each pair of groups +- `realized_swap::S`: Working copy of realized values for swap evaluation +- `max_iter::Int`: Maximum number of iterations +- `node_swap_rule::NodeR`: Strategy for selecting nodes to swap +- `stop_rule::StopR`: Criterion for early stopping + +# Example +```julia +k = 5 # number of groups +counts = SymArray(k, 0) +counts_swap = SymArray(k, 0) +realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) +realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) + +estimator = GreedyAverage( + counts, counts_swap, realized, realized_swap, + max_iter=100_000, + node_swap_rule=RandomGroupSwap(), + stop_rule=PreviousBestValue(1000, Inf, :min) +) + +labels = estimate(estimator, data, initial_labels) +``` +""" +struct GreedyAverage{C, S, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator + counts::C + counts_swap::C + realized::S + realized_swap::S + max_iter::Int + node_swap_rule::NodeR + stop_rule::StopR +end + +""" + score(estimator::GreedyAverage) + +Compute the current objective value (loss) for the estimator. + +Lower values indicate better fit to a block model structure. +""" +function score(estimator::GreedyAverage) + return loss_function(estimator.realized, estimator.counts) +end + +""" + init!(estimator::GreedyAverage, data, initial_labels) + +Initialize the estimator's count and realized value matrices from data. + +Iterates through the upper triangle of the adjacency matrix (i < j) to avoid +double-counting edges in undirected graphs. Updates both the main and swap +workspace matrices. + +# Arguments +- `estimator::GreedyAverage`: The estimator to initialize +- `data::AbstractMatrix`: Network adjacency matrix +- `initial_labels::Vector{Int}`: Initial group assignments for nodes +""" +function init!(estimator::GreedyAverage, data, initial_labels) + # Iterate over upper triangle to avoid double-counting edges + @inbounds for j in axes(data, 2) + label_j = initial_labels[j] + for i in 1:(j - 1) # More efficient than i < j check inside loop + edge_value = data[i, j] + if !isnothing(edge_value) + label_i = initial_labels[i] + + # Update both main and swap workspaces + add_realized(estimator.realized[label_i, label_j], edge_value) + add_realized(estimator.realized_swap[label_i, label_j], edge_value) + add_counts!(estimator.counts, edge_value, label_i, label_j) + add_counts!(estimator.counts_swap, edge_value, label_i, label_j) + end + end + end +end + +""" + estimate(estimator::GreedyAverage, data, initial_labels; progress=true) + +Estimate node group assignments using greedy optimization with node swapping. + +# Algorithm +The algorithm proceeds as follows: +1. Initialize count and realized value matrices from data and initial labels +2. For each iteration: + a. Select two nodes to swap according to the swap rule + b. Tentatively swap them and update statistics + c. Accept swap if it improves the loss, otherwise revert + d. Check stopping criterion +3. Return final node labels + +# Arguments +- `estimator::GreedyAverage`: The estimator with configuration +- `data::AbstractMatrix`: Network adjacency matrix (n × n) +- `initial_labels::Vector{Int}`: Initial group assignments (length n) +- `progress::Bool`: Whether to show progress bar (default: true) + +# Returns +- `node_labels::Vector{Int}`: Optimized group assignments for each node +""" +function estimate(estimator::GreedyAverage, data, initial_labels; progress = true) + # Initialize counts and realized values from data + init!(estimator, data, initial_labels) + initialise_stop_rule!(estimator.stop_rule, estimator) + + # Compute initial loss + current_loss = score(estimator) + + # Start with initial labeling + node_labels = copy(initial_labels) + + # Progress tracking + pbar = ProgressUnknown( + enabled = progress, + showspeed = true, + desc = "Greedy search: " + ) + + # Update progress bar only every N iterations to reduce overhead + progress_update_interval = max(1, estimator.max_iter ÷ 1000) + + # Main optimization loop + for iter in 1:(estimator.max_iter) + # Select two nodes to potentially swap + index1, index2 = select_indices_swap(node_labels, estimator.node_swap_rule) + + group1 = node_labels[index1] + group2 = node_labels[index2] + + # Only process if nodes are in different groups + if group1 != group2 + # Update swap workspace to reflect the proposed swap + # Using @inbounds for performance - loop bounds are guaranteed safe + @inbounds for j in axes(data, 1) + # Skip the swapped nodes themselves + if j == index1 || j == index2 + continue + end + + group_j = node_labels[j] + edge_val_1 = data[j, index1] + edge_val_2 = data[j, index2] + + # Update for node1: remove from group1, add to group2 + # TODO: duplicate for each edge and only iterate over non-zeros (i.e. edge with value and nothing!) + if !isnothing(edge_val_1) + remove_realized(estimator.realized_swap[group1, group_j], edge_val_1) + remove_counts!(estimator.counts_swap, edge_val_1, group1, group_j) + add_realized(estimator.realized_swap[group2, group_j], edge_val_1) + add_counts!(estimator.counts_swap, edge_val_1, group2, group_j) + end + + # Update for node2: remove from group2, add to group1 + if !isnothing(edge_val_2) + remove_realized(estimator.realized_swap[group2, group_j], edge_val_2) + remove_counts!(estimator.counts_swap, edge_val_2, group2, group_j) + add_realized(estimator.realized_swap[group1, group_j], edge_val_2) + add_counts!(estimator.counts_swap, edge_val_2, group1, group_j) + end + end + + # Tentatively apply swap + node_labels[index1] = group2 + node_labels[index2] = group1 + + # Compute new loss + new_loss = loss_function(estimator.realized_swap, estimator.counts_swap) + + # Accept or reject swap + if new_loss < current_loss + # Accept: commit swap to main workspace + deepcopy!(estimator.realized, estimator.realized_swap) + copy!(estimator.counts, estimator.counts_swap) + current_loss = new_loss + else + # Reject: revert labels and workspace + node_labels[index1] = group1 + node_labels[index2] = group2 + deepcopy!(estimator.realized_swap, estimator.realized) + copy!(estimator.counts_swap, estimator.counts) + end + end + + # Update progress bar + + # Update progress bar only periodically to reduce overhead + if progress && (iter % progress_update_interval == 0 || iter == estimator.max_iter) + update!( + pbar, iter; + showvalues = [ + ("loss", current_loss), + info_to_print(estimator.stop_rule) + ]) + end + + # Check stopping criterion + if stopping_rule(current_loss, estimator.stop_rule) + @info "Stopping criterion met at iteration $iter with loss $current_loss" + finish!(pbar) + break + end + end + + return node_labels +end + +""" + loss_function(realized, counts) + +Compute the normalized sum-of-squares loss for block model fitting. + +The loss measures how well a block model fits the data by computing: + L = (1/N) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] + +where the sum is over the upper triangle (i ≤ j) to avoid double-counting. + +# Mathematical Interpretation +For each pair of groups (i,j): +- `count(i,j)` is the number of edges between groups i and j +- `realized(i,j)` is a vector of observed edge values +- The term `||realized(i,j)||²/count(i,j)` measures concentration of values +- Lower loss indicates better block structure (more homogeneous within blocks) + +# Arguments +- `realized`: Symmetric array of realized edge value sums between groups +- `counts`: Symmetric array of edge counts between groups + +# Returns +- Normalized loss value (lower is better) + +# !warning + This will need to be modified for other data types! +""" +@inline function loss_function(realized, counts::AbstractArray{<:Real}) + total_loss = 0.0 + total_edges = 0.0 + + @inbounds for j in axes(realized, 2) + for i in 1:j + n_edges = counts[i, j] + if n_edges > 0 + sum_squares = sum(abs2, realized[i, j]) + total_loss += n_edges - sum_squares / n_edges + total_edges += n_edges + end + end + end + + return total_edges > 0 ? total_loss / total_edges : 0.0 +end + +# this assumes that sum realized = counts +@inline function loss_function(realized, counts) + total_loss = 0.0 + total_edges = 0.0 + @inbounds for j in axes(realized, 2) + for i in 1:j + for m in eachindex(realized[i, j]) + total_edges += realized[i, j][m] + total_loss += realized[i, j][m] * + (1 - + _fast_div_(realized[i, j][m], counts[i, j][m])) + end + end + end + return total_loss / total_edges +end + +@inline function _fast_div_(num::Real, denom::Real) + num == 0.0 && denom == 0.0 && return 0.0 + return num / denom +end + +# ============================================================================ +# Count manipulation helpers +# ============================================================================ + +""" + add_realized(parameter::AbstractArray, data_value::AbstractArray) + +Add array data value to parameter array (for categorical edge values). +""" +@inline function add_realized(parameter::AbstractArray, data_value::AbstractArray) + @inbounds parameter .+= data_value +end + +""" + remove_realized(parameter::AbstractArray, data_value::AbstractArray) + +Remove array data value from parameter array (for categorical edge values). +""" +@inline function remove_realized(parameter::AbstractArray, data_value::AbstractArray) + @inbounds parameter .-= data_value +end + +""" + add_realized(parameter::AbstractArray, data_value::Real) + +Increment the count for a specific category (for categorical edge values). +""" +@inline function add_realized(parameter::AbstractArray, data_value::Real) + @inbounds parameter[data_value] += 1 +end + +""" + remove_realized(parameter::AbstractArray, data_value::Real) + +Decrement the count for a specific category (for categorical edge values). +""" +@inline function remove_realized(parameter::AbstractArray, data_value::Real) + @inbounds parameter[data_value] -= 1 +end + +@inline function add_counts!( + counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: + Real} + @inbounds counts[group_i, group_j] += one(T) +end + +@inline function remove_counts!( + counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: + Real} + @inbounds counts[group_i, group_j] -= one(T) +end + +@inline function add_counts!( + counts::AbstractArray, data_value, group_i::Int, group_j::Int) + @inbounds counts[group_i, group_j] .+= 1#data_value +end + +@inline function remove_counts!( + counts::AbstractArray, data_value, group_i::Int, group_j::Int) + @inbounds counts[group_i, group_j] .-= 1#data_value +end diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index aa83f08..75374f1 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -9,424 +9,11 @@ All concrete estimator types should implement: """ abstract type SBMEstimator end -""" - SumGreedyEstimator{C, S, NodeR, StopR} - -Greedy optimization estimator for Stochastic Block Models using sum-of-squares loss. - -This estimator uses a greedy node-swapping algorithm to minimize the loss function: - L = (1/n_edges) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] - -The algorithm iteratively swaps nodes between groups to improve the block model fit. - -# Type Parameters -- `C`: Type for count matrices (usually symmetric array of integers) -- `S`: Type for realized value matrices (usually symmetric array of vectors) -- `NodeR <: NodeSwapRule`: Rule for selecting which nodes to swap -- `StopR <: StopRule`: Rule for determining when to stop optimization - -# Fields -- `counts::C`: Number of possible edges between each pair of groups -- `counts_swap::C`: Working copy of counts for swap evaluation -- `realized::S`: Sum of observed edge values between each pair of groups -- `realized_swap::S`: Working copy of realized values for swap evaluation -- `max_iter::Int`: Maximum number of iterations -- `node_swap_rule::NodeR`: Strategy for selecting nodes to swap -- `stop_rule::StopR`: Criterion for early stopping - -# Example -```julia -k = 5 # number of groups -counts = SymArray(k, 0) -counts_swap = SymArray(k, 0) -realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) -realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) - -estimator = SumGreedyEstimator( - counts, counts_swap, realized, realized_swap, - max_iter=100_000, - node_swap_rule=RandomGroupSwap(), - stop_rule=PreviousBestValue(1000, Inf, :min) -) - -labels = estimate(estimator, data, initial_labels) -``` -""" -struct SumGreedyEstimator{C, S, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator - counts::C - counts_swap::C - realized::S - realized_swap::S - max_iter::Int - node_swap_rule::NodeR - stop_rule::StopR -end - -""" - score(estimator::SumGreedyEstimator) - -Compute the current objective value (loss) for the estimator. - -Lower values indicate better fit to a block model structure. -""" -function score(estimator::SumGreedyEstimator) - return loss_function(estimator.realized, estimator.counts) -end - -""" - init!(estimator::SumGreedyEstimator, data, initial_labels) - -Initialize the estimator's count and realized value matrices from data. - -Iterates through the upper triangle of the adjacency matrix (i < j) to avoid -double-counting edges in undirected graphs. Updates both the main and swap -workspace matrices. - -# Arguments -- `estimator::SumGreedyEstimator`: The estimator to initialize -- `data::AbstractMatrix`: Network adjacency matrix -- `initial_labels::Vector{Int}`: Initial group assignments for nodes -""" -function init!(estimator::SumGreedyEstimator, data, initial_labels) - # Iterate over upper triangle to avoid double-counting edges - @inbounds for j in axes(data, 2) - label_j = initial_labels[j] - for i in 1:(j - 1) # More efficient than i < j check inside loop - edge_value = data[i, j] - if !isnothing(edge_value) - label_i = initial_labels[i] - - # Update both main and swap workspaces - add_realized(estimator.realized[label_i, label_j], edge_value) - add_realized(estimator.realized_swap[label_i, label_j], edge_value) - add_counts!(estimator.counts, edge_value, label_i, label_j) - add_counts!(estimator.counts_swap, edge_value, label_i, label_j) - end - end - end -end - -""" - estimate(estimator::SumGreedyEstimator, data, initial_labels; progress=true) - -Estimate node group assignments using greedy optimization with node swapping. - -# Algorithm -The algorithm proceeds as follows: -1. Initialize count and realized value matrices from data and initial labels -2. For each iteration: - a. Select two nodes to swap according to the swap rule - b. Tentatively swap them and update statistics - c. Accept swap if it improves the loss, otherwise revert - d. Check stopping criterion -3. Return final node labels - -# Arguments -- `estimator::SumGreedyEstimator`: The estimator with configuration -- `data::AbstractMatrix`: Network adjacency matrix (n × n) -- `initial_labels::Vector{Int}`: Initial group assignments (length n) -- `progress::Bool`: Whether to show progress bar (default: true) - -# Returns -- `node_labels::Vector{Int}`: Optimized group assignments for each node -""" -function estimate(estimator::SumGreedyEstimator, data, initial_labels; progress = true) - # Initialize counts and realized values from data - init!(estimator, data, initial_labels) - initialise_stop_rule!(estimator.stop_rule, estimator) - - # Compute initial loss - current_loss = score(estimator) - - # Start with initial labeling - node_labels = copy(initial_labels) - - # Progress tracking - pbar = ProgressUnknown( - enabled = progress, - showspeed = true, - desc = "Greedy search: " - ) - - # Update progress bar only every N iterations to reduce overhead - progress_update_interval = max(1, estimator.max_iter ÷ 1000) - - # Main optimization loop - for iter in 1:(estimator.max_iter) - # Select two nodes to potentially swap - index1, index2 = select_indices_swap(node_labels, estimator.node_swap_rule) - - group1 = node_labels[index1] - group2 = node_labels[index2] - - # Only process if nodes are in different groups - if group1 != group2 - # Update swap workspace to reflect the proposed swap - # Using @inbounds for performance - loop bounds are guaranteed safe - @inbounds for j in axes(data, 1) - # Skip the swapped nodes themselves - if j == index1 || j == index2 - continue - end - - group_j = node_labels[j] - edge_val_1 = data[j, index1] - edge_val_2 = data[j, index2] - - # Update for node1: remove from group1, add to group2 - if !isnothing(edge_val_1) - remove_realized(estimator.realized_swap[group1, group_j], edge_val_1) - remove_counts!(estimator.counts_swap, edge_val_1, group1, group_j) - add_realized(estimator.realized_swap[group2, group_j], edge_val_1) - add_counts!(estimator.counts_swap, edge_val_1, group2, group_j) - end - - # Update for node2: remove from group2, add to group1 - if !isnothing(edge_val_2) - remove_realized(estimator.realized_swap[group2, group_j], edge_val_2) - remove_counts!(estimator.counts_swap, edge_val_2, group2, group_j) - add_realized(estimator.realized_swap[group1, group_j], edge_val_2) - add_counts!(estimator.counts_swap, edge_val_2, group1, group_j) - end - end - - # Tentatively apply swap - node_labels[index1] = group2 - node_labels[index2] = group1 - - # Compute new loss - new_loss = loss_function(estimator.realized_swap, estimator.counts_swap) - - # Accept or reject swap - if new_loss < current_loss - # Accept: commit swap to main workspace - deepcopy!(estimator.realized, estimator.realized_swap) - copy!(estimator.counts, estimator.counts_swap) - current_loss = new_loss - else - # Reject: revert labels and workspace - node_labels[index1] = group1 - node_labels[index2] = group2 - deepcopy!(estimator.realized_swap, estimator.realized) - copy!(estimator.counts_swap, estimator.counts) - end - end - - # Update progress bar - - # Update progress bar only periodically to reduce overhead - if progress && (iter % progress_update_interval == 0 || iter == estimator.max_iter) - update!( - pbar, iter; - showvalues = [ - ("loss", current_loss), - info_to_print(estimator.stop_rule) - ]) - end - - # Check stopping criterion - if stopping_rule(current_loss, estimator.stop_rule) - @info "Stopping criterion met at iteration $iter with loss $current_loss" - finish!(pbar) - break - end - end - - return node_labels -end - -""" - loss_function(realized, counts) - -Compute the normalized sum-of-squares loss for block model fitting. - -The loss measures how well a block model fits the data by computing: - L = (1/N) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] - -where the sum is over the upper triangle (i ≤ j) to avoid double-counting. - -# Mathematical Interpretation -For each pair of groups (i,j): -- `count(i,j)` is the number of edges between groups i and j -- `realized(i,j)` is a vector of observed edge values -- The term `||realized(i,j)||²/count(i,j)` measures concentration of values -- Lower loss indicates better block structure (more homogeneous within blocks) - -# Arguments -- `realized`: Symmetric array of realized edge value sums between groups -- `counts`: Symmetric array of edge counts between groups - -# Returns -- Normalized loss value (lower is better) - -# !warning - This will need to be modified for other data types! -""" -@inline function loss_function(realized, counts::AbstractArray{<:Real}) - total_loss = 0.0 - total_edges = 0.0 - - @inbounds for j in axes(realized, 2) - for i in 1:j - n_edges = counts[i, j] - if n_edges > 0 - sum_squares = sum(abs2, realized[i, j]) - total_loss += n_edges - sum_squares / n_edges - total_edges += n_edges - end - end - end - - return total_edges > 0 ? total_loss / total_edges : 0.0 -end - -# this assumes that sum realized = counts -@inline function loss_function(realized, counts) - total_loss = 0.0 - total_edges = 0.0 - @inbounds for j in axes(realized, 2) - for i in 1:j - for m in eachindex(realized[i, j]) - total_edges += realized[i, j][m] - total_loss += realized[i, j][m] * - (1 - - _fast_div_(realized[i, j][m], counts[i, j][m])) - end - end - end - return total_loss / total_edges -end - -@inline function _fast_div_(num::Real, denom::Real) - num == 0.0 && denom == 0.0 && return 0.0 - return num / denom -end - -# ============================================================================ -# Count manipulation helpers -# ============================================================================ - -""" - add_realized(parameter::AbstractArray, data_value::AbstractArray) - -Add array data value to parameter array (for categorical edge values). -""" -@inline function add_realized(parameter::AbstractArray, data_value::AbstractArray) - @inbounds parameter .+= data_value -end - -""" - remove_realized(parameter::AbstractArray, data_value::AbstractArray) - -Remove array data value from parameter array (for categorical edge values). -""" -@inline function remove_realized(parameter::AbstractArray, data_value::AbstractArray) - @inbounds parameter .-= data_value -end - -""" - add_realized(parameter::AbstractArray, data_value::Real) - -Increment the count for a specific category (for categorical edge values). -""" -@inline function add_realized(parameter::AbstractArray, data_value::Real) - @inbounds parameter[data_value] += 1 -end - -""" - remove_realized(parameter::AbstractArray, data_value::Real) - -Decrement the count for a specific category (for categorical edge values). -""" -@inline function remove_realized(parameter::AbstractArray, data_value::Real) - @inbounds parameter[data_value] -= 1 -end - -@inline function add_counts!( - counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: - Real} - @inbounds counts[group_i, group_j] += one(T) -end - -@inline function remove_counts!( - counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: - Real} - @inbounds counts[group_i, group_j] -= one(T) -end - -@inline function add_counts!( - counts::AbstractArray, data_value, group_i::Int, group_j::Int) - @inbounds counts[group_i, group_j] .+= 1#data_value -end - -@inline function remove_counts!( - counts::AbstractArray, data_value, group_i::Int, group_j::Int) - @inbounds counts[group_i, group_j] .-= 1#data_value -end - -# ============================================================================ -# Data preparation utilities -# ============================================================================ - -""" - prepare_data_cat(A::AbstractMatrix{<:Real}, k; m=length(unique(A)), has_zero=zero(eltype(A)) in A) - -Prepare categorical network data for SumGreedyEstimator. - -Creates the necessary data structures (count matrices and realized value tensors) -for estimating a categorical Stochastic Block Model with k groups. - -# Arguments -- `A::AbstractMatrix{<:Real}`: Adjacency matrix with categorical edge values -- `k::Int`: Number of groups to partition nodes into -- `m::Int`: Number of edge categories (default: inferred from unique values in A) -- `has_zero::Bool`: Whether the data contains zero values (default: auto-detected) - -# Returns -A tuple containing: -- `data`: Preprocessed adjacency matrix (shifted if zero-indexed) -- `counts`: Symmetric k×k array for edge counts (initialized to 0) -- `counts_swap`: Workspace copy of counts for swap evaluation -- `realized`: Symmetric k×k array of m-dimensional count vectors (initialized to 0) -- `realized_swap`: Workspace copy of realized for swap evaluation - -# Example -```julia -# Network with 3 edge types (0, 1, 2) for no edge, layer 1, layer 2 -A = rand(0:2, 100, 100) -A = (A + A') .÷ 2 # Make symmetric - -data, counts, counts_swap, realized, realized_swap = prepare_data_cat(A, k=5) -``` - -# Notes -- If data contains zeros, they are shifted to 1-indexing for categorical representation -- The realized arrays use StaticArrays.MVector for performance -- The symmetric array structure avoids redundant storage -""" -function prepare_data_cat( - A::AbstractMatrix{<:Real}, - k::Int; - m::Int = length(unique(A)), - has_zero::Bool = zero(eltype(A)) in A -) - @debug "Preparing data for categorical SBM with $m categories and $k groups." - - # Adjust data if zero-indexed (shift to 1-indexing for Julia) - if has_zero - @debug "Data contains zero values, using 1-based indexing." - data = A .+ 1 - else - data = A - end - - # Initialize count matrices - counts = SymArray(k, 0) - counts_swap = SymArray(k, 0) +abstract type Result end - # Initialize realized value tensors (k×k matrices of m-dimensional vectors) - realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) - realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) +# struct NethistResult{L, M} <: Result +# labels::L +# model::M +# end - return data, counts, counts_swap, realized, realized_swap -end +include("GreedyAverage.jl") From 40a94ed8fe65bf5b28007fd4016581dbf1cac573 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 26 Oct 2025 13:31:57 +0100 Subject: [PATCH 209/266] add suff stats greedy --- Project.toml | 9 +- docs/literate/tutorials/multiplex_network.jl | 29 +- docs/literate/tutorials/weighted_network.jl | 59 +++- docs/src/tutorials/multiplex_network.md | 97 +++++- docs/src/tutorials/simple_graph.md | 61 ++-- docs/src/tutorials/weighted_network.md | 32 +- ext/DiscretizeExt.jl | 15 - src/NetworkHistogram.jl | 11 +- src/api.jl | 49 ++-- src/distributions/hist_dist.jl | 61 ++++ src/estimator/GreedyAverage.jl | 46 ++- src/estimator/GreedySuffStats.jl | 292 +++++++++++++++++++ src/estimator/abstractEstimator.jl | 1 + src/optimization/config_rules/swap_rule.jl | 2 +- src/preprocessor/abstractConvertor.jl | 55 ++++ src/preprocessor/categorical.jl | 67 +++++ src/preprocessor/continuous.jl | 0 src/utils/SymArray.jl | 24 +- test/test_symarray.jl | 17 +- 19 files changed, 790 insertions(+), 137 deletions(-) delete mode 100644 ext/DiscretizeExt.jl create mode 100644 src/distributions/hist_dist.jl create mode 100644 src/estimator/GreedySuffStats.jl create mode 100644 src/preprocessor/abstractConvertor.jl create mode 100644 src/preprocessor/categorical.jl create mode 100644 src/preprocessor/continuous.jl diff --git a/Project.toml b/Project.toml index 847dbe6..ee20ed2 100644 --- a/Project.toml +++ b/Project.toml @@ -4,10 +4,13 @@ version = "0.5.2" authors = ["Charles Dufour", "Jake Grainger"] [deps] +Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" +IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -19,20 +22,20 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [weakdeps] Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" -DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" LightMC = "b58f5c6e-c887-41d6-b553-02118416cd5d" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" [extensions] BootstrapExt = "Bootstrap" -DiscretizeExt = "DiscretizeDistributions" LightMCExt = "LightMC" MakieExt = "Makie" [compat] +Accessors = "0.1.42" ArgCheck = "2.5.0" Clustering = "0.15.8" -Graphons = "0.1.0" +DiscretizeDistributions = "0.1.2" +IntervalArithmetic = "1.0.1" LinearAlgebra = "1.12.0" Reexport = "1.2.2" diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index 1fffc59..989424d 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -14,7 +14,7 @@ function W_multiplex(x, y) ps = zeros(4) ps[2] = sqrt(abs(x - y)) / 2 # layer 1 only ps[3] = abs(sin(2π * x) * sin(2π * y)) / 2 # layer 2 only - ps[4] = min(x, y) / 4 # both layers + ps[4] = min(x, y) / 2 # both layers ps[1] = 1 - sum(ps[2:4]) # no edge return DiscreteNonParametric(0:3, SVector{4}(ps)) end @@ -44,17 +44,10 @@ res = NetworkHistogram.nethist_discrete_edges(A, RandomGroupSwap(), Strict(), PreviousBestValue(5_000, Inf, :min), - true + false # progress bar )); -let - fig = Mke.Figure(size = (4 * h, h)) - for m in 1:4 - ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) - Mke.heatmap!(ax, res.model, k = m, colormap = :binary, colorrange = (0, 1)) - end - fig -end +# Visualize the fitted models for different numbers of groups after aligning with true latents NetworkHistogram.align_res_true_latents!(res, oracle_labels); let @@ -66,17 +59,18 @@ let fig end +# The fitted network histogram can be further processed to obtain a smoother estimate of the underlying graphon. + using Clustering -shape_range = 1:20 +shape_range = 1:30 ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); using Kneedle -# kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) -# # Let's extract the optimal number of shapes using the Kneedle algorithm: +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing); +# Let's extract the optimal number of shapes using the Kneedle algorithm: -# k_knee = knees(kr)[1] -# k_knee = 10 +k_knee = knees(kr)[1] ssm = SSM(res.model, k_knee) let @@ -84,8 +78,11 @@ let for (i, model) in enumerate([graphon, res.model, ssm]) for m in 1:4 ax = Mke.Axis(fig[i, m], aspect = Mke.DataAspect()) - Mke.heatmap!(ax, model, k = m, colormap = :binary, colorrange = (0, 1)) + Mke.hidedecorations!(ax) + Mke.heatmap!(ax, model, k = m, colormap = :lipari, colorrange = (0, 1)) end end + Mke.Colorbar(fig[2, end + 1], colormap = :lipari, + limits = (0, 1), width = 0.05 * h) fig end diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index d2061f7..3fdb435 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -1,6 +1,63 @@ #= # Decorated Graphon Tutorial for Weighted Networks =# +using Clustering +using NetworkHistogram +using Distributions +using LinearAlgebra +using Random +graphon = DecoratedGraphon((x, y) -> Kumaraswamy( + 4 * (cos(π * (x - y)) + 1) + 1, max(x, y) * 8 + 1)) -# # How to use NetworkHistogram.jl for Weighted Networks +import CairoMakie as Mke +let + fig = Mke.Figure() + ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) + hm = Mke.heatmap!(ax, graphon, colormap = :viridis) + Mke.Colorbar(fig[1, 2], hm) + ax2 = Mke.Axis(fig[1, 3], aspect = Mke.DataAspect()) + hm2 = Mke.heatmap!(ax2, graphon, k = 2, colormap = :viridis) + Mke.Colorbar(fig[1, 4], hm2) + fig +end + +n = 5000 +k = 15 +A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(0.9), n, n)); +oracle_latents = ordered_start_labels(n, k); +starting_labels = copy(oracle_latents); +# shuffle!(starting_labels); +p_shuffle = 1 - 1.5 / k +@info "Shuffling $(p_shuffle*100)% labels for starting point" +indices_to_shuffle = sample(1:n, floor(Int, n * p_shuffle), replace = false); +starting_labels[indices_to_shuffle] .= shuffle(starting_labels[indices_to_shuffle]); +@assert starting_labels != oracle_latents + +res, res_cat, A_cat = NetworkHistogram.nethist_continuous_edges(A, + starting_labels, GreedyParams( + 1_000_000, + RandomGroupSwap(), + Strict(), + PreviousBestValue(10_000, Inf, :min), + true # progress bar + ); + num_bins_ = 10, lower_bound = eps(), upper_bound = 1); + +latents = range(0, 1; length = n); + +ssm_test = SSM(res.model, k) + +shape_range = 1:min(30, k * (k + 1) ÷ 2 - 1) +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res_cat.model, A_cat, latents, shape_range) + +Mke.lines(shape_range, criterion_values) + +## +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +# Let's extract the optimal number of shapes using the Kneedle algorithm: + +k_knee = knees(kr)[1] +ssm_knee = SSM(res.model, k_knee) diff --git a/docs/src/tutorials/multiplex_network.md b/docs/src/tutorials/multiplex_network.md index 92da870..7e2da12 100644 --- a/docs/src/tutorials/multiplex_network.md +++ b/docs/src/tutorials/multiplex_network.md @@ -4,7 +4,102 @@ EditURL = "../../literate/tutorials/multiplex_network.jl" # Decorated Graphon Tutorial for Multiplex Networks -# How to use NetworkHistogram.jl for Multiplex Networks +````@example multiplex_network +using NetworkHistogram +using Distributions +using StaticArrays +import CairoMakie as Mke + +using Random +Random.seed!(1234); +h = 300; + +function W_multiplex(x, y) + ps = zeros(4) + ps[2] = sqrt(abs(x - y)) / 2 # layer 1 only + ps[3] = abs(sin(2π * x) * sin(2π * y)) / 2 # layer 2 only + ps[4] = min(x, y) / 2 # both layers + ps[1] = 1 - sum(ps[2:4]) # no edge + return DiscreteNonParametric(0:3, SVector{4}(ps)) +end + +graphon = DecoratedGraphon(W_multiplex) + +let + fig = Mke.Figure(size = (4 * h, h)) + for m in 1:4 + ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, graphon, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end + +n = 1000 +true_latents = range(0, 1; length = n) +A = sample_graph(graphon, true_latents); + +k = 20 +oracle_labels = ordered_start_labels(n, k); +initial_labels = shuffle(oracle_labels); + +res = NetworkHistogram.nethist_discrete_edges(A, + initial_labels, GreedyParams( + 1_000_000, + RandomGroupSwap(), + Strict(), + PreviousBestValue(5_000, Inf, :min), + false # progress bar + )); +nothing #hide +```` + +Visualize the fitted models for different numbers of groups after aligning with true latents + +````@example multiplex_network +NetworkHistogram.align_res_true_latents!(res, oracle_labels); +let + fig = Mke.Figure(size = (4 * h, h)) + for m in 1:4 + ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, res.model, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end +```` + +The fitted network histogram can be further processed to obtain a smoother estimate of the underlying graphon. + +````@example multiplex_network +using Clustering +shape_range = 1:30 +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res.model, A, true_latents, shape_range); + +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing); +nothing #hide +```` + + Let's extract the optimal number of shapes using the Kneedle algorithm: + +````@example multiplex_network +k_knee = knees(kr)[1] +ssm = SSM(res.model, k_knee) + +let + fig = Mke.Figure(size = (4 * h, 3 * h)) + for (i, model) in enumerate([graphon, res.model, ssm]) + for m in 1:4 + ax = Mke.Axis(fig[i, m], aspect = Mke.DataAspect()) + Mke.hidedecorations!(ax) + Mke.heatmap!(ax, model, k = m, colormap = :lipari, colorrange = (0, 1)) + end + end + Mke.Colorbar(fig[2, end + 1], colormap = :lipari, + limits = (0, 1), width = 0.05 * h) + fig +end +```` --- diff --git a/docs/src/tutorials/simple_graph.md b/docs/src/tutorials/simple_graph.md index 2631db4..2cd6515 100644 --- a/docs/src/tutorials/simple_graph.md +++ b/docs/src/tutorials/simple_graph.md @@ -33,18 +33,16 @@ nothing #hide Define a simple step-function graphon ````@example simple_graph -W(u, v) = u * v +w = SimpleContinuousGraphon((x, y) -> x * y) ```` We can visualize this graphon as a heatmap. ````@example simple_graph let - grid = 0:0.01:1 fig = Mke.Figure(size = (h + 20, h)) - ax = Mke.Axis(fig[1, 1], title = "True Graphon W(u,v)", - xlabel = "u", ylabel = "v", aspect = Mke.DataAspect()) - hm = Mke.heatmap!(ax, grid, grid, W, colormap = :binary, colorrange = (0, 1)) + ax = Mke.Axis(fig[1, 1], title = "True Graphon W(u,v)") + hm = Mke.heatmap!(ax, w, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 2], hm) fig end @@ -57,30 +55,12 @@ end To generate a random graph from a graphon, we follow these steps: 1. **Assign latent positions:** For a graph with `n` nodes, we sample `n` independent and identically distributed random variables $u_1, u_2, \dots, u_n$ from a Uniform(0, 1) distribution. These are the latent positions of our nodes. 2. **Generate edges:** For each pair of nodes `(i, j)` with `i < j`, we generate a random number from a Bernoulli distribution with probability $W(u_i, u_j)$. This determines whether an edge exists between them. The resulting adjacency matrix `A` will be symmetric. - -Let's write a function to do this. - -````@example simple_graph -function sample_graph(W_func, n::Int; seed = 123) - Random.seed!(seed) - u = rand(n) # Latent positions - A = zeros(Int, n, n) - for i in 1:n - for j in (i + 1):n - if rand() < W_func(u[i], u[j]) - A[i, j] = A[j, i] = 1 - end - end - end - return A, u -end -```` - -Now, let's sample a graph with 400 nodes from our graphon `W`. +Let's sample a graph with 2000 nodes from our graphon `W`. ````@example simple_graph -n = 400 -A, u_true = sample_graph(W, n); +n = 2000 +u_true = rand(n); # Latent positions +A = sample_graph(w, u_true); nothing #hide ```` @@ -136,8 +116,8 @@ dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter We start with a random initial assignment of nodes to `k=5` groups. ````@example simple_graph -k = floor(Int, sqrt(n)) -oracle_labels = inverse_rle(1:k, fill(n ÷ k, k)) +k = 10 +oracle_labels = ordered_start_labels(n, k); initial_assignment = shuffle(oracle_labels); nothing #hide @@ -158,13 +138,24 @@ println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) Let's use the `nethist` function with `GreedyParams`, which iteratively moves nodes between groups to maximize the log-likelihood. -````@example simple_graph params_opti = NetworkHistogram.GreedyParams( 100_000, NetworkHistogram.RandomNodeSwap(), NetworkHistogram.Strict(), NetworkHistogram.PreviousBestValue(2_000), false); a = nethist(A, dist, initial_assignment, params_opti, false); -nothing #hide + +````@example simple_graph +res = NetworkHistogram.nethist_binary_edges(A, + initial_assignment, GreedyParams( + 1_000_000, + RandomGroupSwap(), + Strict(), + PreviousBestValue(1_000, Inf, :min), + true + )); + +a = Assignment(res.node_labels, edge_list, Dist(dist)); +println("Log-likelihood after optimization: ", loglikelihood(a)) ```` The `Assignment` object `a` now contains the optimized node groupings and @@ -178,14 +169,14 @@ heatmap_params(a, ordering = false, colorrange = (0, 1)) We can convert it to a block model for easier interpretation. -````@example simple_graph res = NethistResult(a); +````@example simple_graph let fig = Mke.Figure(size = (1220, 400)) titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] - Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[2], sbm_oracle, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) @@ -198,7 +189,7 @@ end the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. ````@example simple_graph -NetworkHistogram.align_res_true_latents!(res, a, oracle_estimator.node_labels); +NetworkHistogram.align_res_true_latents!(res, oracle_estimator.node_labels); nothing #hide ```` @@ -209,7 +200,7 @@ let fig = Mke.Figure(size = (1220, 400)) titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] - Mke.heatmap!(axes[1], 0:0.01:1, 0:0.01:1, W, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[2], sbm_oracle, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) diff --git a/docs/src/tutorials/weighted_network.md b/docs/src/tutorials/weighted_network.md index 3b22b71..a5420ab 100644 --- a/docs/src/tutorials/weighted_network.md +++ b/docs/src/tutorials/weighted_network.md @@ -4,7 +4,37 @@ EditURL = "../../literate/tutorials/weighted_network.jl" # Decorated Graphon Tutorial for Weighted Networks -# How to use NetworkHistogram.jl for Weighted Networks +````@example weighted_network +using NetworkHistogram +using Distributions +import CairoMakie as Mke +using LinearAlgebra + +graphon = DecoratedGraphon((x, y) -> Exponential(3 * x * y + 1)) + +let + fig = Mke.Figure(size = (600, 300)) + ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, graphon, colormap = :viridis) + fig +end + +n = 500 +k = 10 +A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(0.7), n, n)); +oracle_latents = ordered_start_labels(n, k); +starting_labels = shuffle(oracle_latents); + +res = NetworkHistogram.nethist_continuous_edges(A, + starting_labels, GreedyParams( + 1_000_000, + RandomGroupSwap(), + Strict(), + PreviousBestValue(5_000, Inf, :min), + true # progress bar + )); +nothing #hide +```` --- diff --git a/ext/DiscretizeExt.jl b/ext/DiscretizeExt.jl deleted file mode 100644 index 4cd5ddf..0000000 --- a/ext/DiscretizeExt.jl +++ /dev/null @@ -1,15 +0,0 @@ -module DiscretizeExt - -using NetworkHistogram -import NetworkHistogram: get_ref_dist, Dist, ZeroInflated -import Distributions: ContinuousUnivariateDistribution -using DiscretizeDistributions - -function get_ref_dist(dist::D, ::Val{true}) where {D <: ContinuousUnivariateDistribution} - return Dist(ZeroInflated(dist)) -end -function get_ref_dist(dist::D, ::Val{false}) where {D <: ContinuousUnivariateDistribution} - return Dist(dist) -end - -end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 6cc353b..72a5e1f 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,4 +1,5 @@ module NetworkHistogram +using Accessors using StatsBase using StaticArrays using ProgressMeter @@ -7,11 +8,17 @@ import Base: convert, eltype, zero using Distributions using LinearAlgebra using ArgCheck -using Random: randperm +import Random: randperm, AbstractRNG, rand +import Distributions: logpdf +export logpdf + +using IntervalArithmetic using Reexport @reexport using Graphons +import Graphons: _extract_param, convert_to_params + include("utils/include.jl") @reexport using .FastSymArray @@ -20,10 +27,12 @@ include("distributions/include.jl") include("EdgeList.jl") include("assignment.jl") include("optimization/greedy.jl") +include("preprocessor/abstractConvertor.jl") include("preprocessor/categorical.jl") include("preprocessor/continuous.jl") include("estimator/abstractEstimator.jl") include("estimator/SpectralEstimator.jl") +include("distributions/hist_dist.jl") include("api.jl") export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf, diff --git a/src/api.jl b/src/api.jl index 8db95bb..6ebb75d 100644 --- a/src/api.jl +++ b/src/api.jl @@ -113,41 +113,54 @@ function postprocess(out) return out end -function nethist_discrete_edges(A, initial_node_labels, params::GreedyParams, - k = length(unique(initial_node_labels))) +function nethist_binary_edges(A, initial_node_labels, params::GreedyParams) + k = length(unique(initial_node_labels)) data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) - m = length(unique(data)) es = GreedyAverage( counts_main, counts_swap, realized, realized_swap, params.max_iter, params.swap_rule, params.stop_rule) - node_labels = estimate(es, data, initial_node_labels) + node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) sizes = counts(node_labels) ./ length(node_labels) - parameters = Matrix{SVector{m, Float64}}(undef, k, k) + θ = Matrix{Float64}(undef, k, k) @inbounds for j in 1:k, i in 1:k - parameters[i, j] = [es.realized[i, j][c] / es.counts[i, j] for c in 1:m] + θ[i, j] = es.realized[i, j][2] / max(1, es.counts[i, j]) end - model = DecoratedSBM(DiscreteNonParametric.(Ref(0:(m - 1)), parameters), sizes) + model = SBM(θ, sizes) + return NethistResult(node_labels, model) end -function nethist_binary_edges(A, initial_node_labels, params::GreedyParams, - k = length(unique(initial_node_labels))) - data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) - +function nethist_discrete_edges( + A, initial_node_labels, params::GreedyParams, m = length(unique(A))) + k = length(unique(initial_node_labels)) + data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k, m = m) es = GreedyAverage( counts_main, counts_swap, realized, realized_swap, params.max_iter, params.swap_rule, params.stop_rule) - node_labels = estimate(es, data, initial_node_labels) + node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) sizes = counts(node_labels) ./ length(node_labels) - - θ = Matrix{Float64}(undef, k, k) + parameters = Matrix{SVector{m, Float64}}(undef, k, k) @inbounds for j in 1:k, i in 1:k - θ[i, j] = es.realized[i, j][2] / es.counts[i, j] + parameters[i, j] = [es.realized[i, j][c] / max(es.counts[i, j], 1) for c in 1:m] end - model = SBM(θ, sizes) - - return NethistResult(node_labels, model) + s = zero(eltype(A)) in A ? collect(0:(m - 1)) : 1:m + model = DecoratedSBM(DiscreteNonParametric.(Ref(s), parameters), sizes) + return NethistResult(node_labels, model), es +end + +function nethist_continuous_edges(A_cont, initial_node_labels, params::GreedyParams; + num_bins_::Int = 10, lower_bound = quantile(A_cont[:], 0.01), upper_bound = quantile( + A_cont[:], 0.99)) + convertor = ContinuousConvertor(lower_bound, upper_bound, num_bins_) + A = convertor.(A_cont) + @info "Discretized continuous edge values into $(num_bins(convertor)) bins" + res_cat = nethist_discrete_edges( + A, initial_node_labels, params, num_bins(convertor)) + parameters = NetworkHistogram.HistDistribution.( + Graphons._extract_param.(res_cat.model.θ), convertor) + model = DecoratedSBM(parameters, res_cat.model.size) + return NethistResult(res_cat.node_labels, model), res_cat, A end # functions for postprocessing diff --git a/src/distributions/hist_dist.jl b/src/distributions/hist_dist.jl new file mode 100644 index 0000000..ab9bb29 --- /dev/null +++ b/src/distributions/hist_dist.jl @@ -0,0 +1,61 @@ +struct HistDistribution{B, P, P2, T} <: ContinuousUnivariateDistribution + bins::B + probs::P + cum_probs::P2 + lower_bound::T + upper_bound::T +end + +Base.broadcastable(d::HistDistribution) = Ref(d) + +params(d::HistDistribution) = (d.bins, d.probs) + +eltype(::HistDistribution{F}) where {F} = F + +function rand(rng::AbstractRNG, d::HistDistribution) + u = rand(rng) + bin_idx = searchsortedfirst(d.cum_probs, u) + return uniform_in(rng, d.bins, bin_idx) +end + +# assume bins are sorted and encoded by their upper bounds +function uniform_in(rng::AbstractRNG, bins::AbstractVector{B}, + bin_idx::Int) where {B <: Union{Interval, BareInterval}} + l, u = inf(bins[bin_idx]), sup(bins[bin_idx]) + return rand(rng) * (u - l) + l +end + +function HistDistribution(ps, c::ContinuousConvertor) + cum_ps = cumsum(ps) + intervals = vcat(bareinterval(0.0), c.bins) + return HistDistribution(intervals, ps, cum_ps, inf(c.bins[1]), sup(c.bins[end])) +end + +function HistDistribution(bins, ps) + cum_ps = cumsum(ps) + return HistDistribution(bins, ps, cum_ps, inf(bins[2]), sup(bins[end])) +end + +### For Graphons compatibility +support(d::HistDistribution) = d.bins +_extract_param(d::HistDistribution, k) = d.probs[k] +# specialization for DiscreteNonParametric as it requires support to be specified +function convert_to_params(centers, + sbm::DecoratedSBM{HistDistribution{B, P, P2, T}}) where {B, P, P2, T} + s = support(sbm.θ[1, 1]) + return [HistDistribution(s, centers[:, i]) for i in axes(centers, 2)] +end + +function logpdf(d::HistDistribution, x::Real) + if x < d.lower_bound + x = d.lower_bound + eps() + elseif x >= d.upper_bound + x = d.upper_bound - eps() + end + bin_idx = findfirst(b -> in_interval(x, b), d.bins) # potentially slow + p = d.probs[bin_idx] + # return sum(d.probs .^ 2) - p^2 + (1 - p)^2 + bin_idx == 1 && return log(p + eps()) + bin_width = sup(d.bins[bin_idx]) - inf(d.bins[bin_idx]) + return log(eps() + p / bin_width) +end diff --git a/src/estimator/GreedyAverage.jl b/src/estimator/GreedyAverage.jl index 68a591d..e679119 100644 --- a/src/estimator/GreedyAverage.jl +++ b/src/estimator/GreedyAverage.jl @@ -119,7 +119,7 @@ The algorithm proceeds as follows: # Returns - `node_labels::Vector{Int}`: Optimized group assignments for each node """ -function estimate(estimator::GreedyAverage, data, initial_labels; progress = true) +function estimate(estimator::GreedyAverage, data, initial_labels; progress = false) # Initialize counts and realized values from data init!(estimator, data, initial_labels) initialise_stop_rule!(estimator.stop_rule, estimator) @@ -138,7 +138,7 @@ function estimate(estimator::GreedyAverage, data, initial_labels; progress = tru ) # Update progress bar only every N iterations to reduce overhead - progress_update_interval = max(1, estimator.max_iter ÷ 1000) + progress_update_interval = max(1, estimator.max_iter ÷ 5000) # Main optimization loop for iter in 1:(estimator.max_iter) @@ -216,12 +216,11 @@ function estimate(estimator::GreedyAverage, data, initial_labels; progress = tru # Check stopping criterion if stopping_rule(current_loss, estimator.stop_rule) - @info "Stopping criterion met at iteration $iter with loss $current_loss" - finish!(pbar) break end end - + finish!(pbar) + @info "Optimization finished. Final loss: $current_loss" return node_labels end @@ -260,32 +259,31 @@ For each pair of groups (i,j): for i in 1:j n_edges = counts[i, j] if n_edges > 0 - sum_squares = sum(abs2, realized[i, j]) - total_loss += n_edges - sum_squares / n_edges + inter = n_edges - sum(abs2, realized[i, j]) / n_edges + total_loss += inter total_edges += n_edges end end end - return total_edges > 0 ? total_loss / total_edges : 0.0 end -# this assumes that sum realized = counts -@inline function loss_function(realized, counts) - total_loss = 0.0 - total_edges = 0.0 - @inbounds for j in axes(realized, 2) - for i in 1:j - for m in eachindex(realized[i, j]) - total_edges += realized[i, j][m] - total_loss += realized[i, j][m] * - (1 - - _fast_div_(realized[i, j][m], counts[i, j][m])) - end - end - end - return total_loss / total_edges -end +# # this assumes that sum realized = counts +# @inline function loss_function(realized, counts) +# total_loss = 0.0 +# total_edges = 0.0 +# @inbounds for j in axes(realized, 2) +# for i in 1:j +# for m in eachindex(realized[i, j]) +# total_edges += realized[i, j][m] +# total_loss += realized[i, j][m] * +# (1 - +# _fast_div_(realized[i, j][m], counts[i, j][m])) +# end +# end +# end +# return total_loss / total_edges +# end @inline function _fast_div_(num::Real, denom::Real) num == 0.0 && denom == 0.0 && return 0.0 diff --git a/src/estimator/GreedySuffStats.jl b/src/estimator/GreedySuffStats.jl new file mode 100644 index 0000000..870f5f6 --- /dev/null +++ b/src/estimator/GreedySuffStats.jl @@ -0,0 +1,292 @@ +abstract type SuffStats end + +function add_sample(ss::SuffStats, sample) + @error("add_sample not implemented for $(typeof(ss)) and sample $(typeof(sample)) \n + you may need to implement a custom sufficient statistics type") +end +function remove_sample(ss::SuffStats, sample) + @error("remove_sample not implemented for $(typeof(ss)) and sample $(typeof(sample)) \n + you may need to implement a custom sufficient statistics type") +end + +function make_k_block(k, suff_stats_type; kwargs...) + @error("make_k_block not implemented for sufficient statistics type $(suff_stats_type) \n + you may need to implement a custom sufficient statistics type") +end + +function score(ss::SuffStats; kwargs...) + @error("score not implemented for sufficient statistics type $(typeof(ss)) \n + you may need to implement a custom sufficient statistics type") +end + +### ======================================================================================== + +struct CategoricalSuffStats{M, T} <: SuffStats + h::SVector{M, T} + n::Int +end + +function CategoricalSuffStats(num_categories::Int) + h = SVector{num_categories, Int}(zeros(Int, num_categories)) + return CategoricalSuffStats{num_categories, Int}(h, 0) +end + +function add_sample(ss::CategoricalSuffStats, sample::Int) + ss = @set ss.h[sample] += 1 + ss = @set ss.n += 1 + return ss +end + +function add_sample(ss::CategoricalSuffStats, ::Nothing) + @reset ss.n += 1 + return ss +end + +function remove_sample(ss::CategoricalSuffStats, sample::Int) + ss = @set ss.h[sample] -= 1 + ss = @set ss.n -= 1 + return ss +end + +function remove_sample(ss::CategoricalSuffStats, ::Nothing) + @reset ss.n -= 1 + return ss +end + +function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) + k_block = SymArray{CategoricalSuffStats{num_categories, Int}}(undef, k, k) + fill!(k_block, CategoricalSuffStats(num_categories)) + return k_block +end + +@inline function score(ss::CategoricalSuffStats; kwargs...) + n = max(ss.n, 1) + return n - sum(abs2, ss.h) / n +end + +### ======================================================================================== + +struct BernoulliSuffStats{T} <: SuffStats + h::T + n::T +end + +function BernoulliSuffStats() + return BernoulliSuffStats{Int}(0, 0) +end + +function add_sample(ss::BernoulliSuffStats, sample::Bool) + sample && (@reset ss.h += 1) + @reset ss.n += 1 + return ss +end + +function add_sample(ss::BernoulliSuffStats, ::Nothing) + @reset ss.n += 1 + return ss +end + +function remove_sample(ss::BernoulliSuffStats, sample::Bool) + sample && (@reset ss.h -= 1) + @reset ss.n -= 1 + return ss +end + +function remove_sample(ss::BernoulliSuffStats, ::Nothing) + @reset ss.n -= 1 + return ss +end + +function make_k_block(k, ::Val{:binary}; kwargs...) + k_block = SymArray{BernoulliSuffStats{Int}}(undef, k, k) + fill!(k_block, BernoulliSuffStats()) + return k_block +end + +@inline function score(ss::BernoulliSuffStats; kwargs...) + n = max(ss.n, 1) + p = ss.h / n + return n * (xlogx(1 - p) + xlogx(p)) +end + +### ======================================================================================== + +struct GenericSuffStats{T} <: SuffStats + samples::Vector{T} +end + +function GenericSuffStats{T}() where {T} + return GenericSuffStats{T}(Vector{T}()) +end + +function add_sample(ss::GenericSuffStats, sample) + append!(ss.samples, sample) + return ss +end + +function remove_sample(ss::GenericSuffStats, sample) + index = findfirst(==(sample), ss.samples) + if index !== nothing + deleteat!(ss.samples, index) + end + return ss +end + +function make_k_block(k, generic; data::AbstractArray, kwargs...) + @warn "Using GenericSuffStats may lead to high memory usage for large datasets. + Consider using more specialized sufficient statistics types when possible." + k_block = SymArray{GenericSuffStats{eltype(data)}}(undef, k, k) + for j in 1:k, i in 1:k + k_block[i, j] = GenericSuffStats{eltype(data)}() + end + return k_block +end + +function score(ss::GenericSuffStats; dist::D, kwargs...) where {D} + if dist === nothing + @error("No distribution provided for scoring GenericSuffStats") + end + d = fit(D, ss.samples) + return -sum(logpdf.(Ref(d), ss.samples)) +end + +### ======================================================================================== + +struct GreedySuffStats{M, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator + block_ss::M + block_ss_swap::M + node_swap_rule::NodeR + stop_rule::StopR + max_iter::Int +end + +function init!(es::GreedySuffStats, data, node_labels) + # Initialize the sufficient statistics for each block + @inbounds for j in axes(data, 2) + gj = node_labels[j] + for i in 1:(j - 1) # More efficient than i < j check inside loop + edge_value = data[i, j] + gi = node_labels[i] + es.block_ss[gi, gj] = add_sample(es.block_ss[gi, gj], edge_value) + es.block_ss_swap[gi, gj] = add_sample(es.block_ss_swap[gi, gj], edge_value) + end + end +end + +# TODO: allow for non-symmetric data +function score(matrix_ss, data, node_labels; dist = nothing, norm = 1.0) + total_loss = 0.0 + @inbounds for j in axes(matrix_ss, 2) + for i in 1:j + inter = score( + matrix_ss[i, j]; dist = dist, data = data, node_labels = node_labels) + total_loss += inter + end + end + return total_loss / norm +end + +function GreedySuffStats( + data, node_labels; type_suff_stats = :categorical, max_iter = 10000, + node_swap_rule = RandomGroupSwap(), stop_rule = PreviousBestValue(5_000, Inf, :min), + kwargs...) + # derive user input + k = length(unique(node_labels)) + + # allocate sufficient statistics blocks + block_ss = make_k_block(k, Val(type_suff_stats); data = data, kwargs...) + block_ss_swap = make_k_block(k, Val(type_suff_stats); data = data, kwargs...) + + # create estimator + return GreedySuffStats{typeof(block_ss), typeof(node_swap_rule), typeof(stop_rule)}( + block_ss, block_ss_swap, node_swap_rule, stop_rule, max_iter) + return es +end + +function estimate( + es::GreedySuffStats, + data, + node_labels_init; + progress = true, + dist = nothing, + iter_progress = 5000 +) + # Initialize node labels + node_labels = copy(node_labels_init) + n = length(node_labels) + n_edges = n * (n - 1) / 2 + init!(es, data, node_labels) + + # Progress tracking + pbar = ProgressUnknown( + enabled = progress, + showspeed = true, + desc = "Greedy search: " + ) + + # Update progress bar only every N iterations to reduce overhead + progress_update_interval = max(1, es.max_iter ÷ iter_progress) + # Initial log-likelihood + + current_loss = score(es.block_ss, data, node_labels, dist = dist, norm = n_edges) + es.stop_rule.previous_best_value = current_loss + # Main optimization loop + for iter in 1:(es.max_iter) + # Select two nodes to potentially swap + index1, index2 = select_indices_swap(node_labels, es.node_swap_rule) + + group1 = node_labels[index1] + group2 = node_labels[index2] + + for j in axes(data, 2) + + # extract data + groupj = node_labels[j] + edge_value_1 = data[j, index1] + edge_value_2 = data[j, index2] + + es.block_ss_swap[group1, groupj] = remove_sample( + es.block_ss_swap[group1, groupj], edge_value_1) + es.block_ss_swap[group2, groupj] = add_sample( + es.block_ss_swap[group2, groupj], edge_value_1) + + es.block_ss_swap[group2, groupj] = remove_sample( + es.block_ss_swap[group2, groupj], edge_value_2) + es.block_ss_swap[group1, groupj] = add_sample( + es.block_ss_swap[group1, groupj], edge_value_2) + end + + # tentative swap + node_labels[index1], node_labels[index2] = group2, group1 + new_loss = score(es.block_ss_swap, data, node_labels, dist = dist, norm = n_edges) + + if new_loss < current_loss + # apply swap + copy!(es.block_ss, es.block_ss_swap) + current_loss = new_loss + else + # revert swap + node_labels[index1], node_labels[index2] = group1, group2 + # revert sufficient statistics + copy!(es.block_ss_swap, es.block_ss) + end + + if progress && (iter % progress_update_interval == 0 || iter == es.max_iter) + update!( + pbar, iter; + showvalues = [ + ("loss", current_loss), + info_to_print(es.stop_rule) + ]) + end + + # Check stopping criterion + if stopping_rule(current_loss, es.stop_rule) + break + end + end + finish!(pbar) + @info "Optimization finished. Final loss: $current_loss" + + return node_labels +end diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index 75374f1..e8d8b04 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -17,3 +17,4 @@ abstract type Result end # end include("GreedyAverage.jl") +include("GreedySuffStats.jl") diff --git a/src/optimization/config_rules/swap_rule.jl b/src/optimization/config_rules/swap_rule.jl index fb25f72..2e4cb0f 100644 --- a/src/optimization/config_rules/swap_rule.jl +++ b/src/optimization/config_rules/swap_rule.jl @@ -15,7 +15,7 @@ current assignment `node_assignment`. select_swap function select_indices_swap(node_labels::AbstractVector{Int}, ::RandomNodeSwap) - return Tuple(StatsBase.sample(1:length(node_labels), 2; replace = false)) + return Tuple(StatsBase.samplepair(1:length(node_labels))) end function select_indices_swap(node_labels::AbstractVector{Int}, ::RandomGroupSwap, diff --git a/src/preprocessor/abstractConvertor.jl b/src/preprocessor/abstractConvertor.jl new file mode 100644 index 0000000..cd28a5c --- /dev/null +++ b/src/preprocessor/abstractConvertor.jl @@ -0,0 +1,55 @@ +abstract type AbstractConvertor end + +Base.broadcastable(o::T) where {T <: AbstractConvertor} = Ref(o) + +""" + Convert data from its original form to a processed form suitable for SBM estimation. +""" +function convert end + +struct CategoricalConvertor{T} <: AbstractConvertor + m::Int # number of categories + has_zero::Bool # whether data contains zero values + map::Dict{T, Int} +end + +function num_bins(c::CategoricalConvertor) + return c.m +end + +function convert(c::CategoricalConvertor{T}, A::AbstractMatrix{T}) where {T} + # Map original values to 1-based indices + @error "to be implemented" +end + +struct ContinuousConvertor{B, N, V <: AbstractVector{B}} <: AbstractConvertor + zero_index::Int + bins::V +end + +function num_bins(c::ContinuousConvertor{B, N}) where {B, N} + return N +end + +## assume no singleton bins +function ContinuousConvertor(bins::AbstractVector{B}) where {B <: + Union{Interval, BareInterval}} + bins = sort(bins, lt = lt = strictprecedes) + N = length(bins) + 1 + zero_index = 1 + ContinuousConvertor{B, N, typeof(bins)}(zero_index, bins) +end + +# assume bins are sorted and correctly cover the whole support +function (c::ContinuousConvertor{<:Union{Interval, BareInterval}})(x) + iszero(x) && return c.zero_index + x >= sup(c.bins[end]) && return length(c.bins) + 1 + x <= inf(c.bins[1]) && return c.zero_index + 1 + return findfirst(b -> in_interval(x, b), c.bins) + 1 +end + +function ContinuousConvertor(l, u, num_bins::Int) + edges = collect(range(l, stop = u, length = num_bins + 1)) + bins = [bareinterval(edges[i], edges[i + 1]) for i in 1:num_bins] + ContinuousConvertor(bins) +end diff --git a/src/preprocessor/categorical.jl b/src/preprocessor/categorical.jl new file mode 100644 index 0000000..4030aad --- /dev/null +++ b/src/preprocessor/categorical.jl @@ -0,0 +1,67 @@ + +# ============================================================================ +# Data preparation utilities +# ============================================================================ + +""" + prepare_data_cat(A::AbstractMatrix{<:Real}, k; m=length(unique(A)), has_zero=zero(eltype(A)) in A) + +Prepare categorical network data for GreedyAverage. + +Creates the necessary data structures (count matrices and realized value tensors) +for estimating a categorical Stochastic Block Model with k groups. + +# Arguments +- `A::AbstractMatrix{<:Real}`: Adjacency matrix with categorical edge values +- `k::Int`: Number of groups to partition nodes into +- `m::Int`: Number of edge categories (default: inferred from unique values in A) +- `has_zero::Bool`: Whether the data contains zero values (default: auto-detected) + +# Returns +A tuple containing: +- `data`: Preprocessed adjacency matrix (shifted if zero-indexed) +- `counts`: Symmetric k×k array for edge counts (initialized to 0) +- `counts_swap`: Workspace copy of counts for swap evaluation +- `realized`: Symmetric k×k array of m-dimensional count vectors (initialized to 0) +- `realized_swap`: Workspace copy of realized for swap evaluation + +# Example +```julia +# Network with 3 edge types (0, 1, 2) for no edge, layer 1, layer 2 +A = rand(0:2, 100, 100) +A = (A + A') .÷ 2 # Make symmetric + +data, counts, counts_swap, realized, realized_swap = prepare_data_cat(A, k=5) +``` + +# Notes +- If data contains zeros, they are shifted to 1-indexing for categorical representation +- The realized arrays use StaticArrays.MVector for performance +- The symmetric array structure avoids redundant storage +""" +function prepare_data_cat( + A::AbstractMatrix{<:Real}, + k::Int; + m::Int = length(unique(A)), + has_zero::Bool = zero(eltype(A)) in A +) + @debug "Preparing data for categorical SBM with $m categories and $k groups." + + # Adjust data if zero-indexed (shift to 1-indexing for Julia) + if has_zero + @debug "Data contains zero values, using 1-based indexing." + data = A .+ 1 + else + data = A + end + + # Initialize count matrices + counts = SymArray(k, 0) + counts_swap = SymArray(k, 0) + + # Initialize realized value tensors (k×k matrices of m-dimensional vectors) + realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) + realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) + + return data, counts, counts_swap, realized, realized_swap +end diff --git a/src/preprocessor/continuous.jl b/src/preprocessor/continuous.jl new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index fccb067..f62d406 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -10,9 +10,10 @@ using SparseArrays using LinearAlgebra import Base: eltype, convert, size, getindex, setindex!, copy!, similar, IndexStyle, axes, length, iterate, copyto!, fill! -export SymArray, eltype, deepcopy!, sum_tri_with_diag, fast_getindex, fast_setindex! import SparseArrays: getcolptr, nonzeros, FixedSparseCSC +export SymArray, eltype, deepcopy!, sum_tri_with_diag + """ SymArray{F} <: AbstractSparseMatrix{F, 2} @@ -180,25 +181,7 @@ function length(a::SymArray) return length(a.uppertrian) end -# Base.@propagate_inbounds function getindex(a::SymArray{F}, i::Int, j::Int) where {F} -# @boundscheck checkbounds(a, i, j) -# if i <= j -# @inbounds return a.uppertrian[i, j] -# else -# @inbounds return a.uppertrian[j, i] -# end -# end - -# Base.@propagate_inbounds function setindex!(a::SymArray{F}, v, i::Int, j::Int) where {F} -# @boundscheck checkbounds(a, i, j) -# if i <= j -# @inbounds a.uppertrian[i, j] = v -# else -# @inbounds a.uppertrian[j, i] = v -# end -# end - -# faster indexing by avoiding search +# faster indexing by avoiding search, modified from SparseArrays Base.@propagate_inbounds function getindex(A::SymArray, i0::Integer, i1::Integer) i0, i1 = minmax(i0, i1) @boundscheck checkbounds(A, i0, i1) @@ -206,6 +189,7 @@ Base.@propagate_inbounds function getindex(A::SymArray, i0::Integer, i1::Integer nonzeros(A.uppertrian)[r1 + i0 - 1] end +# faster indexing by avoiding search, modified from SparseArrays Base.@propagate_inbounds function setindex!(A::SymArray, v, i::Int, j::Int) i, j = minmax(i, j) @boundscheck checkbounds(A, i, j) diff --git a/test/test_symarray.jl b/test/test_symarray.jl index 7d36971..442ca88 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -1,6 +1,5 @@ using Test using NetworkHistogram -using NetworkHistogram.FastSymArray using SparseArrays using LinearAlgebra using StaticArrays @@ -312,4 +311,20 @@ using StaticArrays @test b[50, 50] == 50.0 @test b[99, 99] == 99.0 end + + @testset "Broadcasting" begin + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 2.0) + b = @. a + 2.0 + @test b isa SymArray + @test all(b[i, j] == 4.0 for i in 1:3, j in 1:3) + + c = b ./ a + @test c isa SymArray + @test all(c[i, j] == 2.0 for i in 1:3, j in 1:3) + + sin_a = @. sin(a) + @test sin_a isa SymArray + @test all(sin_a[i, j] == sin(2.0) for i in 1:3, j in 1:3) + end end From 18131418e09c6524d172a41b0ab7f0a2190bdc8f Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 26 Oct 2025 13:42:18 +0100 Subject: [PATCH 210/266] try to match messy perf --- src/estimator/GreedySuffStats.jl | 33 +++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/estimator/GreedySuffStats.jl b/src/estimator/GreedySuffStats.jl index 870f5f6..fd8fde2 100644 --- a/src/estimator/GreedySuffStats.jl +++ b/src/estimator/GreedySuffStats.jl @@ -31,24 +31,24 @@ function CategoricalSuffStats(num_categories::Int) return CategoricalSuffStats{num_categories, Int}(h, 0) end -function add_sample(ss::CategoricalSuffStats, sample::Int) +@inline function add_sample(ss::CategoricalSuffStats, sample::Int) ss = @set ss.h[sample] += 1 ss = @set ss.n += 1 return ss end -function add_sample(ss::CategoricalSuffStats, ::Nothing) +@inline function add_sample(ss::CategoricalSuffStats, ::Nothing) @reset ss.n += 1 return ss end -function remove_sample(ss::CategoricalSuffStats, sample::Int) +@inline function remove_sample(ss::CategoricalSuffStats, sample::Int) ss = @set ss.h[sample] -= 1 ss = @set ss.n -= 1 return ss end -function remove_sample(ss::CategoricalSuffStats, ::Nothing) +@inline function remove_sample(ss::CategoricalSuffStats, ::Nothing) @reset ss.n -= 1 return ss end @@ -174,15 +174,18 @@ function init!(es::GreedySuffStats, data, node_labels) end # TODO: allow for non-symmetric data -function score(matrix_ss, data, node_labels; dist = nothing, norm = 1.0) +@inline function score(matrix_ss::SymArray, data, node_labels; dist = nothing, norm = 1.0) total_loss = 0.0 - @inbounds for j in axes(matrix_ss, 2) - for i in 1:j - inter = score( - matrix_ss[i, j]; dist = dist, data = data, node_labels = node_labels) - total_loss += inter - end + for m in matrix_ss.uppertrian.nzval + total_loss += score(m; dist = dist, data = data, node_labels = node_labels) end + # @inbounds for j in axes(matrix_ss, 2) + # for i in 1:j + # inter = score( + # matrix_ss[i, j]; dist = dist, data = data, node_labels = node_labels) + # total_loss += inter + # end + # end return total_loss / norm end @@ -238,8 +241,12 @@ function estimate( group1 = node_labels[index1] group2 = node_labels[index2] - for j in axes(data, 2) + @inbounds for j in axes(data, 2) + # this check is slow ! (+ 6 μs per iteration on n=2000) + if j == index1 || j == index2 + continue + end # extract data groupj = node_labels[j] edge_value_1 = data[j, index1] @@ -257,7 +264,7 @@ function estimate( end # tentative swap - node_labels[index1], node_labels[index2] = group2, group1 + @inbounds node_labels[index1], node_labels[index2] = group2, group1 new_loss = score(es.block_ss_swap, data, node_labels, dist = dist, norm = n_edges) if new_loss < current_loss From 11baabe5d42a7665d32354e1d2792cbced849257 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 11:04:38 +0100 Subject: [PATCH 211/266] almost fix broadcasting for symarray --- src/utils/SymArray.jl | 117 +++++++++++++++++++++++++++++++++--------- test/test_symarray.jl | 81 ++++++++++++++++++++++++++++- 2 files changed, 173 insertions(+), 25 deletions(-) diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index f62d406..e1d5178 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -186,7 +186,7 @@ Base.@propagate_inbounds function getindex(A::SymArray, i0::Integer, i1::Integer i0, i1 = minmax(i0, i1) @boundscheck checkbounds(A, i0, i1) r1 = Int(@inbounds getcolptr(A.uppertrian)[i1]) - nonzeros(A.uppertrian)[r1 + i0 - 1] + A.uppertrian.nzval[r1 + i0 - 1] end # faster indexing by avoiding search, modified from SparseArrays @@ -194,7 +194,7 @@ Base.@propagate_inbounds function setindex!(A::SymArray, v, i::Int, j::Int) i, j = minmax(i, j) @boundscheck checkbounds(A, i, j) r1 = Int(@inbounds getcolptr(A.uppertrian)[j]) - nonzeros(A.uppertrian)[r1 + i - 1] = v + A.uppertrian.nzval[r1 + i - 1] = v end function similar(a::SymArray, ::Type{T} = eltype(a), dims::Dims{2} = size(a)) where {T} @@ -223,8 +223,8 @@ function sum_tri_with_diag(a::SymArray) end function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} - @assert size(a, 1) == size(a, 2) - k = size(a, 1) + k, n = size(a) + @assert k==n "Input matrix must be square, got size $(size(a))" # Directly build upper triangle sparse matrix # Pre-allocate with exact size needed @@ -255,38 +255,34 @@ end deepcopy!(dest::SymArray{F}, src::SymArray{F}) where {F <: Real} = copy!(dest, src) # Broadcasting support - custom style to maintain symmetric structure -struct SymArrayStyle <: Broadcast.AbstractArrayStyle{2} end -SymArrayStyle(::Val{2}) = SymArrayStyle() +# struct SymArrayStyle <: Broadcast.AbstractArrayStyle{2} end +# SymArrayStyle(::Val{2}) = SymArrayStyle() -Base.BroadcastStyle(::Type{<:SymArray}) = SymArrayStyle() +const SymArrayStyle = Broadcast.ArrayStyle{SymArray} + +Base.BroadcastStyle(::Type{<:SymArray}) = Broadcast.ArrayStyle{SymArray}() # SymArrayStyle() # When broadcasting with scalars or other styles, keep SymArrayStyle Base.BroadcastStyle(::SymArrayStyle, ::Broadcast.DefaultArrayStyle{0}) = SymArrayStyle() Base.BroadcastStyle(::Broadcast.DefaultArrayStyle{0}, ::SymArrayStyle) = SymArrayStyle() -# When broadcasting with other arrays, use default array style -function Base.BroadcastStyle(::SymArrayStyle, ::Broadcast.DefaultArrayStyle) - Broadcast.DefaultArrayStyle{2}() -end -function Base.BroadcastStyle(::Broadcast.DefaultArrayStyle, ::SymArrayStyle) - Broadcast.DefaultArrayStyle{2}() -end +# When broadcasting with regular arrays (not scalars), defer to the array's style +# This ensures SymArray .+ Matrix returns Matrix, not SymArray +Base.BroadcastStyle(::SymArrayStyle, s::Broadcast.DefaultArrayStyle) = s +Base.BroadcastStyle(s::Broadcast.DefaultArrayStyle, ::SymArrayStyle) = s # When broadcasting between SymArrays, keep SymArrayStyle Base.BroadcastStyle(::SymArrayStyle, ::SymArrayStyle) = SymArrayStyle() # Custom similar for broadcasted SymArrays function Base.similar( - bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{ElType}) where {ElType} - A = find_symarray(bc) - return SymArray(similar(A.uppertrian, ElType)) -end - -# Custom similar for broadcasted SymArrays -function Base.similar( - bc::Broadcast.Broadcasted{SymArrayStyle}, ::Type{Nothing}) + bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{SymArray}}, ::Type{ElType}) where {ElType} A = find_symarray(bc) - return similar(Array{Nothing}, axes(bc)) + if A == nothing + return SymArray(similar(SparseMatrixCSC{ElType, Int}, axes(bc)...)) + else + return SymArray(similar(A.uppertrian, ElType)) + end end # Helper function to find a SymArray in the broadcast tree @@ -297,4 +293,79 @@ find_symarray(args::Tuple{}) = nothing find_symarray(a::SymArray, rest) = a find_symarray(::Any, rest) = find_symarray(rest) +# Override broadcasted to eagerly evaluate when SymArrayStyle is involved +# This prevents issues with nested broadcasts losing the SymArray type +# hack, needs to be fixed later +function Broadcast.broadcasted(::SymArrayStyle, f, args...) + # Eagerly materialize any nested Broadcasted{SymArrayStyle} to maintain type stability + materialized_args = map(args) do arg + if arg isa Broadcast.Broadcasted{SymArrayStyle} + # Materialize nested SymArray broadcasts immediately + return copy(arg) + else + return arg + end + end + # Now create the broadcast with materialized args + return Broadcast.Broadcasted{SymArrayStyle}(f, materialized_args) +end + +# Specialized copyto! for efficient in-place broadcasting into SymArray +# This maintains the symmetric structure during broadcast operations +function Base.copyto!(dest::SymArray, bc::Broadcast.Broadcasted{SymArrayStyle}) + axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) + + _copyto_nzval!(dest, bc) + return dest + # # Try to use optimized nzval path for simple operations + # if _can_use_nzval_broadcast(bc) + # return _copyto_nzval!(dest, bc) + # end + + # # Fallback: iterate using CartesianIndices but only over upper triangle + # bc′ = Broadcast.preprocess(dest, bc) + # @inbounds for j in 1:size(dest, 2) + # for i in 1:j + # dest[i, j] = bc′[i, j] + # end + # end + # return dest +end + +# Optimized copyto! that works directly on nzval arrays +function _copyto_nzval!( + dest::SymArray{T}, bc::Broadcast.Broadcasted{SymArrayStyle}) where {T} + # Replace SymArrays in the broadcast tree with their nzval arrays + bc_nzval = _replace_with_nzval(bc) + + # Broadcast directly on the nzval array + dest_nzval = nonzeros(dest.uppertrian) + copyto!(dest_nzval, bc_nzval) + + return dest +end + +# Replace SymArrays in broadcast tree with their nzval arrays +function _replace_with_nzval(bc::Broadcast.Broadcasted{SymArrayStyle}) + # Create new broadcasted with transformed arguments + new_args = map(_replace_with_nzval, bc.args) + # Don't specify style - let it be inferred + return Broadcast.Broadcasted(bc.f, new_args) +end + +function _replace_with_nzval(sa::SymArray) + return sa.uppertrian.nzval +end + +function _replace_with_nzval(bc::Broadcast.Broadcasted) + # Recursively process nested broadcasts + new_args = map(_replace_with_nzval, bc.args) + return Broadcast.Broadcasted(bc.f, new_args) +end + +function _replace_with_nzval(x) + # For scalars and other types, return as-is + return x +end + end diff --git a/test/test_symarray.jl b/test/test_symarray.jl index 442ca88..6c4bc6b 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -324,7 +324,84 @@ using StaticArrays @test all(c[i, j] == 2.0 for i in 1:3, j in 1:3) sin_a = @. sin(a) - @test sin_a isa SymArray - @test all(sin_a[i, j] == sin(2.0) for i in 1:3, j in 1:3) + sin_a_bis = sin.(a) + for sin_test in (sin_a, sin_a_bis) + @test sin_test isa SymArray + @test all(sin_test[i, j] == sin(2.0) for i in 1:3, j in 1:3) + end + end + + @testset "Broadcasting with regular arrays" begin + a = SymArray(3, 2.0) + M = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] + + # SymArray + Matrix should return Matrix (follows Matrix type) + result1 = a .+ M + @test result1 isa Matrix{Float64} + @test !(result1 isa SymArray) + + # Matrix + SymArray should also return Matrix + result2 = M .+ a + @test result2 isa Matrix{Float64} + @test !(result2 isa SymArray) + + # Check values are correct + for i in 1:3, j in 1:3 + @test result1[i, j] ≈ 2.0 + M[i, j] + @test result2[i, j] ≈ M[i, j] + 2.0 + end + + # SymArray + scalar should still return SymArray + result3 = a .+ 5.0 + @test result3 isa SymArray + + # SymArray + SymArray should return SymArray + b = SymArray(3, 3.0) + result4 = a .+ b + @test result4 isa SymArray + end + + @testset "SymArray broadcast with Matrix returns Matrix" begin + # Create a SymArray and a regular Matrix + a = SymArray(3, 2.0) + M = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] + + # SymArray + Matrix should return Matrix + result1 = a .+ M + @test result1 isa Matrix{Float64} + @test !(result1 isa SymArray) + @test size(result1) == (3, 3) + + # Matrix + SymArray should also return Matrix + result2 = M .+ a + @test result2 isa Matrix{Float64} + @test !(result2 isa SymArray) + + # Check values are correct + for i in 1:3, j in 1:3 + @test result1[i, j] ≈ 2.0 + M[i, j] + @test result2[i, j] ≈ M[i, j] + 2.0 + end + + # SymArray + scalar should still return SymArray + result3 = a .+ 5.0 + @test result3 isa SymArray + @test all(result3[i, j] ≈ 7.0 for i in 1:3, j in 1:3) + + # SymArray + SymArray should return SymArray + b = SymArray(3, 3.0) + result4 = a .+ b + @test result4 isa SymArray + @test all(result4[i, j] ≈ 5.0 for i in 1:3, j in 1:3) + + # Chained operations with scalars should still work + result5 = (a .+ 1) .* 2 + @test result5 isa SymArray + @test all(result5[i, j] ≈ 6.0 for i in 1:3, j in 1:3) + + a_ones = SymArray(3, 1.0) + result_sum_two_matrices = a_ones .+ M .+ M + @test result_sum_two_matrices isa Matrix{Float64} + @test all(result_sum_two_matrices[i, j] ≈ 1 + 2 * M[i, j] for i in 1:3, j in 1:3) end end From 2caf74861c9c18c3ffdb901efd544da635cd582c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 11:27:40 +0100 Subject: [PATCH 212/266] optimization + start new generic suffstats with indices instead --- src/estimator/GreedySuffStats.jl | 104 ++++++++++++++++++------------- 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/src/estimator/GreedySuffStats.jl b/src/estimator/GreedySuffStats.jl index fd8fde2..0725153 100644 --- a/src/estimator/GreedySuffStats.jl +++ b/src/estimator/GreedySuffStats.jl @@ -1,23 +1,12 @@ abstract type SuffStats end -function add_sample(ss::SuffStats, sample) - @error("add_sample not implemented for $(typeof(ss)) and sample $(typeof(sample)) \n - you may need to implement a custom sufficient statistics type") -end -function remove_sample(ss::SuffStats, sample) - @error("remove_sample not implemented for $(typeof(ss)) and sample $(typeof(sample)) \n - you may need to implement a custom sufficient statistics type") -end +function add_sample end +function remove_sample end -function make_k_block(k, suff_stats_type; kwargs...) - @error("make_k_block not implemented for sufficient statistics type $(suff_stats_type) \n - you may need to implement a custom sufficient statistics type") -end - -function score(ss::SuffStats; kwargs...) - @error("score not implemented for sufficient statistics type $(typeof(ss)) \n - you may need to implement a custom sufficient statistics type") -end +add_sample(suffstats::SuffStats, sample, i, j) = add_sample(suffstats, sample) +remove_sample(suffstats::SuffStats, sample, i, j) = remove_sample(suffstats, sample) +function make_k_block end +function score end ### ======================================================================================== @@ -60,8 +49,7 @@ function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) end @inline function score(ss::CategoricalSuffStats; kwargs...) - n = max(ss.n, 1) - return n - sum(abs2, ss.h) / n + return ss.n - sum(abs2, ss.h) / max(ss.n, 1) end ### ======================================================================================== @@ -75,24 +63,24 @@ function BernoulliSuffStats() return BernoulliSuffStats{Int}(0, 0) end -function add_sample(ss::BernoulliSuffStats, sample::Bool) +@inline function add_sample(ss::BernoulliSuffStats, sample::Bool) sample && (@reset ss.h += 1) @reset ss.n += 1 return ss end -function add_sample(ss::BernoulliSuffStats, ::Nothing) +@inline function add_sample(ss::BernoulliSuffStats, ::Nothing) @reset ss.n += 1 return ss end -function remove_sample(ss::BernoulliSuffStats, sample::Bool) +@inline function remove_sample(ss::BernoulliSuffStats, sample::Bool) sample && (@reset ss.h -= 1) @reset ss.n -= 1 return ss end -function remove_sample(ss::BernoulliSuffStats, ::Nothing) +@inline function remove_sample(ss::BernoulliSuffStats, ::Nothing) @reset ss.n -= 1 return ss end @@ -111,14 +99,20 @@ end ### ======================================================================================== -struct GenericSuffStats{T} <: SuffStats +abstract type GenericSuffStatsType <: SuffStats end + +struct GenericSuffStats{T} <: GenericSuffStatsType samples::Vector{T} end -function GenericSuffStats{T}() where {T} +function GenericSuffStats(::AbstractArray{T}) where {T} return GenericSuffStats{T}(Vector{T}()) end +function get_samples(ss::GenericSuffStats) + return ss.samples +end + function add_sample(ss::GenericSuffStats, sample) append!(ss.samples, sample) return ss @@ -137,17 +131,37 @@ function make_k_block(k, generic; data::AbstractArray, kwargs...) Consider using more specialized sufficient statistics types when possible." k_block = SymArray{GenericSuffStats{eltype(data)}}(undef, k, k) for j in 1:k, i in 1:k - k_block[i, j] = GenericSuffStats{eltype(data)}() + k_block[i, j] = GenericSuffStats(data) end return k_block end -function score(ss::GenericSuffStats; dist::D, kwargs...) where {D} +# use indices rather than pushing and deleting samples for better performance ? +# struct GenericSuffStatsIndex{T} <: GenericSuffStatsType +# indices::Vector{Tuple{Int, Int}} +# data::T +# end + +# function get_samples(ss::GenericSuffStatsIndex) +# return [ss.data[i, j] for (i, j) in ss.indices] +# end + +# function GenericSuffStatsIndex{T}(data::T) where {T} +# return GenericSuffStatsIndex{T}(Vector{Tuple{Int, Int}}(), data) +# end + +# function add_sample(ss::GenericSuffStatsIndex, sample, i, j) +# push!(ss.indices, (i, j)) +# return ss +# end + +function score(ss::GenericSuffStatsType; dist::D, kwargs...) where {D} if dist === nothing @error("No distribution provided for scoring GenericSuffStats") end - d = fit(D, ss.samples) - return -sum(logpdf.(Ref(d), ss.samples)) + samples = get_samples(ss) + d = fit(D, samples) + return -sum(logpdf.(d, samples)) end ### ======================================================================================== @@ -167,8 +181,9 @@ function init!(es::GreedySuffStats, data, node_labels) for i in 1:(j - 1) # More efficient than i < j check inside loop edge_value = data[i, j] gi = node_labels[i] - es.block_ss[gi, gj] = add_sample(es.block_ss[gi, gj], edge_value) - es.block_ss_swap[gi, gj] = add_sample(es.block_ss_swap[gi, gj], edge_value) + es.block_ss[gi, gj] = add_sample(es.block_ss[gi, gj], edge_value, i, j) + es.block_ss_swap[gi, gj] = add_sample( + es.block_ss_swap[gi, gj], edge_value, i, j) end end end @@ -179,13 +194,18 @@ end for m in matrix_ss.uppertrian.nzval total_loss += score(m; dist = dist, data = data, node_labels = node_labels) end - # @inbounds for j in axes(matrix_ss, 2) - # for i in 1:j - # inter = score( - # matrix_ss[i, j]; dist = dist, data = data, node_labels = node_labels) - # total_loss += inter - # end - # end + return total_loss / norm +end + +@inline function score(matrix_ss, data, node_labels; dist = nothing, norm = 1.0) + total_loss = 0.0 + @inbounds for j in axes(matrix_ss, 2) + for i in 1:j + inter = score( + matrix_ss[i, j]; dist = dist, data = data, node_labels = node_labels) + total_loss += inter + end + end return total_loss / norm end @@ -253,14 +273,14 @@ function estimate( edge_value_2 = data[j, index2] es.block_ss_swap[group1, groupj] = remove_sample( - es.block_ss_swap[group1, groupj], edge_value_1) + es.block_ss_swap[group1, groupj], edge_value_1, j, index1) es.block_ss_swap[group2, groupj] = add_sample( - es.block_ss_swap[group2, groupj], edge_value_1) + es.block_ss_swap[group2, groupj], edge_value_1, j, index1) es.block_ss_swap[group2, groupj] = remove_sample( - es.block_ss_swap[group2, groupj], edge_value_2) + es.block_ss_swap[group2, groupj], edge_value_2, j, index2) es.block_ss_swap[group1, groupj] = add_sample( - es.block_ss_swap[group1, groupj], edge_value_2) + es.block_ss_swap[group1, groupj], edge_value_2, j, index2) end # tentative swap From 5e34062a09d0ec0e6c02a8170bb2860efcab9174 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 12:54:37 +0100 Subject: [PATCH 213/266] slight improvement of loop --- src/estimator/GreedySuffStats.jl | 33 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/estimator/GreedySuffStats.jl b/src/estimator/GreedySuffStats.jl index 0725153..cd03f68 100644 --- a/src/estimator/GreedySuffStats.jl +++ b/src/estimator/GreedySuffStats.jl @@ -262,25 +262,22 @@ function estimate( group2 = node_labels[index2] @inbounds for j in axes(data, 2) - - # this check is slow ! (+ 6 μs per iteration on n=2000) - if j == index1 || j == index2 - continue + if j != index1 && j != index2 + # extract data + groupj = node_labels[j] + edge_value_1 = data[j, index1] + edge_value_2 = data[j, index2] + + es.block_ss_swap[group1, groupj] = remove_sample( + es.block_ss_swap[group1, groupj], edge_value_1, j, index1) + es.block_ss_swap[group2, groupj] = add_sample( + es.block_ss_swap[group2, groupj], edge_value_1, j, index1) + + es.block_ss_swap[group2, groupj] = remove_sample( + es.block_ss_swap[group2, groupj], edge_value_2, j, index2) + es.block_ss_swap[group1, groupj] = add_sample( + es.block_ss_swap[group1, groupj], edge_value_2, j, index2) end - # extract data - groupj = node_labels[j] - edge_value_1 = data[j, index1] - edge_value_2 = data[j, index2] - - es.block_ss_swap[group1, groupj] = remove_sample( - es.block_ss_swap[group1, groupj], edge_value_1, j, index1) - es.block_ss_swap[group2, groupj] = add_sample( - es.block_ss_swap[group2, groupj], edge_value_1, j, index1) - - es.block_ss_swap[group2, groupj] = remove_sample( - es.block_ss_swap[group2, groupj], edge_value_2, j, index2) - es.block_ss_swap[group1, groupj] = add_sample( - es.block_ss_swap[group1, groupj], edge_value_2, j, index2) end # tentative swap From f567fa61caae7131c461269a19874be56271d552 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 18:06:00 +0100 Subject: [PATCH 214/266] update constructors --- src/utils/SymArray.jl | 94 +++++++++---------------------------------- test/test_symarray.jl | 58 +++++++++++++------------- 2 files changed, 47 insertions(+), 105 deletions(-) diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index e1d5178..9377ee1 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -12,7 +12,7 @@ import Base: eltype, convert, size, getindex, setindex!, copy!, similar, IndexStyle, axes, length, iterate, copyto!, fill! import SparseArrays: getcolptr, nonzeros, FixedSparseCSC -export SymArray, eltype, deepcopy!, sum_tri_with_diag +export SymArray, eltype, deepcopy!, sum_tri_with_diag, make_sym_init """ SymArray{F} <: AbstractSparseMatrix{F, 2} @@ -27,8 +27,9 @@ This implementation uses Julia's SparseMatrixCSC for efficient storage and acces # Examples ```julia -# Create a 3×3 symmetric matrix initialized with zeros -sym = SymArray(3, 0.0) +# Create a 3×3 symmetric matrix +sym = SymArray{Float64}(undef, 3, 3) +sym .= 0.0 # Access elements (symmetric) sym[1, 2] = 5.0 @@ -81,89 +82,30 @@ function make_csc_format(k::Int, ::Type{F}) where {F} return k, k, colptr, rowval, nzval end -""" - SymArray(k::Int, d::F) - -Create a k×k symmetric matrix initialized with copies of value `d`. - -# Arguments -- `k::Int`: Dimension of the matrix (must be positive) -- `d::F`: Initial value for all entries - -# Example -```julia -sym = SymArray(5, 0.0) # 5×5 matrix of zeros -``` -""" -function SymArray(k::T, d::F) where {F, T <: Real} - k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) - - # Pre-allocate arrays with exact size needed for upper triangle - n_elements = div(k * (k + 1), 2) - I_indices = Vector{Int}(undef, n_elements) - J_indices = Vector{Int}(undef, n_elements) - values = Vector{F}(undef, n_elements) - - idx = 1 - for j in 1:k - for i in 1:j - I_indices[idx] = i - J_indices[idx] = j - values[idx] = deepcopy(d) - idx += 1 - end - end - - uppertrian = sparse(I_indices, J_indices, values, k, k) - return SymArray{F}(uppertrian) +function make_sym_init(k, d::Real) + a = SymArray{typeof(d)}(undef, k, k) + fill!(a, d) + return a end -function SymArray(k::T, d::AbstractArray) where {T <: Real} - k > 0 || throw(ArgumentError("Matrix dimension k=$k must be positive")) - - # Pre-allocate arrays with exact size needed for upper triangle - n_elements = div(k * (k + 1), 2) - I_indices = Vector{Int}(undef, n_elements) - J_indices = Vector{Int}(undef, n_elements) - values = Vector{typeof(d)}(undef, n_elements) - - idx = 1 - for j in 1:k - for i in 1:j - I_indices[idx] = i - J_indices[idx] = j - values[idx] = deepcopy(d) - idx += 1 - end +function make_sym_init(k, d) + a = SymArray{typeof(d)}(undef, k, k) + for j in 1:k, i in 1:j + a[i, j] = deepcopy(d) end - - uppertrian = sparse(I_indices, J_indices, values, k, k) - return SymArray{typeof(d)}(uppertrian) + return a end +@deprecate SymArray(k::Int, d::F) where {F} make_sym_init(k, d) + """ SymArray(d::AbstractMatrix{F}) -Create a SymArray from an existing matrix. The matrix should be symmetric. -Validates symmetry with a tolerance for floating-point errors. +Create a SymArray from an existing matrix.The matrix must be square and is assumed to be symmetric. """ function SymArray(d::AbstractMatrix{F}) where {F} - size(d, 1) == size(d, 2) || throw(ArgumentError( - "Input matrix must be square, got size $(size(d))")) - - # Validate symmetry for floating-point types - if F <: AbstractFloat - k = size(d, 1) - max_asymmetry = zero(F) - for j in 1:k, i in 1:(j - 1) - max_asymmetry = max(max_asymmetry, abs(d[i, j] - d[j, i])) - end - tol = sqrt(eps(F)) * maximum(abs, d) - if max_asymmetry > tol - @warn "Input matrix has asymmetry up to $max_asymmetry (tolerance: $tol). Using upper triangle." - end - end - + m, n = size(d) + m == n || throw(ArgumentError("Input matrix must be square, got size $(size(d))")) return convert(SymArray{F}, d) end diff --git a/test/test_symarray.jl b/test/test_symarray.jl index 6c4bc6b..2d4e5be 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -7,7 +7,7 @@ using StaticArrays @testset "SymArray Array Interface" begin @testset "Construction and basic properties" begin # Test construction with scalar - a = SymArray(3, 1.0) + a = make_sym_init(3, 1.0) @test a isa AbstractArray{Float64, 2} @test size(a) == (3, 3) @test length(a) == 9 @@ -15,17 +15,17 @@ using StaticArrays @test eltype(a) == Float64 # Test construction with zeros - b = SymArray(5, 0.0) + b = make_sym_init(5, 0.0) @test size(b) == (5, 5) @test all(b[i, j] == 0.0 for i in 1:5 for j in 1:5) # Test dimension validation - @test_throws ArgumentError SymArray(0, 1.0) - @test_throws ArgumentError SymArray(-1, 1.0) + @test_throws ArgumentError make_sym_init(0, 1.0) + @test_throws ArgumentError make_sym_init(-1, 1.0) end @testset "Indexing - getindex and setindex!" begin - a = SymArray(4, 0.0) + a = make_sym_init(4, 0.0) # Test setindex! in upper triangle a[1, 2] = 5.0 @@ -48,7 +48,7 @@ using StaticArrays end @testset "Symmetry property" begin - a = SymArray(5, 0.0) + a = make_sym_init(5, 0.0) # Set values and verify symmetry for i in 1:5 @@ -93,7 +93,7 @@ using StaticArrays end @testset "similar function" begin - a = SymArray(3, 5.0) + a = make_sym_init(3, 5.0) # Test similar without type b = similar(a) @@ -117,7 +117,7 @@ using StaticArrays end @testset "copy! and deepcopy!" begin - a = SymArray(3, 0.0) + a = make_sym_init(3, 0.0) a[1, 1] = 1.0 a[1, 2] = 2.0 a[2, 3] = 5.0 @@ -132,7 +132,7 @@ using StaticArrays @test b[3, 2] == 5.0 # Test dimension mismatch - d = SymArray(4, 0.0) + d = make_sym_init(4, 0.0) @test_throws DimensionMismatch copy!(d, a) # Test deepcopy! @@ -162,7 +162,7 @@ using StaticArrays end @testset "Array operations" begin - a = SymArray(3, 2.0) + a = make_sym_init(3, 2.0) # Test iteration count = 0 @@ -180,7 +180,7 @@ using StaticArrays @test any(x -> x == 2.0, a) # Test maximum/minimum - b = SymArray(3, 0.0) + b = make_sym_init(3, 0.0) b[1, 1] = 5.0 b[2, 3] = -3.0 @test maximum(b) == 5.0 @@ -188,8 +188,8 @@ using StaticArrays end @testset "Mathematical operations" begin - a = SymArray(3, 2.0) - b = SymArray(3, 3.0) + a = make_sym_init(3, 2.0) + b = make_sym_init(3, 3.0) # Element-wise operations (using broadcasting) c = a .+ b @@ -211,13 +211,13 @@ using StaticArrays @test all(f[i, j] == 1.5 for i in 1:3, j in 1:3) # Test unary operations - g = SymArray(3, -2.0) + g = make_sym_init(3, -2.0) h = abs.(g) @test h isa SymArray @test all(h[i, j] == 2.0 for i in 1:3, j in 1:3) # Test with mixed values - m = SymArray(3, 0.0) + m = make_sym_init(3, 0.0) m[1, 1] = 1.0 m[1, 2] = 2.0 m[2, 2] = 3.0 @@ -234,7 +234,7 @@ using StaticArrays @test n[3, 3] == 16.0 # Test operations between two SymArrays with different values - p = SymArray(3, 0.0) + p = make_sym_init(3, 0.0) p[1, 1] = 10.0 p[2, 2] = 20.0 p[3, 3] = 30.0 @@ -249,12 +249,12 @@ using StaticArrays end @testset "Special case: sum_tri_with_diag" begin - a = SymArray(3, 1.0) + a = make_sym_init(3, 1.0) # Only upper triangle is stored: 6 elements # [1,1], [1,2], [1,3], [2,2], [2,3], [3,3] @test sum_tri_with_diag(a) == 6.0 - b = SymArray(4, 2.0) + b = make_sym_init(4, 2.0) # Upper triangle has 10 elements for 4x4 @test sum_tri_with_diag(b) == 20.0 @@ -266,20 +266,20 @@ using StaticArrays @testset "Type stability" begin # Float64 - a = SymArray(3, 1.0) + a = make_sym_init(3, 1.0) @test typeof(a[1, 1]) == Float64 # Int - b = SymArray(3, 1) + b = make_sym_init(3, 1) @test typeof(b[1, 1]) == Int # Float32 - c = SymArray(3, 1.0f0) + c = make_sym_init(3, 1.0f0) @test typeof(c[1, 1]) == Float32 end @testset "Sparse matrix properties" begin - a = SymArray(10, 0.0) + a = make_sym_init(10, 0.0) # Initially all elements are stored (including zeros) # Set only a few elements to non-zero a[1, 5] = 3.0 @@ -297,14 +297,14 @@ using StaticArrays @testset "Edge cases" begin # 1x1 matrix - a = SymArray(1, 5.0) + a = make_sym_init(1, 5.0) @test size(a) == (1, 1) @test a[1, 1] == 5.0 a[1, 1] = 10.0 @test a[1, 1] == 10.0 # Large diagonal - b = SymArray(100, 0.0) + b = make_sym_init(100, 0.0) for i in 1:100 b[i, i] = Float64(i) end @@ -332,7 +332,7 @@ using StaticArrays end @testset "Broadcasting with regular arrays" begin - a = SymArray(3, 2.0) + a = make_sym_init(3, 2.0) M = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] # SymArray + Matrix should return Matrix (follows Matrix type) @@ -356,14 +356,14 @@ using StaticArrays @test result3 isa SymArray # SymArray + SymArray should return SymArray - b = SymArray(3, 3.0) + b = make_sym_init(3, 3.0) result4 = a .+ b @test result4 isa SymArray end @testset "SymArray broadcast with Matrix returns Matrix" begin # Create a SymArray and a regular Matrix - a = SymArray(3, 2.0) + a = make_sym_init(3, 2.0) M = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] # SymArray + Matrix should return Matrix @@ -389,7 +389,7 @@ using StaticArrays @test all(result3[i, j] ≈ 7.0 for i in 1:3, j in 1:3) # SymArray + SymArray should return SymArray - b = SymArray(3, 3.0) + b = make_sym_init(3, 3.0) result4 = a .+ b @test result4 isa SymArray @test all(result4[i, j] ≈ 5.0 for i in 1:3, j in 1:3) @@ -399,7 +399,7 @@ using StaticArrays @test result5 isa SymArray @test all(result5[i, j] ≈ 6.0 for i in 1:3, j in 1:3) - a_ones = SymArray(3, 1.0) + a_ones = make_sym_init(3, 1.0) result_sum_two_matrices = a_ones .+ M .+ M @test result_sum_two_matrices isa Matrix{Float64} @test all(result_sum_two_matrices[i, j] ≈ 1 + 2 * M[i, j] for i in 1:3, j in 1:3) From 8fea621576d611f0b6076f684022db512b968223 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 18:08:36 +0100 Subject: [PATCH 215/266] before the great cleanse --- Project.toml | 4 +- src/NetworkHistogram.jl | 9 +- src/distributions/hist_dist.jl | 53 ++++-------- src/estimator/GreedySuffStats.jl | 2 +- src/preprocessor/abstractConvertor.jl | 116 ++++++++++++++++++++------ 5 files changed, 112 insertions(+), 72 deletions(-) diff --git a/Project.toml b/Project.toml index ee20ed2..fa036a2 100644 --- a/Project.toml +++ b/Project.toml @@ -10,7 +10,7 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" -IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" +IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -35,7 +35,7 @@ Accessors = "0.1.42" ArgCheck = "2.5.0" Clustering = "0.15.8" DiscretizeDistributions = "0.1.2" -IntervalArithmetic = "1.0.1" +IntervalSets = "0.7.11" LinearAlgebra = "1.12.0" Reexport = "1.2.2" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 72a5e1f..6ffd742 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -9,10 +9,9 @@ using Distributions using LinearAlgebra using ArgCheck import Random: randperm, AbstractRNG, rand -import Distributions: logpdf -export logpdf +import Distributions: logpdf, pdf -using IntervalArithmetic +using IntervalSets using Reexport @reexport using Graphons @@ -27,15 +26,15 @@ include("distributions/include.jl") include("EdgeList.jl") include("assignment.jl") include("optimization/greedy.jl") +include("distributions/hist_dist.jl") include("preprocessor/abstractConvertor.jl") include("preprocessor/categorical.jl") include("preprocessor/continuous.jl") include("estimator/abstractEstimator.jl") include("estimator/SpectralEstimator.jl") -include("distributions/hist_dist.jl") include("api.jl") -export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf, +export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf, pdf, GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, Strict, PreviousBestValue, nethist_binary_edges diff --git a/src/distributions/hist_dist.jl b/src/distributions/hist_dist.jl index ab9bb29..60cd4af 100644 --- a/src/distributions/hist_dist.jl +++ b/src/distributions/hist_dist.jl @@ -1,61 +1,40 @@ -struct HistDistribution{B, P, P2, T} <: ContinuousUnivariateDistribution +struct HistDistribution{B, P, P2} <: ContinuousUnivariateDistribution bins::B probs::P cum_probs::P2 - lower_bound::T - upper_bound::T end Base.broadcastable(d::HistDistribution) = Ref(d) params(d::HistDistribution) = (d.bins, d.probs) -eltype(::HistDistribution{F}) where {F} = F - function rand(rng::AbstractRNG, d::HistDistribution) u = rand(rng) bin_idx = searchsortedfirst(d.cum_probs, u) - return uniform_in(rng, d.bins, bin_idx) + return rand(rng, d.bins[bin_idx]) end -# assume bins are sorted and encoded by their upper bounds -function uniform_in(rng::AbstractRNG, bins::AbstractVector{B}, - bin_idx::Int) where {B <: Union{Interval, BareInterval}} - l, u = inf(bins[bin_idx]), sup(bins[bin_idx]) - return rand(rng) * (u - l) + l +function HistDistribution(bins, ps) + cum_ps = SVector(cumsum(ps)...) + return HistDistribution{typeof(bins), typeof(ps), typeof(cum_ps)}(bins, ps, cum_ps) end -function HistDistribution(ps, c::ContinuousConvertor) - cum_ps = cumsum(ps) - intervals = vcat(bareinterval(0.0), c.bins) - return HistDistribution(intervals, ps, cum_ps, inf(c.bins[1]), sup(c.bins[end])) -end +logpdf(d::HistDistribution, x::Real) = log(pdf(d, x)) -function HistDistribution(bins, ps) - cum_ps = cumsum(ps) - return HistDistribution(bins, ps, cum_ps, inf(bins[2]), sup(bins[end])) +function pdf(d::HistDistribution, x::Real) + # potentially slow + bin_idx = findfirst(b -> x ∈ b, d.bins) + p = d.probs[bin_idx] + bin_idx == 1 && return p + return p / width(d.bins[bin_idx]) end ### For Graphons compatibility support(d::HistDistribution) = d.bins _extract_param(d::HistDistribution, k) = d.probs[k] -# specialization for DiscreteNonParametric as it requires support to be specified -function convert_to_params(centers, - sbm::DecoratedSBM{HistDistribution{B, P, P2, T}}) where {B, P, P2, T} - s = support(sbm.θ[1, 1]) - return [HistDistribution(s, centers[:, i]) for i in axes(centers, 2)] -end -function logpdf(d::HistDistribution, x::Real) - if x < d.lower_bound - x = d.lower_bound + eps() - elseif x >= d.upper_bound - x = d.upper_bound - eps() - end - bin_idx = findfirst(b -> in_interval(x, b), d.bins) # potentially slow - p = d.probs[bin_idx] - # return sum(d.probs .^ 2) - p^2 + (1 - p)^2 - bin_idx == 1 && return log(p + eps()) - bin_width = sup(d.bins[bin_idx]) - inf(d.bins[bin_idx]) - return log(eps() + p / bin_width) +function convert_to_params(centers, + sbm::DecoratedSBM{HistDistribution{B, P, P2}}) where {B, P, P2} + s = sbm.θ[1, 1].bins + return [HistDistribution(s, convert(P, centers[:, i])) for i in axes(centers, 2)] end diff --git a/src/estimator/GreedySuffStats.jl b/src/estimator/GreedySuffStats.jl index cd03f68..2dd04cb 100644 --- a/src/estimator/GreedySuffStats.jl +++ b/src/estimator/GreedySuffStats.jl @@ -226,7 +226,7 @@ function GreedySuffStats( return es end -function estimate( +function estimate!( es::GreedySuffStats, data, node_labels_init; diff --git a/src/preprocessor/abstractConvertor.jl b/src/preprocessor/abstractConvertor.jl index cd28a5c..32f040c 100644 --- a/src/preprocessor/abstractConvertor.jl +++ b/src/preprocessor/abstractConvertor.jl @@ -5,51 +5,113 @@ Base.broadcastable(o::T) where {T <: AbstractConvertor} = Ref(o) """ Convert data from its original form to a processed form suitable for SBM estimation. """ -function convert end +function (c::AbstractConvertor)(A; kwargs...) + @error "to be implemented" +end + +function to_distribution(c::AbstractConvertor, ps; kwargs...) + @error "to be implemented" +end + +### ======================================================================================= +### Categorical Convertor +### ======================================================================================= struct CategoricalConvertor{T} <: AbstractConvertor m::Int # number of categories - has_zero::Bool # whether data contains zero values map::Dict{T, Int} end +function CategoricalConvertor(data::AbstractArray{T}) where {T} + categories = sort(unique(data)) + m = length(categories) + map = Dict{T, Int}(categories[i] => i for i in 1:m) + return CategoricalConvertor{T}(m, map) +end + function num_bins(c::CategoricalConvertor) return c.m end -function convert(c::CategoricalConvertor{T}, A::AbstractMatrix{T}) where {T} - # Map original values to 1-based indices - @error "to be implemented" +function (c::CategoricalConvertor)(obs::T) where {T} + return c.map[obs] end -struct ContinuousConvertor{B, N, V <: AbstractVector{B}} <: AbstractConvertor - zero_index::Int - bins::V +function to_distribution( + c::CategoricalConvertor{T}, ps::AbstractVector{T2}; kwargs...) where {T, T2} + @argcheck length(ps)==c.m "Length of probabilities must match number of categories" + support = sort(collect(keys(c.map))) + probabilities = SVector{c.m, T2}(ps[c.map[s]] for s in support) + return DiscreteNonParametric(support, probabilities) end -function num_bins(c::ContinuousConvertor{B, N}) where {B, N} - return N +### ======================================================================================= +### [0,1] Continuous Convertor +### ======================================================================================= + +abstract type UnitIntervalConvertorType <: AbstractConvertor end + +struct UnitIntervalConvertor{B <: AbstractVector} <: UnitIntervalConvertorType + bins::B +end + +function UnitIntervalConvertor(n::Int) + zero_interval = Interval{:closed, :closed}(0.0, 0.0) + edges = range(0.0, stop = 1.0, length = n + 1) + bins = [Interval{:closed, :closed}(edges[i], edges[i + 1]) for i in 1:n] + bins = vcat(zero_interval, bins) + return UnitIntervalConvertor{typeof(bins)}(bins) end -## assume no singleton bins -function ContinuousConvertor(bins::AbstractVector{B}) where {B <: - Union{Interval, BareInterval}} - bins = sort(bins, lt = lt = strictprecedes) - N = length(bins) + 1 - zero_index = 1 - ContinuousConvertor{B, N, typeof(bins)}(zero_index, bins) +function num_bins(c::UnitIntervalConvertor) + return length(c.bins) end -# assume bins are sorted and correctly cover the whole support -function (c::ContinuousConvertor{<:Union{Interval, BareInterval}})(x) - iszero(x) && return c.zero_index - x >= sup(c.bins[end]) && return length(c.bins) + 1 - x <= inf(c.bins[1]) && return c.zero_index + 1 - return findfirst(b -> in_interval(x, b), c.bins) + 1 +function (c::UnitIntervalConvertor)(x::Real) + return findfirst(b -> x ∈ b, c.bins) end -function ContinuousConvertor(l, u, num_bins::Int) - edges = collect(range(l, stop = u, length = num_bins + 1)) - bins = [bareinterval(edges[i], edges[i + 1]) for i in 1:num_bins] - ContinuousConvertor(bins) +function to_distribution( + c::UnitIntervalConvertor, ps::AbstractVector{T}; kwargs...) where {T} + @argcheck length(ps)==length(c.bins) "Length of probabilities must match number of bins" + return HistDistribution(c.bins, SVector{length(ps), T}(ps)) end + +# struct RegularUnitIntervalConvertor{N} <: UnitIntervalConvertorType +# num_bins::Int +# end + +### ======================================================================================= +### Continuous Convertor +### ======================================================================================= +# struct ContinuousConvertor{B, N, V <: AbstractVector{B}} <: AbstractConvertor +# zero_index::Int +# bins::V +# end + +# function num_bins(c::ContinuousConvertor{B, N}) where {B, N} +# return N +# end + +# ## assume no singleton bins +# function ContinuousConvertor(bins::AbstractVector{B}) where {B <: +# Union{Interval, BareInterval}} +# bins = sort(bins, lt = lt = strictprecedes) +# N = length(bins) + 1 +# zero_index = 1 +# ContinuousConvertor{B, N, typeof(bins)}(zero_index, bins) +# end + +# # assume bins are sorted and correctly cover the whole support +# function (c::ContinuousConvertor{<:Union{Interval, BareInterval}})(x) +# iszero(x) && return c.zero_index +# x >= sup(c.bins[end]) && return length(c.bins) + 1 +# x <= inf(c.bins[1]) && return c.zero_index + 1 +# return findfirst(b -> in_interval(x, b), c.bins) + 1 +# end + +# function ContinuousConvertor(l, u, num_bins::Int) +# edges = collect(range(l, stop = u, length = num_bins + 1)) +# bins = [bareinterval(edges[i], edges[i + 1]) for i in 1:num_bins] +# ContinuousConvertor(bins) +# end From 2d3f9053d25ce7c605bb3ba2b258bcda46f29928 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 18:33:56 +0100 Subject: [PATCH 216/266] big cleanse --- Project.toml | 6 +- ext/LightMCExt.jl | 2 - ext/MakieExt.jl | 99 ----- src/EdgeList.jl | 195 --------- src/NetworkHistogram.jl | 18 +- src/api.jl | 134 ------ src/assignment.jl | 234 ---------- src/distributions/cat.jl | 65 --- src/distributions/distributions_type.jl | 262 ----------- src/distributions/include.jl | 3 - src/distributions/zero_inflated.jl | 99 ----- src/estimator/GreedyAverage.jl | 353 --------------- src/estimator/GreedySuffStats.jl | 3 +- src/estimator/SpectralEstimator.jl | 413 ------------------ src/estimator/abstractEstimator.jl | 1 - src/optimization/config_rules/InitRule.jl | 61 --- src/optimization/config_rules/accept_rule.jl | 23 - .../config_rules/bandwidth_selection_rule.jl | 40 -- src/optimization/config_rules/include.jl | 5 - src/optimization/greedy.jl | 131 ------ src/optimization/swap_categorical.jl | 186 -------- src/optimization/swap_workspace.jl | 99 ----- src/preprocessor/abstractConvertor.jl | 104 +---- src/preprocessor/categorical.jl | 86 ++-- src/preprocessor/continuous.jl | 71 +++ src/utils/SymArray.jl | 2 - src/utils/config_rules/include.jl | 8 + .../config_rules/stop_rule.jl | 7 - .../config_rules/swap_rule.jl | 8 - src/utils/include.jl | 1 + test/runtests.jl | 5 - test/test_cat_case.jl | 63 --- test/test_data_format.jl | 21 - test/test_distributions_type.jl | 12 - test/test_get_edges_in_groups.jl | 33 -- test/test_swap_workspace.jl | 69 --- test/test_symarray.jl | 81 ++-- 37 files changed, 164 insertions(+), 2839 deletions(-) delete mode 100644 ext/MakieExt.jl delete mode 100644 src/EdgeList.jl delete mode 100644 src/assignment.jl delete mode 100644 src/distributions/cat.jl delete mode 100644 src/distributions/distributions_type.jl delete mode 100644 src/distributions/include.jl delete mode 100644 src/distributions/zero_inflated.jl delete mode 100644 src/estimator/GreedyAverage.jl delete mode 100644 src/estimator/SpectralEstimator.jl delete mode 100644 src/optimization/config_rules/InitRule.jl delete mode 100644 src/optimization/config_rules/accept_rule.jl delete mode 100644 src/optimization/config_rules/bandwidth_selection_rule.jl delete mode 100644 src/optimization/config_rules/include.jl delete mode 100644 src/optimization/greedy.jl delete mode 100644 src/optimization/swap_categorical.jl delete mode 100644 src/optimization/swap_workspace.jl create mode 100644 src/utils/config_rules/include.jl rename src/{optimization => utils}/config_rules/stop_rule.jl (93%) rename src/{optimization => utils}/config_rules/swap_rule.jl (77%) delete mode 100644 test/test_cat_case.jl delete mode 100644 test/test_data_format.jl delete mode 100644 test/test_distributions_type.jl delete mode 100644 test/test_get_edges_in_groups.jl delete mode 100644 test/test_swap_workspace.jl diff --git a/Project.toml b/Project.toml index fa036a2..7645576 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,12 @@ name = "NetworkHistogram" uuid = "7806f430-7229-459c-b2e6-df35e8e4eb5d" -version = "0.5.2" +version = "0.6.0" authors = ["Charles Dufour", "Jake Grainger"] [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" -DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" @@ -23,18 +22,15 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [weakdeps] Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" LightMC = "b58f5c6e-c887-41d6-b553-02118416cd5d" -Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" [extensions] BootstrapExt = "Bootstrap" LightMCExt = "LightMC" -MakieExt = "Makie" [compat] Accessors = "0.1.42" ArgCheck = "2.5.0" Clustering = "0.15.8" -DiscretizeDistributions = "0.1.2" IntervalSets = "0.7.11" LinearAlgebra = "1.12.0" Reexport = "1.2.2" diff --git a/ext/LightMCExt.jl b/ext/LightMCExt.jl index 36b6a5f..dea5384 100644 --- a/ext/LightMCExt.jl +++ b/ext/LightMCExt.jl @@ -3,8 +3,6 @@ module LightMCExt using NetworkHistogram using LightMC -import NetworkHistogram: agg_params, logpdf, sample, params, distance, _fast_compressed_obs, - from_adjs_to_decorated using LightMC: DiscreteMarkovChain, SampleChain, transition_matrix, ConvertBinaryMC logpdf(d::DiscreteMarkovChain, x) = LightMC.logpdf(d, x) diff --git a/ext/MakieExt.jl b/ext/MakieExt.jl deleted file mode 100644 index 8b5dc46..0000000 --- a/ext/MakieExt.jl +++ /dev/null @@ -1,99 +0,0 @@ -""" -MakieExt - Visualization extension for NetworkHistogram - -Provides plotting capabilities for Assignment and BlockModel objects using Makie.jl. -""" -module MakieExt - -using NetworkHistogram -using Makie -using StatsBase: countmap - -import NetworkHistogram: get_probability_matrix, Assignment, heatmap_params, - number_nodes, number_groups, Dist, unwrap -import Distributions -import StatsAPI - -# Helper functions to extract distribution parameters -vec_mine(x) = vec(x) -vec_mine(x::Real) = x - -_splatter_args(ps) = vcat(vec_mine.(ps)...) -_extract_params(d) = _splatter_args(StatsAPI.params(d)) - -""" - Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) - -Convert an Assignment to plottable data by extracting distribution parameters. -""" -function Makie.convert_arguments(::Type{<:AbstractPlot}, a::Assignment) - params_matrix = map(_extract_params, get_probability_matrix(a)) - ps = (getindex.(params_matrix, i) for i in 1:length(params_matrix[1, 2])) - return ps -end - -""" - heatmap_params(a; colormap=:binary, ordering=false, colorrange=nothing, group_match=1:number_groups(a)) - -Create a heatmap visualization of distribution parameters in an Assignment. - -# Arguments -- `a::Assignment`: The assignment to visualize -- `colormap`: Color scheme for the heatmap (default: :binary) -- `ordering::Bool`: Whether to sort nodes by their group labels (default: false) -- `colorrange`: Range for color mapping (default: auto-computed from data) -- `group_match`: Mapping for group indices (default: identity) - -# Returns -- `Figure`: Makie figure with heatmap(s) showing distribution parameters - -# Note -For multi-parameter distributions, creates a grid of heatmaps, one per parameter. -""" -function heatmap_params(a; colormap = :binary, ordering = false, - colorrange = nothing, group_match = 1:number_groups(a)) - node_labels_new = map(x -> group_match[x], a.node_labels) - - params_matrix = map( - _extract_params, get_probability_matrix(a, nothing, node_labels_new)) - - if ordering - perm = sortperm(a.node_labels) - else - perm = 1:number_nodes(a) - end - params_matrix = params_matrix[perm, perm] - - if isnothing(colorrange) - colorrange = extrema(_splatter_args(params_matrix)) - end - - num_params = length(params_matrix[1, 2]) - # Compute grid dimensions: make as square as possible - rows = floor(Int, sqrt(num_params)) - cols = ceil(Int, num_params / rows) - if rows * cols < num_params - rows += 1 - end - - default_size = 300 - fig = Figure() - - # Create grid of subplots - axes = [Axis(fig[i, j], width = default_size, height = default_size) - for i in 1:rows, j in 1:cols] - - for i in 1:num_params - heatmap!(axes[i], getindex.(params_matrix, i)[perm, perm], - colormap = colormap, colorrange = colorrange) - axes[i].title = "Parameter $i" - end - - Colorbar(fig[1:rows, cols + 1], limits = colorrange, colormap = colormap, - label = "Parameter value", width = ceil(Int, sqrt(default_size))) - resize_to_layout!(fig) - return fig -end - -export heatmap_params -end diff --git a/src/EdgeList.jl b/src/EdgeList.jl deleted file mode 100644 index 3a09a8b..0000000 --- a/src/EdgeList.jl +++ /dev/null @@ -1,195 +0,0 @@ -""" - EdgeList{E} - -A memory-efficient adjacency list representation for sparse networks. - -# Fields -- `data::Vector{Vector{E}}`: For each node, stores the edge values to its neighbors -- `name_list::Vector{Vector{Int}}`: For each node, stores the node indices of its neighbors - -# Type Parameters -- `E`: The type of edge values (e.g., Int, Float64, or custom distribution types) - -# Examples -```julia -# From an adjacency matrix -A = [0 1 0; 1 0 1; 0 1 0] -edges = EdgeList(A) - -# Access neighbors of node 1 -neighbor_indices, edge_values = neighbors(edges, 1) - -# Iterate through neighbors -for (neighbor, edge) in iterate_neighbors(edges, 1) - println("Edge to node ", neighbor, " with value ", edge) -end -``` - -See also: [`neighbors`](@ref), [`iterate_neighbors`](@ref), [`get_edge`](@ref) -""" -struct EdgeList{E} - data::Vector{Vector{E}} - name_list::Vector{Vector{Int}} -end - -""" - neighbors(A::EdgeList, i::Int) - -Get the neighbor indices and edge values for node `i`. - -Returns a tuple `(neighbor_indices, edge_values)` where each vector has the same length. - -# Example -```julia -edges = EdgeList(A) -neighbor_nodes, edge_vals = neighbors(edges, 1) -``` -""" -@inline function neighbors(A::EdgeList{E}, i::Int) where {E} - @boundscheck checkbounds(A.data, i) - @boundscheck checkbounds(A.name_list, i) - return A.name_list[i], A.data[i] -end - -""" - iterate_neighbors(A::EdgeList, i::Int) - -Returns an iterator over (neighbor_index, edge_value) pairs for node `i`. - -# Example -```julia -for (j, edge) in iterate_neighbors(edges, i) - # Process edge from i to j -end -``` -""" -@inline iterate_neighbors(A::EdgeList, i::Int) = zip(neighbors(A, i)...) - -""" - edge_type(A::EdgeList{E}) - -Get the element type `E` of edges stored in the EdgeList. -""" -@inline edge_type(A::EdgeList{E}) where {E} = E - -""" - nodes(edgelist::EdgeList) - number_nodes(edgelist::EdgeList) - -Return the number of nodes in the network. -""" -@inline nodes(edgelist::EdgeList) = length(edgelist.data) -@inline number_nodes(edgelist::EdgeList) = nodes(edgelist) - -""" - EdgeList(A::AbstractMatrix{<:Union{Missing, E}}) where {E} - -Construct an EdgeList from an adjacency matrix. Missing values are treated as absent edges, -and diagonal entries are excluded (no self-loops). - -# Arguments -- `A::AbstractMatrix`: Adjacency matrix where `missing` indicates absent edges - -# Example -```julia -A = [0 1 missing; 1 0 2; missing 2 0] -edges = EdgeList(A) -``` -""" -function EdgeList(A::AbstractMatrix{<:Union{Missing, E}}) where {E} - _from_adj_to_edge_list(A) -end -EdgeList(adj_list::EdgeList) = adj_list - -""" - get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} - -Get the edge value between nodes `i` and `j`. Returns `zero(E)` if no edge exists or if `i == j`. - -# Arguments -- `A::EdgeList{E}`: The edge list -- `i::Int`: Source node index -- `j::Int`: Target node index - -# Returns -- Edge value of type `E`, or `zero(E)` if no edge exists -""" -function get_edge(A::EdgeList{E}, i::Int, j::Int) where {E} - if i == j - return zero(E) - end - if j ∉ A.name_list[i] && i ∉ A.name_list[j] - return zero(E) - end - for (k, e) in iterate_neighbors(A, i) - if k == j - return e - end - end - return zero(E) # If edge not found in the iteration -end - -# Internal function to convert adjacency matrix to EdgeList format -function _from_adj_to_edge_list( - A::AbstractMatrix, function_to_apply = identity) - n = size(A, 1) - input = findfirst(x -> !ismissing(x), A) - test = function_to_apply(A[input]) - data = Vector{Vector{typeof(test)}}(undef, n) - name_list = Vector{Vector{Int}}(undef, n) - for j in 1:n - data[j] = Vector{typeof(test)}(undef, 0) - name_list[j] = Vector{Int}(undef, 0) - for i in 1:n - # Exclude diagonal and missing edges - if !ismissing(A[i, j]) && i != j - push!(name_list[j], i) - push!(data[j], function_to_apply(A[i, j])) - end - end - end - return EdgeList(data, name_list) -end - -# Internal functions for preprocessing edge data -function _fast_compressed_obs(d::Dist, A::AbstractMatrix, zeroinflated) - _from_adj_to_edge_list(A, x -> _fast_compressed_obs(d, x, zeroinflated)) -end -function _fast_compressed_obs(d::Dist, A::EdgeList{E}, zeroinflated) where {E} - _make_shift_broadcast(A.data, x -> _fast_compressed_obs(d, x, zeroinflated)) -end - -# Internal function to apply a transformation to EdgeList data -function _make_shift_broadcast(A::EdgeList, f) - n = length(A.data) - test = f(A.data[1][1]) - data = Vector{Vector{typeof(test)}}(undef, n) - for j in 1:n - data[j] = f.(A.data[j]) - end - return EdgeList(data, A.name_list) -end - -""" - fit(d::Dist, A::EdgeList{E}) where {E} - -Fit the distribution `d` to each edge in the EdgeList `A`, returning a new EdgeList -where each edge is replaced by its fitted distribution. - -# Arguments -- `d::Dist`: The distribution type to fit -- `A::EdgeList{E}`: EdgeList containing edge observations - -# Returns -- `EdgeList{typeof(d)}`: New EdgeList with fitted distributions -""" -function fit(d::Dist, A::EdgeList{E}) where {E} - new_data = Vector{Vector{typeof(d)}}(undef, length(A.data)) - for j in 1:length(A.data) - new_data[j] = Vector{typeof(d)}(undef, length(A.data[j])) - for (k, e) in enumerate(A.data[j]) - new_data[j][k] = fit(d, e) - end - end - return EdgeList(new_data, A.name_list) -end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 6ffd742..7d9d408 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -22,28 +22,12 @@ include("utils/include.jl") @reexport using .FastSymArray -include("distributions/include.jl") -include("EdgeList.jl") -include("assignment.jl") -include("optimization/greedy.jl") include("distributions/hist_dist.jl") include("preprocessor/abstractConvertor.jl") -include("preprocessor/categorical.jl") -include("preprocessor/continuous.jl") include("estimator/abstractEstimator.jl") -include("estimator/SpectralEstimator.jl") include("api.jl") -export EdgeList, neighbors, nodes, loglikelihood, zero, fit, agg_params, logpdf, pdf, - GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, +export GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, Strict, PreviousBestValue, nethist_binary_edges -function from_adjs_to_decorated end - -function heatmap_params end - -export from_adjs_to_decorated, heatmap_params - -export NethistResult - end diff --git a/src/api.jl b/src/api.jl index 6ebb75d..44a9f9b 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,118 +1,3 @@ -""" - nethist(data_input, dist_user, initial_node_labels, params::GreedyParams, zero_inflated::Bool = false) - -Estimate a network histogram (stochastic block model) from network data. - -This is the main entry point for fitting a network histogram to your data. It performs -preprocessing, optimization, and returns an Assignment representing the estimated model. - -# Arguments -- `data_input`: Network data (adjacency matrix or EdgeList) -- `dist_user`: Reference distribution for edge values (e.g., Bernoulli, Categorical) -- `initial_node_labels`: Initial group assignment for nodes (vector of integers 1:k) -- `params::GreedyParams`: Optimization parameters -- `zero_inflated::Bool`: Whether to use zero-inflated version of distribution (default: false) - -# Returns -- `Assignment`: The fitted network histogram with optimized node groups and parameters - -# Throws -- `ArgumentError`: If input validation fails - -# Examples -```julia -using NetworkHistogram, LinearAlgebra -import NetworkHistogram: nethist, GreedyParams - -# Binary network -A = Symmetric(rand(0:1, 100, 100)) -A[diagind(A)] .= 0 - -# Initial partition into 3 groups -initial_labels = rand(1:3, 100) - -# Fit network histogram -result = nethist(A, Bernoulli(0.5), initial_labels, GreedyParams()) - -# Extract results -block_matrix = result.θ -node_groups = result.node_labels -ll = loglikelihood(result) -``` - -See also: [`GreedyParams`](@ref), [`Assignment`](@ref), [`BlockModel`](@ref) -""" -function nethist(data_input, dist_user, initial_node_labels, - params::GreedyParams, zero_inflated::Bool = false) - # Input validation - if data_input isa AbstractMatrix - n_rows, n_cols = size(data_input) - if n_rows != n_cols - throw(ArgumentError("Adjacency matrix must be square, got size ($n_rows, $n_cols)")) - end - n = n_rows - elseif data_input isa EdgeList - n = number_nodes(data_input) - else - throw(ArgumentError("data_input must be an AbstractMatrix or EdgeList")) - end - - if length(initial_node_labels) != n - throw(ArgumentError("initial_node_labels length ($(length(initial_node_labels))) must match number of nodes ($n)")) - end - - k = length(unique(initial_node_labels)) - if k < 1 - throw(ArgumentError("Must have at least one group, got $k groups")) - end - if k > n - throw(ArgumentError("Number of groups ($k) cannot exceed number of nodes ($n)")) - end - - if !all(x -> x isa Integer && 1 <= x <= k, initial_node_labels) - throw(ArgumentError("initial_node_labels must contain integers in range 1:$k")) - end - - if params.max_iter < 1 - throw(ArgumentError("max_iter must be positive, got $(params.max_iter)")) - end - - return _nethist( - data_input, dist_user, initial_node_labels, params, Val(zero_inflated)) -end - -# Internal implementation with compile-time zero-inflation flag -function _nethist(data_input, dist_user, initial_node_labels, - params::GreedyParams, zero_inflated) - @debug "preprocessing data" - dist = get_ref_dist(dist_user, zero_inflated) - g = preprocess_data(data_input, dist, zero_inflated) - - @debug "started optimization" - out = greedy_optimize(g, initial_node_labels, params) - - @info "finished optimization with loglikelihood $(loglikelihood(out))" - return postprocess(out) -end - -# Helper functions for preprocessing - -function get_ref_dist(dist::D, ::Val{true}) where {D} - return Dist(ZeroInflated(dist)) -end -function get_ref_dist(dist::D, ::Val{false}) where {D} - return Dist(dist) -end - -function preprocess_data(data, dist::Dist, zero_inflated) - A = EdgeList(_fast_compressed_obs(dist, data, zero_inflated)) - return A, dist -end - -function postprocess(out) - return out -end - function nethist_binary_edges(A, initial_node_labels, params::GreedyParams) k = length(unique(initial_node_labels)) data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) @@ -170,23 +55,6 @@ struct NethistResult{S} model::S end -function NethistResult(a::Assignment) - return NethistResult(copy(a.node_labels), to_block_model(a)) -end - -function to_block_model(a::Assignment{ - E, Dist{D}}) where {E, D <: Union{Bernoulli, Distributions.Bernoulli}} - sizes = counts(a.node_labels) ./ length(a.node_labels) - θ::Matrix{Float64} = map(x -> first(params(unwrap(x))), a.θ) - return SBM(θ, sizes) -end - -function to_block_model(a::Assignment) - @info "Converting Assignment to DecoratedSBM" - sizes = counts(a.node_labels) ./ length(a.node_labels) - return DecoratedSBM(unwrap.(a.θ), sizes) -end - function node_labels_to_latents(node_labels::AbstractVector{Int}, sbm) return map(label -> _label_to_latent(label, sbm), node_labels) end @@ -208,8 +76,6 @@ function permute!(sbm, perm) sbm.cumsize .= cumsum(sbm.size) end -order_groups(a::Assignment, latents::AbstractVector) = order_groups(a.node_labels, latents) - function order_groups(node_labels, latents::AbstractVector) n = length(node_labels) k = length(unique(node_labels)) diff --git a/src/assignment.jl b/src/assignment.jl deleted file mode 100644 index de8647e..0000000 --- a/src/assignment.jl +++ /dev/null @@ -1,234 +0,0 @@ -""" - Assignment{E, D, F, W, V <: AbstractVector{Int}} - -Represents a network histogram: a partition of nodes into groups along with -edge distributions between groups. - -# Fields -- `node_labels::V`: Vector assigning each node to a group (1-indexed) -- `edges::EdgeList{E}`: The observed edge data -- `dists::EdgeList{D}`: Fitted distributions for each edge -- `θ::SymArray{D}`: Symmetric matrix of aggregated distributions between groups -- `log_likelihood::SymArray{F}`: Symmetric matrix of log-likelihoods for each group pair -- `additional_workspace::W`: Optional workspace for optimization algorithms - -# Type Parameters -- `E`: Type of edge observations -- `D`: Type of fitted distributions -- `F`: Type for log-likelihood values (typically Float64) -- `W`: Type for additional workspace data -- `V`: Vector type for node labels - -# Examples -```julia -# Create assignment from node labels and edge data -node_labels = [1, 1, 2, 2, 3] -edges = EdgeList(adjacency_matrix) -dist = Dist(Bernoulli(0.5)) -assignment = Assignment(node_labels, edges, dist) - -# Query assignment properties -k = number_groups(assignment) -n = number_nodes(assignment) -ll = loglikelihood(assignment) -group_i = group(assignment, node_i) -``` - -See also: [`BlockModel`](@ref), [`EdgeList`](@ref), [`Dist`](@ref) -""" -mutable struct Assignment{E, D, F, W, V <: AbstractVector{Int}} - node_labels::V - const edges::EdgeList{E} - const dists::EdgeList{D} - θ::SymArray{D} - log_likelihood::SymArray{F} - additional_workspace::W -end - -# Default capacity for edge collection - typical node degree in sparse networks -const DEFAULT_EDGE_CAPACITY = 32 - -""" - number_nodes(a::Assignment) - -Return the number of nodes in the network. -""" -@inline number_nodes(a::Assignment) = length(a.node_labels) - -""" - number_groups(a::Assignment) - -Return the number of groups (blocks) in the partition. -""" -@inline number_groups(a::Assignment) = size(a.θ, 1) - -""" - proportions(a::Assignment) - -Calculate the proportion of nodes in each group. - -# Returns -- Vector of proportions summing to 1.0 -""" -function proportions(a::Assignment) - return counts(a.node_labels) / number_nodes(a) -end - -""" - loglikelihood(a::Assignment) - -Calculate the total log-likelihood of the assignment. - -The log-likelihood measures how well the stochastic block model (with the current -node partition) fits the observed network data. - -# Returns -- `Float64`: Total log-likelihood value -""" -function loglikelihood(a::Assignment) - return FastSymArray.sum_tri_with_diag(a.log_likelihood) -end - -""" - group(a::Assignment, node::Int) - -Get the group label for a specific node. - -# Arguments -- `a::Assignment`: The assignment -- `node::Int`: Node index (1-indexed) - -# Returns -- `Int`: Group index that the node belongs to -""" -function group(a::Assignment, node::Int) - @boundscheck checkbounds(a.node_labels, node) - @inbounds return a.node_labels[node] -end - -""" - get_edges_in_groups(a::Assignment, g1::Int, g2::Int) - -Extract all edges between two groups. - -# Arguments -- `a::Assignment`: The assignment -- `g1::Int`: First group index -- `g2::Int`: Second group index - -# Returns -- `Vector{E}`: Vector of edge values between the two groups - -# Note -For within-group edges (g1 == g2), only returns edges where i < j to avoid duplicates. -""" -function get_edges_in_groups(a::Assignment, g1::Int, g2::Int) - return get_edges_in_groups(a.node_labels, a.edges, g1, g2) -end - -function get_edges_in_groups(node_labels, edges_all, g1, g2) - edges = Vector{edge_type(edges_all)}() - # Pre-size the vector to avoid repeated reallocations - sizehint!(edges, DEFAULT_EDGE_CAPACITY) - - @inbounds for u in eachindex(node_labels) - if node_labels[u] != g1 - continue - end - for (v, e) in iterate_neighbors(edges_all, u) - if node_labels[v] == g2 && ((g1 == g2 && u < v) || g1 != g2) - push!(edges, e) - end - end - end - return edges -end - -""" - Assignment(node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} - -Construct an Assignment from node labels, edge data, and a reference distribution. - -This constructor fits the distribution to the data, computes the block-level parameters -θ, and calculates the log-likelihood. - -# Arguments -- `node_labels`: Vector of group assignments for each node -- `edge_list::EdgeList{E}`: Edge observations -- `dist::Dist{D}`: Reference distribution to fit to the data - -# Example -```julia -node_labels = [1, 1, 2, 2] -edges = EdgeList(A) -dist = Dist(Bernoulli(0.5)) -assignment = Assignment(node_labels, edges, dist) -``` -""" -function Assignment( - node_labels, edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} - dists = fit(dist, edge_list) - θ, ll = _compute_theta_and_ll(node_labels, dists, edge_list, dist) - return Assignment(node_labels, edge_list, dists, θ, ll, nothing) -end - -# Internal function to compute θ parameters and log-likelihood for each group pair -function _compute_theta_and_ll(node_labels, dists::EdgeList{Dist{D}}, - edge_list::EdgeList{E}, dist::Dist{D}) where {E, D} - number_groups = length(unique(node_labels)) - θ = SymArray(number_groups, zero(dist)) - log_likelihood = SymArray(number_groups, 0.0) - - # Aggregate distributions for each group pair - for u in 1:nodes(dists) - g1 = node_labels[u] - for (v, d) in iterate_neighbors(dists, u) - g2 = node_labels[v] - if u < v - θ[g1, g2] = add_to(θ[g1, g2], d) - end - end - end - - # Compute log-likelihood for each group pair - for u in 1:nodes(dists) - g1 = node_labels[u] - for (v, e) in iterate_neighbors(edge_list, u) - g2 = node_labels[v] - if u > v - log_likelihood[g1, g2] += logpdf(θ[g1, g2], e) - else - break - end - end - end - return θ, log_likelihood -end - -function get_probability_matrix( - a::Assignment, default_dist = nothing, node_labels = a.node_labels) - θ = unwrap.(a.θ) - D = typeof(first(θ)) - if isnothing(default_dist) - try - default_dist = zero(θ[1, 1]) - catch e - if !isa(e, MethodError) - rethrow(e) - end - error("Please provide a default distribution for the diagonal as it could not be inferred") - end - end - n = length(node_labels) - A = Array{D, 2}(undef, n, n) - for j in 1:n - for i in 1:n - if i == j - A[i, i] = default_dist - else - A[i, j] = θ[node_labels[i], node_labels[j]] - end - end - end - return A -end diff --git a/src/distributions/cat.jl b/src/distributions/cat.jl deleted file mode 100644 index c902f7e..0000000 --- a/src/distributions/cat.jl +++ /dev/null @@ -1,65 +0,0 @@ -const Cat{M, T} = Categorical{T, SVector{M, T}} - -function Cat(p::SVector{M, T}) where {M, T} - return Categorical(p) -end - -function Base.show(io::IO, c::Cat) - Base.print(io, "Cat($(c.p))") -end - -num_categories(::Type{Cat{M, T}}) where {M, T} = M -num_categories(::Cat{M, T}) where {M, T} = M -zero(c::Cat{M, T}) where {M, T} = Cat(ones(typeof(c.p)) ./ M) -sample(c::Cat) = rand(c) -function fit(c::Cat{M, T}, xs::AbstractVector{Int}) where {M, T} - total = length(xs) - if total == 0 - return zero(c) - end - return Cat(SVector{M}(counts(xs, M) ./ total)) -end - -function fit(::Cat{M, T}, x::Int) where {M, T} - ps = zeros(T, M) - ps[x] = one(T) - return Cat(SVector{M}(ps)) -end - -function _xlogy(x, y) - if x == 0 - return zero(y) - end - return x * log(y) -end - -function logpdf_cat(p::AbstractVector, obs::Int) - return log(p[obs]) -end - -# Efficient log-likelihood computation for categorical observations -# Uses xlogy(x,y) = x*log(y) which handles edge cases properly -function logpdf_cat(p::AbstractVector, count_observed::AbstractVector) - return sum(_xlogy.(count_observed, p)) -end - -distance(c1::Cat{M, V}, c2::Cat{M, V}) where {M, V} = sum(abs.(c1.p .- c2.p)) - -function get_ref_dist(dist::Categorical, ::Val{true}) - return Dist(Cat(SVector{ncategories(dist) + 1}(0.0, dist.p...))) -end - -function get_ref_dist(dist::Categorical, ::Val{false}) - return Dist(Cat(SVector{ncategories(dist)}(dist.p))) -end - -_fast_compressed_obs(d::Categorical, x::Int, ::Val{true}) = x + one(x) -_fast_compressed_obs(d::Categorical, x::Int, ::Val{false}) = x - -function tv_distance(c1::Cat, c2::Cat) - return sum(abs.(c1.p .- c2.p)) / 2 -end - -function l2_distance(c1::Cat, c2::Cat) - return sqrt(sum((c1.p .- c2.p) .^ 2)) -end diff --git a/src/distributions/distributions_type.jl b/src/distributions/distributions_type.jl deleted file mode 100644 index bec1f42..0000000 --- a/src/distributions/distributions_type.jl +++ /dev/null @@ -1,262 +0,0 @@ -""" - Dist{D} - -A wrapper for distributions that tracks aggregation statistics. - -This type wraps a distribution `D` and maintains a count of how many observations -have been aggregated into it. This is essential for the network histogram algorithm -which needs to efficiently update distributions as nodes move between groups. - -# Fields -- `dist::D`: The underlying distribution -- `counts::Int`: Number of observations aggregated into this distribution (must be ≥ 0) - -# Type Parameters -- `D`: The type of the underlying distribution (e.g., Bernoulli, Categorical, etc.) - -# Constructors -```julia -# With explicit count -Dist(distribution, counts::Int) - -# Single observation (count = 1) -Dist(distribution) -``` - -# Examples -```julia -# Wrap a Bernoulli distribution -d = Dist(Bernoulli(0.5)) - -# Create a zero distribution -d_zero = zero(d) - -# Add observations -d_updated = add_to(d, Bernoulli(0.7)) - -# Remove observations -d_reduced = remove_from(d_updated, Bernoulli(0.7)) -``` - -See also: [`add_to`](@ref), [`remove_from`](@ref), [`zero`](@ref) -""" -struct Dist{D} - dist::D - counts::Int - function Dist(d, counts::Int) - if counts < 0 - throw(ArgumentError("Counts ($counts) cannot be negative")) - end - new{typeof(d)}(d, counts) - end -end - -function Base.show(io::IO, d::Dist) - print(io, "$(d.dist)") -end - -""" - Dist(d) - -Create a Dist with a single observation (count = 1). -""" -Dist(d) = Dist(d, 1) - -""" - zero(d::Dist) - -Create a zero-initialized distribution with 0 counts. -""" -zero(d::Dist) = Dist(zero(d.dist), 0) - -Base.broadcastable(x::Dist) = Ref(x) - -""" - add_to(avgdist::Dist{D}, dist::D) where {D} - -Add a new observation to an aggregated distribution. - -Updates the distribution parameters using weighted averaging based on the count. -The new observation has weight 1/(counts+1) and the existing distribution has -weight counts/(counts+1). - -# Arguments -- `avgdist::Dist{D}`: The current aggregated distribution -- `dist::D`: The new distribution to add - -# Returns -- `Dist{D}`: Updated distribution with incremented count - -# Example -```julia -d = Dist(Bernoulli(0.5), 2) # 2 observations with mean 0.5 -d_new = add_to(d, Bernoulli(0.8)) # Add observation with value 0.8 -# Result: Dist with 3 observations and mean (2*0.5 + 1*0.8)/3 ≈ 0.6 -``` -""" -function add_to(avgdist::Dist{D}, dist::D) where {D} - inner_dist = agg_params( - avgdist.dist, dist, avgdist.counts / (avgdist.counts + 1), - 1 / (avgdist.counts + 1)) - return Dist(inner_dist, avgdist.counts + 1) -end - -""" - remove_from(avgdist::Dist{D}, dist::D) where {D} - -Remove an observation from an aggregated distribution. - -Updates the distribution parameters by removing the contribution of `dist` from -the aggregate, using appropriate weight adjustments. - -# Arguments -- `avgdist::Dist{D}`: The current aggregated distribution -- `dist::D`: The distribution to remove - -# Returns -- `Dist{D}`: Updated distribution with decremented count - -# Note -Throws an error if attempting to remove from a distribution with 0 counts. -""" -function remove_from(avgdist::Dist{D}, dist::D) where {D} - if avgdist.counts <= 0 - error("Cannot remove from a distribution with 0 counts") - end - return Dist( - agg_params( - avgdist.dist, dist, avgdist.counts / max(1, (avgdist.counts - 1)), - -1 / max(1, (avgdist.counts - 1))), - avgdist.counts - 1) -end - -""" - add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} - -Add two Dist objects together, properly accounting for their counts. - -# Arguments -- `avgdist::Dist{D}`: First distribution -- `dist::Dist{D}`: Second distribution to add - -# Returns -- `Dist{D}`: Combined distribution with summed counts -""" -function add_to(avgdist::Dist{D}, dist::Dist{D}) where {D} - Dist( - agg_params( - avgdist.dist, dist.dist, avgdist.counts / - (avgdist.counts + dist.counts), - dist.counts / (avgdist.counts + dist.counts)), - avgdist.counts + dist.counts) -end - -""" - remove_from(avgdist::Dist, dist::Dist) - -Remove one Dist from another, properly accounting for their counts. -""" -function remove_from(avgdist::Dist, dist::Dist) - Dist( - agg_params( - avgdist.dist, dist.dist, - avgdist.counts / max(1, (avgdist.counts - dist.counts)), - -dist.counts / max(1, (avgdist.counts - dist.counts))), - avgdist.counts - dist.counts) -end - -""" - _fast_compressed_obs(d, x, zero_inflated) - -Compress observations for efficient storage and computation. - -By default, returns `x` unchanged. Distributions can override this to implement -custom compression strategies. -""" -_fast_compressed_obs(d, x, zero_inflated) = x - -# Delegate common operations to the underlying distribution -for f in [:logpdf, :sample, :distance, :eltype, :params, :_fast_compressed_obs] - @eval $f(d::Dist, args...) = $f(d.dist, args...) -end - -""" - fit(d::Dist, x) - -Fit the underlying distribution to observation(s) `x`, preserving the count. -""" -fit(d::Dist, x) = Dist(fit(d.dist, x), d.counts) - -""" - loglikelihood(d::Dist, x) - -Compute the log-likelihood of observation(s) `x` under distribution `d`. - -# Returns -- `Float64`: Sum of log-probabilities, or 0.0 if x is empty -""" -loglikelihood(d::Dist, x) = isempty(x) ? 0.0 : sum(logpdf(d, y) for y in x) - -""" - unwrap(d::Dist) - -Extract the underlying distribution from a Dist wrapper. -""" -unwrap(d::Dist) = d.dist - -Base.promote_rule(::Type{Dist{D}}, ::Type{D}) where {D} = D -Base.convert(::Type{D}, d::Dist{D}) where {D} = d.dist - -""" - Bernoulli{T <: Real} - -A simple Bernoulli distribution for binary (0/1) edges. - -# Fields -- `p::T`: Success probability (probability of edge = 1) - -# Example -```julia -b = Bernoulli(0.3) # 30% chance of edge -edge = sample(b) # Returns true or false -ll = logpdf(b, true) # Log probability of observing an edge -``` - -# Interface Requirements -For a distribution to work with NetworkHistogram, it must implement: -- `zero(d)`: Return a zero-initialized distribution -- `agg_params(d1, d2, w1, w2)`: Aggregate two distributions with weights -- `fit(d, x)`: Fit distribution to observation(s) -- `distance(d1, d2)`: Distance metric between distributions -- `logpdf(d, x)`: Log probability density/mass function -- `params(d)`: Return tuple of parameters -- `eltype(d)`: Return element type -- `sample(d)`: Generate a random sample -""" -struct Bernoulli{T <: Real} - p::T - function Bernoulli(p::T) where {T <: Real} - if isnan(p) || isinf(p) - throw(ArgumentError("Bernoulli parameter p=$p must be finite")) - end - if !(0 <= p <= 1) - throw(ArgumentError("Bernoulli parameter p=$p must be in [0, 1]")) - end - new{T}(p) - end -end - -zero(d::Bernoulli) = Bernoulli(zero(d.p)) -zero(::Type{Bernoulli{T}}) where {T} = Bernoulli(zero(T)) -function agg_params(d1::Bernoulli, d2::Bernoulli, w1, w2) - p = w1 * d1.p + w2 * d2.p - # Clamp to [0, 1] to handle floating-point arithmetic errors - p = clamp(p, 0.0, 1.0) - Bernoulli(p) -end -fit(::Bernoulli, x) = Bernoulli(mean(x)) -distance(d1::Bernoulli, d2::Bernoulli) = abs(d1.p - d2.p) -logpdf(d::Bernoulli, x) = log(d.p * x + (1 - d.p) * (1 - x)) -params(d::Bernoulli) = (d.p,) -eltype(d::Bernoulli) = Bool -sample(d::Bernoulli) = rand() <= d.p diff --git a/src/distributions/include.jl b/src/distributions/include.jl deleted file mode 100644 index 2ba3979..0000000 --- a/src/distributions/include.jl +++ /dev/null @@ -1,3 +0,0 @@ -include("distributions_type.jl") -include("zero_inflated.jl") -include("cat.jl") diff --git a/src/distributions/zero_inflated.jl b/src/distributions/zero_inflated.jl deleted file mode 100644 index bbdaf90..0000000 --- a/src/distributions/zero_inflated.jl +++ /dev/null @@ -1,99 +0,0 @@ - -## TODO: define proper distribution that is zero inflated? - -# struct ZIDist{S, F} <: UniVariateDistribution{S} -# dist::UnivariateDistribution{S} -# proba_zero::F -# end - -struct ZeroInflated{D, F} - dist::D - proba_zero::F -end - -struct SampleZI{F} - value::F - iszero::Bool -end - -function ZeroInflated(dist) - return ZeroInflated(dist, 0.0) -end - -function logpdf(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} - if x.iszero - return log(zi.proba_zero) - else - return log(1 - zi.proba_zero) + logpdf(zi.dist, x.value) - end -end - -# function logpdf(zi::ZeroInflated{D, F}, x) where {D, F} -# if iszero(x) -# return log(zi.proba_zero) -# else -# return log(1 - zi.proba_zero) + logpdf(zi.dist, x.value) -# end -# end - -function agg_params( - zi1::ZeroInflated{D, F}, zi2::ZeroInflated{D, F}, w1, w2) where {D, F} - new_proba_zero = w1 * zi1.proba_zero + w2 * zi2.proba_zero - return ZeroInflated( - agg_params(zi1.dist, zi2.dist, w1, w2), - new_proba_zero) -end - -zero(zi::ZeroInflated) = ZeroInflated(zero(zi.dist), 0.0) - -eltype(zi::ZeroInflated{D, F}) where {D, F} = SampleZI{eltype(D)} -params(zi::ZeroInflated{D, F}) where {D, F} = (params(zi.dist)..., zi.proba_zero) - -function fit(zi::ZeroInflated{D, F}, x::SampleZI) where {D, F} - if x.iszero - return ZeroInflated(zero(zi.dist), 1.0) - else - return ZeroInflated(fit(zi.dist, x.value), 0.0) - end -end - -function _fast_compressed_obs(zi::ZeroInflated, x, zero_inflated; filter = iszero) - return SampleZI(_fast_compressed_obs(zi.dist, x, zero_inflated), filter(x)) -end - -function unwrap(d::Dist{ZeroInflated{B, D}}) where {B, D} - #yeah I know again... - return d.dist.dist -end - -function get_proportion_observed(d::Dist{ZeroInflated{B, D}}) where {B, D} - return (1 - d.dist.proba_zero) * d.counts -end - -function get_proportion_observed(d::Dist) - return d.counts -end - -function sample(zi::ZeroInflated{D, F}, args...) where {D, F} - if rand() < zi.proba_zero - return SampleZI(zero(eltype(zi.dist)), true) - else - return SampleZI(sample(zi.dist, args...), false) - end -end - -# function fit(zd::ZeroInflated, x::SampleZI) -# if x.iszero -# return ZeroInflated(zero(zd.dist), 1.0) -# else -# return ZeroInflated(fit(zd.dist, x.value), 0.0) -# end -# end - -# function fit(zd::ZeroInflated{D, F}, x) where {D, F} -# if iszero(x) -# return ZeroInflated(zero(zd.dist), 1.0) -# else -# return ZeroInflated(fit(zd.dist, x), 0.0) -# end -# end diff --git a/src/estimator/GreedyAverage.jl b/src/estimator/GreedyAverage.jl deleted file mode 100644 index e679119..0000000 --- a/src/estimator/GreedyAverage.jl +++ /dev/null @@ -1,353 +0,0 @@ -""" - GreedyAverage{C, S, NodeR, StopR} - -Greedy optimization estimator for Stochastic Block Models using sum-of-squares loss. - -This estimator uses a greedy node-swapping algorithm to minimize the loss function: - L = (1/n_edges) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] - -The algorithm iteratively swaps nodes between groups to improve the block model fit. - -# Type Parameters -- `C`: Type for count matrices (usually symmetric array of integers) -- `S`: Type for realized value matrices (usually symmetric array of vectors) -- `NodeR <: NodeSwapRule`: Rule for selecting which nodes to swap -- `StopR <: StopRule`: Rule for determining when to stop optimization - -# Fields -- `counts::C`: Number of possible edges between each pair of groups -- `counts_swap::C`: Working copy of counts for swap evaluation -- `realized::S`: Sum of observed edge values between each pair of groups -- `realized_swap::S`: Working copy of realized values for swap evaluation -- `max_iter::Int`: Maximum number of iterations -- `node_swap_rule::NodeR`: Strategy for selecting nodes to swap -- `stop_rule::StopR`: Criterion for early stopping - -# Example -```julia -k = 5 # number of groups -counts = SymArray(k, 0) -counts_swap = SymArray(k, 0) -realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) -realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) - -estimator = GreedyAverage( - counts, counts_swap, realized, realized_swap, - max_iter=100_000, - node_swap_rule=RandomGroupSwap(), - stop_rule=PreviousBestValue(1000, Inf, :min) -) - -labels = estimate(estimator, data, initial_labels) -``` -""" -struct GreedyAverage{C, S, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator - counts::C - counts_swap::C - realized::S - realized_swap::S - max_iter::Int - node_swap_rule::NodeR - stop_rule::StopR -end - -""" - score(estimator::GreedyAverage) - -Compute the current objective value (loss) for the estimator. - -Lower values indicate better fit to a block model structure. -""" -function score(estimator::GreedyAverage) - return loss_function(estimator.realized, estimator.counts) -end - -""" - init!(estimator::GreedyAverage, data, initial_labels) - -Initialize the estimator's count and realized value matrices from data. - -Iterates through the upper triangle of the adjacency matrix (i < j) to avoid -double-counting edges in undirected graphs. Updates both the main and swap -workspace matrices. - -# Arguments -- `estimator::GreedyAverage`: The estimator to initialize -- `data::AbstractMatrix`: Network adjacency matrix -- `initial_labels::Vector{Int}`: Initial group assignments for nodes -""" -function init!(estimator::GreedyAverage, data, initial_labels) - # Iterate over upper triangle to avoid double-counting edges - @inbounds for j in axes(data, 2) - label_j = initial_labels[j] - for i in 1:(j - 1) # More efficient than i < j check inside loop - edge_value = data[i, j] - if !isnothing(edge_value) - label_i = initial_labels[i] - - # Update both main and swap workspaces - add_realized(estimator.realized[label_i, label_j], edge_value) - add_realized(estimator.realized_swap[label_i, label_j], edge_value) - add_counts!(estimator.counts, edge_value, label_i, label_j) - add_counts!(estimator.counts_swap, edge_value, label_i, label_j) - end - end - end -end - -""" - estimate(estimator::GreedyAverage, data, initial_labels; progress=true) - -Estimate node group assignments using greedy optimization with node swapping. - -# Algorithm -The algorithm proceeds as follows: -1. Initialize count and realized value matrices from data and initial labels -2. For each iteration: - a. Select two nodes to swap according to the swap rule - b. Tentatively swap them and update statistics - c. Accept swap if it improves the loss, otherwise revert - d. Check stopping criterion -3. Return final node labels - -# Arguments -- `estimator::GreedyAverage`: The estimator with configuration -- `data::AbstractMatrix`: Network adjacency matrix (n × n) -- `initial_labels::Vector{Int}`: Initial group assignments (length n) -- `progress::Bool`: Whether to show progress bar (default: true) - -# Returns -- `node_labels::Vector{Int}`: Optimized group assignments for each node -""" -function estimate(estimator::GreedyAverage, data, initial_labels; progress = false) - # Initialize counts and realized values from data - init!(estimator, data, initial_labels) - initialise_stop_rule!(estimator.stop_rule, estimator) - - # Compute initial loss - current_loss = score(estimator) - - # Start with initial labeling - node_labels = copy(initial_labels) - - # Progress tracking - pbar = ProgressUnknown( - enabled = progress, - showspeed = true, - desc = "Greedy search: " - ) - - # Update progress bar only every N iterations to reduce overhead - progress_update_interval = max(1, estimator.max_iter ÷ 5000) - - # Main optimization loop - for iter in 1:(estimator.max_iter) - # Select two nodes to potentially swap - index1, index2 = select_indices_swap(node_labels, estimator.node_swap_rule) - - group1 = node_labels[index1] - group2 = node_labels[index2] - - # Only process if nodes are in different groups - if group1 != group2 - # Update swap workspace to reflect the proposed swap - # Using @inbounds for performance - loop bounds are guaranteed safe - @inbounds for j in axes(data, 1) - # Skip the swapped nodes themselves - if j == index1 || j == index2 - continue - end - - group_j = node_labels[j] - edge_val_1 = data[j, index1] - edge_val_2 = data[j, index2] - - # Update for node1: remove from group1, add to group2 - # TODO: duplicate for each edge and only iterate over non-zeros (i.e. edge with value and nothing!) - if !isnothing(edge_val_1) - remove_realized(estimator.realized_swap[group1, group_j], edge_val_1) - remove_counts!(estimator.counts_swap, edge_val_1, group1, group_j) - add_realized(estimator.realized_swap[group2, group_j], edge_val_1) - add_counts!(estimator.counts_swap, edge_val_1, group2, group_j) - end - - # Update for node2: remove from group2, add to group1 - if !isnothing(edge_val_2) - remove_realized(estimator.realized_swap[group2, group_j], edge_val_2) - remove_counts!(estimator.counts_swap, edge_val_2, group2, group_j) - add_realized(estimator.realized_swap[group1, group_j], edge_val_2) - add_counts!(estimator.counts_swap, edge_val_2, group1, group_j) - end - end - - # Tentatively apply swap - node_labels[index1] = group2 - node_labels[index2] = group1 - - # Compute new loss - new_loss = loss_function(estimator.realized_swap, estimator.counts_swap) - - # Accept or reject swap - if new_loss < current_loss - # Accept: commit swap to main workspace - deepcopy!(estimator.realized, estimator.realized_swap) - copy!(estimator.counts, estimator.counts_swap) - current_loss = new_loss - else - # Reject: revert labels and workspace - node_labels[index1] = group1 - node_labels[index2] = group2 - deepcopy!(estimator.realized_swap, estimator.realized) - copy!(estimator.counts_swap, estimator.counts) - end - end - - # Update progress bar - - # Update progress bar only periodically to reduce overhead - if progress && (iter % progress_update_interval == 0 || iter == estimator.max_iter) - update!( - pbar, iter; - showvalues = [ - ("loss", current_loss), - info_to_print(estimator.stop_rule) - ]) - end - - # Check stopping criterion - if stopping_rule(current_loss, estimator.stop_rule) - break - end - end - finish!(pbar) - @info "Optimization finished. Final loss: $current_loss" - return node_labels -end - -""" - loss_function(realized, counts) - -Compute the normalized sum-of-squares loss for block model fitting. - -The loss measures how well a block model fits the data by computing: - L = (1/N) * Σᵢⱼ [count(i,j) - ||realized(i,j)||²/count(i,j)] - -where the sum is over the upper triangle (i ≤ j) to avoid double-counting. - -# Mathematical Interpretation -For each pair of groups (i,j): -- `count(i,j)` is the number of edges between groups i and j -- `realized(i,j)` is a vector of observed edge values -- The term `||realized(i,j)||²/count(i,j)` measures concentration of values -- Lower loss indicates better block structure (more homogeneous within blocks) - -# Arguments -- `realized`: Symmetric array of realized edge value sums between groups -- `counts`: Symmetric array of edge counts between groups - -# Returns -- Normalized loss value (lower is better) - -# !warning - This will need to be modified for other data types! -""" -@inline function loss_function(realized, counts::AbstractArray{<:Real}) - total_loss = 0.0 - total_edges = 0.0 - - @inbounds for j in axes(realized, 2) - for i in 1:j - n_edges = counts[i, j] - if n_edges > 0 - inter = n_edges - sum(abs2, realized[i, j]) / n_edges - total_loss += inter - total_edges += n_edges - end - end - end - return total_edges > 0 ? total_loss / total_edges : 0.0 -end - -# # this assumes that sum realized = counts -# @inline function loss_function(realized, counts) -# total_loss = 0.0 -# total_edges = 0.0 -# @inbounds for j in axes(realized, 2) -# for i in 1:j -# for m in eachindex(realized[i, j]) -# total_edges += realized[i, j][m] -# total_loss += realized[i, j][m] * -# (1 - -# _fast_div_(realized[i, j][m], counts[i, j][m])) -# end -# end -# end -# return total_loss / total_edges -# end - -@inline function _fast_div_(num::Real, denom::Real) - num == 0.0 && denom == 0.0 && return 0.0 - return num / denom -end - -# ============================================================================ -# Count manipulation helpers -# ============================================================================ - -""" - add_realized(parameter::AbstractArray, data_value::AbstractArray) - -Add array data value to parameter array (for categorical edge values). -""" -@inline function add_realized(parameter::AbstractArray, data_value::AbstractArray) - @inbounds parameter .+= data_value -end - -""" - remove_realized(parameter::AbstractArray, data_value::AbstractArray) - -Remove array data value from parameter array (for categorical edge values). -""" -@inline function remove_realized(parameter::AbstractArray, data_value::AbstractArray) - @inbounds parameter .-= data_value -end - -""" - add_realized(parameter::AbstractArray, data_value::Real) - -Increment the count for a specific category (for categorical edge values). -""" -@inline function add_realized(parameter::AbstractArray, data_value::Real) - @inbounds parameter[data_value] += 1 -end - -""" - remove_realized(parameter::AbstractArray, data_value::Real) - -Decrement the count for a specific category (for categorical edge values). -""" -@inline function remove_realized(parameter::AbstractArray, data_value::Real) - @inbounds parameter[data_value] -= 1 -end - -@inline function add_counts!( - counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: - Real} - @inbounds counts[group_i, group_j] += one(T) -end - -@inline function remove_counts!( - counts::AbstractArray{T}, data_value::Real, group_i::Int, group_j::Int) where {T <: - Real} - @inbounds counts[group_i, group_j] -= one(T) -end - -@inline function add_counts!( - counts::AbstractArray, data_value, group_i::Int, group_j::Int) - @inbounds counts[group_i, group_j] .+= 1#data_value -end - -@inline function remove_counts!( - counts::AbstractArray, data_value, group_i::Int, group_j::Int) - @inbounds counts[group_i, group_j] .-= 1#data_value -end diff --git a/src/estimator/GreedySuffStats.jl b/src/estimator/GreedySuffStats.jl index 2dd04cb..841d37f 100644 --- a/src/estimator/GreedySuffStats.jl +++ b/src/estimator/GreedySuffStats.jl @@ -237,6 +237,7 @@ function estimate!( # Initialize node labels node_labels = copy(node_labels_init) n = length(node_labels) + k = length(unique(node_labels)) n_edges = n * (n - 1) / 2 init!(es, data, node_labels) @@ -256,7 +257,7 @@ function estimate!( # Main optimization loop for iter in 1:(es.max_iter) # Select two nodes to potentially swap - index1, index2 = select_indices_swap(node_labels, es.node_swap_rule) + index1, index2 = select_indices_swap(node_labels, es.node_swap_rule, k) group1 = node_labels[index1] group2 = node_labels[index2] diff --git a/src/estimator/SpectralEstimator.jl b/src/estimator/SpectralEstimator.jl deleted file mode 100644 index fd46d6e..0000000 --- a/src/estimator/SpectralEstimator.jl +++ /dev/null @@ -1,413 +0,0 @@ -""" - SpectralEstimator{T} - -Spectral clustering estimator for Stochastic Block Models (SBM). - -This estimator uses spectral clustering to partition nodes into groups based on -the graph structure. It computes the normalized Laplacian and performs k-means -clustering on the top k eigenvectors. - -# Fields -- `k::Int`: Number of groups/communities to detect -- `eig_size::Int`: Number of eigenvectors to use (default: k) -- `adjacency_type::Symbol`: Type of adjacency matrix to use (`:binary`, `:weighted`) -- `laplacian_type::Symbol`: Type of Laplacian (`:normalized`, `:unnormalized`) -- `max_kmeans_iter::Int`: Maximum iterations for k-means clustering -- `balanced::Bool`: If true, forces balanced community sizes (default: false) - -# Example -```julia -# Binary adjacency matrix with balanced communities -A = [0 1 1 0; 1 0 1 0; 1 1 0 1; 0 0 1 0] -estimator = SpectralEstimator(2, balanced=true) -labels = estimate(estimator, A) -``` -""" -struct SpectralEstimator{T <: Real} <: SBMEstimator - k::Int - eig_size::Int - adjacency_type::Symbol - laplacian_type::Symbol - max_kmeans_iter::Int - balanced::Bool - - function SpectralEstimator(k::Int; - eig_size::Int = k, - adjacency_type::Symbol = :binary, - laplacian_type::Symbol = :normalized, - max_kmeans_iter::Int = 100, - balanced::Bool = false) - @argcheck k>0 "Number of groups k must be positive" - @argcheck adjacency_type in [:binary, :weighted] "adjacency_type must be :binary or :weighted" - @argcheck laplacian_type in [:normalized, :unnormalized] "laplacian_type must be :normalized or :unnormalized" - @argcheck max_kmeans_iter>0 "max_kmeans_iter must be positive" - new{Float64}(k, eig_size, adjacency_type, laplacian_type, max_kmeans_iter, balanced) - end -end - -""" - estimate(estimator::SpectralEstimator, data; progress = true) - -Perform spectral clustering on the network data. - -# Arguments -- `estimator::SpectralEstimator`: The spectral estimator configuration -- `data`: The adjacency matrix or network data -- `progress::Bool`: Whether to show progress information (for compatibility with other estimators) - -# Returns -- `labels::Vector{Int}`: Node group assignments (1 to k) - -# Algorithm -1. Construct adjacency matrix from data -2. Compute the Laplacian matrix (normalized or unnormalized) -3. Compute eigenvectors corresponding to smallest eigenvalues -4. Perform k-means clustering on the eigenvectors -5. Return cluster assignments -""" -function estimate(estimator::SpectralEstimator, data; progress = true) - progress && @info "Starting spectral clustering with k=$(estimator.k)" - - # Convert data to adjacency matrix - A = construct_adjacency(data, estimator.adjacency_type) - n = size(A, 1) - - @argcheck n>=estimator.k "Number of nodes ($n) must be >= number of groups ($(estimator.k))" - - # Compute Laplacian - L = compute_laplacian(A, estimator.laplacian_type) - - # Compute eigenvectors - progress && @info "Computing eigenvectors..." - eigvals, eigvecs = compute_spectral_embedding(L, estimator.eig_size) - - # Normalize rows for normalized spectral clustering - if estimator.laplacian_type == :normalized - eigvecs = normalize_rows(eigvecs) - end - - # Perform k-means clustering - progress && @info "Performing k-means clustering..." - if estimator.balanced - labels = balanced_kmeans_clustering(eigvecs, estimator.k, estimator.max_kmeans_iter) - else - labels = kmeans_clustering(eigvecs, estimator.k, estimator.max_kmeans_iter) - end - - progress && @info "Spectral clustering complete" - return labels -end - -""" - estimate(estimator::SpectralEstimator, data, initial_labels; progress = true) - -Perform spectral clustering on the network data. The initial_labels are ignored -as spectral clustering doesn't use an initialization. - -This method signature is provided for compatibility with other estimators. -""" -function estimate(estimator::SpectralEstimator, data, initial_labels; progress = true) - return estimate(estimator, data; progress = progress) -end - -""" - construct_adjacency(data, adjacency_type::Symbol) - -Construct an adjacency matrix from the data. - -# Arguments -- `data`: Network data (can be a matrix with various edge types) -- `adjacency_type`: Either `:binary` or `:weighted` - -# Returns -- Symmetric adjacency matrix -""" -function construct_adjacency(data::AbstractMatrix, adjacency_type::Symbol) - n = size(data, 1) - A = zeros(Float64, n, n) - - if adjacency_type == :binary - # Binary adjacency: edge exists if data is not nothing/zero - for i in 1:n - for j in (i + 1):n - if !isnothing(data[i, j]) && data[i, j] != 0 - A[i, j] = 1.0 - A[j, i] = 1.0 - end - end - end - elseif adjacency_type == :weighted - # Weighted adjacency: use the actual values - for i in 1:n - for j in (i + 1):n - if !isnothing(data[i, j]) - if data[i, j] isa AbstractArray - # For categorical data, use sum or count - weight = sum(data[i, j]) - else - weight = float(data[i, j]) - end - A[i, j] = weight - A[j, i] = weight - end - end - end - end - - return A -end - -""" - compute_laplacian(A::AbstractMatrix, laplacian_type::Symbol) - -Compute the graph Laplacian matrix. - -# Arguments -- `A`: Adjacency matrix -- `laplacian_type`: Either `:normalized` or `:unnormalized` - -# Returns -- Laplacian matrix -""" -function compute_laplacian(A::AbstractMatrix, laplacian_type::Symbol) - n = size(A, 1) - d = vec(sum(A, dims = 2)) # Degree vector - - if laplacian_type == :unnormalized - # L = D - A - D = Diagonal(d) - return D - A - elseif laplacian_type == :normalized - # L = I - D^{-1/2} A D^{-1/2} - # Handle zero degrees - d_inv_sqrt = zeros(n) - for i in 1:n - d_inv_sqrt[i] = d[i] > 0 ? 1.0 / sqrt(d[i]) : 0.0 - end - D_inv_sqrt = Diagonal(d_inv_sqrt) - return I - D_inv_sqrt * A * D_inv_sqrt - end -end - -""" - compute_spectral_embedding(L::AbstractMatrix, k::Int) - -Compute the spectral embedding by finding eigenvectors corresponding to -the k smallest eigenvalues of the Laplacian. - -# Arguments -- `L`: Laplacian matrix -- `k`: Number of eigenvectors to compute - -# Returns -- `eigvals`: The k smallest eigenvalues -- `eigvecs`: Matrix where each row is a node and columns are eigenvector components -""" -function compute_spectral_embedding(L::AbstractMatrix, k::Int) - # Compute smallest k eigenvalues and eigenvectors - # Use eigen for small matrices, could use iterative methods for large ones - n = size(L, 1) - - if n <= 1000 - # For small matrices, compute all eigenvalues - F = eigen(Symmetric(L)) - idx = sortperm(F.values)[1:k] - return F.values[idx], F.vectors[:, idx] - else - # For larger matrices, use iterative solver (if available) - # For now, still use full eigen but this could be optimized - F = eigen(Symmetric(L)) - idx = sortperm(F.values)[1:k] - return F.values[idx], F.vectors[:, idx] - end -end - -""" - normalize_rows(X::AbstractMatrix) - -Normalize each row of the matrix to unit length. - -# Arguments -- `X`: Matrix to normalize - -# Returns -- Matrix with normalized rows -""" -function normalize_rows(X::AbstractMatrix) - n, k = size(X) - X_norm = similar(X) - - for i in 1:n - row_norm = norm(X[i, :]) - if row_norm > 0 - X_norm[i, :] = X[i, :] / row_norm - else - X_norm[i, :] = X[i, :] - end - end - - return X_norm -end - -""" - kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) - -Perform k-means clustering on the rows of X. - -# Arguments -- `X`: Data matrix (n × d), where n is number of points, d is dimensionality -- `k`: Number of clusters -- `max_iter`: Maximum number of iterations - -# Returns -- `labels::Vector{Int}`: Cluster assignments (1 to k) -""" -function kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) - n, d = size(X) - - # Initialize centers by randomly selecting k rows - center_indices = randperm(n)[1:k] - centers = X[center_indices, :] - labels = zeros(Int, n) - - converged = false - for iter in 1:max_iter - # Assignment step - old_labels = copy(labels) - for i in 1:n - min_dist = Inf - best_cluster = 1 - for j in 1:k - dist = sum(abs2, X[i, :] - centers[j, :]) - if dist < min_dist - min_dist = dist - best_cluster = j - end - end - labels[i] = best_cluster - end - - # Check convergence - if labels == old_labels - converged = true - break - end - - # Update step - for j in 1:k - cluster_points = findall(labels .== j) - if !isempty(cluster_points) - centers[j, :] = vec(mean(X[cluster_points, :], dims = 1)) - end - end - end - - return labels -end - -""" - balanced_kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) - -Perform balanced k-means clustering on the rows of X, ensuring approximately equal-sized clusters. - -This uses a greedy assignment approach where each cluster is filled to its target size -by assigning the closest points to each cluster's center, respecting size constraints. - -# Arguments -- `X`: Data matrix (n × d), where n is number of points, d is dimensionality -- `k`: Number of clusters -- `max_iter`: Maximum number of iterations - -# Returns -- `labels::Vector{Int}`: Cluster assignments (1 to k) with balanced sizes -""" -function balanced_kmeans_clustering(X::AbstractMatrix, k::Int, max_iter::Int) - n, d = size(X) - target_size = n ÷ k - remainder = n % k - - # Initialize centers by randomly selecting k rows - center_indices = randperm(n)[1:k] - centers = X[center_indices, :] - labels = zeros(Int, n) - - for iter in 1:max_iter - old_labels = copy(labels) - - # Compute all distances - distances = zeros(n, k) - for i in 1:n - for j in 1:k - distances[i, j] = sum(abs2, X[i, :] - centers[j, :]) - end - end - - # Balanced assignment using greedy approach - labels = balanced_assignment(distances, k, target_size, remainder) - - # Check convergence - if labels == old_labels - break - end - - # Update centers - for j in 1:k - cluster_points = findall(labels .== j) - if !isempty(cluster_points) - centers[j, :] = vec(mean(X[cluster_points, :], dims = 1)) - end - end - end - - return labels -end - -""" - balanced_assignment(distances::Matrix, k::Int, target_size::Int, remainder::Int) - -Assign points to clusters in a balanced way using a greedy approach. - -Each cluster gets exactly `target_size` or `target_size + 1` points (depending on remainder). - -# Arguments -- `distances`: Matrix of distances from each point to each cluster center (n × k) -- `k`: Number of clusters -- `target_size`: Base number of points per cluster -- `remainder`: Number of clusters that get one extra point - -# Returns -- `labels::Vector{Int}`: Balanced cluster assignments -""" -function balanced_assignment(distances::Matrix, k::Int, target_size::Int, remainder::Int) - n = size(distances, 1) - labels = zeros(Int, n) - cluster_sizes = zeros(Int, k) - max_sizes = fill(target_size, k) - max_sizes[1:remainder] .+= 1 - - # Create a list of (distance, point_idx, cluster_idx) tuples - assignments = [] - for i in 1:n - for j in 1:k - push!(assignments, (distances[i, j], i, j)) - end - end - - # Sort by distance (greedy: assign closest points first) - sort!(assignments, by = x -> x[1]) - - # Assign points greedily while respecting size constraints - assigned = falses(n) - for (dist, i, j) in assignments - if !assigned[i] && cluster_sizes[j] < max_sizes[j] - labels[i] = j - cluster_sizes[j] += 1 - assigned[i] = true - end - if all(assigned) - break - end - end - - return labels -end diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl index e8d8b04..f6fe72d 100644 --- a/src/estimator/abstractEstimator.jl +++ b/src/estimator/abstractEstimator.jl @@ -16,5 +16,4 @@ abstract type Result end # model::M # end -include("GreedyAverage.jl") include("GreedySuffStats.jl") diff --git a/src/optimization/config_rules/InitRule.jl b/src/optimization/config_rules/InitRule.jl deleted file mode 100644 index 99c1615..0000000 --- a/src/optimization/config_rules/InitRule.jl +++ /dev/null @@ -1,61 +0,0 @@ -abstract type StartingAssignment end -struct OrderedStart <: StartingAssignment end -struct RandomStart <: StartingAssignment end -struct ClusteredStart <: StartingAssignment end - -struct FromAssignment{A} <: StartingAssignment - assignment::A -end - -struct FromNodeLabels{L} <: StartingAssignment - node_labels::L -end - -struct InitRule{S <: StartingAssignment, I} - starting_assignment_rule::S - assignment_rule::I -end - -# check that this is necessary! -function make_assignment(g, h, init_rule::InitRule{S, Nothing}) where {S} - return Assignment(initialize_node_labels( - g, h, init_rule.starting_assignment_rule)...) -end - -""" - initialize_node_labels(g, h, starting_assignment_rule::StartingAssignment) - -initialize node labels based on the `starting_assignment_rule`, and return a vector of -node labels. - -# Implemented rules -- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. -- `RandomStart()`: Randomly assign nodes to groups. -- `ClusteredStart()`: Assign nodes to groups based on a kmedoids algorithm. -""" -initialize_node_labels - -function initialize_node_labels(g, h, ::OrderedStart) - group_size = GroupSize(number_nodes(g), h) - node_labels = StatsBase.inverse_rle(1:length(group_size), group_size) - return node_labels -end - -function initialize_node_labels(g, h, ::RandomStart) - group_size, node_labels = initialize_node_labels(g, h, OrderedStart()) - Random.shuffle!(node_labels) - return node_labels -end - -function initialise_node_labels(g, h, init_rule::FromAssignment{A}) where {A <: Assignment} - return initialise_node_labels(g, h, FromNodeLabels(init_rule.assignment.node_labels)) -end - -function initialise_node_labels(g, h, init_rule::FromNodeLabels{L}) where {L} - @assert number_nodes(g) == length(init_rule.node_labels) - return deepcopy(init_rule.node_labels) -end - -function number_nodes(g::AbstractMatrix) - return size(g, 1) -end diff --git a/src/optimization/config_rules/accept_rule.jl b/src/optimization/config_rules/accept_rule.jl deleted file mode 100644 index f531dc5..0000000 --- a/src/optimization/config_rules/accept_rule.jl +++ /dev/null @@ -1,23 +0,0 @@ -abstract type AcceptRule end -struct Strict <: AcceptRule end - -""" - accept_reject_update!(a::Assignment, swap::Swap, g, accept_rule::AcceptRule) - - -Perform the swap and accept it if it improves the likelihood of the assignment. `a` will -be updated in place if the swap is accepted. - -# Implemented rules -- `Strict()`: Accept the proposal if it has a higher likelihood than the current assignment. -""" -accept_reject_update! - -function accept_reject_update!(a::Assignment, swap::Swap, ::Strict) - current_score = loglikelihood(a) - apply_swap!(a, swap) - if loglikelihood(a) <= current_score - revert_swap!(a, swap) - end - return nothing -end diff --git a/src/optimization/config_rules/bandwidth_selection_rule.jl b/src/optimization/config_rules/bandwidth_selection_rule.jl deleted file mode 100644 index d922a96..0000000 --- a/src/optimization/config_rules/bandwidth_selection_rule.jl +++ /dev/null @@ -1,40 +0,0 @@ -abstract type KSelectionRule end -struct OracleK <: KSelectionRule - K::Int -end - -struct OracleH <: KSelectionRule - H::Int -end - -""" - select_number_node_per_block(g::Observations, rule::KSelectionRule) - -How to select the number of blocks `K` for the BlockModel model. - -# Implemented rules -- `OracleK(K::Int)`: Use the oracle number of blocks `K`. -- `OracleH(H::Int)`: Use the oracle number of nodes per block `H`. - -!!! info - - The number of blocks `K` should be at most `n/2` where `n` is the number of nodes in - the graph. -""" -select_number_node_per_block - -function select_number_node_per_block(g, rule::OracleH) - if rule.H > number_nodes(g) ÷ 2 - throw(ArgumentError("The number of nodes per block $(rule.H) is too large for the \ - number of nodes $(number_nodes(g)), it should be at most $(number_nodes(g)÷2)")) - end - if rule.H <= 1 - throw(ArgumentError("The number of nodes per block $(rule.H) is too small, it should \ - be at least 2")) - end - return rule.H -end - -function select_number_node_per_block(g, rule::OracleK) - nodes_per_block = number_nodes(g) ÷ rule.K - return select_number_node_per_block(g, OracleH(nodes_per_block)) -end diff --git a/src/optimization/config_rules/include.jl b/src/optimization/config_rules/include.jl deleted file mode 100644 index 8c5b6bf..0000000 --- a/src/optimization/config_rules/include.jl +++ /dev/null @@ -1,5 +0,0 @@ -include("swap_rule.jl") -include("accept_rule.jl") -include("InitRule.jl") -include("stop_rule.jl") -include("bandwidth_selection_rule.jl") diff --git a/src/optimization/greedy.jl b/src/optimization/greedy.jl deleted file mode 100644 index 1cabc85..0000000 --- a/src/optimization/greedy.jl +++ /dev/null @@ -1,131 +0,0 @@ -include("swap_workspace.jl") -include("swap_categorical.jl") -include("config_rules/include.jl") - -""" - GreedyParams - -Configuration parameters for the greedy optimization algorithm. - -# Fields -- `max_iter::Int`: Maximum number of iterations (default: 100,000) -- `swap_rule::NodeSwapRule`: Rule for selecting which nodes to swap -- `accept_rule::AcceptRule`: Rule for accepting/rejecting proposed swaps -- `stop_rule::StopRule`: Rule for determining when to stop optimization -- `progress_bar::Bool`: Whether to display a progress bar (default: true) - -# Examples -```julia -# Use default parameters -params = GreedyParams() - -# Custom parameters with stricter stopping -params = GreedyParams( - 1_000_000, # max iterations - RandomNodeSwap(), # random node selection - Strict(), # only accept improvements - PreviousBestValue(5000), # stop after 5000 iterations without improvement - true # show progress bar -) -``` - -See also: [`NodeSwapRule`](@ref), [`AcceptRule`](@ref), [`StopRule`](@ref) -""" -mutable struct GreedyParams - max_iter::Int - swap_rule::NodeSwapRule - accept_rule::AcceptRule - stop_rule::StopRule - progress_bar::Bool -end - -""" - GreedyParams() - -Create default greedy optimization parameters. - -Defaults: -- max_iter: 100,000 -- swap_rule: RandomNodeSwap() -- accept_rule: Strict() -- stop_rule: PreviousBestValue(10,000) -- progress_bar: true -""" -function GreedyParams() - GreedyParams( - 100_000, RandomNodeSwap(), Strict(), PreviousBestValue(10_000), true) -end - -""" - greedy_optimize(g, initial_labels, params::GreedyParams) - -Run greedy optimization to find a good network histogram (block model partition). - -# Arguments -- `g`: Tuple of (EdgeList, Dist) containing the network data and distribution type -- `initial_labels`: Initial group assignment for nodes -- `params::GreedyParams`: Optimization parameters - -# Returns -- `Assignment`: Optimized assignment of nodes to groups - -# Algorithm -The algorithm iteratively: -1. Proposes moving a node to a different group (based on swap_rule) -2. Evaluates the change in log-likelihood -3. Accepts or rejects the move (based on accept_rule) -4. Continues until stopping criterion met (based on stop_rule) -""" -function greedy_optimize(g, initial_labels, params::GreedyParams) - @debug "making assignment" - a = Assignment(initial_labels, g...) - @debug "assignment made, starting greedy search" - greedy_improve!(a; params = params) - return a -end - -""" - greedy_improve!(a::Assignment; params = GreedyParams()) - -Improve an existing assignment through greedy local search. - -Modifies the assignment in-place by iteratively proposing and accepting beneficial -node reassignments. - -# Arguments -- `a::Assignment`: The assignment to improve (modified in-place) -- `params::GreedyParams`: Optimization parameters - -# Note -This function modifies `a` in-place and updates its log-likelihood. -""" -function greedy_improve!(a::Assignment; params = GreedyParams()) - # allocate memory for swap - swap = make_swap(a, (1, 2)) - - # display progress bar - p = ProgressUnknown(enabled = params.progress_bar, - showspeed = true, desc = "Greedy search: ") - - for i in 1:(params.max_iter) - local_search!(a, swap, params) - next!(p; - showvalues = [ - ("ll: ", loglikelihood(a)), info_to_print(params.stop_rule)]) - if stopping_rule(a, params.stop_rule) - if i < 10 - @warn "Greedy search stopped early after $(i) iterations" - end - finish!(p) - break - end - end -end - -# Internal function for a single local search step -function local_search!(a::Assignment, swap, params::GreedyParams) - # select two nodes to swap and update data in the swap object - make_swap!(swap, a, select_indices_swap(a, params.swap_rule)) - # apply swap, test if local improvement and update assignment if needed - accept_reject_update!(a, swap, params.accept_rule) -end diff --git a/src/optimization/swap_categorical.jl b/src/optimization/swap_categorical.jl deleted file mode 100644 index 8f4d89c..0000000 --- a/src/optimization/swap_categorical.jl +++ /dev/null @@ -1,186 +0,0 @@ -mutable struct WorkspaceDiscreteSwap{C <: SymArray, R <: SymArray, - R2 <: SymArray, L <: SymArray} - log_likelihood_per_group::L - counts::C - realized::R - estimated::R2 -end - -function Assignment( - node_labels, edge_list::EdgeList{E}, - dist::Dist{Cat{M, T}}) where {E, M, T} - n_groups = length(unique(node_labels)) - n_nodes = length(node_labels) - dists = fit(dist, edge_list) - realized = SymArray(n_groups, zeros(Float64, num_categories(unwrap(dist)))) - estimated = SymArray(n_groups, zeros(Float64, num_categories(unwrap(dist)))) - counts = SymArray(n_groups, 0) - - for u in 1:n_nodes - g1 = node_labels[u] - for (v, e) in iterate_neighbors(edge_list, u) - g2 = node_labels[v] - if v < u - counts[g1, g2] += 1 - realized[g1, g2][e] += 1 - else - break - end - end - end - - for g2 in 1:n_groups, g1 in g2:n_groups - _fast_normalization!( - estimated[g1, g2], realized[g1, g2], counts[g1, g2]) - end - - θ = SymArray(n_groups, zero(dist)) - log_likelihood_per_group = SymArray(n_groups, 0.0) - for g2 in 1:n_groups - for g1 in g2:n_groups - θ[g1, g2] = Dist(Cat(SVector{M}(estimated[g1, g2]))) - log_likelihood_per_group[g1, g2] = logpdf_cat( - estimated[g1, g2], realized[g1, g2]) - end - end - - # Pre-allocate workspace with copies of current state - w = WorkspaceDiscreteSwap( - SymArray(n_groups, 0.0), - SymArray(n_groups, 0), - SymArray(n_groups, zeros(Float64, M)), - SymArray(n_groups, zeros(Float64, M)) - ) - - # Create assignment first - assignment = Assignment( - node_labels, edge_list, dists, θ, log_likelihood_per_group, w) - - # Now copy the actual workspace data into w - for g2 in 1:n_groups, g1 in g2:n_groups - w.log_likelihood_per_group[g1, g2] = log_likelihood_per_group[g1, g2] - w.counts[g1, g2] = counts[g1, g2] - copy!(w.realized[g1, g2], realized[g1, g2]) - copy!(w.estimated[g1, g2], estimated[g1, g2]) - end - - return assignment -end - -function make_workspace(a::Assignment{E, Dist{D}, - F, W}) where {E, F, D <: Cat, W} - # Pre-allocate workspace instead of deepcopy - k = number_groups(a) - m = num_categories(unwrap(a.θ[1, 1])) - - log_ll = SymArray(k, 0.0) - counts = SymArray(k, 0) - realized = SymArray(k, zeros(Float64, m)) - estimated = SymArray(k, zeros(Float64, m)) - - return WorkspaceDiscreteSwap(log_ll, counts, realized, estimated) -end - -function copy_categorical_workspace!( - dest::WorkspaceDiscreteSwap, src_assignment::Assignment) - # In-place copy without allocation - copy!(dest.log_likelihood_per_group, src_assignment.log_likelihood) - - # Copy counts (scalars) - copy!(dest.counts, src_assignment.additional_workspace.counts) - - # Copy vector-valued SymArrays element by element - # Use sparse matrix iteration instead of .d dictionary - k = size(dest.realized, 1) - deepcopy!(dest.realized, src_assignment.additional_workspace.realized) - deepcopy!(dest.estimated, src_assignment.additional_workspace.estimated) - - # @inbounds for j in 1:k, i in 1:j - # copy!(dest.realized[i, j], src_ws.realized[i, j]) - # end - - # @inbounds for j in 1:k, i in 1:j - # copy!(dest.estimated[i, j], src_ws.estimated[i, j]) - # end -end - -function make_swap_workspace!(ws::WorkspaceDiscreteSwap, a::Assignment) - # Use in-place copy instead of deepcopy - copy_categorical_workspace!(ws, a) -end - -function revert_swap_workspace!(a::Assignment, ws::WorkspaceDiscreteSwap) - # Use in-place copy instead of deepcopy - copy!(a.log_likelihood, ws.log_likelihood_per_group) - - copy!(a.additional_workspace.log_likelihood_per_group, ws.log_likelihood_per_group) - copy!(a.additional_workspace.counts, ws.counts) - - # Copy vector-valued SymArrays element by element - # Use sparse matrix iteration instead of .d dictionary - k = size(ws.realized, 1) - deepcopy!(a.additional_workspace.realized, ws.realized) - deepcopy!(a.additional_workspace.estimated, ws.estimated) - # @inbounds for j in 1:k, i in 1:j - # copy!(as.realized[i, j], ws.realized[i, j]) - # end - - # @inbounds for j in 1:k, i in 1:j - # copy!(as.estimated[i, j], ws.estimated[i, j]) - # end -end - -function apply_swap!(as::Assignment, s::Swap{<:WorkspaceDiscreteSwap}) - u, v = s.u, s.v - n_groups = number_groups(as) - gu = as.node_labels[u] - gv = as.node_labels[v] - for (node, e) in iterate_neighbors(as.edges, u) - if node == v - continue - end - g_inter = as.node_labels[node] - as.additional_workspace.counts[gu, g_inter] -= 1 - as.additional_workspace.realized[gu, g_inter][e] -= 1 - as.additional_workspace.counts[gv, g_inter] += 1 - as.additional_workspace.realized[gv, g_inter][e] += 1 - end - for (node, e) in iterate_neighbors(as.edges, v) - if node == u - continue - end - g_inter = as.node_labels[node] - as.additional_workspace.counts[gv, g_inter] -= 1 - as.additional_workspace.realized[gv, g_inter][e] -= 1 - as.additional_workspace.counts[gu, g_inter] += 1 - as.additional_workspace.realized[gu, g_inter][e] += 1 - end - @inbounds for index in eachindex(as.additional_workspace.estimated) - _fast_normalization!(as.additional_workspace.estimated[index], - as.additional_workspace.realized[index], as.additional_workspace.counts[index]) - end - swap_node_labels!(as, u, v) - m = size(as.additional_workspace.estimated[1, 1], 1) - for g2 in 1:n_groups - for g1 in g2:n_groups - as.θ[g1, g2] = Dist(Cat(SVector{m}(as.additional_workspace.estimated[g1, g2]))) - # set_params!(as.additional_workspace.θ[g1, g2], - # as.additional_workspace.estimated[g1, g2]) - as.additional_workspace.log_likelihood_per_group[g1, g2] = logpdf_cat( - as.additional_workspace.estimated[g1, g2], as.additional_workspace.realized[ - g1, g2]) - end - end - copy!(as.log_likelihood, as.additional_workspace.log_likelihood_per_group) - # as.log_likelihood = deepcopy(as.additional_workspace.log_likelihood_per_group) -end - -function _fast_normalization!(p::AbstractVector, r::AbstractVector, c::Real) - if c > 0 - @inbounds for m in eachindex(p) - p[m] = r[m] / c - end - else - fill!(p, 0.0) - end -end diff --git a/src/optimization/swap_workspace.jl b/src/optimization/swap_workspace.jl deleted file mode 100644 index aa94c73..0000000 --- a/src/optimization/swap_workspace.jl +++ /dev/null @@ -1,99 +0,0 @@ -# Reasonable default capacity for affected groups in a swap -const MAX_AFFECTED_GROUPS = 16 - -mutable struct WorkspaceSwap{D, F, G} - θ::SymArray{D} - log_likelihood_per_group::SymArray{F} - groups_buffer::G # Pre-allocated buffer for affected group pairs -end - -function make_workspace(a::Assignment) - # Pre-allocate workspace with same structure - k = number_groups(a) - θ_copy = SymArray(k, zero(a.θ[1, 1])) - ll_copy = SymArray(k, 0.0) - groups_buffer = Set{Tuple{Int, Int}}() - sizehint!(groups_buffer, MAX_AFFECTED_GROUPS) - return WorkspaceSwap(θ_copy, ll_copy, groups_buffer) -end - -mutable struct Swap{W} - u::Int - v::Int - workspace::W -end - -function make_swap_workspace!(ws, a::Assignment) - # Use in-place copy instead of deepcopy - copy!(ws.θ, a.θ) - copy!(ws.log_likelihood_per_group, a.log_likelihood) -end - -function revert_swap_workspace!(a::Assignment, ws) - # Use in-place copy instead of deepcopy - copy!(a.θ, ws.θ) - copy!(a.log_likelihood, ws.log_likelihood_per_group) -end - -function make_swap(a::Assignment, id) - ws = make_workspace(a) - make_swap_workspace!(ws, a) # Actually copy the current state - return Swap(id[1], id[2], ws) -end - -function make_swap!(swap::Swap, a::Assignment, id) - swap.u, swap.v = id - make_swap_workspace!(swap.workspace, a) -end - -function revert_swap!(assignment::Assignment, swap::Swap) - # swap labels back to original - swap_node_labels!(assignment, swap.u, swap.v) - # restore saved θ and log likelihoods - revert_swap_workspace!(assignment, swap.workspace) -end - -function swap_node_labels!(a::Assignment, i, j) - a.node_labels[i], a.node_labels[j] = a.node_labels[j], a.node_labels[i] -end - -function apply_swap!(a::Assignment, s::Swap) - u, v = s.u, s.v - gu = a.node_labels[u] - gv = a.node_labels[v] - - # Reuse pre-allocated buffer instead of allocating new Set each time - groups_concerned = s.workspace.groups_buffer - empty!(groups_concerned) - push!(groups_concerned, minmax(gu, gv)) - - @inbounds for (node, d) in iterate_neighbors(a.dists, u) - if node == v - continue - end - g1 = a.node_labels[node] - a.θ[gv, g1] = add_to(a.θ[gv, g1], d) - a.θ[gu, g1] = remove_from(a.θ[gu, g1], d) - push!(groups_concerned, minmax(gu, g1)) - push!(groups_concerned, minmax(gv, g1)) - end - - @inbounds for (index, (node, d)) in enumerate(iterate_neighbors(a.dists, v)) - if node == u - continue - end - g2 = a.node_labels[node] - a.θ[gu, g2] = add_to(a.θ[gu, g2], d) - a.θ[gv, g2] = remove_from(a.θ[gv, g2], d) - push!(groups_concerned, minmax(gv, g2)) - push!(groups_concerned, minmax(gu, g2)) - end - - swap_node_labels!(a, u, v) - @inbounds for (g1, g2) in groups_concerned - a.log_likelihood[g1, g2] = 0.0 - for e in get_edges_in_groups(a.node_labels, a.edges, g1, g2) - a.log_likelihood[g1, g2] += logpdf(a.θ[g1, g2], e) - end - end -end diff --git a/src/preprocessor/abstractConvertor.jl b/src/preprocessor/abstractConvertor.jl index 32f040c..3d2f3e2 100644 --- a/src/preprocessor/abstractConvertor.jl +++ b/src/preprocessor/abstractConvertor.jl @@ -13,105 +13,5 @@ function to_distribution(c::AbstractConvertor, ps; kwargs...) @error "to be implemented" end -### ======================================================================================= -### Categorical Convertor -### ======================================================================================= - -struct CategoricalConvertor{T} <: AbstractConvertor - m::Int # number of categories - map::Dict{T, Int} -end - -function CategoricalConvertor(data::AbstractArray{T}) where {T} - categories = sort(unique(data)) - m = length(categories) - map = Dict{T, Int}(categories[i] => i for i in 1:m) - return CategoricalConvertor{T}(m, map) -end - -function num_bins(c::CategoricalConvertor) - return c.m -end - -function (c::CategoricalConvertor)(obs::T) where {T} - return c.map[obs] -end - -function to_distribution( - c::CategoricalConvertor{T}, ps::AbstractVector{T2}; kwargs...) where {T, T2} - @argcheck length(ps)==c.m "Length of probabilities must match number of categories" - support = sort(collect(keys(c.map))) - probabilities = SVector{c.m, T2}(ps[c.map[s]] for s in support) - return DiscreteNonParametric(support, probabilities) -end - -### ======================================================================================= -### [0,1] Continuous Convertor -### ======================================================================================= - -abstract type UnitIntervalConvertorType <: AbstractConvertor end - -struct UnitIntervalConvertor{B <: AbstractVector} <: UnitIntervalConvertorType - bins::B -end - -function UnitIntervalConvertor(n::Int) - zero_interval = Interval{:closed, :closed}(0.0, 0.0) - edges = range(0.0, stop = 1.0, length = n + 1) - bins = [Interval{:closed, :closed}(edges[i], edges[i + 1]) for i in 1:n] - bins = vcat(zero_interval, bins) - return UnitIntervalConvertor{typeof(bins)}(bins) -end - -function num_bins(c::UnitIntervalConvertor) - return length(c.bins) -end - -function (c::UnitIntervalConvertor)(x::Real) - return findfirst(b -> x ∈ b, c.bins) -end - -function to_distribution( - c::UnitIntervalConvertor, ps::AbstractVector{T}; kwargs...) where {T} - @argcheck length(ps)==length(c.bins) "Length of probabilities must match number of bins" - return HistDistribution(c.bins, SVector{length(ps), T}(ps)) -end - -# struct RegularUnitIntervalConvertor{N} <: UnitIntervalConvertorType -# num_bins::Int -# end - -### ======================================================================================= -### Continuous Convertor -### ======================================================================================= -# struct ContinuousConvertor{B, N, V <: AbstractVector{B}} <: AbstractConvertor -# zero_index::Int -# bins::V -# end - -# function num_bins(c::ContinuousConvertor{B, N}) where {B, N} -# return N -# end - -# ## assume no singleton bins -# function ContinuousConvertor(bins::AbstractVector{B}) where {B <: -# Union{Interval, BareInterval}} -# bins = sort(bins, lt = lt = strictprecedes) -# N = length(bins) + 1 -# zero_index = 1 -# ContinuousConvertor{B, N, typeof(bins)}(zero_index, bins) -# end - -# # assume bins are sorted and correctly cover the whole support -# function (c::ContinuousConvertor{<:Union{Interval, BareInterval}})(x) -# iszero(x) && return c.zero_index -# x >= sup(c.bins[end]) && return length(c.bins) + 1 -# x <= inf(c.bins[1]) && return c.zero_index + 1 -# return findfirst(b -> in_interval(x, b), c.bins) + 1 -# end - -# function ContinuousConvertor(l, u, num_bins::Int) -# edges = collect(range(l, stop = u, length = num_bins + 1)) -# bins = [bareinterval(edges[i], edges[i + 1]) for i in 1:num_bins] -# ContinuousConvertor(bins) -# end +include("categorical.jl") +include("continuous.jl") diff --git a/src/preprocessor/categorical.jl b/src/preprocessor/categorical.jl index 4030aad..efb9998 100644 --- a/src/preprocessor/categorical.jl +++ b/src/preprocessor/categorical.jl @@ -1,67 +1,31 @@ +### ======================================================================================= +### Categorical Convertor +### ======================================================================================= -# ============================================================================ -# Data preparation utilities -# ============================================================================ - -""" - prepare_data_cat(A::AbstractMatrix{<:Real}, k; m=length(unique(A)), has_zero=zero(eltype(A)) in A) - -Prepare categorical network data for GreedyAverage. - -Creates the necessary data structures (count matrices and realized value tensors) -for estimating a categorical Stochastic Block Model with k groups. - -# Arguments -- `A::AbstractMatrix{<:Real}`: Adjacency matrix with categorical edge values -- `k::Int`: Number of groups to partition nodes into -- `m::Int`: Number of edge categories (default: inferred from unique values in A) -- `has_zero::Bool`: Whether the data contains zero values (default: auto-detected) - -# Returns -A tuple containing: -- `data`: Preprocessed adjacency matrix (shifted if zero-indexed) -- `counts`: Symmetric k×k array for edge counts (initialized to 0) -- `counts_swap`: Workspace copy of counts for swap evaluation -- `realized`: Symmetric k×k array of m-dimensional count vectors (initialized to 0) -- `realized_swap`: Workspace copy of realized for swap evaluation - -# Example -```julia -# Network with 3 edge types (0, 1, 2) for no edge, layer 1, layer 2 -A = rand(0:2, 100, 100) -A = (A + A') .÷ 2 # Make symmetric - -data, counts, counts_swap, realized, realized_swap = prepare_data_cat(A, k=5) -``` - -# Notes -- If data contains zeros, they are shifted to 1-indexing for categorical representation -- The realized arrays use StaticArrays.MVector for performance -- The symmetric array structure avoids redundant storage -""" -function prepare_data_cat( - A::AbstractMatrix{<:Real}, - k::Int; - m::Int = length(unique(A)), - has_zero::Bool = zero(eltype(A)) in A -) - @debug "Preparing data for categorical SBM with $m categories and $k groups." +struct CategoricalConvertor{T} <: AbstractConvertor + m::Int # number of categories + map::Dict{T, Int} +end - # Adjust data if zero-indexed (shift to 1-indexing for Julia) - if has_zero - @debug "Data contains zero values, using 1-based indexing." - data = A .+ 1 - else - data = A - end +function CategoricalConvertor(data::AbstractArray{T}) where {T} + categories = sort(unique(data)) + m = length(categories) + map = Dict{T, Int}(categories[i] => i for i in 1:m) + return CategoricalConvertor{T}(m, map) +end - # Initialize count matrices - counts = SymArray(k, 0) - counts_swap = SymArray(k, 0) +function num_bins(c::CategoricalConvertor) + return c.m +end - # Initialize realized value tensors (k×k matrices of m-dimensional vectors) - realized = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) - realized_swap = SymArray(zero(SizedMatrix{k, k, MVector{m, Int}})) +function (c::CategoricalConvertor)(obs::T) where {T} + return c.map[obs] +end - return data, counts, counts_swap, realized, realized_swap +function to_distribution( + c::CategoricalConvertor{T}, ps::AbstractVector{T2}; kwargs...) where {T, T2} + @argcheck length(ps)==c.m "Length of probabilities must match number of categories" + support = sort(collect(keys(c.map))) + probabilities = SVector{c.m, T2}(ps[c.map[s]] for s in support) + return DiscreteNonParametric(support, probabilities) end diff --git a/src/preprocessor/continuous.jl b/src/preprocessor/continuous.jl index e69de29..4aceb22 100644 --- a/src/preprocessor/continuous.jl +++ b/src/preprocessor/continuous.jl @@ -0,0 +1,71 @@ + +### ======================================================================================= +### [0,1] Continuous Convertor +### ======================================================================================= + +abstract type UnitIntervalConvertorType <: AbstractConvertor end + +struct UnitIntervalConvertor{B <: AbstractVector} <: UnitIntervalConvertorType + bins::B +end + +function UnitIntervalConvertor(n::Int) + zero_interval = Interval{:closed, :closed}(0.0, 0.0) + edges = range(0.0, stop = 1.0, length = n + 1) + bins = [Interval{:closed, :closed}(edges[i], edges[i + 1]) for i in 1:n] + bins = vcat(zero_interval, bins) + return UnitIntervalConvertor{typeof(bins)}(bins) +end + +function num_bins(c::UnitIntervalConvertor) + return length(c.bins) +end + +function (c::UnitIntervalConvertor)(x::Real) + return findfirst(b -> x ∈ b, c.bins) +end + +function to_distribution( + c::UnitIntervalConvertor, ps::AbstractVector{T}; kwargs...) where {T} + @argcheck length(ps)==length(c.bins) "Length of probabilities must match number of bins" + return HistDistribution(c.bins, SVector{length(ps), T}(ps)) +end + +# struct RegularUnitIntervalConvertor{N} <: UnitIntervalConvertorType +# num_bins::Int +# end + +### ======================================================================================= +### Continuous Convertor +### ======================================================================================= +# struct ContinuousConvertor{B, N, V <: AbstractVector{B}} <: AbstractConvertor +# zero_index::Int +# bins::V +# end + +# function num_bins(c::ContinuousConvertor{B, N}) where {B, N} +# return N +# end + +# ## assume no singleton bins +# function ContinuousConvertor(bins::AbstractVector{B}) where {B <: +# Union{Interval, BareInterval}} +# bins = sort(bins, lt = lt = strictprecedes) +# N = length(bins) + 1 +# zero_index = 1 +# ContinuousConvertor{B, N, typeof(bins)}(zero_index, bins) +# end + +# # assume bins are sorted and correctly cover the whole support +# function (c::ContinuousConvertor{<:Union{Interval, BareInterval}})(x) +# iszero(x) && return c.zero_index +# x >= sup(c.bins[end]) && return length(c.bins) + 1 +# x <= inf(c.bins[1]) && return c.zero_index + 1 +# return findfirst(b -> in_interval(x, b), c.bins) + 1 +# end + +# function ContinuousConvertor(l, u, num_bins::Int) +# edges = collect(range(l, stop = u, length = num_bins + 1)) +# bins = [bareinterval(edges[i], edges[i + 1]) for i in 1:num_bins] +# ContinuousConvertor(bins) +# end diff --git a/src/utils/SymArray.jl b/src/utils/SymArray.jl index 9377ee1..b0dfcc2 100644 --- a/src/utils/SymArray.jl +++ b/src/utils/SymArray.jl @@ -96,8 +96,6 @@ function make_sym_init(k, d) return a end -@deprecate SymArray(k::Int, d::F) where {F} make_sym_init(k, d) - """ SymArray(d::AbstractMatrix{F}) diff --git a/src/utils/config_rules/include.jl b/src/utils/config_rules/include.jl new file mode 100644 index 0000000..e4c4476 --- /dev/null +++ b/src/utils/config_rules/include.jl @@ -0,0 +1,8 @@ +include("swap_rule.jl") +include("stop_rule.jl") + +struct GreedyParams{N <: NodeSwapRule, S <: StopRule} + max_iter::Int + node_swap_rule::N + stop_rule::S +end diff --git a/src/optimization/config_rules/stop_rule.jl b/src/utils/config_rules/stop_rule.jl similarity index 93% rename from src/optimization/config_rules/stop_rule.jl rename to src/utils/config_rules/stop_rule.jl index dbaab9a..85c94b2 100644 --- a/src/optimization/config_rules/stop_rule.jl +++ b/src/utils/config_rules/stop_rule.jl @@ -4,13 +4,6 @@ function info_to_print(::StopRule) return nothing end -function initialise_stop_rule!(stop_rule::StopRule, a, g) -end - -function score(a::Assignment) - return loglikelihood(a) -end - mutable struct PreviousBestValue{T, S} <: StopRule k::Int previous_best_value::T diff --git a/src/optimization/config_rules/swap_rule.jl b/src/utils/config_rules/swap_rule.jl similarity index 77% rename from src/optimization/config_rules/swap_rule.jl rename to src/utils/config_rules/swap_rule.jl index 2e4cb0f..cc22198 100644 --- a/src/optimization/config_rules/swap_rule.jl +++ b/src/utils/config_rules/swap_rule.jl @@ -25,11 +25,3 @@ function select_indices_swap(node_labels::AbstractVector{Int}, ::RandomGroupSwap index2 = rand(findall(x -> x == groups[2], node_labels)) return index1, index2 end - -function select_indices_swap(a::Assignment, rule::NodeSwapRule) - select_indices_swap(a.node_labels, rule) -end - -function select_indices_swap(assignment::Assignment, rule::RandomGroupSwap) - return select_indices_swap(assignment.node_labels, rule, number_groups(assignment)) -end diff --git a/src/utils/include.jl b/src/utils/include.jl index 94fb077..01682f0 100644 --- a/src/utils/include.jl +++ b/src/utils/include.jl @@ -1,4 +1,5 @@ include("SymArray.jl") +include("config_rules/include.jl") function ordered_start_labels(n::Int, k::Int) labels = Vector{Int}(undef, n) diff --git a/test/runtests.jl b/test/runtests.jl index 0767e79..b2c0263 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,9 +4,4 @@ using NetworkHistogram @testset "Tests" begin include("test_symarray.jl") - include("test_data_format.jl") - include("test_distributions_type.jl") - include("test_swap_workspace.jl") - include("test_cat_case.jl") - include("test_get_edges_in_groups.jl") end diff --git a/test/test_cat_case.jl b/test/test_cat_case.jl deleted file mode 100644 index deca73d..0000000 --- a/test/test_cat_case.jl +++ /dev/null @@ -1,63 +0,0 @@ -using Test -using NetworkHistogram -using StatsBase -using Random -using Distributions -using StaticArrays - -@testset "Swap workspace likelihood update (Categorical)" begin - Random.seed!(42) - n = 10 - k = 2 - m = 3 - ps = SVector{m}(fill(1 / m, m)) - d_mine = NetworkHistogram.Cat(ps) - - θ = [NetworkHistogram.Cat(SVector{3}([0.7, 0.2, 0.1])) NetworkHistogram.Cat(SVector{3}([0.1, 0.3, 0.6])); - NetworkHistogram.Cat(SVector{3}([0.1, 0.3, 0.6])) NetworkHistogram.Cat(SVector{3}([0.3, 0.4, 0.3]))] - sbm = DecoratedSBM(θ, [0.5, 0.5]) - - labels = StatsBase.inverse_rle(1:k, fill(n ÷ k, k)) - latents = vcat(repeat([0.2], n ÷ 2), repeat([0.8], n ÷ 2)) - A = sample_graph(sbm, latents) - - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d_mine)) - - for ind in eachindex(assignment.additional_workspace.counts) - @test assignment.additional_workspace.counts[ind] == - sum(assignment.additional_workspace.realized[ind]) - end - - ll_original = NetworkHistogram.loglikelihood(assignment) - - # Swap two nodes from different groups - indices = (1, n) - swap = NetworkHistogram.make_swap(assignment, indices) - true_swapped = deepcopy(labels) - true_swapped[1] = labels[n] - true_swapped[n] = labels[1] - NetworkHistogram.apply_swap!(assignment, swap) - - nodes_label_swapped = deepcopy(assignment.node_labels) - new_a = NetworkHistogram.Assignment( - nodes_label_swapped, edgelist, NetworkHistogram.Dist(d_mine)) - ll_new_a = NetworkHistogram.loglikelihood(new_a) - ll_after_swap = NetworkHistogram.loglikelihood(assignment) - ws_new = new_a.additional_workspace - ws_old = assignment.additional_workspace - @test ws_new.counts == ws_old.counts - @test ws_new.realized == ws_old.realized - @test ws_new.estimated == ws_old.estimated - - @test new_a.node_labels == assignment.node_labels - @test new_a.node_labels == true_swapped - @test new_a.log_likelihood == assignment.log_likelihood - @test isapprox(ll_after_swap, ll_new_a; atol = 1e-10) - - # Revert the swap - NetworkHistogram.revert_swap!(assignment, swap) - ll_after_revert = NetworkHistogram.loglikelihood(assignment) - @test isapprox(ll_after_revert, ll_original; atol = 1e-10) -end diff --git a/test/test_data_format.jl b/test/test_data_format.jl deleted file mode 100644 index fd18a4c..0000000 --- a/test/test_data_format.jl +++ /dev/null @@ -1,21 +0,0 @@ -@testset "Edge list tests" begin - using Random - Random.seed!(1234) - A = Symmetric(sprand(20, 20, 0.5)) - edgelist = EdgeList(A) - - for j in 1:20 - nv_j, val_j = neighbors(edgelist, j) - for i in 1:20 - if i != j - @test i in nv_j - @test A[i, j] == val_j[findfirst(x -> x == i, nv_j)] - else - @test i ∉ nv_j - end - end - end - - @test NetworkHistogram.edge_type(edgelist) == eltype(A) - @test nodes(edgelist) == size(A, 1) -end diff --git a/test/test_distributions_type.jl b/test/test_distributions_type.jl deleted file mode 100644 index b8c53e9..0000000 --- a/test/test_distributions_type.jl +++ /dev/null @@ -1,12 +0,0 @@ -@testset "Distribution tests" begin - import NetworkHistogram as NH - d1 = NH.Bernoulli(0.5) - d2 = NH.Bernoulli(0.7) - my_d = NH.Dist(d1) - d_avg = NH.add_to(my_d, d2) - @test d_avg.counts == 2 - @test d_avg.dist.p == 0.6 - d_removed = NH.remove_from(d_avg, d2) - @test d_removed.counts == 1 - @test d_removed.dist == d1 -end diff --git a/test/test_get_edges_in_groups.jl b/test/test_get_edges_in_groups.jl deleted file mode 100644 index 4df3a22..0000000 --- a/test/test_get_edges_in_groups.jl +++ /dev/null @@ -1,33 +0,0 @@ -using Test -using NetworkHistogram - -@testset "get_edges_in_groups behavior" begin - # Simple 4-node undirected graph - # 1-2, 1-3, 2-4, 3-4 - A = [0 1 1 0; - 1 0 0 1; - 1 0 0 1; - 0 1 1 0] - edgelist = NetworkHistogram.EdgeList(A) - node_labels = [1, 1, 2, 2] # nodes 1,2 in group 1; 3,4 in group 2 - - # Test within-group edges (group 1) - edges_1_1 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 1, 1) - @test length(edges_1_1) == 1 # Only edge (1,2) - @test edges_1_1[1] == 1 # A[1,2] == 1 - - # Test within-group edges (group 2) - edges_2_2 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 2, 2) - @test length(edges_2_2) == 1 # Only edge (3,4) - @test edges_2_2[1] == 1 # A[3,4] == 1 - - # Test between-group edges (1,2) - edges_1_2 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 1, 2) - # Edges: (1,3), (2,4) - @test length(edges_1_2) == 4 - @test sort(edges_1_2) == [0, 0, 1, 1] # Both edges exist - - # Test symmetry: get_edges_in_groups(2,1) == get_edges_in_groups(1,2) - edges_2_1 = NetworkHistogram.get_edges_in_groups(node_labels, edgelist, 2, 1) - @test sort(edges_2_1) == sort(edges_1_2) -end diff --git a/test/test_swap_workspace.jl b/test/test_swap_workspace.jl deleted file mode 100644 index 367d0e9..0000000 --- a/test/test_swap_workspace.jl +++ /dev/null @@ -1,69 +0,0 @@ -using Test -using NetworkHistogram -using StatsBase -using Random - -function manual_loglikelihood(A, node_labels, θ) - n = size(A, 1) - k = size(θ, 1) - ll = 0.0 - for j in 1:n - for i in 1:n - if i != j - g1 = node_labels[i] - g2 = node_labels[j] - ll += NetworkHistogram.logpdf(θ[g1, g2], A[i, j]) - end - end - end - return ll / 2 -end - -function slow_swap(a::NetworkHistogram.Assignment, s::NetworkHistogram.Swap) - labels = deepcopy(a.node_labels) - labels[s.u], labels[s.v] = labels[s.v], labels[s.u] - return NetworkHistogram.Assignment(labels, a.edges, a.θ[1, 1]) -end - -@testset "Swap workspace likelihood update (Bernoulli)" begin - Random.seed!(42) - n = 6 - k = 2 - #p1, p2 = 0.8, 0.3 - d = NetworkHistogram.Bernoulli(0.5) - # Create a block model with two groups - # sbm = NetworkHistogram.BlockModel(k, d) - # sbm[1, 1] = NetworkHistogram.Bernoulli(p1) - # sbm[2, 2] = NetworkHistogram.Bernoulli(p2) - # sbm[1, 2] = NetworkHistogram.Bernoulli(0.1) - - sbm = SBM([0.8 0.3; 0.3 0.8], [0.5, 0.5]) - - labels = StatsBase.inverse_rle(1:k, fill(n ÷ k, k)) - latents = [0.1, 0.1, 0.1, 0.9, 0.9, 0.9] - A = sample_graph(sbm, latents) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) - - ll_original = NetworkHistogram.loglikelihood(assignment) - ll_manual = manual_loglikelihood(A, assignment.node_labels, assignment.θ) - @test isapprox(ll_original, ll_manual; atol = 1e-10) - - # Swap two nodes from different groups - indices = (1, n) - swap = NetworkHistogram.make_swap(assignment, indices) - slow_swapped = slow_swap(assignment, swap) - NetworkHistogram.apply_swap!(assignment, swap) - ll_after_swap = NetworkHistogram.loglikelihood(assignment) - ll_slow_swap = NetworkHistogram.loglikelihood(slow_swapped) - ll_manual_after_swap = manual_loglikelihood(A, assignment.node_labels, assignment.θ) - @test isapprox(ll_after_swap, ll_manual_after_swap; atol = 1e-10) - @test isapprox(ll_after_swap, ll_slow_swap; atol = 1e-10) - - # Revert the swap - NetworkHistogram.revert_swap!(assignment, swap) - ll_after_revert = NetworkHistogram.loglikelihood(assignment) - ll_manual_after_revert = manual_loglikelihood(A, assignment.node_labels, assignment.θ) - @test isapprox(ll_after_revert, ll_manual_after_revert; atol = 1e-10) - @test isapprox(ll_after_revert, ll_original; atol = 1e-10) -end diff --git a/test/test_symarray.jl b/test/test_symarray.jl index 2d4e5be..724387a 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -7,7 +7,8 @@ using StaticArrays @testset "SymArray Array Interface" begin @testset "Construction and basic properties" begin # Test construction with scalar - a = make_sym_init(3, 1.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 1.0) @test a isa AbstractArray{Float64, 2} @test size(a) == (3, 3) @test length(a) == 9 @@ -15,17 +16,18 @@ using StaticArrays @test eltype(a) == Float64 # Test construction with zeros - b = make_sym_init(5, 0.0) + b = SymArray{Float64}(undef, 5, 5) + fill!(b, 0.0) @test size(b) == (5, 5) @test all(b[i, j] == 0.0 for i in 1:5 for j in 1:5) # Test dimension validation - @test_throws ArgumentError make_sym_init(0, 1.0) - @test_throws ArgumentError make_sym_init(-1, 1.0) + @test_throws ArgumentError SymArray{Float64}(undef, 3, 4) end @testset "Indexing - getindex and setindex!" begin - a = make_sym_init(4, 0.0) + a = SymArray{Float64}(undef, 4, 4) + fill!(a, 0.0) # Test setindex! in upper triangle a[1, 2] = 5.0 @@ -48,7 +50,8 @@ using StaticArrays end @testset "Symmetry property" begin - a = make_sym_init(5, 0.0) + a = SymArray{Float64}(undef, 5, 5) + fill!(a, 0.0) # Set values and verify symmetry for i in 1:5 @@ -93,7 +96,8 @@ using StaticArrays end @testset "similar function" begin - a = make_sym_init(3, 5.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 5.0) # Test similar without type b = similar(a) @@ -117,7 +121,8 @@ using StaticArrays end @testset "copy! and deepcopy!" begin - a = make_sym_init(3, 0.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 0.0) a[1, 1] = 1.0 a[1, 2] = 2.0 a[2, 3] = 5.0 @@ -132,7 +137,8 @@ using StaticArrays @test b[3, 2] == 5.0 # Test dimension mismatch - d = make_sym_init(4, 0.0) + d = SymArray{Float64}(undef, 4, 4) + fill!(d, 0.0) @test_throws DimensionMismatch copy!(d, a) # Test deepcopy! @@ -162,7 +168,8 @@ using StaticArrays end @testset "Array operations" begin - a = make_sym_init(3, 2.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 2.0) # Test iteration count = 0 @@ -180,7 +187,8 @@ using StaticArrays @test any(x -> x == 2.0, a) # Test maximum/minimum - b = make_sym_init(3, 0.0) + b = SymArray{Float64}(undef, 3, 3) + fill!(b, 0.0) b[1, 1] = 5.0 b[2, 3] = -3.0 @test maximum(b) == 5.0 @@ -188,8 +196,10 @@ using StaticArrays end @testset "Mathematical operations" begin - a = make_sym_init(3, 2.0) - b = make_sym_init(3, 3.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 2.0) + b = SymArray{Float64}(undef, 3, 3) + fill!(b, 3.0) # Element-wise operations (using broadcasting) c = a .+ b @@ -211,13 +221,15 @@ using StaticArrays @test all(f[i, j] == 1.5 for i in 1:3, j in 1:3) # Test unary operations - g = make_sym_init(3, -2.0) + g = SymArray{Float64}(undef, 3, 3) + fill!(g, -2.0) h = abs.(g) @test h isa SymArray @test all(h[i, j] == 2.0 for i in 1:3, j in 1:3) # Test with mixed values - m = make_sym_init(3, 0.0) + m = SymArray{Float64}(undef, 3, 3) + fill!(m, 0.0) m[1, 1] = 1.0 m[1, 2] = 2.0 m[2, 2] = 3.0 @@ -234,7 +246,8 @@ using StaticArrays @test n[3, 3] == 16.0 # Test operations between two SymArrays with different values - p = make_sym_init(3, 0.0) + p = SymArray{Float64}(undef, 3, 3) + fill!(p, 0.0) p[1, 1] = 10.0 p[2, 2] = 20.0 p[3, 3] = 30.0 @@ -249,12 +262,14 @@ using StaticArrays end @testset "Special case: sum_tri_with_diag" begin - a = make_sym_init(3, 1.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 1.0) # Only upper triangle is stored: 6 elements # [1,1], [1,2], [1,3], [2,2], [2,3], [3,3] @test sum_tri_with_diag(a) == 6.0 - b = make_sym_init(4, 2.0) + b = SymArray{Float64}(undef, 4, 4) + fill!(b, 2.0) # Upper triangle has 10 elements for 4x4 @test sum_tri_with_diag(b) == 20.0 @@ -266,20 +281,24 @@ using StaticArrays @testset "Type stability" begin # Float64 - a = make_sym_init(3, 1.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 1.0) @test typeof(a[1, 1]) == Float64 # Int - b = make_sym_init(3, 1) + b = SymArray{Int}(undef, 3, 3) + fill!(b, 1) @test typeof(b[1, 1]) == Int # Float32 - c = make_sym_init(3, 1.0f0) + c = SymArray{Float32}(undef, 3, 3) + fill!(c, 1.0f0) @test typeof(c[1, 1]) == Float32 end @testset "Sparse matrix properties" begin - a = make_sym_init(10, 0.0) + a = SymArray{Float64}(undef, 10, 10) + fill!(a, 0.0) # Initially all elements are stored (including zeros) # Set only a few elements to non-zero a[1, 5] = 3.0 @@ -297,14 +316,16 @@ using StaticArrays @testset "Edge cases" begin # 1x1 matrix - a = make_sym_init(1, 5.0) + a = SymArray{Float64}(undef, 1, 1) + fill!(a, 5.0) @test size(a) == (1, 1) @test a[1, 1] == 5.0 a[1, 1] = 10.0 @test a[1, 1] == 10.0 # Large diagonal - b = make_sym_init(100, 0.0) + b = SymArray{Float64}(undef, 100, 100) + fill!(b, 0.0) for i in 1:100 b[i, i] = Float64(i) end @@ -332,7 +353,8 @@ using StaticArrays end @testset "Broadcasting with regular arrays" begin - a = make_sym_init(3, 2.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 2.0) M = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] # SymArray + Matrix should return Matrix (follows Matrix type) @@ -363,7 +385,8 @@ using StaticArrays @testset "SymArray broadcast with Matrix returns Matrix" begin # Create a SymArray and a regular Matrix - a = make_sym_init(3, 2.0) + a = SymArray{Float64}(undef, 3, 3) + fill!(a, 2.0) M = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] # SymArray + Matrix should return Matrix @@ -389,7 +412,8 @@ using StaticArrays @test all(result3[i, j] ≈ 7.0 for i in 1:3, j in 1:3) # SymArray + SymArray should return SymArray - b = make_sym_init(3, 3.0) + b = SymArray{Float64}(undef, 3, 3) + fill!(b, 3.0) result4 = a .+ b @test result4 isa SymArray @test all(result4[i, j] ≈ 5.0 for i in 1:3, j in 1:3) @@ -399,7 +423,8 @@ using StaticArrays @test result5 isa SymArray @test all(result5[i, j] ≈ 6.0 for i in 1:3, j in 1:3) - a_ones = make_sym_init(3, 1.0) + a_ones = SymArray{Float64}(undef, 3, 3) + fill!(a_ones, 1.0) result_sum_two_matrices = a_ones .+ M .+ M @test result_sum_two_matrices isa Matrix{Float64} @test all(result_sum_two_matrices[i, j] ≈ 1 + 2 * M[i, j] for i in 1:3, j in 1:3) From 854400aaee80908938ab897560378453695447bd Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 18:50:18 +0100 Subject: [PATCH 217/266] change structure --- docs/examples/custom_suffstats.jl | 91 +++++++++++ src/{estimator => }/GreedySuffStats.jl | 169 +-------------------- src/NetworkHistogram.jl | 8 +- src/{utils => }/SymArray.jl | 0 src/api.jl | 38 ----- src/{utils => }/config_rules/include.jl | 0 src/{utils => }/config_rules/stop_rule.jl | 0 src/{utils => }/config_rules/swap_rule.jl | 0 src/estimator/abstractEstimator.jl | 19 --- src/pseudo_suff_stats/abstract_suffstat.jl | 13 ++ src/pseudo_suff_stats/bernoulli.jl | 43 ++++++ src/pseudo_suff_stats/categorical.jl | 41 +++++ src/pseudo_suff_stats/generic.jl | 65 ++++++++ src/utils/include.jl | 15 -- src/utils/utils_node_labels.jl | 45 ++++++ 15 files changed, 308 insertions(+), 239 deletions(-) create mode 100644 docs/examples/custom_suffstats.jl rename src/{estimator => }/GreedySuffStats.jl (52%) rename src/{utils => }/SymArray.jl (100%) rename src/{utils => }/config_rules/include.jl (100%) rename src/{utils => }/config_rules/stop_rule.jl (100%) rename src/{utils => }/config_rules/swap_rule.jl (100%) delete mode 100644 src/estimator/abstractEstimator.jl create mode 100644 src/pseudo_suff_stats/abstract_suffstat.jl create mode 100644 src/pseudo_suff_stats/bernoulli.jl create mode 100644 src/pseudo_suff_stats/categorical.jl create mode 100644 src/pseudo_suff_stats/generic.jl delete mode 100644 src/utils/include.jl create mode 100644 src/utils/utils_node_labels.jl diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl new file mode 100644 index 0000000..d3322cd --- /dev/null +++ b/docs/examples/custom_suffstats.jl @@ -0,0 +1,91 @@ +import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, score +using StaticArrays +using Accessors + +struct MyCustomSuffStats{M, T} <: SuffStats + h::SVector{M, T} +end + +function MyCustomSuffStats(num_categories::Int) + h = SVector{num_categories, Int}(zeros(Int, num_categories)) + return MyCustomSuffStats{num_categories, Int}(h) +end + +@inline function add_sample(ss::MyCustomSuffStats, sample::Int) + ss = @set ss.h[sample] += 1 + return ss +end + +@inline function remove_sample(ss::MyCustomSuffStats, sample::Int) + ss = @set ss.h[sample] -= 1 + return ss +end + +function make_k_block(k, ::Val{:custom}; num_categories, kwargs...) + k_block = SymArray{MyCustomSuffStats{num_categories, Int}}(undef, k, k) + fill!(k_block, MyCustomSuffStats(num_categories)) + return k_block +end + +@inline function score(ss::MyCustomSuffStats; kwargs...) + n = sum(ss.h) + return n - sum(abs2, ss.h) / max(n, 1) +end + +## + +using Distributions +using NetworkHistogram +using Random +using StatsBase + +function W_multiplex(x, y) + ps = zeros(4) + ps[2] = sqrt(abs(x - y)) / 2 # layer 1 only + ps[3] = abs(sin(2π * x) * sin(2π * y)) / 2 # layer 2 only + ps[4] = min(x, y) / 2 # both layers + ps[1] = 1 - sum(ps[2:4]) # no edge + return DiscreteNonParametric(0:3, SVector{4}(ps)) +end + +m = 4 +graphon = DecoratedGraphon(W_multiplex) + +n = 2000 +true_latents = range(0, 1; length = n) +A = sample_graph(graphon, true_latents); + +k = 20 +oracle_labels = ordered_start_labels(n, k); +initial_labels = shuffle(oracle_labels); + +max_iter = 1_000_000 +stalled_iters = 5_000 + +data = A .+ 1; # shift to 1,2,3,4 for categorical +es_new = NetworkHistogram.GreedySuffStats(data, initial_labels, num_categories = m, + type_suff_stats = :custom, + max_iter = max_iter, + swap_rule = NetworkHistogram.RandomGroupSwap(), + stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), + progress = true, + dist = Categorical(m) +); +node_labels_es_new = NetworkHistogram.estimate!( + es_new, data, initial_labels; dist = Categorical(m), + iter_progress = 10_000) + +function params(ss::Union{NetworkHistogram.CategoricalSuffStats, MyCustomSuffStats}) + ss.h ./ sum(ss.h) +end +parameters = Matrix{SVector{m, Float64}}(undef, k, k) +@inbounds for j in 1:k, i in 1:k + parameters[i, j] = SVector{m, Float64}( + params(es_new.block_ss[i, j])...) +end +model_es_new = NetworkHistogram.DecoratedSBM( + DiscreteNonParametric.(Ref(0:(m - 1)), parameters), counts(node_labels_es_new) ./ + length(node_labels_es_new)); + +res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); +NetworkHistogram.align_res_true_latents!(res_new, oracle_labels); diff --git a/src/estimator/GreedySuffStats.jl b/src/GreedySuffStats.jl similarity index 52% rename from src/estimator/GreedySuffStats.jl rename to src/GreedySuffStats.jl index 841d37f..17dd67b 100644 --- a/src/estimator/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -1,171 +1,12 @@ -abstract type SuffStats end +abstract type SBMEstimator end -function add_sample end -function remove_sample end +abstract type Result end -add_sample(suffstats::SuffStats, sample, i, j) = add_sample(suffstats, sample) -remove_sample(suffstats::SuffStats, sample, i, j) = remove_sample(suffstats, sample) -function make_k_block end -function score end - -### ======================================================================================== - -struct CategoricalSuffStats{M, T} <: SuffStats - h::SVector{M, T} - n::Int -end - -function CategoricalSuffStats(num_categories::Int) - h = SVector{num_categories, Int}(zeros(Int, num_categories)) - return CategoricalSuffStats{num_categories, Int}(h, 0) -end - -@inline function add_sample(ss::CategoricalSuffStats, sample::Int) - ss = @set ss.h[sample] += 1 - ss = @set ss.n += 1 - return ss -end - -@inline function add_sample(ss::CategoricalSuffStats, ::Nothing) - @reset ss.n += 1 - return ss -end - -@inline function remove_sample(ss::CategoricalSuffStats, sample::Int) - ss = @set ss.h[sample] -= 1 - ss = @set ss.n -= 1 - return ss -end - -@inline function remove_sample(ss::CategoricalSuffStats, ::Nothing) - @reset ss.n -= 1 - return ss -end - -function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) - k_block = SymArray{CategoricalSuffStats{num_categories, Int}}(undef, k, k) - fill!(k_block, CategoricalSuffStats(num_categories)) - return k_block -end - -@inline function score(ss::CategoricalSuffStats; kwargs...) - return ss.n - sum(abs2, ss.h) / max(ss.n, 1) -end - -### ======================================================================================== - -struct BernoulliSuffStats{T} <: SuffStats - h::T - n::T -end - -function BernoulliSuffStats() - return BernoulliSuffStats{Int}(0, 0) -end - -@inline function add_sample(ss::BernoulliSuffStats, sample::Bool) - sample && (@reset ss.h += 1) - @reset ss.n += 1 - return ss -end - -@inline function add_sample(ss::BernoulliSuffStats, ::Nothing) - @reset ss.n += 1 - return ss -end - -@inline function remove_sample(ss::BernoulliSuffStats, sample::Bool) - sample && (@reset ss.h -= 1) - @reset ss.n -= 1 - return ss -end - -@inline function remove_sample(ss::BernoulliSuffStats, ::Nothing) - @reset ss.n -= 1 - return ss +struct NethistResult{L, M} <: Result + labels::L + model::M end -function make_k_block(k, ::Val{:binary}; kwargs...) - k_block = SymArray{BernoulliSuffStats{Int}}(undef, k, k) - fill!(k_block, BernoulliSuffStats()) - return k_block -end - -@inline function score(ss::BernoulliSuffStats; kwargs...) - n = max(ss.n, 1) - p = ss.h / n - return n * (xlogx(1 - p) + xlogx(p)) -end - -### ======================================================================================== - -abstract type GenericSuffStatsType <: SuffStats end - -struct GenericSuffStats{T} <: GenericSuffStatsType - samples::Vector{T} -end - -function GenericSuffStats(::AbstractArray{T}) where {T} - return GenericSuffStats{T}(Vector{T}()) -end - -function get_samples(ss::GenericSuffStats) - return ss.samples -end - -function add_sample(ss::GenericSuffStats, sample) - append!(ss.samples, sample) - return ss -end - -function remove_sample(ss::GenericSuffStats, sample) - index = findfirst(==(sample), ss.samples) - if index !== nothing - deleteat!(ss.samples, index) - end - return ss -end - -function make_k_block(k, generic; data::AbstractArray, kwargs...) - @warn "Using GenericSuffStats may lead to high memory usage for large datasets. - Consider using more specialized sufficient statistics types when possible." - k_block = SymArray{GenericSuffStats{eltype(data)}}(undef, k, k) - for j in 1:k, i in 1:k - k_block[i, j] = GenericSuffStats(data) - end - return k_block -end - -# use indices rather than pushing and deleting samples for better performance ? -# struct GenericSuffStatsIndex{T} <: GenericSuffStatsType -# indices::Vector{Tuple{Int, Int}} -# data::T -# end - -# function get_samples(ss::GenericSuffStatsIndex) -# return [ss.data[i, j] for (i, j) in ss.indices] -# end - -# function GenericSuffStatsIndex{T}(data::T) where {T} -# return GenericSuffStatsIndex{T}(Vector{Tuple{Int, Int}}(), data) -# end - -# function add_sample(ss::GenericSuffStatsIndex, sample, i, j) -# push!(ss.indices, (i, j)) -# return ss -# end - -function score(ss::GenericSuffStatsType; dist::D, kwargs...) where {D} - if dist === nothing - @error("No distribution provided for scoring GenericSuffStats") - end - samples = get_samples(ss) - d = fit(D, samples) - return -sum(logpdf.(d, samples)) -end - -### ======================================================================================== - struct GreedySuffStats{M, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator block_ss::M block_ss_swap::M diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 7d9d408..2ee34a4 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -18,13 +18,15 @@ using Reexport import Graphons: _extract_param, convert_to_params -include("utils/include.jl") - +include("SymArray.jl") @reexport using .FastSymArray include("distributions/hist_dist.jl") include("preprocessor/abstractConvertor.jl") -include("estimator/abstractEstimator.jl") +include("config_rules/include.jl") +include("pseudo_suff_stats/abstract_suffstat.jl") +include("GreedySuffStats.jl") +include("utils/utils_node_labels.jl") include("api.jl") export GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, diff --git a/src/utils/SymArray.jl b/src/SymArray.jl similarity index 100% rename from src/utils/SymArray.jl rename to src/SymArray.jl diff --git a/src/api.jl b/src/api.jl index 44a9f9b..b0f0cf5 100644 --- a/src/api.jl +++ b/src/api.jl @@ -49,41 +49,3 @@ function nethist_continuous_edges(A_cont, initial_node_labels, params::GreedyPar end # functions for postprocessing - -struct NethistResult{S} - node_labels::Vector{Int} - model::S -end - -function node_labels_to_latents(node_labels::AbstractVector{Int}, sbm) - return map(label -> _label_to_latent(label, sbm), node_labels) -end - -function _label_to_latent(label::Int, sbm) - return sbm.cumsize[label] - eps() -end - -function align_res_true_latents!(res, latents) - perm = order_groups(res.node_labels, latents) - permute!(res.model, perm) - res.node_labels .= map(x -> findfirst(==(x), perm), res.node_labels) -end - -function permute!(sbm, perm) - permuted_theta = copy(sbm.θ) - sbm.θ .= permuted_theta[perm, perm] - sbm.size .= sbm.size[perm] - sbm.cumsize .= cumsum(sbm.size) -end - -function order_groups(node_labels, latents::AbstractVector) - n = length(node_labels) - k = length(unique(node_labels)) - sort_perm = sortperm(latents) - sorted_group_labels = node_labels[sort_perm] - dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - return sort( - 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) -end diff --git a/src/utils/config_rules/include.jl b/src/config_rules/include.jl similarity index 100% rename from src/utils/config_rules/include.jl rename to src/config_rules/include.jl diff --git a/src/utils/config_rules/stop_rule.jl b/src/config_rules/stop_rule.jl similarity index 100% rename from src/utils/config_rules/stop_rule.jl rename to src/config_rules/stop_rule.jl diff --git a/src/utils/config_rules/swap_rule.jl b/src/config_rules/swap_rule.jl similarity index 100% rename from src/utils/config_rules/swap_rule.jl rename to src/config_rules/swap_rule.jl diff --git a/src/estimator/abstractEstimator.jl b/src/estimator/abstractEstimator.jl deleted file mode 100644 index f6fe72d..0000000 --- a/src/estimator/abstractEstimator.jl +++ /dev/null @@ -1,19 +0,0 @@ -""" - SBMEstimator - -Abstract base type for all Stochastic Block Model (SBM) estimators. - -All concrete estimator types should implement: -- `estimate(estimator, data, initial_labels; progress=true)`: Main estimation function -- `score(estimator)`: Return current objective value (if applicable) -""" -abstract type SBMEstimator end - -abstract type Result end - -# struct NethistResult{L, M} <: Result -# labels::L -# model::M -# end - -include("GreedySuffStats.jl") diff --git a/src/pseudo_suff_stats/abstract_suffstat.jl b/src/pseudo_suff_stats/abstract_suffstat.jl new file mode 100644 index 0000000..af90250 --- /dev/null +++ b/src/pseudo_suff_stats/abstract_suffstat.jl @@ -0,0 +1,13 @@ +abstract type SuffStats end + +function add_sample end +function remove_sample end + +add_sample(suffstats::SuffStats, sample, i, j) = add_sample(suffstats, sample) +remove_sample(suffstats::SuffStats, sample, i, j) = remove_sample(suffstats, sample) +function make_k_block end +function score end + +include("categorical.jl") +include("bernoulli.jl") +include("generic.jl") diff --git a/src/pseudo_suff_stats/bernoulli.jl b/src/pseudo_suff_stats/bernoulli.jl new file mode 100644 index 0000000..581ae4f --- /dev/null +++ b/src/pseudo_suff_stats/bernoulli.jl @@ -0,0 +1,43 @@ + +struct BernoulliSuffStats{T} <: SuffStats + h::T + n::T +end + +function BernoulliSuffStats() + return BernoulliSuffStats{Int}(0, 0) +end + +@inline function add_sample(ss::BernoulliSuffStats, sample::Bool) + sample && (@reset ss.h += 1) + @reset ss.n += 1 + return ss +end + +@inline function add_sample(ss::BernoulliSuffStats, ::Nothing) + @reset ss.n += 1 + return ss +end + +@inline function remove_sample(ss::BernoulliSuffStats, sample::Bool) + sample && (@reset ss.h -= 1) + @reset ss.n -= 1 + return ss +end + +@inline function remove_sample(ss::BernoulliSuffStats, ::Nothing) + @reset ss.n -= 1 + return ss +end + +function make_k_block(k, ::Val{:binary}; kwargs...) + k_block = SymArray{BernoulliSuffStats{Int}}(undef, k, k) + fill!(k_block, BernoulliSuffStats()) + return k_block +end + +@inline function score(ss::BernoulliSuffStats; kwargs...) + n = max(ss.n, 1) + p = ss.h / n + return n * (xlogx(1 - p) + xlogx(p)) +end diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl new file mode 100644 index 0000000..ab4d95c --- /dev/null +++ b/src/pseudo_suff_stats/categorical.jl @@ -0,0 +1,41 @@ +struct CategoricalSuffStats{M, T} <: SuffStats + h::SVector{M, T} + n::Int +end + +function CategoricalSuffStats(num_categories::Int) + h = SVector{num_categories, Int}(zeros(Int, num_categories)) + return CategoricalSuffStats{num_categories, Int}(h, 0) +end + +@inline function add_sample(ss::CategoricalSuffStats, sample::Int) + ss = @set ss.h[sample] += 1 + ss = @set ss.n += 1 + return ss +end + +@inline function add_sample(ss::CategoricalSuffStats, ::Nothing) + @reset ss.n += 1 + return ss +end + +@inline function remove_sample(ss::CategoricalSuffStats, sample::Int) + ss = @set ss.h[sample] -= 1 + ss = @set ss.n -= 1 + return ss +end + +@inline function remove_sample(ss::CategoricalSuffStats, ::Nothing) + @reset ss.n -= 1 + return ss +end + +function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) + k_block = SymArray{CategoricalSuffStats{num_categories, Int}}(undef, k, k) + fill!(k_block, CategoricalSuffStats(num_categories)) + return k_block +end + +@inline function score(ss::CategoricalSuffStats; kwargs...) + return ss.n - sum(abs2, ss.h) / max(ss.n, 1) +end diff --git a/src/pseudo_suff_stats/generic.jl b/src/pseudo_suff_stats/generic.jl new file mode 100644 index 0000000..e0b6b76 --- /dev/null +++ b/src/pseudo_suff_stats/generic.jl @@ -0,0 +1,65 @@ + +abstract type GenericSuffStatsType <: SuffStats end + +struct GenericSuffStats{T} <: GenericSuffStatsType + samples::Vector{T} +end + +function GenericSuffStats(::AbstractArray{T}) where {T} + return GenericSuffStats{T}(Vector{T}()) +end + +function get_samples(ss::GenericSuffStats) + return ss.samples +end + +function add_sample(ss::GenericSuffStats, sample) + append!(ss.samples, sample) + return ss +end + +function remove_sample(ss::GenericSuffStats, sample) + index = findfirst(==(sample), ss.samples) + if index !== nothing + deleteat!(ss.samples, index) + end + return ss +end + +function make_k_block(k, generic; data::AbstractArray, kwargs...) + @warn "Using GenericSuffStats may lead to high memory usage for large datasets. + Consider using more specialized sufficient statistics types when possible." + k_block = SymArray{GenericSuffStats{eltype(data)}}(undef, k, k) + for j in 1:k, i in 1:k + k_block[i, j] = GenericSuffStats(data) + end + return k_block +end + +# use indices rather than pushing and deleting samples for better performance ? +# struct GenericSuffStatsIndex{T} <: GenericSuffStatsType +# indices::Vector{Tuple{Int, Int}} +# data::T +# end + +# function get_samples(ss::GenericSuffStatsIndex) +# return [ss.data[i, j] for (i, j) in ss.indices] +# end + +# function GenericSuffStatsIndex{T}(data::T) where {T} +# return GenericSuffStatsIndex{T}(Vector{Tuple{Int, Int}}(), data) +# end + +# function add_sample(ss::GenericSuffStatsIndex, sample, i, j) +# push!(ss.indices, (i, j)) +# return ss +# end + +function score(ss::GenericSuffStatsType; dist::D, kwargs...) where {D} + if dist === nothing + @error("No distribution provided for scoring GenericSuffStats") + end + samples = get_samples(ss) + d = fit(D, samples) + return -sum(logpdf.(d, samples)) +end diff --git a/src/utils/include.jl b/src/utils/include.jl deleted file mode 100644 index 01682f0..0000000 --- a/src/utils/include.jl +++ /dev/null @@ -1,15 +0,0 @@ -include("SymArray.jl") -include("config_rules/include.jl") - -function ordered_start_labels(n::Int, k::Int) - labels = Vector{Int}(undef, n) - base_size = n ÷ k - remainder = n % k - for group in 1:k - fill!(view(labels, ((group - 1) * base_size + 1):(group * base_size)), group) - end - if remainder > 0 - fill!(view(labels, (k * base_size + 1):(k * base_size + remainder)), k) - end - return labels -end diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl new file mode 100644 index 0000000..c9351fa --- /dev/null +++ b/src/utils/utils_node_labels.jl @@ -0,0 +1,45 @@ +function ordered_start_labels(n::Int, k::Int) + labels = Vector{Int}(undef, n) + base_size = n ÷ k + remainder = n % k + for group in 1:k + fill!(view(labels, ((group - 1) * base_size + 1):(group * base_size)), group) + end + if remainder > 0 + fill!(view(labels, (k * base_size + 1):(k * base_size + remainder)), k) + end + return labels +end + +function node_labels_to_latents(node_labels::AbstractVector{Int}, sbm) + return map(label -> _label_to_latent(label, sbm), node_labels) +end + +function _label_to_latent(label::Int, sbm) + return sbm.cumsize[label] - eps() +end + +function align_res_true_latents!(res::NethistResult, latents) + perm = order_groups(res.labels, latents) + permute!(res.model, perm) + res.labels .= map(x -> findfirst(==(x), perm), res.labels) +end + +function permute!(sbm, perm) + permuted_theta = copy(sbm.θ) + sbm.θ .= permuted_theta[perm, perm] + sbm.size .= sbm.size[perm] + sbm.cumsize .= cumsum(sbm.size) +end + +function order_groups(node_labels, latents::AbstractVector) + n = length(node_labels) + k = length(unique(node_labels)) + sort_perm = sortperm(latents) + sorted_group_labels = node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + return sort( + 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) +end From 418f6487b11499cbd728f74f362bef40448a7ee5 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 19:24:50 +0100 Subject: [PATCH 218/266] remove kwargs expension in score --- Project.toml | 2 ++ docs/examples/custom_suffstats.jl | 5 ++--- src/GreedySuffStats.jl | 20 ++++++++--------- src/pseudo_suff_stats/abstract_suffstat.jl | 5 +++-- src/pseudo_suff_stats/bernoulli.jl | 2 +- src/pseudo_suff_stats/categorical.jl | 2 +- src/pseudo_suff_stats/generic.jl | 26 +++++++++------------- 7 files changed, 30 insertions(+), 32 deletions(-) diff --git a/Project.toml b/Project.toml index 7645576..ee404b6 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ authors = ["Charles Dufour", "Jake Grainger"] [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" @@ -30,6 +31,7 @@ LightMCExt = "LightMC" [compat] Accessors = "0.1.42" ArgCheck = "2.5.0" +BenchmarkTools = "1.6.3" Clustering = "0.15.8" IntervalSets = "0.7.11" LinearAlgebra = "1.12.0" diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index d3322cd..3af7601 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -27,7 +27,7 @@ function make_k_block(k, ::Val{:custom}; num_categories, kwargs...) return k_block end -@inline function score(ss::MyCustomSuffStats; kwargs...) +@inline function score(ss::MyCustomSuffStats) n = sum(ss.h) return n - sum(abs2, ss.h) / max(n, 1) end @@ -72,8 +72,7 @@ es_new = NetworkHistogram.GreedySuffStats(data, initial_labels, num_categories = dist = Categorical(m) ); node_labels_es_new = NetworkHistogram.estimate!( - es_new, data, initial_labels; dist = Categorical(m), - iter_progress = 10_000) + es_new, data, initial_labels; iter_progress = 10_000) function params(ss::Union{NetworkHistogram.CategoricalSuffStats, MyCustomSuffStats}) ss.h ./ sum(ss.h) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index 17dd67b..316c6cb 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -30,20 +30,19 @@ function init!(es::GreedySuffStats, data, node_labels) end # TODO: allow for non-symmetric data -@inline function score(matrix_ss::SymArray, data, node_labels; dist = nothing, norm = 1.0) +@inline function score(matrix_ss::SymArray; norm = 1.0) total_loss = 0.0 for m in matrix_ss.uppertrian.nzval - total_loss += score(m; dist = dist, data = data, node_labels = node_labels) + total_loss += score(m) end return total_loss / norm end -@inline function score(matrix_ss, data, node_labels; dist = nothing, norm = 1.0) +@inline function score(matrix_ss; norm = 1.0) total_loss = 0.0 @inbounds for j in axes(matrix_ss, 2) for i in 1:j - inter = score( - matrix_ss[i, j]; dist = dist, data = data, node_labels = node_labels) + inter = score(matrix_ss[i, j]) total_loss += inter end end @@ -53,13 +52,15 @@ end function GreedySuffStats( data, node_labels; type_suff_stats = :categorical, max_iter = 10000, node_swap_rule = RandomGroupSwap(), stop_rule = PreviousBestValue(5_000, Inf, :min), + dist = nothing, kwargs...) # derive user input k = length(unique(node_labels)) # allocate sufficient statistics blocks - block_ss = make_k_block(k, Val(type_suff_stats); data = data, kwargs...) - block_ss_swap = make_k_block(k, Val(type_suff_stats); data = data, kwargs...) + block_ss = make_k_block(k, Val(type_suff_stats); data = data, dist = dist, kwargs...) + block_ss_swap = make_k_block( + k, Val(type_suff_stats); data = data, dist = dist, kwargs...) # create estimator return GreedySuffStats{typeof(block_ss), typeof(node_swap_rule), typeof(stop_rule)}( @@ -72,7 +73,6 @@ function estimate!( data, node_labels_init; progress = true, - dist = nothing, iter_progress = 5000 ) # Initialize node labels @@ -93,7 +93,7 @@ function estimate!( progress_update_interval = max(1, es.max_iter ÷ iter_progress) # Initial log-likelihood - current_loss = score(es.block_ss, data, node_labels, dist = dist, norm = n_edges) + current_loss = score(es.block_ss, norm = n_edges) es.stop_rule.previous_best_value = current_loss # Main optimization loop for iter in 1:(es.max_iter) @@ -124,7 +124,7 @@ function estimate!( # tentative swap @inbounds node_labels[index1], node_labels[index2] = group2, group1 - new_loss = score(es.block_ss_swap, data, node_labels, dist = dist, norm = n_edges) + new_loss = score(es.block_ss_swap, norm = n_edges) if new_loss < current_loss # apply swap diff --git a/src/pseudo_suff_stats/abstract_suffstat.jl b/src/pseudo_suff_stats/abstract_suffstat.jl index af90250..a2adf0f 100644 --- a/src/pseudo_suff_stats/abstract_suffstat.jl +++ b/src/pseudo_suff_stats/abstract_suffstat.jl @@ -2,11 +2,12 @@ abstract type SuffStats end function add_sample end function remove_sample end +function make_k_block end +function score end +# some suffstat may need the edge index (i,j) to update properly add_sample(suffstats::SuffStats, sample, i, j) = add_sample(suffstats, sample) remove_sample(suffstats::SuffStats, sample, i, j) = remove_sample(suffstats, sample) -function make_k_block end -function score end include("categorical.jl") include("bernoulli.jl") diff --git a/src/pseudo_suff_stats/bernoulli.jl b/src/pseudo_suff_stats/bernoulli.jl index 581ae4f..b7e727b 100644 --- a/src/pseudo_suff_stats/bernoulli.jl +++ b/src/pseudo_suff_stats/bernoulli.jl @@ -36,7 +36,7 @@ function make_k_block(k, ::Val{:binary}; kwargs...) return k_block end -@inline function score(ss::BernoulliSuffStats; kwargs...) +@inline function score(ss::BernoulliSuffStats) n = max(ss.n, 1) p = ss.h / n return n * (xlogx(1 - p) + xlogx(p)) diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl index ab4d95c..54fee93 100644 --- a/src/pseudo_suff_stats/categorical.jl +++ b/src/pseudo_suff_stats/categorical.jl @@ -36,6 +36,6 @@ function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) return k_block end -@inline function score(ss::CategoricalSuffStats; kwargs...) +@inline function score(ss::CategoricalSuffStats, dist) return ss.n - sum(abs2, ss.h) / max(ss.n, 1) end diff --git a/src/pseudo_suff_stats/generic.jl b/src/pseudo_suff_stats/generic.jl index e0b6b76..d4c2445 100644 --- a/src/pseudo_suff_stats/generic.jl +++ b/src/pseudo_suff_stats/generic.jl @@ -1,12 +1,10 @@ - -abstract type GenericSuffStatsType <: SuffStats end - -struct GenericSuffStats{T} <: GenericSuffStatsType +struct GenericSuffStats{T, D} <: SuffStats samples::Vector{T} + dist::D end -function GenericSuffStats(::AbstractArray{T}) where {T} - return GenericSuffStats{T}(Vector{T}()) +function GenericSuffStats(::AbstractArray{T}, dist::D) where {T, D} + return GenericSuffStats{T, D}(Vector{T}(), dist) end function get_samples(ss::GenericSuffStats) @@ -26,12 +24,12 @@ function remove_sample(ss::GenericSuffStats, sample) return ss end -function make_k_block(k, generic; data::AbstractArray, kwargs...) - @warn "Using GenericSuffStats may lead to high memory usage for large datasets. +function make_k_block(k, generic; data::AbstractArray, dist::D, kwargs...) where {D} + @warn "Using GenericSuffStats may be very slow even for small graphs. Consider using more specialized sufficient statistics types when possible." - k_block = SymArray{GenericSuffStats{eltype(data)}}(undef, k, k) + k_block = SymArray{GenericSuffStats{eltype(data), D}}(undef, k, k) for j in 1:k, i in 1:k - k_block[i, j] = GenericSuffStats(data) + k_block[i, j] = GenericSuffStats(data, dist) end return k_block end @@ -55,11 +53,9 @@ end # return ss # end -function score(ss::GenericSuffStatsType; dist::D, kwargs...) where {D} - if dist === nothing - @error("No distribution provided for scoring GenericSuffStats") - end +function score(ss::GenericSuffStats) + isnothing(ss.dist) && @error("No distribution provided for scoring GenericSuffStats") samples = get_samples(ss) - d = fit(D, samples) + d = fit(typeof(ss.dist), samples) return -sum(logpdf.(d, samples)) end From a576654000d11baefe578c1bf03606ac07ff5608 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 19:27:19 +0100 Subject: [PATCH 219/266] correct typo --- src/GreedySuffStats.jl | 4 ++-- src/pseudo_suff_stats/categorical.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index 316c6cb..aa4d741 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -30,7 +30,7 @@ function init!(es::GreedySuffStats, data, node_labels) end # TODO: allow for non-symmetric data -@inline function score(matrix_ss::SymArray; norm = 1.0) +@inline function score(matrix_ss::SymArray{<:SuffStats}; norm = 1.0) total_loss = 0.0 for m in matrix_ss.uppertrian.nzval total_loss += score(m) @@ -38,7 +38,7 @@ end return total_loss / norm end -@inline function score(matrix_ss; norm = 1.0) +@inline function score(matrix_ss::AbstractMatrix{<:SuffStats}; norm = 1.0) total_loss = 0.0 @inbounds for j in axes(matrix_ss, 2) for i in 1:j diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl index 54fee93..bf7bded 100644 --- a/src/pseudo_suff_stats/categorical.jl +++ b/src/pseudo_suff_stats/categorical.jl @@ -36,6 +36,6 @@ function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) return k_block end -@inline function score(ss::CategoricalSuffStats, dist) +@inline function score(ss::CategoricalSuffStats) return ss.n - sum(abs2, ss.h) / max(ss.n, 1) end From 570f08a5a5a70b34329caecce1d1202e3a32a4c9 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 19:57:08 +0100 Subject: [PATCH 220/266] tidy and clean --- docs/examples/custom_suffstats.jl | 35 ++++--- src/GreedySuffStats.jl | 2 +- src/SymArray.jl | 37 +------ src/api.jl | 106 +++++++++++---------- src/preprocessor/abstractConvertor.jl | 2 +- src/pseudo_suff_stats/abstract_suffstat.jl | 1 + src/pseudo_suff_stats/bernoulli.jl | 4 + src/pseudo_suff_stats/categorical.jl | 5 + src/pseudo_suff_stats/generic.jl | 10 +- test/test_symarray.jl | 21 +--- 10 files changed, 94 insertions(+), 129 deletions(-) diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index 3af7601..ff1cc25 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -1,4 +1,5 @@ -import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, score +import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, score, + to_params, CategoricalConvertor, num_bins, to_distribution using StaticArrays using Accessors @@ -32,6 +33,10 @@ end return n - sum(abs2, ss.h) / max(n, 1) end +function to_params(ss::MyCustomSuffStats) + n = max(sum(ss.h), 1) + return ss.h ./ n +end ## using Distributions @@ -48,7 +53,8 @@ function W_multiplex(x, y) return DiscreteNonParametric(0:3, SVector{4}(ps)) end -m = 4 +convertor = CategoricalConvertor(4, Dict(0 => 1, 1 => 2, 2 => 3, 3 => 4)) + graphon = DecoratedGraphon(W_multiplex) n = 2000 @@ -59,32 +65,23 @@ k = 20 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); -max_iter = 1_000_000 +max_iter = 500 stalled_iters = 5_000 -data = A .+ 1; # shift to 1,2,3,4 for categorical -es_new = NetworkHistogram.GreedySuffStats(data, initial_labels, num_categories = m, +data = convertor.(A) +es_new = NetworkHistogram.GreedySuffStats( + data, initial_labels, num_categories = num_bins(convertor), type_suff_stats = :custom, max_iter = max_iter, swap_rule = NetworkHistogram.RandomGroupSwap(), stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), - progress = true, - dist = Categorical(m) + progress = true ); -node_labels_es_new = NetworkHistogram.estimate!( +node_labels_es_new, parameters = NetworkHistogram.estimate!( es_new, data, initial_labels; iter_progress = 10_000) -function params(ss::Union{NetworkHistogram.CategoricalSuffStats, MyCustomSuffStats}) - ss.h ./ sum(ss.h) -end -parameters = Matrix{SVector{m, Float64}}(undef, k, k) -@inbounds for j in 1:k, i in 1:k - parameters[i, j] = SVector{m, Float64}( - params(es_new.block_ss[i, j])...) -end -model_es_new = NetworkHistogram.DecoratedSBM( - DiscreteNonParametric.(Ref(0:(m - 1)), parameters), counts(node_labels_es_new) ./ - length(node_labels_es_new)); +model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), + counts(node_labels_es_new) ./ length(node_labels_es_new)); res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); NetworkHistogram.align_res_true_latents!(res_new, oracle_labels); diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index aa4d741..7866700 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -154,5 +154,5 @@ function estimate!( finish!(pbar) @info "Optimization finished. Final loss: $current_loss" - return node_labels + return node_labels, to_params.(es.block_ss) end diff --git a/src/SymArray.jl b/src/SymArray.jl index b0dfcc2..df16c88 100644 --- a/src/SymArray.jl +++ b/src/SymArray.jl @@ -12,7 +12,7 @@ import Base: eltype, convert, size, getindex, setindex!, copy!, similar, IndexStyle, axes, length, iterate, copyto!, fill! import SparseArrays: getcolptr, nonzeros, FixedSparseCSC -export SymArray, eltype, deepcopy!, sum_tri_with_diag, make_sym_init +export SymArray, eltype, deepcopy! """ SymArray{F} <: AbstractSparseMatrix{F, 2} @@ -39,8 +39,6 @@ sym[2, 1] # Returns 5.0 A = [1 2 3; 2 4 5; 3 5 6] sym = SymArray(A) ``` - -See also: [`sum_tri_with_diag`](@ref) """ mutable struct SymArray{F} <: AbstractSparseMatrix{F, Int} uppertrian::SparseMatrixCSC{F, Int} @@ -82,20 +80,6 @@ function make_csc_format(k::Int, ::Type{F}) where {F} return k, k, colptr, rowval, nzval end -function make_sym_init(k, d::Real) - a = SymArray{typeof(d)}(undef, k, k) - fill!(a, d) - return a -end - -function make_sym_init(k, d) - a = SymArray{typeof(d)}(undef, k, k) - for j in 1:k, i in 1:j - a[i, j] = deepcopy(d) - end - return a -end - """ SymArray(d::AbstractMatrix{F}) @@ -147,21 +131,6 @@ function copy!(dest::SymArray{F}, src::SymArray{F}) where {F} return nothing end -""" - sum_tri_with_diag(a::SymArray) - -Efficiently sum all elements in the symmetric matrix (counting each off-diagonal once). - -# Returns -- Sum of all unique elements in the symmetric matrix - -# Note -This is more efficient than `sum(a)` because it only sums stored elements. -""" -function sum_tri_with_diag(a::SymArray) - return sum(a.uppertrian) -end - function convert(::Type{SymArray{F}}, a::AbstractMatrix{F}) where {F} k, n = size(a) @assert k==n "Input matrix must be square, got size $(size(a))" @@ -254,7 +223,6 @@ end # This maintains the symmetric structure during broadcast operations function Base.copyto!(dest::SymArray, bc::Broadcast.Broadcasted{SymArrayStyle}) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) - _copyto_nzval!(dest, bc) return dest # # Try to use optimized nzval path for simple operations @@ -279,9 +247,8 @@ function _copyto_nzval!( bc_nzval = _replace_with_nzval(bc) # Broadcast directly on the nzval array - dest_nzval = nonzeros(dest.uppertrian) + dest_nzval = dest.uppertrian.nzval copyto!(dest_nzval, bc_nzval) - return dest end diff --git a/src/api.jl b/src/api.jl index b0f0cf5..383251c 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,51 +1,55 @@ -function nethist_binary_edges(A, initial_node_labels, params::GreedyParams) - k = length(unique(initial_node_labels)) - data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) - es = GreedyAverage( - counts_main, counts_swap, realized, realized_swap, - params.max_iter, params.swap_rule, params.stop_rule) - node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) - sizes = counts(node_labels) ./ length(node_labels) - - θ = Matrix{Float64}(undef, k, k) - @inbounds for j in 1:k, i in 1:k - θ[i, j] = es.realized[i, j][2] / max(1, es.counts[i, j]) - end - model = SBM(θ, sizes) - - return NethistResult(node_labels, model) -end - -function nethist_discrete_edges( - A, initial_node_labels, params::GreedyParams, m = length(unique(A))) - k = length(unique(initial_node_labels)) - data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k, m = m) - es = GreedyAverage( - counts_main, counts_swap, realized, realized_swap, - params.max_iter, params.swap_rule, params.stop_rule) - node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) - sizes = counts(node_labels) ./ length(node_labels) - parameters = Matrix{SVector{m, Float64}}(undef, k, k) - @inbounds for j in 1:k, i in 1:k - parameters[i, j] = [es.realized[i, j][c] / max(es.counts[i, j], 1) for c in 1:m] - end - s = zero(eltype(A)) in A ? collect(0:(m - 1)) : 1:m - model = DecoratedSBM(DiscreteNonParametric.(Ref(s), parameters), sizes) - return NethistResult(node_labels, model), es -end - -function nethist_continuous_edges(A_cont, initial_node_labels, params::GreedyParams; - num_bins_::Int = 10, lower_bound = quantile(A_cont[:], 0.01), upper_bound = quantile( - A_cont[:], 0.99)) - convertor = ContinuousConvertor(lower_bound, upper_bound, num_bins_) - A = convertor.(A_cont) - @info "Discretized continuous edge values into $(num_bins(convertor)) bins" - res_cat = nethist_discrete_edges( - A, initial_node_labels, params, num_bins(convertor)) - parameters = NetworkHistogram.HistDistribution.( - Graphons._extract_param.(res_cat.model.θ), convertor) - model = DecoratedSBM(parameters, res_cat.model.size) - return NethistResult(res_cat.node_labels, model), res_cat, A -end - -# functions for postprocessing + + + + + + +# function nethist_binary_edges(A, initial_node_labels, params::GreedyParams) +# k = length(unique(initial_node_labels)) +# data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) +# es = GreedyAverage( +# counts_main, counts_swap, realized, realized_swap, +# params.max_iter, params.swap_rule, params.stop_rule) +# node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) +# sizes = counts(node_labels) ./ length(node_labels) + +# θ = Matrix{Float64}(undef, k, k) +# @inbounds for j in 1:k, i in 1:k +# θ[i, j] = es.realized[i, j][2] / max(1, es.counts[i, j]) +# end +# model = SBM(θ, sizes) + +# return NethistResult(node_labels, model) +# end + +# function nethist_discrete_edges( +# A, initial_node_labels, params::GreedyParams, m = length(unique(A))) +# k = length(unique(initial_node_labels)) +# data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k, m = m) +# es = GreedyAverage( +# counts_main, counts_swap, realized, realized_swap, +# params.max_iter, params.swap_rule, params.stop_rule) +# node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) +# sizes = counts(node_labels) ./ length(node_labels) +# parameters = Matrix{SVector{m, Float64}}(undef, k, k) +# @inbounds for j in 1:k, i in 1:k +# parameters[i, j] = [es.realized[i, j][c] / max(es.counts[i, j], 1) for c in 1:m] +# end +# s = zero(eltype(A)) in A ? collect(0:(m - 1)) : 1:m +# model = DecoratedSBM(DiscreteNonParametric.(Ref(s), parameters), sizes) +# return NethistResult(node_labels, model), es +# end + +# function nethist_continuous_edges(A_cont, initial_node_labels, params::GreedyParams; +# num_bins_::Int = 10, lower_bound = quantile(A_cont[:], 0.01), upper_bound = quantile( +# A_cont[:], 0.99)) +# convertor = ContinuousConvertor(lower_bound, upper_bound, num_bins_) +# A = convertor.(A_cont) +# @info "Discretized continuous edge values into $(num_bins(convertor)) bins" +# res_cat = nethist_discrete_edges( +# A, initial_node_labels, params, num_bins(convertor)) +# parameters = NetworkHistogram.HistDistribution.( +# Graphons._extract_param.(res_cat.model.θ), convertor) +# model = DecoratedSBM(parameters, res_cat.model.size) +# return NethistResult(res_cat.node_labels, model), res_cat, A +# end diff --git a/src/preprocessor/abstractConvertor.jl b/src/preprocessor/abstractConvertor.jl index 3d2f3e2..bc6e09e 100644 --- a/src/preprocessor/abstractConvertor.jl +++ b/src/preprocessor/abstractConvertor.jl @@ -1,6 +1,6 @@ abstract type AbstractConvertor end -Base.broadcastable(o::T) where {T <: AbstractConvertor} = Ref(o) +Base.broadcastable(o::AbstractConvertor) = Ref(o) """ Convert data from its original form to a processed form suitable for SBM estimation. diff --git a/src/pseudo_suff_stats/abstract_suffstat.jl b/src/pseudo_suff_stats/abstract_suffstat.jl index a2adf0f..099e409 100644 --- a/src/pseudo_suff_stats/abstract_suffstat.jl +++ b/src/pseudo_suff_stats/abstract_suffstat.jl @@ -4,6 +4,7 @@ function add_sample end function remove_sample end function make_k_block end function score end +function to_params end # some suffstat may need the edge index (i,j) to update properly add_sample(suffstats::SuffStats, sample, i, j) = add_sample(suffstats, sample) diff --git a/src/pseudo_suff_stats/bernoulli.jl b/src/pseudo_suff_stats/bernoulli.jl index b7e727b..f63f35b 100644 --- a/src/pseudo_suff_stats/bernoulli.jl +++ b/src/pseudo_suff_stats/bernoulli.jl @@ -41,3 +41,7 @@ end p = ss.h / n return n * (xlogx(1 - p) + xlogx(p)) end + +function to_params(ss::BernoulliSuffStats) + return ss.h / max(ss.n, 1) +end diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl index bf7bded..05fde85 100644 --- a/src/pseudo_suff_stats/categorical.jl +++ b/src/pseudo_suff_stats/categorical.jl @@ -39,3 +39,8 @@ end @inline function score(ss::CategoricalSuffStats) return ss.n - sum(abs2, ss.h) / max(ss.n, 1) end + +function to_params(ss::CategoricalSuffStats) + n = max(ss.n, 1) + return ss.h ./ n +end diff --git a/src/pseudo_suff_stats/generic.jl b/src/pseudo_suff_stats/generic.jl index d4c2445..e52947b 100644 --- a/src/pseudo_suff_stats/generic.jl +++ b/src/pseudo_suff_stats/generic.jl @@ -53,9 +53,13 @@ end # return ss # end -function score(ss::GenericSuffStats) - isnothing(ss.dist) && @error("No distribution provided for scoring GenericSuffStats") +function score(ss::GenericSuffStats{T, D}) where {T, D} samples = get_samples(ss) - d = fit(typeof(ss.dist), samples) + d = fit(D, samples) return -sum(logpdf.(d, samples)) end + +function to_params(ss::GenericSuffStats) + d = fit(typeof(ss.dist), get_samples(ss)) + return params(d) +end diff --git a/test/test_symarray.jl b/test/test_symarray.jl index 724387a..5f9d40a 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -261,24 +261,6 @@ using StaticArrays @test q[2, 1] == 2.0 end - @testset "Special case: sum_tri_with_diag" begin - a = SymArray{Float64}(undef, 3, 3) - fill!(a, 1.0) - # Only upper triangle is stored: 6 elements - # [1,1], [1,2], [1,3], [2,2], [2,3], [3,3] - @test sum_tri_with_diag(a) == 6.0 - - b = SymArray{Float64}(undef, 4, 4) - fill!(b, 2.0) - # Upper triangle has 10 elements for 4x4 - @test sum_tri_with_diag(b) == 20.0 - - # Verify it's different from full sum (which counts off-diag twice) - # Full sum would be 2*n*(n-1)/2 + n for value v - # = v*(n^2-n+n) = v*n^2 - # While sum_tri_with_diag gives v*n*(n+1)/2 - end - @testset "Type stability" begin # Float64 a = SymArray{Float64}(undef, 3, 3) @@ -378,7 +360,8 @@ using StaticArrays @test result3 isa SymArray # SymArray + SymArray should return SymArray - b = make_sym_init(3, 3.0) + b = SymArray{Float64}(undef, 3, 3) + fill!(b, 3.0) result4 = a .+ b @test result4 isa SymArray end From 5684140d9fc38c8142f13e5b0ff5948efa6930f0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 20:43:40 +0100 Subject: [PATCH 221/266] fix broadcasting --- docs/examples/custom_suffstats.jl | 2 +- src/SymArray.jl | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index ff1cc25..1e603c1 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -65,7 +65,7 @@ k = 20 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); -max_iter = 500 +max_iter = 1_000_000 stalled_iters = 5_000 data = convertor.(A) diff --git a/src/SymArray.jl b/src/SymArray.jl index df16c88..f886556 100644 --- a/src/SymArray.jl +++ b/src/SymArray.jl @@ -44,12 +44,20 @@ mutable struct SymArray{F} <: AbstractSparseMatrix{F, Int} uppertrian::SparseMatrixCSC{F, Int} end +function check_size(a) + if length(a.uppertrian.nzval) == 0 + println("Warning: SymArray has zero stored elements.") + end +end + SymArray(::Type{F}, dims::Int...) where {F} = SymArray(F, dims) function SymArray(::Type{F}, dims::NTuple{2, Int}) where {F} if dims[1] != dims[2] throw(ArgumentError("SymArray must be square, got dims=$(dims)")) end - SymArray{F}(SparseMatrixCSC{F, Int}(make_csc_format(dims[1], F)...)) + a = SymArray{F}(SparseMatrixCSC{F, Int}(make_csc_format(dims[1], F)...)) + check_size(a) + return a end SymArray{F}(::UndefInitializer, dims::Int...) where {F} = SymArray{F}(undef, dims) @@ -186,12 +194,7 @@ Base.BroadcastStyle(::SymArrayStyle, ::SymArrayStyle) = SymArrayStyle() # Custom similar for broadcasted SymArrays function Base.similar( bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{SymArray}}, ::Type{ElType}) where {ElType} - A = find_symarray(bc) - if A == nothing - return SymArray(similar(SparseMatrixCSC{ElType, Int}, axes(bc)...)) - else - return SymArray(similar(A.uppertrian, ElType)) - end + return SymArray{ElType}(undef, size(bc)...) end # Helper function to find a SymArray in the broadcast tree From a3ce9294e17127915d09fff9f1a4f75f9bcca372 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 20:52:15 +0100 Subject: [PATCH 222/266] weighted seems to work --- docs/literate/tutorials/weighted_network.jl | 41 +++++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 3fdb435..084f250 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -22,35 +22,44 @@ let fig end -n = 5000 -k = 15 A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(0.9), n, n)); oracle_latents = ordered_start_labels(n, k); starting_labels = copy(oracle_latents); -# shuffle!(starting_labels); p_shuffle = 1 - 1.5 / k @info "Shuffling $(p_shuffle*100)% labels for starting point" indices_to_shuffle = sample(1:n, floor(Int, n * p_shuffle), replace = false); starting_labels[indices_to_shuffle] .= shuffle(starting_labels[indices_to_shuffle]); @assert starting_labels != oracle_latents -res, res_cat, A_cat = NetworkHistogram.nethist_continuous_edges(A, - starting_labels, GreedyParams( - 1_000_000, - RandomGroupSwap(), - Strict(), - PreviousBestValue(10_000, Inf, :min), - true # progress bar - ); - num_bins_ = 10, lower_bound = eps(), upper_bound = 1); +convertor = NetworkHistogram.UnitIntervalConvertor(10) -latents = range(0, 1; length = n); +max_iter = 1_000_000 +stalled_iters = 5_000 -ssm_test = SSM(res.model, k) +data = convertor.(A) +es_new = NetworkHistogram.GreedySuffStats( + data, initial_labels, num_categories = num_bins(convertor), + type_suff_stats = :categorical, + max_iter = max_iter, + swap_rule = NetworkHistogram.RandomGroupSwap(), + stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), + progress = true +); +node_labels_es_new, parameters = NetworkHistogram.estimate!( + es_new, data, initial_labels; iter_progress = 10_000) + +model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), + counts(node_labels_es_new) ./ length(node_labels_es_new)); + +res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); +NetworkHistogram.align_res_true_latents!(res_new, oracle_labels); + +## +ssm_test = SSM(res_new.model, k) shape_range = 1:min(30, k * (k + 1) ÷ 2 - 1) ssm_estimated, criterion_values = Graphons.estimate_ssm( - res_cat.model, A_cat, latents, shape_range) + res_new.model, A_cat, latents, shape_range) Mke.lines(shape_range, criterion_values) @@ -60,4 +69,4 @@ kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smooth # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] -ssm_knee = SSM(res.model, k_knee) +ssm_knee = SSM(res_new.model, k_knee) From 8921f78e9036b19b33c94cbf0af04658a613c023 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Mon, 27 Oct 2025 21:14:22 +0100 Subject: [PATCH 223/266] add rudimentary api --- docs/literate/tutorials/weighted_network.jl | 55 ++++++++++++--------- src/NetworkHistogram.jl | 2 +- src/api.jl | 52 ++++++++++++++++++- 3 files changed, 84 insertions(+), 25 deletions(-) diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 084f250..9899f46 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -22,6 +22,8 @@ let fig end +n = 4000 +k = 4 A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(0.9), n, n)); oracle_latents = ordered_start_labels(n, k); starting_labels = copy(oracle_latents); @@ -31,42 +33,51 @@ indices_to_shuffle = sample(1:n, floor(Int, n * p_shuffle), replace = false); starting_labels[indices_to_shuffle] .= shuffle(starting_labels[indices_to_shuffle]); @assert starting_labels != oracle_latents -convertor = NetworkHistogram.UnitIntervalConvertor(10) - max_iter = 1_000_000 stalled_iters = 5_000 -data = convertor.(A) -es_new = NetworkHistogram.GreedySuffStats( - data, initial_labels, num_categories = num_bins(convertor), - type_suff_stats = :categorical, +res_new = NetworkHistogram.nethist_continuous( + A, k, + starting_labels; + num_bins_ = 10, max_iter = max_iter, - swap_rule = NetworkHistogram.RandomGroupSwap(), - stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), - progress = true + stalled_iters = stalled_iters, + progress_bar = true ); -node_labels_es_new, parameters = NetworkHistogram.estimate!( - es_new, data, initial_labels; iter_progress = 10_000) -model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), - counts(node_labels_es_new) ./ length(node_labels_es_new)); +# convertor = NetworkHistogram.UnitIntervalConvertor(10) + +# data = convertor.(A) +# es_new = NetworkHistogram.GreedySuffStats( +# data, initial_labels, num_categories = num_bins(convertor), +# type_suff_stats = :categorical, +# max_iter = max_iter, +# swap_rule = NetworkHistogram.RandomGroupSwap(), +# stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), +# progress = true +# ); +# node_labels_es_new, parameters = NetworkHistogram.estimate!( +# es_new, data, initial_labels; iter_progress = 10_000) + +# model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), +# counts(node_labels_es_new) ./ length(node_labels_es_new)); -res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); -NetworkHistogram.align_res_true_latents!(res_new, oracle_labels); +# res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); +NetworkHistogram.align_res_true_latents!(res_new, oracle_latents); ## ssm_test = SSM(res_new.model, k) -shape_range = 1:min(30, k * (k + 1) ÷ 2 - 1) +shape_range = 1:min(5, k * (k + 1) ÷ 2 - 1) ssm_estimated, criterion_values = Graphons.estimate_ssm( - res_new.model, A_cat, latents, shape_range) + res_new.model, A, res_new.labels, shape_range) Mke.lines(shape_range, criterion_values) ## -using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) -# Let's extract the optimal number of shapes using the Kneedle algorithm: +# using Kneedle +# kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +# # Let's extract the optimal number of shapes using the Kneedle algorithm: -k_knee = knees(kr)[1] -ssm_knee = SSM(res_new.model, k_knee) +# k_knee = knees(kr)[1] +# ssm_knee = SSM(res_new.model, k_knee) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 2ee34a4..ba477ff 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -8,7 +8,7 @@ import Base: convert, eltype, zero using Distributions using LinearAlgebra using ArgCheck -import Random: randperm, AbstractRNG, rand +import Random: randperm, AbstractRNG, rand, shuffle import Distributions: logpdf, pdf using IntervalSets diff --git a/src/api.jl b/src/api.jl index 383251c..3952fe9 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,8 +1,56 @@ +function nethist_categorical( + A, k, + labels_start = shuffle(ordered_start_labels(size(A, 1), k)); + max_iter = 1_000_000, + stalled_iters = 5_000, + progress_bar = true) + _nethist( + A, labels_start, + CategoricalConvertor(A), + :categorical; + max_iter = max_iter, + stalled_iters = stalled_iters, + progress_bar = progress_bar + ) +end +function nethist_continuous( + A, k, + labels_start = shuffle(ordered_start_labels(size(A, 1), k)); + num_bins_::Int = 10, + max_iter = 1_000_000, + stalled_iters = 5_000, + progress_bar = true) + _nethist( + A, labels_start, + UnitIntervalConvertor(num_bins_), + :categorical; + max_iter = max_iter, + stalled_iters = stalled_iters, + progress_bar = progress_bar + ) +end - - +function _nethist(A, labels_start, convertor, type_suff_stats; max_iter = max_iter, + stalled_iters = 10_000, + progress_bar = true) + data = convertor.(A) + @info "Using $(num_bins(convertor)) discrete categories for edge values" + es = NetworkHistogram.GreedySuffStats( + data, labels_start, num_categories = num_bins(convertor), + type_suff_stats = type_suff_stats, + max_iter = max_iter, + swap_rule = NetworkHistogram.RandomGroupSwap(), + stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), + progress = progress_bar + ) + node_labels, parameters = NetworkHistogram.estimate!( + es, data, labels_start; iter_progress = 10_000) + model = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), + counts(node_labels) ./ length(node_labels)) + return NetworkHistogram.NethistResult(node_labels, model) +end # function nethist_binary_edges(A, initial_node_labels, params::GreedyParams) # k = length(unique(initial_node_labels)) From 16a41e5a0bbc058d23086df28e95897180602bab Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 09:21:43 +0100 Subject: [PATCH 224/266] remove unnecessary @inline --- src/pseudo_suff_stats/bernoulli.jl | 10 +++++----- src/pseudo_suff_stats/categorical.jl | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/pseudo_suff_stats/bernoulli.jl b/src/pseudo_suff_stats/bernoulli.jl index f63f35b..eceba35 100644 --- a/src/pseudo_suff_stats/bernoulli.jl +++ b/src/pseudo_suff_stats/bernoulli.jl @@ -8,24 +8,24 @@ function BernoulliSuffStats() return BernoulliSuffStats{Int}(0, 0) end -@inline function add_sample(ss::BernoulliSuffStats, sample::Bool) +function add_sample(ss::BernoulliSuffStats, sample::Bool) sample && (@reset ss.h += 1) @reset ss.n += 1 return ss end -@inline function add_sample(ss::BernoulliSuffStats, ::Nothing) +function add_sample(ss::BernoulliSuffStats, ::Nothing) @reset ss.n += 1 return ss end -@inline function remove_sample(ss::BernoulliSuffStats, sample::Bool) +function remove_sample(ss::BernoulliSuffStats, sample::Bool) sample && (@reset ss.h -= 1) @reset ss.n -= 1 return ss end -@inline function remove_sample(ss::BernoulliSuffStats, ::Nothing) +function remove_sample(ss::BernoulliSuffStats, ::Nothing) @reset ss.n -= 1 return ss end @@ -36,7 +36,7 @@ function make_k_block(k, ::Val{:binary}; kwargs...) return k_block end -@inline function score(ss::BernoulliSuffStats) +function score(ss::BernoulliSuffStats) n = max(ss.n, 1) p = ss.h / n return n * (xlogx(1 - p) + xlogx(p)) diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl index 05fde85..f4ad54c 100644 --- a/src/pseudo_suff_stats/categorical.jl +++ b/src/pseudo_suff_stats/categorical.jl @@ -8,24 +8,24 @@ function CategoricalSuffStats(num_categories::Int) return CategoricalSuffStats{num_categories, Int}(h, 0) end -@inline function add_sample(ss::CategoricalSuffStats, sample::Int) +function add_sample(ss::CategoricalSuffStats, sample::Int) ss = @set ss.h[sample] += 1 ss = @set ss.n += 1 return ss end -@inline function add_sample(ss::CategoricalSuffStats, ::Nothing) +function add_sample(ss::CategoricalSuffStats, ::Nothing) @reset ss.n += 1 return ss end -@inline function remove_sample(ss::CategoricalSuffStats, sample::Int) +function remove_sample(ss::CategoricalSuffStats, sample::Int) ss = @set ss.h[sample] -= 1 ss = @set ss.n -= 1 return ss end -@inline function remove_sample(ss::CategoricalSuffStats, ::Nothing) +function remove_sample(ss::CategoricalSuffStats, ::Nothing) @reset ss.n -= 1 return ss end @@ -36,7 +36,7 @@ function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) return k_block end -@inline function score(ss::CategoricalSuffStats) +function score(ss::CategoricalSuffStats) return ss.n - sum(abs2, ss.h) / max(ss.n, 1) end From f4aec5c79faad5394641558d0615ec9da7cbcbfb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 09:21:50 +0100 Subject: [PATCH 225/266] clean parameters --- src/GreedySuffStats.jl | 4 ++-- src/config_rules/include.jl | 17 +++++++++++++---- src/config_rules/stop_rule.jl | 11 ++++------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index 7866700..b0403a8 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -94,7 +94,7 @@ function estimate!( # Initial log-likelihood current_loss = score(es.block_ss, norm = n_edges) - es.stop_rule.previous_best_value = current_loss + reset!(es.stop_rule, current_loss) # Main optimization loop for iter in 1:(es.max_iter) # Select two nodes to potentially swap @@ -126,7 +126,7 @@ function estimate!( @inbounds node_labels[index1], node_labels[index2] = group2, group1 new_loss = score(es.block_ss_swap, norm = n_edges) - if new_loss < current_loss + if compare_to_best(new_loss, current_loss, es.stop_rule) # apply swap copy!(es.block_ss, es.block_ss_swap) current_loss = new_loss diff --git a/src/config_rules/include.jl b/src/config_rules/include.jl index e4c4476..67fc118 100644 --- a/src/config_rules/include.jl +++ b/src/config_rules/include.jl @@ -1,8 +1,17 @@ include("swap_rule.jl") include("stop_rule.jl") -struct GreedyParams{N <: NodeSwapRule, S <: StopRule} - max_iter::Int - node_swap_rule::N - stop_rule::S +abstract type ParamsType end + +@kwdef struct GreedyParams{N <: NodeSwapRule, S <: StopRule} <: ParamsType + max_iter::Int = 1_000_000 + stalled_iters::Int = 5_000 + node_swap_rule::N = RandomGroupSwap() + stop_rule::S = PreviousBestValue(stalled_iters, Inf, :min) +end + +function reset!(params::GreedyParams) + params.stop_rule = PreviousBestValue( + params.stalled_iters, Inf, :min) + return params end diff --git a/src/config_rules/stop_rule.jl b/src/config_rules/stop_rule.jl index 85c94b2..033f8bb 100644 --- a/src/config_rules/stop_rule.jl +++ b/src/config_rules/stop_rule.jl @@ -5,7 +5,7 @@ function info_to_print(::StopRule) end mutable struct PreviousBestValue{T, S} <: StopRule - k::Int + const k::Int previous_best_value::T iterations_since_best::Int end @@ -18,9 +18,9 @@ end const PreviousMaxValue{T} = PreviousBestValue{T, Val(:max)} const PreviousMinValue{T} = PreviousBestValue{T, Val(:min)} -function initialise_stop_rule!(stop_rule::PreviousBestValue, a) - score_value = score(a) +function reset!(stop_rule::PreviousBestValue{T}, score_value::T) where {T} stop_rule.previous_best_value = score_value + stop_rule.iterations_since_best = 0 end function compare_to_best(current, past, ::PreviousMaxValue) @@ -44,16 +44,13 @@ stopping_rule function stopping_rule(loss::T, stop_rule::PreviousBestValue{T}) where {T <: Real} if compare_to_best(loss, stop_rule.previous_best_value, stop_rule) - stop_rule.previous_best_value = loss - stop_rule.iterations_since_best = 0 + reset!(stop_rule, loss) else stop_rule.iterations_since_best += 1 end return stop_rule.iterations_since_best >= stop_rule.k end -stopping_rule(a, stop_rule::StopRule) = stopping_rule(score(a), stop_rule) - function info_to_print(stop_rule::PreviousBestValue) ("stalled iter: ", stop_rule.iterations_since_best) end From 23263234686357891e00412a01454c15387589e5 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 12:46:39 +0100 Subject: [PATCH 226/266] add parameters struct --- docs/examples/custom_suffstats.jl | 2 +- docs/literate/tutorials/weighted_network.jl | 5 +- src/GreedySuffStats.jl | 7 +- src/api.jl | 91 ++++----------------- src/config_rules/include.jl | 6 +- src/config_rules/stop_rule.jl | 3 + 6 files changed, 30 insertions(+), 84 deletions(-) diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index 1e603c1..00432e4 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -71,7 +71,7 @@ stalled_iters = 5_000 data = convertor.(A) es_new = NetworkHistogram.GreedySuffStats( data, initial_labels, num_categories = num_bins(convertor), - type_suff_stats = :custom, + type_suff_stats = Val(:custom), max_iter = max_iter, swap_rule = NetworkHistogram.RandomGroupSwap(), stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 9899f46..a640c67 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -39,10 +39,7 @@ stalled_iters = 5_000 res_new = NetworkHistogram.nethist_continuous( A, k, starting_labels; - num_bins_ = 10, - max_iter = max_iter, - stalled_iters = stalled_iters, - progress_bar = true + num_bins_ = 10 ); # convertor = NetworkHistogram.UnitIntervalConvertor(10) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index b0403a8..2bb8b8d 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -50,7 +50,7 @@ end end function GreedySuffStats( - data, node_labels; type_suff_stats = :categorical, max_iter = 10000, + data, node_labels; type_suff_stats = Val(:categorical), max_iter = 10000, node_swap_rule = RandomGroupSwap(), stop_rule = PreviousBestValue(5_000, Inf, :min), dist = nothing, kwargs...) @@ -58,9 +58,8 @@ function GreedySuffStats( k = length(unique(node_labels)) # allocate sufficient statistics blocks - block_ss = make_k_block(k, Val(type_suff_stats); data = data, dist = dist, kwargs...) - block_ss_swap = make_k_block( - k, Val(type_suff_stats); data = data, dist = dist, kwargs...) + block_ss = make_k_block(k, type_suff_stats; data = data, dist = dist, kwargs...) + block_ss_swap = make_k_block(k, type_suff_stats; data = data, dist = dist, kwargs...) # create estimator return GreedySuffStats{typeof(block_ss), typeof(node_swap_rule), typeof(stop_rule)}( diff --git a/src/api.jl b/src/api.jl index 3952fe9..b945c14 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,17 +1,12 @@ - function nethist_categorical( A, k, labels_start = shuffle(ordered_start_labels(size(A, 1), k)); - max_iter = 1_000_000, - stalled_iters = 5_000, - progress_bar = true) + params::GreedyParams = GreedyParams()) _nethist( A, labels_start, CategoricalConvertor(A), - :categorical; - max_iter = max_iter, - stalled_iters = stalled_iters, - progress_bar = progress_bar + Val(:categorical), + params ) end @@ -19,85 +14,35 @@ function nethist_continuous( A, k, labels_start = shuffle(ordered_start_labels(size(A, 1), k)); num_bins_::Int = 10, - max_iter = 1_000_000, - stalled_iters = 5_000, - progress_bar = true) + params::GreedyParams = GreedyParams()) _nethist( A, labels_start, UnitIntervalConvertor(num_bins_), - :categorical; - max_iter = max_iter, - stalled_iters = stalled_iters, - progress_bar = progress_bar + Val(:categorical), + params ) end -function _nethist(A, labels_start, convertor, type_suff_stats; max_iter = max_iter, - stalled_iters = 10_000, - progress_bar = true) +function _nethist( + A, labels_start, convertor, type_suff_stats, + params::GreedyParams = GreedyParams(); kwargs...) + if !params.warm_start + reset!(params) + end data = convertor.(A) @info "Using $(num_bins(convertor)) discrete categories for edge values" es = NetworkHistogram.GreedySuffStats( data, labels_start, num_categories = num_bins(convertor), type_suff_stats = type_suff_stats, - max_iter = max_iter, - swap_rule = NetworkHistogram.RandomGroupSwap(), - stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), - progress = progress_bar + max_iter = params.max_iter, + swap_rule = params.node_swap_rule, + stop_rule = params.stop_rule, + progress = params.display_progress; + kwargs... ) node_labels, parameters = NetworkHistogram.estimate!( - es, data, labels_start; iter_progress = 10_000) + es, data, labels_start; iter_progress = params.progress_freq) model = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), counts(node_labels) ./ length(node_labels)) return NetworkHistogram.NethistResult(node_labels, model) end - -# function nethist_binary_edges(A, initial_node_labels, params::GreedyParams) -# k = length(unique(initial_node_labels)) -# data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k) -# es = GreedyAverage( -# counts_main, counts_swap, realized, realized_swap, -# params.max_iter, params.swap_rule, params.stop_rule) -# node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) -# sizes = counts(node_labels) ./ length(node_labels) - -# θ = Matrix{Float64}(undef, k, k) -# @inbounds for j in 1:k, i in 1:k -# θ[i, j] = es.realized[i, j][2] / max(1, es.counts[i, j]) -# end -# model = SBM(θ, sizes) - -# return NethistResult(node_labels, model) -# end - -# function nethist_discrete_edges( -# A, initial_node_labels, params::GreedyParams, m = length(unique(A))) -# k = length(unique(initial_node_labels)) -# data, counts_main, counts_swap, realized, realized_swap = prepare_data_cat(A, k, m = m) -# es = GreedyAverage( -# counts_main, counts_swap, realized, realized_swap, -# params.max_iter, params.swap_rule, params.stop_rule) -# node_labels = estimate(es, data, initial_node_labels, progress = params.progress_bar) -# sizes = counts(node_labels) ./ length(node_labels) -# parameters = Matrix{SVector{m, Float64}}(undef, k, k) -# @inbounds for j in 1:k, i in 1:k -# parameters[i, j] = [es.realized[i, j][c] / max(es.counts[i, j], 1) for c in 1:m] -# end -# s = zero(eltype(A)) in A ? collect(0:(m - 1)) : 1:m -# model = DecoratedSBM(DiscreteNonParametric.(Ref(s), parameters), sizes) -# return NethistResult(node_labels, model), es -# end - -# function nethist_continuous_edges(A_cont, initial_node_labels, params::GreedyParams; -# num_bins_::Int = 10, lower_bound = quantile(A_cont[:], 0.01), upper_bound = quantile( -# A_cont[:], 0.99)) -# convertor = ContinuousConvertor(lower_bound, upper_bound, num_bins_) -# A = convertor.(A_cont) -# @info "Discretized continuous edge values into $(num_bins(convertor)) bins" -# res_cat = nethist_discrete_edges( -# A, initial_node_labels, params, num_bins(convertor)) -# parameters = NetworkHistogram.HistDistribution.( -# Graphons._extract_param.(res_cat.model.θ), convertor) -# model = DecoratedSBM(parameters, res_cat.model.size) -# return NethistResult(res_cat.node_labels, model), res_cat, A -# end diff --git a/src/config_rules/include.jl b/src/config_rules/include.jl index 67fc118..d4a03db 100644 --- a/src/config_rules/include.jl +++ b/src/config_rules/include.jl @@ -8,10 +8,12 @@ abstract type ParamsType end stalled_iters::Int = 5_000 node_swap_rule::N = RandomGroupSwap() stop_rule::S = PreviousBestValue(stalled_iters, Inf, :min) + display_progress::Bool = true + progress_freq::Int = 10_000 + warm_start::Bool = false end function reset!(params::GreedyParams) - params.stop_rule = PreviousBestValue( - params.stalled_iters, Inf, :min) + reset!(params.stop_rule) return params end diff --git a/src/config_rules/stop_rule.jl b/src/config_rules/stop_rule.jl index 033f8bb..b1c501c 100644 --- a/src/config_rules/stop_rule.jl +++ b/src/config_rules/stop_rule.jl @@ -23,6 +23,9 @@ function reset!(stop_rule::PreviousBestValue{T}, score_value::T) where {T} stop_rule.iterations_since_best = 0 end +reset!(stop_rule::PreviousMaxValue) = reset!(stop_rule, -Inf) +reset!(stop_rule::PreviousMinValue) = reset!(stop_rule, Inf) + function compare_to_best(current, past, ::PreviousMaxValue) return current > past end From 904e34a125d05b4087b0a0df8658e5ae3dafe6bf Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 12:47:07 +0100 Subject: [PATCH 227/266] add MC extension with pseudo_factory to get convertor of extensions --- ext/LightMCExt.jl | 84 +++++++++++++-------------- src/preprocessor/abstractConvertor.jl | 12 ++++ src/preprocessor/categorical.jl | 5 ++ src/pseudo_suff_stats/categorical.jl | 27 ++++----- 4 files changed, 66 insertions(+), 62 deletions(-) diff --git a/ext/LightMCExt.jl b/ext/LightMCExt.jl index dea5384..c6affce 100644 --- a/ext/LightMCExt.jl +++ b/ext/LightMCExt.jl @@ -1,66 +1,60 @@ module LightMCExt +using StaticArrays +using Accessors using NetworkHistogram -using LightMC +import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, score, + to_params, AbstractConvertor, to_distribution, get_convertor using LightMC: DiscreteMarkovChain, SampleChain, transition_matrix, ConvertBinaryMC -logpdf(d::DiscreteMarkovChain, x) = LightMC.logpdf(d, x) -sample(x::DiscreteMarkovChain, args...) = LightMC.sample(x, args...) -params(d) = LightMC.params(d) +# need to define a convertor that only look at the possible transitions and not all of them +struct McConvertor <: AbstractConvertor end -function agg_params(d1::DiscreteMarkovChain, d2::DiscreteMarkovChain, w1, w2) - s1 = Int(sign(w1)) - s2 = Int(sign(w2)) - return DiscreteMarkovChain(s1 .* d1.transitions .+ s2 .* d2.transitions, - s1 .* d1.normalization .+ s2 .* d2.normalization) +get_convertor(::Val{:mc}; kwargs...) = McConvertor() + +function (c::McConvertor)(chain::SampleChain) + return SVector([SVector(c...) for c in eachcol(chain.transitions)]...) end -function distance(d1::DiscreteMarkovChain, d2::DiscreteMarkovChain) - mean(x -> x^2, transition_matrix(d1) - transition_matrix(d2)) +function to_distribution(::McConvertor, transition_matrix; kwargs...) + return DiscreteMarkovChain(transition_matrix, sum(transition_matrix; dims = 2)) end -function distance(d1::SampleChain, d2::SampleChain) - mean(x -> x^2, transition_matrix(d1) - transition_matrix(d2)) +struct McSuffStats{M, T} <: SuffStats + h::SVector{M, T} end -params(d::DiscreteMarkovChain) = (d.transitions, d.normalization) -function _fast_compressed_obs(d::DiscreteMarkovChain, x::SampleChain, zeroinflated) - return x +function McSuffStats(num_states::Int) + inter = @SVector zeros(SVector{num_states, Int}, num_states) + return McSuffStats(inter) end -function from_adjs_to_decorated(adjs::AbstractArray{T, 3}, converter::ConvertBinaryMC, - threshold = 0.0) where {T <: Union{Missing, Real}} - sample_chain = MC.periodic_chain(adjs[1, 4, :], converter) - graph = Matrix{Union{typeof(sample_chain), Missing}}( - undef, size(adjs, 1), size(adjs, 2)) - counts_t = sum(adjs, dims = 3) - for j in axes(adjs, 2) - for i in axes(adjs, 1) - if i == j || counts_t[i, j] <= threshold * size(adjs, 3) - graph[i, j] = missing - else - graph[i, j] = LightMC.periodic_chain(adjs[i, j, :], converter) - end - end +function add_sample(ss::McSuffStats, sample) + @inbounds for (i, s) in enumerate(sample) + ss = @set ss.h[i] = ss.h[i] + s end - return graph + return ss end -function from_adjs_to_decorated(adjs::AbstractArray{T, 2}, converter::ConvertBinaryMC, - threshold = 0.0) where {T <: Union{Missing, AbstractArray}} - sample_chain = LightMC.periodic_chain(adjs[1, 4], converter) - graph = Matrix{Union{typeof(sample_chain), Missing}}( - undef, size(adjs, 1), size(adjs, 2)) - for j in axes(adjs, 2) - for i in axes(adjs, 1) - if i == j || sum(adjs[i, j]) <= threshold * length(adjs[i, j]) - graph[i, j] = missing - else - graph[i, j] = LightMC.periodic_chain(adjs[i, j], converter) - end - end +function remove_sample(ss::McSuffStats, sample) + @inbounds for (i, s) in enumerate(sample) + ss = @set ss.h[i] = ss.h[i] - s end - return graph + return ss +end + +function _score(counts::SVector) + n = sum(counts) + norm_ = max(n, 1) + return (n - sum(abs2, counts) / norm_) / norm_ +end + +function score(ss::McSuffStats) + return sum(_score, ss.h) +end + +function to_params(ss::McSuffStats) + return reduce(hcat, ss.h) end end diff --git a/src/preprocessor/abstractConvertor.jl b/src/preprocessor/abstractConvertor.jl index bc6e09e..fed5fda 100644 --- a/src/preprocessor/abstractConvertor.jl +++ b/src/preprocessor/abstractConvertor.jl @@ -13,5 +13,17 @@ function to_distribution(c::AbstractConvertor, ps; kwargs...) @error "to be implemented" end +get_convertor(s::String, ; kwargs...) = get_convertor(Symbol(s); kwargs...) +get_convertor(s::Symbol; kwargs...) = get_convertor(Val(s); kwargs...) +get_convertor(::T; kwargs...) where {T} = @error "No convertor found for type $T" + include("categorical.jl") include("continuous.jl") + +function get_convertor(::Val{:categorical}; kwargs...) + return CategoricalConvertor(kwargs[:num_categories]) +end + +function get_convertor(::Val{:continuous}; kwargs...) + return UnitIntervalConvertor(kwargs[:num_bins]) +end diff --git a/src/preprocessor/categorical.jl b/src/preprocessor/categorical.jl index efb9998..7cf3fdd 100644 --- a/src/preprocessor/categorical.jl +++ b/src/preprocessor/categorical.jl @@ -14,6 +14,11 @@ function CategoricalConvertor(data::AbstractArray{T}) where {T} return CategoricalConvertor{T}(m, map) end +function CategoricalConvertor(num_categories::Int) + map = Dict{Int, Int}(i => i for i in 1:num_categories) + return CategoricalConvertor{Int}(num_categories, map) +end + function num_bins(c::CategoricalConvertor) return c.m end diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl index f4ad54c..3ae65d4 100644 --- a/src/pseudo_suff_stats/categorical.jl +++ b/src/pseudo_suff_stats/categorical.jl @@ -1,32 +1,19 @@ struct CategoricalSuffStats{M, T} <: SuffStats h::SVector{M, T} - n::Int end function CategoricalSuffStats(num_categories::Int) h = SVector{num_categories, Int}(zeros(Int, num_categories)) - return CategoricalSuffStats{num_categories, Int}(h, 0) + return CategoricalSuffStats{num_categories, Int}(h) end function add_sample(ss::CategoricalSuffStats, sample::Int) ss = @set ss.h[sample] += 1 - ss = @set ss.n += 1 - return ss -end - -function add_sample(ss::CategoricalSuffStats, ::Nothing) - @reset ss.n += 1 return ss end function remove_sample(ss::CategoricalSuffStats, sample::Int) ss = @set ss.h[sample] -= 1 - ss = @set ss.n -= 1 - return ss -end - -function remove_sample(ss::CategoricalSuffStats, ::Nothing) - @reset ss.n -= 1 return ss end @@ -37,10 +24,16 @@ function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) end function score(ss::CategoricalSuffStats) - return ss.n - sum(abs2, ss.h) / max(ss.n, 1) + n = sum(ss.h) + return n - sum(abs2, ss.h) / max(n, 1) end function to_params(ss::CategoricalSuffStats) - n = max(ss.n, 1) - return ss.h ./ n + return custom_normalize(ss.h) +end + +function custom_normalize(ps::SVector{M, T}) where {M, T} + n = sum(ps) + n == 0 && return zero(SVector{M, T}) + return ps / n end From 9e4c75adb803a3392b56bb70102d0901e76a264a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 12:58:39 +0100 Subject: [PATCH 228/266] add note --- ext/LightMCExt.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ext/LightMCExt.jl b/ext/LightMCExt.jl index c6affce..34a80a0 100644 --- a/ext/LightMCExt.jl +++ b/ext/LightMCExt.jl @@ -24,6 +24,8 @@ struct McSuffStats{M, T} <: SuffStats h::SVector{M, T} end +# this will also need to be modified to take into account the structure of the markov chain +# as above (e.g. only count the transitions that are possible) function McSuffStats(num_states::Int) inter = @SVector zeros(SVector{num_states, Int}, num_states) return McSuffStats(inter) From dd2aa256983da317c3e035ddb3edac8107bda748 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 12:59:26 +0100 Subject: [PATCH 229/266] remove deprecated --- benchmark/Project.toml | 9 - benchmark/README.md | 153 --------- benchmark/benchmark_optimization.jl | 396 ---------------------- benchmark/benchmark_results/baseline.json | 83 ----- benchmark/profile_optimization.jl | 295 ---------------- benchmark/run_benchmarks.jl | 204 ----------- benchmark/visualize_benchmarks.jl | 263 -------------- 7 files changed, 1403 deletions(-) delete mode 100644 benchmark/Project.toml delete mode 100644 benchmark/README.md delete mode 100644 benchmark/benchmark_optimization.jl delete mode 100644 benchmark/benchmark_results/baseline.json delete mode 100644 benchmark/profile_optimization.jl delete mode 100644 benchmark/run_benchmarks.jl delete mode 100644 benchmark/visualize_benchmarks.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml deleted file mode 100644 index 691e4d7..0000000 --- a/benchmark/Project.toml +++ /dev/null @@ -1,9 +0,0 @@ -[deps] -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" -NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" -PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/benchmark/README.md b/benchmark/README.md deleted file mode 100644 index 4e21258..0000000 --- a/benchmark/README.md +++ /dev/null @@ -1,153 +0,0 @@ -# NetworkHistogram Benchmarks - -This directory contains benchmarking and profiling tools for -NetworkHistogram.jl performance analysis. - -## Quick Start - -```bash -# Run all benchmarks (saves to benchmark_results/ with timestamp) -julia --project=. benchmark/benchmark_optimization.jl - -# Profile to find bottlenecks -julia --project=. benchmark/profile_optimization.jl swap -``` - -## Files - -| File | Purpose | -| --------------------------- | ----------------------------------------------------- | -| `benchmark_optimization.jl` | Main benchmarking script - runs all performance tests | -| `profile_optimization.jl` | Profile code to identify bottlenecks | -| `visualize_benchmarks.jl` | Compare benchmark results over time | -| `run_benchmarks.jl` | Convenience wrapper with baseline management | -| `benchmark_results/` | Stored benchmark results (JSON, auto-created) | - -## Usage Examples - -### Running Benchmarks - -```bash -# Basic usage -julia --project=. benchmark/benchmark_optimization.jl - -# Save to specific file -julia --project=. benchmark/benchmark_optimization.jl my_results.json - -# Compare with baseline (auto-detects baseline.json) -julia --project=. benchmark/benchmark_optimization.jl -``` - -### Profiling - -```bash -# Profile swap operations -julia --project=. benchmark/profile_optimization.jl swap - -# Profile full optimization -julia --project=. benchmark/profile_optimization.jl optimize - -# Profile individual components -julia --project=. benchmark/profile_optimization.jl components -``` - -### Baseline Management - -```bash -# Set current run as baseline -cp benchmark/benchmark_results/benchmark_2025-10-15T22-51-21.json \ - benchmark/benchmark_results/baseline.json -``` - -## What Gets Benchmarked - -### Single Swap Operations - -Tests the core swap operation (apply + revert): - -- **Bernoulli networks**: Binary edges (0/1) - - Small: n=50, k=2 - - Medium: n=200, k=3 - - Large: n=500, k=5 -- **Categorical networks**: Multi-valued edges - - Small: n=50, k=2, m=3 - - Medium: n=200, k=3, m=4 - - Large: n=500, k=5, m=5 - -### Full Optimization - -End-to-end optimization performance: - -- Bernoulli: n=100, 1,000 iterations -- Categorical: n=100, 1,000 iterations - -### Component Benchmarks - -Individual function performance: - -- Assignment creation -- EdgeList creation -- Log-likelihood computation -- Edge extraction between groups - -## Current Performance (October 2025) - -**After Phase 1 Optimizations:** - -| Operation | Time | vs Baseline | -| ---------------------------- | ------- | ----------- | -| Bernoulli swap (n=500) | 2.56 ms | 6.1x faster | -| Bernoulli swap (n=200) | 0.54 ms | 3.2x faster | -| Categorical swap (n=200) | 0.04 ms | 1.4x faster | -| Bernoulli optimize (n=100) | 92 ms | - | -| Categorical optimize (n=100) | 15 ms | - | - -## Optimization Workflow - -1. **Establish baseline**: Run benchmarks before changes -2. **Profile**: Use `profile_optimization.jl` to find hotspots -3. **Optimize**: Edit source code (usually `src/optimization/`) -4. **Benchmark**: Run benchmarks again -5. **Verify**: Run tests to ensure correctness -6. **Repeat**: Continue until satisfied - -## Key Hotspots - -Focus optimization efforts on: - -1. **`apply_swap!`** - Called millions of times (biggest impact) -2. **`get_edges_in_groups`** - Called during likelihood updates -3. **Edge iteration** - Used throughout, cumulative effect - -See `dev/CODE_REVIEW_2025-10-15.md` for detailed analysis. - -## Output Format - -``` ---- Single Swap Operations (Bernoulli) --- -Benchmarking Bernoulli swap (n=200, k=3)... - Median: 0.538 ms - -====================================================================== -Performance Comparison vs Baseline -====================================================================== -✓ FASTER bernoulli_swap_n200_k3: 3.23x (223.0%) - Current: 0.54 ms | Baseline: 1.74 ms -``` - -- **✓ FASTER**: >5% improvement -- **✗ SLOWER**: >5% regression -- **≈ SIMILAR**: Within ±5% - -## Tips - -- **Close other apps** for consistent results -- **Run multiple times** to warm up JIT compiler (BenchmarkTools handles this) -- **Check allocations** with `@btime ... samples=1 evals=1` -- **Profile first** before optimizing -- **Test after** every optimization - -## Documentation - -- Full details: See `PERFORMANCE.md` (root directory) -- Code review: See `dev/CODE_REVIEW_2025-10-15.md` diff --git a/benchmark/benchmark_optimization.jl b/benchmark/benchmark_optimization.jl deleted file mode 100644 index 89feebc..0000000 --- a/benchmark/benchmark_optimization.jl +++ /dev/null @@ -1,396 +0,0 @@ -""" -Standalone benchmarking script for NetworkHistogram optimization. - -This script runs performance benchmarks and saves results to track improvements -over time. Results are saved in JSON format with timestamps. - -Usage: - julia --project=. benchmark/benchmark_optimization.jl [output_file] - -If output_file is not provided, results are saved to: - benchmark/benchmark_results/benchmark_YYYY-MM-DD_HH-MM-SS.json -""" - -using Random -using StatsBase -using StaticArrays -using BenchmarkTools -using JSON3 -using Dates -using PrettyTables -using NetworkHistogram -using LoggingExtras - -# Create output directory if it doesn't exist -const BENCHMARK_DIR = joinpath(@__DIR__, "benchmark_results") -mkpath(BENCHMARK_DIR) - -# Helper functions to create test networks -function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) - Random.seed!(seed) - d = NetworkHistogram.Bernoulli(0.5) - sbm = NetworkHistogram.BlockModel(n_groups, d) - - for g1 in 1:n_groups - for g2 in g1:n_groups - p = 0.1 + 0.7 * rand() - sbm[g1, g2] = NetworkHistogram.Bernoulli(p) - end - end - - base_size = n_nodes ÷ n_groups - remainder = n_nodes % n_groups - sizes = fill(base_size, n_groups) - sizes[1:remainder] .+= 1 - labels = StatsBase.inverse_rle(1:n_groups, sizes) - A = NetworkHistogram.sample(sbm, labels) - return A, labels, d -end - -function create_test_sbm_categorical( - n_groups::Int, n_nodes::Int, n_categories::Int; seed = 42) - Random.seed!(seed) - ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) - d = NetworkHistogram.Cat(ps) - sbm = NetworkHistogram.BlockModel(n_groups, d) - - for g1 in 1:n_groups - for g2 in g1:n_groups - probs = rand(n_categories) - probs ./= sum(probs) - sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) - end - end - - labels = StatsBase.inverse_rle(1:n_groups, fill(n_nodes ÷ n_groups, n_groups)) - # Ensure we have exactly n_nodes by padding with last group if needed - while length(labels) < n_nodes - push!(labels, n_groups) - end - A = NetworkHistogram.sample(sbm, labels) - return A, labels, d -end - -function benchmark_single_swap( - network_type, n_nodes, n_groups, n_categories = nothing; samples = 100) - if network_type == :bernoulli - A, labels, d = create_test_sbm_bernoulli(n_groups, n_nodes) - else - A, labels, d = create_test_sbm_categorical(n_groups, n_nodes, n_categories) - end - - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) - swap = NetworkHistogram.make_swap(assignment, (1, n_nodes)) - - b = @benchmark begin - NetworkHistogram.apply_swap!($assignment, $swap) - NetworkHistogram.revert_swap!($assignment, $swap) - end setup=(NetworkHistogram.make_swap_workspace!($swap.workspace, $assignment)) samples=samples #evals=1 - - return Dict( - "median_ms" => median(b.times) / 1e6, - "mean_ms" => mean(b.times) / 1e6, - "min_ms" => minimum(b.times) / 1e6, - "max_ms" => maximum(b.times) / 1e6, - "std_ms" => std(b.times) / 1e6 - ) -end - -function benchmark_full_optimization( - network_type, n_nodes, n_groups, n_categories = nothing, - max_iter = 1000; samples = 10) - if network_type == :bernoulli - A, labels, d = create_test_sbm_bernoulli(n_groups, n_nodes) - else - A, labels, d = create_test_sbm_categorical(n_groups, n_nodes, n_categories) - end - - initial_labels = rand(1:n_groups, n_nodes) - - b = @benchmark begin - # Create fresh params for each benchmark iteration - params = NetworkHistogram.GreedyParams( - $max_iter, - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue($max_iter), - false - ) - NetworkHistogram.nethist($A, $d, $initial_labels, params) - end #samples=samples evals=1 - - return Dict( - "median_ms" => median(b.times) / 1e6, - "mean_ms" => mean(b.times) / 1e6, - "min_ms" => minimum(b.times) / 1e6, - "max_ms" => maximum(b.times) / 1e6, - "std_ms" => std(b.times) / 1e6 - ) -end - -function benchmark_component(component_name, setup_fn, benchmark_fn; samples = 1000) - setup_data = setup_fn() - - b = @benchmark $benchmark_fn($setup_data...) samples=samples - - return Dict( - "median_us" => median(b.times) / 1e3, - "mean_us" => mean(b.times) / 1e3, - "min_us" => minimum(b.times) / 1e3, - "max_us" => maximum(b.times) / 1e3, - "std_us" => std(b.times) / 1e3 - ) -end - -function run_all_benchmarks() - println("="^70) - println("NetworkHistogram Performance Benchmarks") - println("Started at: $(Dates.format(now(), "yyyy-mm-dd HH:MM:SS"))") - println("="^70) - - results = Dict( - "timestamp" => Dates.format(now(), "yyyy-mm-dd HH:MM:SS"), - "julia_version" => string(VERSION), - "benchmarks" => Dict() - ) - - # Single swap benchmarks - Bernoulli - println("\n--- Single Swap Operations (Bernoulli) ---") - for (n, k, s) in [(50, 2, 100), (200, 3, 100), (500, 5, 50)] - println("Benchmarking Bernoulli swap (n=$n, k=$k)...") - results["benchmarks"]["bernoulli_swap_n$(n)_k$(k)"] = benchmark_single_swap( - :bernoulli, n, k; samples = s) - r = results["benchmarks"]["bernoulli_swap_n$(n)_k$(k)"] - println(" Median: $(round(r["median_ms"], digits=3)) ms") - end - - # Single swap benchmarks - Categorical - println("\n--- Single Swap Operations (Categorical) ---") - for (n, k, m, s) in [(50, 2, 3, 100), (200, 3, 4, 100), (500, 5, 5, 50)] - println("Benchmarking Categorical swap (n=$n, k=$k, m=$m)...") - results["benchmarks"]["categorical_swap_n$(n)_k$(k)_m$(m)"] = benchmark_single_swap( - :categorical, n, k, m; samples = s) - r = results["benchmarks"]["categorical_swap_n$(n)_k$(k)_m$(m)"] - println(" Median: $(round(r["median_ms"], digits=3)) ms") - end - - # Full optimization benchmarks - println("\n--- Full Optimization Workflow ---") - println("Benchmarking Bernoulli optimization (n=100, 1k iters)...") - results["benchmarks"]["bernoulli_optimize_n100_1k"] = benchmark_full_optimization( - :bernoulli, 100, 3, nothing, 1000; samples = 10) - r = results["benchmarks"]["bernoulli_optimize_n100_1k"] - println(" Median: $(round(r["median_ms"], digits=1)) ms") - - println("Benchmarking Categorical optimization (n=100, 1k iters)...") - results["benchmarks"]["categorical_optimize_n100_1k"] = benchmark_full_optimization( - :categorical, 100, 3, 3, 1000; samples = 10) - r = results["benchmarks"]["categorical_optimize_n100_1k"] - println(" Median: $(round(r["median_ms"], digits=1)) ms") - - # Component benchmarks - println("\n--- Component Benchmarks ---") - - println("Benchmarking Assignment creation (n=200)...") - results["benchmarks"]["assignment_creation_n200"] = benchmark_component( - "assignment_creation", - () -> begin - A, labels, d = create_test_sbm_bernoulli(3, 200) - edgelist = NetworkHistogram.EdgeList(A) - (labels, edgelist, NetworkHistogram.Dist(d)) - end, - (labels, edgelist, dist) -> NetworkHistogram.Assignment(labels, edgelist, dist); - samples = 100 - ) - r = results["benchmarks"]["assignment_creation_n200"] - println(" Median: $(round(r["median_us"], digits=1)) μs") - - println("Benchmarking EdgeList creation (n=200)...") - results["benchmarks"]["edgelist_creation_n200"] = benchmark_component( - "edgelist_creation", - () -> begin - A, _, _ = create_test_sbm_bernoulli(3, 200) - (A,) - end, - (A,) -> NetworkHistogram.EdgeList(A); - samples = 100 - ) - r = results["benchmarks"]["edgelist_creation_n200"] - println(" Median: $(round(r["median_us"], digits=1)) μs") - - println("Benchmarking Loglikelihood computation (n=200)...") - results["benchmarks"]["loglikelihood_n200"] = benchmark_component( - "loglikelihood", - () -> begin - A, labels, d = create_test_sbm_bernoulli(3, 200) - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment( - labels, edgelist, NetworkHistogram.Dist(d)) - (assignment,) - end, - (assignment,) -> NetworkHistogram.loglikelihood(assignment); - samples = 1000 - ) - r = results["benchmarks"]["loglikelihood_n200"] - println(" Median: $(round(r["median_us"], digits=2)) μs") - - return results -end - -function save_results(results, output_file = nothing) - if isnothing(output_file) - timestamp = Dates.format(now(), "yyyy-mm-ddTHH-MM-SS") - output_file = joinpath(BENCHMARK_DIR, "benchmark_$timestamp.json") - end - - open(output_file, "w") do io - JSON3.pretty(io, results) - end - - println("\n" * "="^70) - println("Results saved to: $output_file") - println("="^70) - - return output_file -end - -function compare_with_baseline(results, baseline_file) - if !isfile(baseline_file) - println("\nBaseline file not found: $baseline_file") - return - end - - baseline = JSON3.read(read(baseline_file, String)) - - println("\n" * "="^80) - println("Performance Comparison vs Baseline") - println("Baseline: $(baseline["timestamp"])") - println("="^80 * "\n") - - # Prepare data for table - table_data = [] - - for (key, value) in sort(collect(results["benchmarks"]), by = x -> string(x[1])) - if haskey(baseline["benchmarks"], key) - baseline_val = baseline["benchmarks"][key] - - # Determine which unit to use (ms or us) - # Check both string and symbol keys for JSON3 compatibility - if haskey(value, "median_ms") - current_median = value["median_ms"] - # Try string key first, then symbol key - if haskey(baseline_val, "median_ms") - baseline_median = baseline_val["median_ms"] - elseif haskey(baseline_val, :median_ms) - baseline_median = baseline_val[:median_ms] - else - baseline_median = get( - baseline_val, "median_us", get(baseline_val, :median_us, 0)) / 1000 - end - unit = "ms" - else - current_median = value["median_us"] - # Try string key first, then symbol key - if haskey(baseline_val, "median_us") - baseline_median = baseline_val["median_us"] - elseif haskey(baseline_val, :median_us) - baseline_median = baseline_val[:median_us] - else - baseline_median = get( - baseline_val, "median_ms", get(baseline_val, :median_ms, 0)) * 1000 - end - unit = "μs" - end - - speedup = baseline_median / current_median - change_pct = (speedup - 1) * 100 - - push!(table_data, - ( - string(key), - baseline_median, - current_median, - unit, - speedup, - change_pct - )) - end - end - - if isempty(table_data) - println("No comparable benchmarks found.") - return - end - - # Create table with headers - headers = ["Benchmark", "Baseline", "Current", "Unit", "Speedup", "Change (%)"] - - # Extract data into columns - benchmark_names = [row[1] for row in table_data] - baseline_vals = [round(row[2], digits = 3) for row in table_data] - current_vals = [round(row[3], digits = 3) for row in table_data] - units = [row[4] for row in table_data] - speedups = [round(row[5], digits = 3) for row in table_data] - changes = [round(row[6], digits = 2) for row in table_data] - - # Create highlighters for improvements (green) and regressions (red) - # These highlight entire rows based on speedup value - hl_improvement = TextHighlighter( - (data, i, j) -> data[i, 5] > 1.05, # Speedup column 5, >5% improvement - crayon"green" - ) - - hl_regression = TextHighlighter( - (data, i, j) -> data[i, 5] < 0.95, # Speedup column 5, >5% regression - crayon"red" - ) - - # Print the table - pretty_table( - hcat(benchmark_names, baseline_vals, current_vals, units, speedups, changes); - column_labels = headers, - highlighters = [hl_improvement, hl_regression], - alignment = [:l, :r, :r, :c, :r, :r], - table_format = TextTableFormat(borders = text_table_borders__unicode_rounded) - ) - - # Print summary statistics - all_speedups = [row[5] for row in table_data] - n_improved = count(s -> s > 1.05, all_speedups) - n_regressed = count(s -> s < 0.95, all_speedups) - n_similar = length(all_speedups) - n_improved - n_regressed - geomean_speedup = exp(sum(log.(all_speedups)) / length(all_speedups)) - - println("\nSummary:") - println(" Geometric mean speedup: $(round(geomean_speedup, digits=3))x") - println(" Benchmarks improved: $n_improved") - println(" Benchmarks regressed: $n_regressed") - println(" Benchmarks similar: $n_similar") -end - -# Main execution -function main() - output_file = length(ARGS) >= 1 ? ARGS[1] : nothing - baseline_file = length(ARGS) >= 2 ? ARGS[2] : joinpath(BENCHMARK_DIR, "baseline.json") - - results = run_all_benchmarks() - saved_file = save_results(results, output_file) - - if isfile(baseline_file) - compare_with_baseline(results, baseline_file) - else - println("\nNo baseline found. To set this as baseline, run:") - println(" cp $saved_file $baseline_file") - end -end - -if abspath(PROGRAM_FILE) == @__FILE__ - # Filter logs from NetworkHistogram module - logger_filter = EarlyFilteredLogger(global_logger()) do args - return !(args._module === NetworkHistogram) - end - with_logger(logger_filter) do - main() - end -end diff --git a/benchmark/benchmark_results/baseline.json b/benchmark/benchmark_results/baseline.json deleted file mode 100644 index 7a6aade..0000000 --- a/benchmark/benchmark_results/baseline.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "julia_version": "1.12.0", - "timestamp": "2025-10-17 10:48:29", - "benchmarks": { - "bernoulli_swap_n500_k5": { - "max_ms": 2.400625, - "min_ms": 2.256625, - "mean_ms": 2.31981594, - "median_ms": 2.3136875, - "std_ms": 0.025263586533731647 - }, - "assignment_creation_n200": { - "max_us": 1476.625, - "min_us": 803.875, - "median_us": 1040.75, - "mean_us": 1023.70714, - "std_us": 102.75691494911177 - }, - "categorical_swap_n50_k2_m3": { - "max_ms": 0.0126875, - "min_ms": 0.007677, - "mean_ms": 0.007845412500000001, - "median_ms": 0.0077395, - "std_ms": 0.0005249944761313536 - }, - "edgelist_creation_n200": { - "max_us": 3618.958, - "min_us": 200.166, - "median_us": 258.3545, - "mean_us": 295.56669, - "std_us": 337.15490001630405 - }, - "loglikelihood_n200": { - "max_us": 0.017709, - "min_us": 0.00525, - "median_us": 0.005417, - "mean_us": 0.005481977999999998, - "std_us": 0.0005442266588216458 - }, - "bernoulli_optimize_n100_1k": { - "max_ms": 113.915708, - "min_ms": 105.567625, - "mean_ms": 109.5839403478261, - "median_ms": 109.3843125, - "std_ms": 1.665303524274982 - }, - "categorical_swap_n200_k3_m4": { - "max_ms": 0.0485, - "min_ms": 0.021917, - "mean_ms": 0.02255671, - "median_ms": 0.022125, - "std_ms": 0.0026733226036775534 - }, - "bernoulli_swap_n50_k2": { - "max_ms": 0.043833, - "min_ms": 0.031958, - "mean_ms": 0.032637479999999996, - "median_ms": 0.032416, - "std_ms": 0.0015867087750969608 - }, - "categorical_swap_n500_k5_m5": { - "max_ms": 0.080292, - "min_ms": 0.06, - "mean_ms": 0.06298337999999999, - "median_ms": 0.06225, - "std_ms": 0.003947748006400714 - }, - "categorical_optimize_n100_1k": { - "max_ms": 34.224917, - "min_ms": 16.19525, - "mean_ms": 17.52069984965035, - "median_ms": 16.7704585, - "std_ms": 1.8944621490608184 - }, - "bernoulli_swap_n200_k3": { - "max_ms": 0.596417, - "min_ms": 0.438375, - "mean_ms": 0.45312872, - "median_ms": 0.443208, - "std_ms": 0.025498219537002414 - } - } -} \ No newline at end of file diff --git a/benchmark/profile_optimization.jl b/benchmark/profile_optimization.jl deleted file mode 100644 index 24083de..0000000 --- a/benchmark/profile_optimization.jl +++ /dev/null @@ -1,295 +0,0 @@ -""" -Profiling helper for NetworkHistogram optimization. - -This script helps identify performance bottlenecks using Julia's Profile module. - -Usage: - julia --project=. benchmark/profile_optimization.jl [scenario] - -Scenarios: - swap - Profile single swap operations - optimize - Profile full optimization run - components - Profile individual components - -The script generates profiling data and can display it as: -- Text output (default) -- FlameGraph (requires ProfileView.jl or PProf.jl) -""" - -using Profile -using Random -using StatsBase -using StaticArrays -using NetworkHistogram - -# Helper functions to create test networks -function create_test_sbm_bernoulli(n_groups::Int, n_nodes::Int; seed = 42) - Random.seed!(seed) - d = NetworkHistogram.Bernoulli(0.5) - sbm = NetworkHistogram.BlockModel(n_groups, d) - - for g1 in 1:n_groups - for g2 in g1:n_groups - p = 0.1 + 0.7 * rand() - sbm[g1, g2] = NetworkHistogram.Bernoulli(p) - end - end - - base_size = n_nodes ÷ n_groups - remainder = n_nodes % n_groups - sizes = fill(base_size, n_groups) - sizes[1:remainder] .+= 1 - labels = StatsBase.inverse_rle(1:n_groups, sizes) - A = NetworkHistogram.sample(sbm, labels) - return A, labels, d -end - -function create_test_sbm_categorical( - n_groups::Int, n_nodes::Int, n_categories::Int; seed = 42) - Random.seed!(seed) - ps = SVector{n_categories}(fill(1 / n_categories, n_categories)) - d = NetworkHistogram.Cat(ps) - sbm = NetworkHistogram.BlockModel(n_groups, d) - - for g1 in 1:n_groups - for g2 in g1:n_groups - probs = rand(n_categories) - probs ./= sum(probs) - sbm[g1, g2] = NetworkHistogram.Cat(SVector{n_categories}(probs)) - end - end - - base_size = n_nodes ÷ n_groups - remainder = n_nodes % n_groups - sizes = fill(base_size, n_groups) - sizes[1:remainder] .+= 1 - labels = StatsBase.inverse_rle(1:n_groups, sizes) - A = NetworkHistogram.sample(sbm, labels) - return A, labels, d -end - -function profile_swap_operations(network_type = :bernoulli, n = 200, k = 3) - println("Setting up $(network_type) network (n=$n, k=$k)...") - - if network_type == :bernoulli - A, labels, d = create_test_sbm_bernoulli(k, n) - else - A, labels, d = create_test_sbm_categorical(k, n, 4) - end - - edgelist = NetworkHistogram.EdgeList(A) - assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) - swap = NetworkHistogram.make_swap(assignment, (1, n)) - - # Warm up - println("Warming up...") - for i in 1:100 - NetworkHistogram.apply_swap!(assignment, swap) - NetworkHistogram.revert_swap!(assignment, swap) - end - - # Profile - println("Profiling swap operations (5000 iterations)...") - Profile.clear() - @profile begin - for i in 1:5000 - NetworkHistogram.apply_swap!(assignment, swap) - NetworkHistogram.revert_swap!(assignment, swap) - end - end - - return true -end - -function profile_full_optimization( - network_type = :bernoulli, n = 200, k = 3, max_iter = 10_000) - println("Setting up $(network_type) network (n=$n, k=$k)...") - - if network_type == :bernoulli - A, labels, d = create_test_sbm_bernoulli(k, n) - else - A, labels, d = create_test_sbm_categorical(k, n, 4) - end - - initial_labels = rand(1:k, n) - params = NetworkHistogram.GreedyParams( - max_iter, - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(max_iter), - false - ) - - # Warm up - println("Warming up...") - test_params = NetworkHistogram.GreedyParams( - 100, - NetworkHistogram.RandomNodeSwap(), - NetworkHistogram.Strict(), - NetworkHistogram.PreviousBestValue(50), - false - ) - NetworkHistogram.nethist(A, d, copy(initial_labels), test_params) - - # Profile - println("Profiling full optimization ($max_iter iterations)...") - Profile.clear() - @profile NetworkHistogram.nethist(A, d, initial_labels, params) - - return true -end - -function profile_components(n = 200, k = 3) - println("Setting up network (n=$n, k=$k)...") - A, labels, d = create_test_sbm_bernoulli(k, n) - edgelist = NetworkHistogram.EdgeList(A) - - # Profile Assignment creation - println("\nProfiling Assignment creation...") - Profile.clear() - @profile begin - for i in 1:1000 - NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) - end - end - println("Results for Assignment creation:") - Profile.print(maxdepth = 15) - - # Profile EdgeList creation - println("\nProfiling EdgeList creation...") - Profile.clear() - @profile begin - for i in 1:1000 - NetworkHistogram.EdgeList(A) - end - end - println("Results for EdgeList creation:") - Profile.print(maxdepth = 15) - - # Profile log-likelihood computation - assignment = NetworkHistogram.Assignment(labels, edgelist, NetworkHistogram.Dist(d)) - println("\nProfiling log-likelihood computation...") - Profile.clear() - @profile begin - for i in 1:10000 - NetworkHistogram.loglikelihood(assignment) - end - end - println("Results for log-likelihood computation:") - Profile.print(maxdepth = 15) - - # Profile get_edges_in_groups - println("\nProfiling get_edges_in_groups...") - Profile.clear() - @profile begin - for i in 1:10000 - NetworkHistogram.get_edges_in_groups(assignment, 1, 2) - end - end - println("Results for get_edges_in_groups:") - Profile.print(maxdepth = 15) - - return false # Don't print again at the end -end - -function print_results() - println("\n" * "="^70) - println("Profile Results") - println("="^70) - println("\nTop functions by exclusive time:") - Profile.print(maxdepth = 15) - - println("\n" * "="^70) -end - -function print_help() - println(""" - NetworkHistogram Profiling Helper - ================================== - - Usage: julia --project=. dev/profile_optimization.jl [scenario] [options] - - Scenarios: - swap Profile swap operations (default) - swap-bernoulli Profile Bernoulli swap operations - swap-categorical Profile Categorical swap operations - optimize Profile full optimization run - components Profile individual components - help Show this message - - Options: - --n=N Number of nodes (default: 200) - --k=K Number of groups (default: 3) - --iter=N Number of iterations for optimization (default: 10000) - - Examples: - julia dev/profile_optimization.jl swap - julia dev/profile_optimization.jl swap-categorical --n=500 - julia dev/profile_optimization.jl optimize --iter=5000 - julia dev/profile_optimization.jl components - - The profiling results will show: - - Which functions consume the most time - - Call counts for each function - - Memory allocation patterns - - Call stack visualization (with flamegraph viewer) - - Tips: - - Focus on functions with high "exclusive" time - - Look for unexpected allocations - - Check for type instabilities - - Use flamegraph for visual exploration - """) -end - -function parse_args(args) - options = Dict( - :n => 200, - :k => 3, - :iter => 10_000 - ) - - for arg in args - if startswith(arg, "--n=") - options[:n] = parse(Int, split(arg, "=")[2]) - elseif startswith(arg, "--k=") - options[:k] = parse(Int, split(arg, "=")[2]) - elseif startswith(arg, "--iter=") - options[:iter] = parse(Int, split(arg, "=")[2]) - end - end - - return options -end - -function main() - if length(ARGS) == 0 || ARGS[1] in ["help", "-h", "--help"] - print_help() - return - end - - scenario = ARGS[1] - options = parse_args(ARGS[2:end]) - - should_print = if scenario == "swap" || scenario == "swap-bernoulli" - profile_swap_operations(:bernoulli, options[:n], options[:k]) - elseif scenario == "swap-categorical" - profile_swap_operations(:categorical, options[:n], options[:k]) - elseif scenario == "optimize" - profile_full_optimization(:bernoulli, options[:n], options[:k], options[:iter]) - elseif scenario == "components" - profile_components(options[:n], options[:k]) - else - println("Error: Unknown scenario '$scenario'") - print_help() - return - end - - if should_print - print_results() - end -end - -if abspath(PROGRAM_FILE) == @__FILE__ - main() -end diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl deleted file mode 100644 index 3cf5956..0000000 --- a/benchmark/run_benchmarks.jl +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env julia - -""" -Quick-start script for running NetworkHistogram benchmarks. - -Usage: - ./run_benchmarks.jl [command] [options] - -Commands: - baseline - Establish a baseline benchmark - current - Run current benchmarks (compares with baseline if available) - compare - Compare two benchmark files - clean - Remove all benchmark results - help - Show this help message - -Examples: - ./run_benchmarks.jl baseline - ./run_benchmarks.jl current - ./run_benchmarks.jl compare results1.json results2.json -""" - -using Pkg -using Dates - -# Ensure we're in the right directory -cd(dirname(@__DIR__)) - -function print_help() - println(""" - NetworkHistogram Benchmark Runner - ================================== - - Usage: julia run_benchmarks.jl [command] [options] - - Commands: - baseline Create a baseline benchmark - current Run current benchmarks - compare FILE1 FILE2 Compare two benchmark files - clean Remove all benchmark results - help Show this message - - Examples: - julia run_benchmarks.jl baseline - julia run_benchmarks.jl current - julia run_benchmarks.jl compare results/v1.json results/v2.json - """) -end - -function ensure_dependencies(tries = 0) - tries == 0 && @info "Checking dependencies..." - # Check if benchmark dependencies are installed - try - @eval using StaticArrays - @eval using BenchmarkTools - @eval using JSON3 - @eval using PrettyTables - @eval using LoggingExtras - - @info "Dependencies OK ✓" - catch - if tries >= 2 - error("Failed to install dependencies after multiple attempts.") - elseif tries == 1 - @info "Trying to instantiate project..." - Pkg.instantiate() - ensure_dependencies(tries + 1) - else - @info "Activating benchmark project" - Pkg.activate("benchmark") - ensure_dependencies(tries + 1) - end - end -end - -function run_baseline() - baseline_file = joinpath("benchmark", "benchmark_results", "baseline.json") - - if isfile(baseline_file) - printstyled( - "Baseline already exists. Overwrite? (y/N): ", color = :light_yellow, blink = true) - response = readline() - if lowercase(strip(response)) != "y" - @info "Aborted." - return - end - end - - @info "Running baseline benchmarks... \nThis may take several minutes...\n" - - run(`julia --project=benchmark benchmark/benchmark_optimization.jl $baseline_file`) - - @info "\n✓ Baseline established at: $baseline_file" * - "\n Next steps: " * - "\n 1. Make your performance improvements" * - "\n 2. Run: julia run_benchmarks.jl current" * - "\n 3. Review the performance comparison" -end - -function run_current() - baseline_file = joinpath("benchmark", "benchmark_results", "baseline.json") - - if !isfile(baseline_file) - @warn "⚠ Warning: No baseline found! \n Consider running: julia run_benchmarks.jl baseline" - @info "\nContinuing anyway...\n" - end - - timestamp = Dates.format(Dates.now(), "yyyy-mm-ddTHH-MM-SS") - current_file = joinpath("benchmark", "benchmark_results", "current_$timestamp.json") - - @info "Running current benchmarks... \n This may take several minutes...\n" - - if isfile(baseline_file) - run(`julia --project=benchmark benchmark/benchmark_optimization.jl $current_file $baseline_file`) - else - run(`julia --project=benchmark benchmark/benchmark_optimization.jl $current_file`) - end - - @info "✓ Results saved to: $current_file" -end - -function compare_benchmarks(file1, file2) - if !isfile(file1) - @error "Error: File not found: $file1" - return - end - - if !isfile(file2) - @error "Error: File not found: $file2" - return - end - - @info "Comparing benchmarks... \n Baseline: $file2 \n Current: $file1\n" - - # Re-run comparison - run(`julia --project=benchmark benchmark/benchmark_optimization.jl $file1 $file2`) -end - -function clean_results() - results_dir = joinpath("benchmark", "benchmark_results") - - if !isdir(results_dir) - @info "No results directory found." - return - end - - files = filter(f -> endswith(f, ".json") && f != "baseline.json", readdir(results_dir)) - - if isempty(files) - @info "No benchmark results to clean." - return - end - - @info "Found $(length(files)) benchmark result file(s):" - for f in files - @info " - $f" - end - - printstyled("\nDelete these files? (y/N): ", color = :light_yellow, blink = true) - response = readline() - - if lowercase(strip(response)) == "y" - for f in files - rm(joinpath(results_dir, f)) - end - @info "✓ Cleaned $(length(files)) file(s)" - else - @info "Aborted." - end -end - -# Main execution -function main() - if length(ARGS) == 0 || ARGS[1] == "help" || ARGS[1] == "-h" || ARGS[1] == "--help" - print_help() - return - end - - ensure_dependencies() - - command = ARGS[1] - - if command == "baseline" - run_baseline() - elseif command == "current" - run_current() - elseif command == "compare" - if length(ARGS) < 3 - @error "Error: compare requires two file arguments \n Usage: julia run_benchmarks.jl compare FILE1 FILE2" - return - end - compare_benchmarks(ARGS[2], ARGS[3]) - elseif command == "clean" - clean_results() - elseif command == "test" - run_tests() - else - @info "Error: Unknown command '$command'" - print_help() - end -end - -if abspath(PROGRAM_FILE) == @__FILE__ - main() -end diff --git a/benchmark/visualize_benchmarks.jl b/benchmark/visualize_benchmarks.jl deleted file mode 100644 index 4a1a3a0..0000000 --- a/benchmark/visualize_benchmarks.jl +++ /dev/null @@ -1,263 +0,0 @@ -""" -Visualize benchmark results over time. - -This script reads multiple benchmark result files and creates a simple -comparison table or plot showing performance trends. - -Usage: - julia --project=. benchmark/visualize_benchmarks.jl [files...] - julia --project=. benchmark/visualize_benchmarks.jl --all # Use all files in benchmark_results/ - -Example: - julia benchmark/visualize_benchmarks.jl \\ - benchmark/benchmark_results/baseline.json \\ - benchmark/benchmark_results/current_2024-10-15.json -""" - -using JSON3 -using Dates -using Printf -using Statistics -using PrettyTables - -function load_benchmark(filepath) - if !isfile(filepath) - @warn "File not found: $filepath" - return nothing - end - - data = JSON3.read(read(filepath, String)) - return data -end - -function extract_key_metrics(benchmark_data) - metrics = Dict{String, Float64}() - - for (name, values) in benchmark_data["benchmarks"] - name_str = string(name) # Convert Symbol to String - if haskey(values, "median_ms") - metrics[name_str] = values["median_ms"] - elseif haskey(values, "median_us") - metrics[name_str] = values["median_us"] / 1000.0 # Convert to ms - end - end - - return metrics -end - -function compare_multiple(files) - if isempty(files) - println("No files provided") - return - end - - # Load all benchmarks - benchmarks = [] - for file in files - data = load_benchmark(file) - if !isnothing(data) - push!(benchmarks, - ( - file = basename(file), - timestamp = data["timestamp"], - metrics = extract_key_metrics(data) - )) - end - end - - if isempty(benchmarks) - println("No valid benchmark files found") - return - end - - # Sort by timestamp - sort!(benchmarks, by = b -> b.timestamp) - - # Get all metric names - all_metrics = Set{String}() - for b in benchmarks - union!(all_metrics, keys(b.metrics)) - end - all_metrics = sort(collect(all_metrics)) - - # Print header - println("\n" * "="^100) - println("Benchmark Comparison Across Versions") - println("="^100 * "\n") - - # Prepare data for table - table_data = [] - baseline_vals = Dict{String, Float64}() - - for metric in all_metrics - # Skip if metric has no values - values = [haskey(b.metrics, metric) ? b.metrics[metric] : NaN for b in benchmarks] - if all(isnan, values) - continue - end - - row = Any[metric] - baseline_val = values[1] - baseline_vals[metric] = baseline_val - - for (i, val) in enumerate(values) - if isnan(val) - push!(row, "N/A") - else - push!(row, round(val, digits = 2)) - end - end - - push!(table_data, row) - end - - if isempty(table_data) - println("No metrics to display") - return - end - - # Create headers - headers = ["Benchmark"] - for b in benchmarks - short_name = length(b.file) > 16 ? b.file[1:13] * "..." : b.file - push!(headers, short_name) - end - - # Create subheaders with timestamps - subheaders = [""] - for b in benchmarks - short_ts = length(b.timestamp) > 16 ? b.timestamp[1:16] : b.timestamp - push!(subheaders, short_ts) - end - - # Create highlighters for improvements and regressions - # We'll color entire rows based on whether the value improved or regressed vs baseline - hl_improvement = TextHighlighter( - (data, i, j) -> begin - # Check if current value (in any column after baseline) shows improvement - baseline_idx = 2 # First value column - baseline_val = data[i, baseline_idx] - - if j > 2 && baseline_val isa Number && baseline_val > 0 - current_val = data[i, j] - if current_val isa Number - speedup = baseline_val / current_val - return speedup > 1.05 # >5% improvement - end - end - return false - end, - crayon"green" - ) - - hl_regression = TextHighlighter( - (data, i, j) -> begin - # Check if current value (in any column after baseline) shows regression - baseline_idx = 2 # First value column - baseline_val = data[i, baseline_idx] - - if j > 2 && baseline_val isa Number && baseline_val > 0 - current_val = data[i, j] - if current_val isa Number - speedup = baseline_val / current_val - return speedup < 0.95 # >5% regression - end - end - return false - end, - crayon"red" - ) # Convert table_data to matrix - data_matrix = permutedims(hcat([vcat(row...) for row in table_data]...)) - - # Print table - pretty_table( - data_matrix; - column_labels = headers, - highlighters = [hl_improvement, hl_regression], - alignment = vcat(:l, fill(:r, length(benchmarks))), - table_format = TextTableFormat(borders = text_table_borders__unicode_rounded) - ) - - println("\nLegend: Green = >5% faster, Red = >5% slower (compared to first column)") - println("All values in milliseconds (ms)") - println() - - # Calculate aggregate statistics - if length(benchmarks) >= 2 - println("Overall Summary:") - println("-" * "="^50) - - baseline = benchmarks[1] - for i in 2:length(benchmarks) - current = benchmarks[i] - - speedups = Float64[] - for metric in all_metrics - if haskey(baseline.metrics, metric) && haskey(current.metrics, metric) - base_val = baseline.metrics[metric] - curr_val = current.metrics[metric] - if base_val > 0 && curr_val > 0 - push!(speedups, base_val / curr_val) - end - end - end - - if !isempty(speedups) - median_speedup = median(speedups) - geomean_speedup = exp(mean(log.(speedups))) - faster_count = count(s -> s > 1.05, speedups) - slower_count = count(s -> s < 0.95, speedups) - similar_count = length(speedups) - faster_count - slower_count - - println("\n$(current.file) vs $(baseline.file):") - println(" Geometric mean speedup: $(round(geomean_speedup, digits=2))x") - println(" Median speedup: $(round(median_speedup, digits=2))x") - println(" Benchmarks: $faster_count faster, $slower_count slower, $similar_count similar") - end - end - end -end - -function main() - if length(ARGS) == 0 || ARGS[1] in ["-h", "--help", "help"] - println(""" - Visualize NetworkHistogram Benchmark Results - ============================================= - - Usage: julia dev/visualize_benchmarks.jl [options] [files...] - - Options: - --all Compare all files in benchmark_results/ - -h, --help Show this help - - Examples: - # Compare specific files - julia dev/visualize_benchmarks.jl \\ - benchmark_results/baseline.json \\ - benchmark_results/current.json - - # Compare all available benchmarks - julia dev/visualize_benchmarks.jl --all - """) - return - end - - files = if ARGS[1] == "--all" - results_dir = joinpath("dev", "benchmark_results") - if !isdir(results_dir) - println("Error: benchmark_results directory not found") - return - end - - all_files = filter(f -> endswith(f, ".json"), readdir(results_dir)) - sort!([joinpath(results_dir, f) for f in all_files]) - else - ARGS - end - - compare_multiple(files) -end - -if abspath(PROGRAM_FILE) == @__FILE__ - main() -end From b8149cd1f6d88c33d8817cf64cbb87f70a7e604e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 15:36:23 +0100 Subject: [PATCH 230/266] add tests for score --- Project.toml | 2 + ext/LightMCExt.jl | 2 +- src/NetworkHistogram.jl | 2 +- src/pseudo_suff_stats/abstract_suffstat.jl | 2 + src/pseudo_suff_stats/bernoulli.jl | 2 +- test/runtests.jl | 2 + test/test_hist_dist.jl | 7 ++++ test/test_pseudo_suff_stats.jl | 43 ++++++++++++++++++++++ 8 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 test/test_hist_dist.jl create mode 100644 test/test_pseudo_suff_stats.jl diff --git a/Project.toml b/Project.toml index ee404b6..e4d2d3b 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -35,6 +36,7 @@ BenchmarkTools = "1.6.3" Clustering = "0.15.8" IntervalSets = "0.7.11" LinearAlgebra = "1.12.0" +LogExpFunctions = "0.3.29" Reexport = "1.2.2" [extras] diff --git a/ext/LightMCExt.jl b/ext/LightMCExt.jl index 34a80a0..4040d21 100644 --- a/ext/LightMCExt.jl +++ b/ext/LightMCExt.jl @@ -11,7 +11,7 @@ using LightMC: DiscreteMarkovChain, SampleChain, transition_matrix, ConvertBinar # need to define a convertor that only look at the possible transitions and not all of them struct McConvertor <: AbstractConvertor end -get_convertor(::Val{:mc}; kwargs...) = McConvertor() +get_convertor(::Val{:mc}; kwargs...) = McConvertor(kwargs...) function (c::McConvertor)(chain::SampleChain) return SVector([SVector(c...) for c in eachcol(chain.transitions)]...) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index ba477ff..7979006 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -10,7 +10,7 @@ using LinearAlgebra using ArgCheck import Random: randperm, AbstractRNG, rand, shuffle import Distributions: logpdf, pdf - +import LogExpFunctions: xlogx using IntervalSets using Reexport diff --git a/src/pseudo_suff_stats/abstract_suffstat.jl b/src/pseudo_suff_stats/abstract_suffstat.jl index 099e409..bf1ad19 100644 --- a/src/pseudo_suff_stats/abstract_suffstat.jl +++ b/src/pseudo_suff_stats/abstract_suffstat.jl @@ -3,6 +3,8 @@ abstract type SuffStats end function add_sample end function remove_sample end function make_k_block end + +# score will be minimized function score end function to_params end diff --git a/src/pseudo_suff_stats/bernoulli.jl b/src/pseudo_suff_stats/bernoulli.jl index eceba35..6c2a8a3 100644 --- a/src/pseudo_suff_stats/bernoulli.jl +++ b/src/pseudo_suff_stats/bernoulli.jl @@ -39,7 +39,7 @@ end function score(ss::BernoulliSuffStats) n = max(ss.n, 1) p = ss.h / n - return n * (xlogx(1 - p) + xlogx(p)) + return -n * (xlogx(1 - p) + xlogx(p)) end function to_params(ss::BernoulliSuffStats) diff --git a/test/runtests.jl b/test/runtests.jl index b2c0263..bdbb9d6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,4 +4,6 @@ using NetworkHistogram @testset "Tests" begin include("test_symarray.jl") + include("test_pseudo_suff_stats.jl") + include("test_hist_dist.jl") end diff --git a/test/test_hist_dist.jl b/test/test_hist_dist.jl new file mode 100644 index 0000000..8860223 --- /dev/null +++ b/test/test_hist_dist.jl @@ -0,0 +1,7 @@ +using Test +using NetworkHistogram +using StaticArrays +using Distributions +import NetworkHistogram as NH + +@testset "Histogram-based Distribution" begin end diff --git a/test/test_pseudo_suff_stats.jl b/test/test_pseudo_suff_stats.jl new file mode 100644 index 0000000..1e24bd4 --- /dev/null +++ b/test/test_pseudo_suff_stats.jl @@ -0,0 +1,43 @@ +using Test +using NetworkHistogram +using StaticArrays +using Distributions +import NetworkHistogram as NH + +function _one_hot_vector(sample::Int, num_categories::Int) + v = zeros(Int, num_categories) + v[sample] = 1 + return v +end + +@testset "Bernoulli" begin + @testset "score" begin + ss = NH.BernoulliSuffStats() + samples = [true, false, true, true, false, true, false, false, true, true] + for s in samples + ss = NH.add_sample(ss, s) + end + d = fit_mle(Bernoulli, samples) + @test NH.score(ss) ≈ -sum(map(Base.Fix1(logpdf, d), samples)) + @test NH.to_params(ss) == d.p + end +end + +@testset "Categorical" begin + @testset "score" begin + ss = NH.CategoricalSuffStats(3) + samples = [1, 2, 1, 2, 3, 1, 2, 3, 1, 2] + s_vec = _one_hot_vector.(samples, 3) + for s in samples + ss = NH.add_sample(ss, s) + end + d = fit_mle(Categorical, samples) + p = probs(d) + loss = 0.0 + for s in s_vec + loss += sum(abs2, s - p) + end + @test NH.score(ss) ≈ loss + @test NH.to_params(ss) == p + end +end From 90c83e5e3d07d4e2078ade171a25e473116f37d3 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 15:38:02 +0100 Subject: [PATCH 231/266] rename score to loss --- docs/examples/custom_suffstats.jl | 4 ++-- ext/LightMCExt.jl | 8 ++++---- src/GreedySuffStats.jl | 12 ++++++------ src/config_rules/stop_rule.jl | 4 ++-- src/pseudo_suff_stats/abstract_suffstat.jl | 4 ++-- src/pseudo_suff_stats/bernoulli.jl | 2 +- src/pseudo_suff_stats/categorical.jl | 2 +- src/pseudo_suff_stats/generic.jl | 2 +- test/test_pseudo_suff_stats.jl | 12 ++++++------ 9 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index 00432e4..e65c3e4 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -1,4 +1,4 @@ -import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, score, +import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, loss, to_params, CategoricalConvertor, num_bins, to_distribution using StaticArrays using Accessors @@ -28,7 +28,7 @@ function make_k_block(k, ::Val{:custom}; num_categories, kwargs...) return k_block end -@inline function score(ss::MyCustomSuffStats) +@inline function loss(ss::MyCustomSuffStats) n = sum(ss.h) return n - sum(abs2, ss.h) / max(n, 1) end diff --git a/ext/LightMCExt.jl b/ext/LightMCExt.jl index 4040d21..4942c5d 100644 --- a/ext/LightMCExt.jl +++ b/ext/LightMCExt.jl @@ -3,7 +3,7 @@ module LightMCExt using StaticArrays using Accessors using NetworkHistogram -import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, score, +import NetworkHistogram: SuffStats, add_sample, remove_sample, make_k_block, loss, to_params, AbstractConvertor, to_distribution, get_convertor using LightMC: DiscreteMarkovChain, SampleChain, transition_matrix, ConvertBinaryMC @@ -45,14 +45,14 @@ function remove_sample(ss::McSuffStats, sample) return ss end -function _score(counts::SVector) +function _loss(counts::SVector) n = sum(counts) norm_ = max(n, 1) return (n - sum(abs2, counts) / norm_) / norm_ end -function score(ss::McSuffStats) - return sum(_score, ss.h) +function loss(ss::McSuffStats) + return sum(_loss, ss.h) end function to_params(ss::McSuffStats) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index 2bb8b8d..33f49dc 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -30,19 +30,19 @@ function init!(es::GreedySuffStats, data, node_labels) end # TODO: allow for non-symmetric data -@inline function score(matrix_ss::SymArray{<:SuffStats}; norm = 1.0) +@inline function loss(matrix_ss::SymArray{<:SuffStats}; norm = 1.0) total_loss = 0.0 for m in matrix_ss.uppertrian.nzval - total_loss += score(m) + total_loss += loss(m) end return total_loss / norm end -@inline function score(matrix_ss::AbstractMatrix{<:SuffStats}; norm = 1.0) +@inline function loss(matrix_ss::AbstractMatrix{<:SuffStats}; norm = 1.0) total_loss = 0.0 @inbounds for j in axes(matrix_ss, 2) for i in 1:j - inter = score(matrix_ss[i, j]) + inter = loss(matrix_ss[i, j]) total_loss += inter end end @@ -92,7 +92,7 @@ function estimate!( progress_update_interval = max(1, es.max_iter ÷ iter_progress) # Initial log-likelihood - current_loss = score(es.block_ss, norm = n_edges) + current_loss = loss(es.block_ss, norm = n_edges) reset!(es.stop_rule, current_loss) # Main optimization loop for iter in 1:(es.max_iter) @@ -123,7 +123,7 @@ function estimate!( # tentative swap @inbounds node_labels[index1], node_labels[index2] = group2, group1 - new_loss = score(es.block_ss_swap, norm = n_edges) + new_loss = loss(es.block_ss_swap, norm = n_edges) if compare_to_best(new_loss, current_loss, es.stop_rule) # apply swap diff --git a/src/config_rules/stop_rule.jl b/src/config_rules/stop_rule.jl index b1c501c..7d7fa13 100644 --- a/src/config_rules/stop_rule.jl +++ b/src/config_rules/stop_rule.jl @@ -18,8 +18,8 @@ end const PreviousMaxValue{T} = PreviousBestValue{T, Val(:max)} const PreviousMinValue{T} = PreviousBestValue{T, Val(:min)} -function reset!(stop_rule::PreviousBestValue{T}, score_value::T) where {T} - stop_rule.previous_best_value = score_value +function reset!(stop_rule::PreviousBestValue{T}, loss_value::T) where {T} + stop_rule.previous_best_value = loss_value stop_rule.iterations_since_best = 0 end diff --git a/src/pseudo_suff_stats/abstract_suffstat.jl b/src/pseudo_suff_stats/abstract_suffstat.jl index bf1ad19..883ce97 100644 --- a/src/pseudo_suff_stats/abstract_suffstat.jl +++ b/src/pseudo_suff_stats/abstract_suffstat.jl @@ -4,8 +4,8 @@ function add_sample end function remove_sample end function make_k_block end -# score will be minimized -function score end +# loss will be minimized +function loss end function to_params end # some suffstat may need the edge index (i,j) to update properly diff --git a/src/pseudo_suff_stats/bernoulli.jl b/src/pseudo_suff_stats/bernoulli.jl index 6c2a8a3..c0c0d6e 100644 --- a/src/pseudo_suff_stats/bernoulli.jl +++ b/src/pseudo_suff_stats/bernoulli.jl @@ -36,7 +36,7 @@ function make_k_block(k, ::Val{:binary}; kwargs...) return k_block end -function score(ss::BernoulliSuffStats) +function loss(ss::BernoulliSuffStats) n = max(ss.n, 1) p = ss.h / n return -n * (xlogx(1 - p) + xlogx(p)) diff --git a/src/pseudo_suff_stats/categorical.jl b/src/pseudo_suff_stats/categorical.jl index 3ae65d4..4931e3b 100644 --- a/src/pseudo_suff_stats/categorical.jl +++ b/src/pseudo_suff_stats/categorical.jl @@ -23,7 +23,7 @@ function make_k_block(k, ::Val{:categorical}; num_categories, kwargs...) return k_block end -function score(ss::CategoricalSuffStats) +function loss(ss::CategoricalSuffStats) n = sum(ss.h) return n - sum(abs2, ss.h) / max(n, 1) end diff --git a/src/pseudo_suff_stats/generic.jl b/src/pseudo_suff_stats/generic.jl index e52947b..2a4795a 100644 --- a/src/pseudo_suff_stats/generic.jl +++ b/src/pseudo_suff_stats/generic.jl @@ -53,7 +53,7 @@ end # return ss # end -function score(ss::GenericSuffStats{T, D}) where {T, D} +function loss(ss::GenericSuffStats{T, D}) where {T, D} samples = get_samples(ss) d = fit(D, samples) return -sum(logpdf.(d, samples)) diff --git a/test/test_pseudo_suff_stats.jl b/test/test_pseudo_suff_stats.jl index 1e24bd4..73eefe1 100644 --- a/test/test_pseudo_suff_stats.jl +++ b/test/test_pseudo_suff_stats.jl @@ -11,20 +11,20 @@ function _one_hot_vector(sample::Int, num_categories::Int) end @testset "Bernoulli" begin - @testset "score" begin + @testset "loss" begin ss = NH.BernoulliSuffStats() samples = [true, false, true, true, false, true, false, false, true, true] for s in samples ss = NH.add_sample(ss, s) end d = fit_mle(Bernoulli, samples) - @test NH.score(ss) ≈ -sum(map(Base.Fix1(logpdf, d), samples)) + @test NH.loss(ss) ≈ -sum(map(Base.Fix1(logpdf, d), samples)) @test NH.to_params(ss) == d.p end end @testset "Categorical" begin - @testset "score" begin + @testset "loss" begin ss = NH.CategoricalSuffStats(3) samples = [1, 2, 1, 2, 3, 1, 2, 3, 1, 2] s_vec = _one_hot_vector.(samples, 3) @@ -33,11 +33,11 @@ end end d = fit_mle(Categorical, samples) p = probs(d) - loss = 0.0 + loss_val = 0.0 for s in s_vec - loss += sum(abs2, s - p) + loss_val += sum(abs2, s - p) end - @test NH.score(ss) ≈ loss + @test NH.loss(ss) ≈ loss_val @test NH.to_params(ss) == p end end From 180affac73fc2f840fb8e97de3a196adbf5ae616 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 15:39:49 +0100 Subject: [PATCH 232/266] add sanity check for loss cat --- test/test_pseudo_suff_stats.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_pseudo_suff_stats.jl b/test/test_pseudo_suff_stats.jl index 73eefe1..a22509c 100644 --- a/test/test_pseudo_suff_stats.jl +++ b/test/test_pseudo_suff_stats.jl @@ -39,5 +39,12 @@ end end @test NH.loss(ss) ≈ loss_val @test NH.to_params(ss) == p + + samples = ones(Int, 10) + ss_unique = NH.CategoricalSuffStats(3) + for s in samples + ss_unique = NH.add_sample(ss_unique, s) + end + @assert NH.loss(ss_unique) == 0.0 end end From 6f4a9a1f7b8d5cf996499cccba0c20a71dbe19ce Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Tue, 28 Oct 2025 15:44:56 +0100 Subject: [PATCH 233/266] correct mapreduce --- src/pseudo_suff_stats/generic.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pseudo_suff_stats/generic.jl b/src/pseudo_suff_stats/generic.jl index 2a4795a..ec3b673 100644 --- a/src/pseudo_suff_stats/generic.jl +++ b/src/pseudo_suff_stats/generic.jl @@ -56,7 +56,7 @@ end function loss(ss::GenericSuffStats{T, D}) where {T, D} samples = get_samples(ss) d = fit(D, samples) - return -sum(logpdf.(d, samples)) + return -mapreduce(BaseFix1(logpdf, d), +, samples) end function to_params(ss::GenericSuffStats) From c45ba5142901b80674aa750bab92a4917cfb7634 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 29 Oct 2025 08:42:03 +0100 Subject: [PATCH 234/266] add makie ext for SymArray --- Project.toml | 2 ++ ext/MakieExt.jl | 8 ++++++++ 2 files changed, 10 insertions(+) create mode 100644 ext/MakieExt.jl diff --git a/Project.toml b/Project.toml index e4d2d3b..05202d7 100644 --- a/Project.toml +++ b/Project.toml @@ -24,10 +24,12 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [weakdeps] Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" LightMC = "b58f5c6e-c887-41d6-b553-02118416cd5d" +Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" [extensions] BootstrapExt = "Bootstrap" LightMCExt = "LightMC" +MakieExt = "Makie" [compat] Accessors = "0.1.42" diff --git a/ext/MakieExt.jl b/ext/MakieExt.jl new file mode 100644 index 0000000..3a7cc28 --- /dev/null +++ b/ext/MakieExt.jl @@ -0,0 +1,8 @@ +module MakieExt + +using NetworkHistogram +using Makie + +Makie.convert_single_argument(A::SymArray) = Matrix(A) + +end From 247f5119b84a7041b9ed5f96030b4f365e0e0851 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 29 Oct 2025 08:42:42 +0100 Subject: [PATCH 235/266] add BinaryConvertor and oracle_estimator --- src/api.jl | 40 ++++++++++++++++++++++++--- src/preprocessor/abstractConvertor.jl | 5 ++++ src/preprocessor/binary.jl | 10 +++++++ 3 files changed, 51 insertions(+), 4 deletions(-) create mode 100644 src/preprocessor/binary.jl diff --git a/src/api.jl b/src/api.jl index b945c14..afe657f 100644 --- a/src/api.jl +++ b/src/api.jl @@ -31,7 +31,7 @@ function _nethist( end data = convertor.(A) @info "Using $(num_bins(convertor)) discrete categories for edge values" - es = NetworkHistogram.GreedySuffStats( + es = GreedySuffStats( data, labels_start, num_categories = num_bins(convertor), type_suff_stats = type_suff_stats, max_iter = params.max_iter, @@ -40,9 +40,41 @@ function _nethist( progress = params.display_progress; kwargs... ) - node_labels, parameters = NetworkHistogram.estimate!( + node_labels, parameters = estimate!( es, data, labels_start; iter_progress = params.progress_freq) - model = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), + + return convert_to_result(node_labels, convertor, parameters) +end + +function oracle_estimator( + data, oracle_labels, convertor; type_suff_stats = Val(:categorical)) + k = length(unique(oracle_labels)) + # allocate sufficient statistics blocks + block_ss = make_k_block(k, type_suff_stats; data = data) + block_ss_swap = make_k_block(k, type_suff_stats; data = data) + es_dummy = GreedySuffStats(block_ss, block_ss_swap, RandomGroupSwap(), + PreviousBestValue(1_000, Inf, :min), 1) + init!(es_dummy, data, oracle_labels) + parameters = to_params.(es_dummy.block_ss) + return convert_to_result(oracle_labels, convertor, parameters) +end + +function convert_to_result(node_labels, convertor, parameters) + model = DecoratedSBM(to_distribution.(convertor, parameters), counts(node_labels) ./ length(node_labels)) - return NetworkHistogram.NethistResult(node_labels, model) + return NethistResult(node_labels, model) end + +function convert_to_result( + node_labels, convertor::BinaryConvertor, parameters::AbstractMatrix{<:Real}) + model = SBM(to_distribution.(convertor, parameters), + counts(node_labels) ./ length(node_labels)) + return NethistResult(node_labels, model) +end + +# function convert_to_result( +# node_labels, convertor::BinaryConvertor, parameters::SymArray{<:Real}) +# model = SBM(Matrix(to_distribution.(convertor, parameters)), +# counts(node_labels) ./ length(node_labels)) +# return NethistResult(node_labels, model) +# end diff --git a/src/preprocessor/abstractConvertor.jl b/src/preprocessor/abstractConvertor.jl index fed5fda..f035903 100644 --- a/src/preprocessor/abstractConvertor.jl +++ b/src/preprocessor/abstractConvertor.jl @@ -17,6 +17,7 @@ get_convertor(s::String, ; kwargs...) = get_convertor(Symbol(s); kwargs...) get_convertor(s::Symbol; kwargs...) = get_convertor(Val(s); kwargs...) get_convertor(::T; kwargs...) where {T} = @error "No convertor found for type $T" +include("binary.jl") include("categorical.jl") include("continuous.jl") @@ -27,3 +28,7 @@ end function get_convertor(::Val{:continuous}; kwargs...) return UnitIntervalConvertor(kwargs[:num_bins]) end + +function get_convertor(::Val{:binary}; kwargs...) + return BinaryConvertor() +end diff --git a/src/preprocessor/binary.jl b/src/preprocessor/binary.jl new file mode 100644 index 0000000..b9d80a1 --- /dev/null +++ b/src/preprocessor/binary.jl @@ -0,0 +1,10 @@ +struct BinaryConvertor <: AbstractConvertor end + +function (c::BinaryConvertor)(obs::T) where {T <: Union{Real, Bool}} + return obs == 1 ? true : false +end + +function to_distribution( + c::BinaryConvertor, p::T; kwargs...) where {T <: Real} + return p +end From c9575036d4e71b21e25af382d9ea7c6eda0f5de7 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 29 Oct 2025 08:44:10 +0100 Subject: [PATCH 236/266] remove overspecialisation --- src/api.jl | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/api.jl b/src/api.jl index afe657f..e509e38 100644 --- a/src/api.jl +++ b/src/api.jl @@ -65,16 +65,8 @@ function convert_to_result(node_labels, convertor, parameters) return NethistResult(node_labels, model) end -function convert_to_result( - node_labels, convertor::BinaryConvertor, parameters::AbstractMatrix{<:Real}) +function convert_to_result(node_labels, convertor::BinaryConvertor, parameters) model = SBM(to_distribution.(convertor, parameters), counts(node_labels) ./ length(node_labels)) return NethistResult(node_labels, model) end - -# function convert_to_result( -# node_labels, convertor::BinaryConvertor, parameters::SymArray{<:Real}) -# model = SBM(Matrix(to_distribution.(convertor, parameters)), -# counts(node_labels) ./ length(node_labels)) -# return NethistResult(node_labels, model) -# end From 6d718b0f78858fb8256cd6aa6a5f54c1d4ca3d2e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 29 Oct 2025 09:02:00 +0100 Subject: [PATCH 237/266] more API conveniance --- src/GreedySuffStats.jl | 4 +++- src/api.jl | 32 +++++++++++++++++++++++++------- src/utils/utils_node_labels.jl | 5 +++++ 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index 33f49dc..6f3a2a4 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -17,7 +17,7 @@ end function init!(es::GreedySuffStats, data, node_labels) # Initialize the sufficient statistics for each block - @inbounds for j in axes(data, 2) + for j in axes(data, 2) gj = node_labels[j] for i in 1:(j - 1) # More efficient than i < j check inside loop edge_value = data[i, j] @@ -29,6 +29,8 @@ function init!(es::GreedySuffStats, data, node_labels) end end +loss(es::GreedySuffStats; norm = 1.0) = loss(es.block_ss; norm = norm) + # TODO: allow for non-symmetric data @inline function loss(matrix_ss::SymArray{<:SuffStats}; norm = 1.0) total_loss = 0.0 diff --git a/src/api.jl b/src/api.jl index e509e38..886f6db 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,24 +1,42 @@ function nethist_categorical( A, k, - labels_start = shuffle(ordered_start_labels(size(A, 1), k)); + labels_start = ordered_start_labels(size(A, 1), k); params::GreedyParams = GreedyParams()) + convertor = CategoricalConvertor(A) + @info "Using $(num_bins(convertor)) discrete categories for edge values" _nethist( A, labels_start, - CategoricalConvertor(A), + convertor, Val(:categorical), - params + params, + num_categories = num_bins(convertor) ) end function nethist_continuous( A, k, - labels_start = shuffle(ordered_start_labels(size(A, 1), k)); + labels_start = ordered_start_labels(size(A, 1), k); num_bins_::Int = 10, params::GreedyParams = GreedyParams()) + convertor = UnitIntervalConvertor(num_bins_) + @info "Using $(num_bins(convertor)) discrete categories for edge values" _nethist( A, labels_start, - UnitIntervalConvertor(num_bins_), + convertor, Val(:categorical), + params, + num_categories = num_bins(convertor) + ) +end + +function nethist_binary( + A, k, + labels_start = ordered_start_labels(size(A, 1), k); + params::GreedyParams = GreedyParams()) + _nethist( + A, labels_start, + BinaryConvertor(), + Val(:binary), params ) end @@ -30,9 +48,8 @@ function _nethist( reset!(params) end data = convertor.(A) - @info "Using $(num_bins(convertor)) discrete categories for edge values" es = GreedySuffStats( - data, labels_start, num_categories = num_bins(convertor), + data, labels_start, type_suff_stats = type_suff_stats, max_iter = params.max_iter, swap_rule = params.node_swap_rule, @@ -55,6 +72,7 @@ function oracle_estimator( es_dummy = GreedySuffStats(block_ss, block_ss_swap, RandomGroupSwap(), PreviousBestValue(1_000, Inf, :min), 1) init!(es_dummy, data, oracle_labels) + @info "Oracle estimator loss: $(loss(es_dummy, norm = get_num_obs(data)))" parameters = to_params.(es_dummy.block_ss) return convert_to_result(oracle_labels, convertor, parameters) end diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index c9351fa..3cb4947 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -43,3 +43,8 @@ function order_groups(node_labels, latents::AbstractVector) return sort( 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) end + +function get_num_obs(A::AbstractMatrix) + n = size(A, 1) + return n * (n - 1) ÷ 2 +end From 49929ad8091b0cf53919fe65dd3ac1b6fa244cea Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 29 Oct 2025 14:14:04 +0100 Subject: [PATCH 238/266] clean api --- src/GreedySuffStats.jl | 24 +++++++++++------------- src/api.jl | 37 +++++++++++++++++++++++-------------- src/preprocessor/binary.jl | 2 ++ 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index 6f3a2a4..fd87e32 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -51,22 +51,20 @@ end return total_loss / norm end -function GreedySuffStats( - data, node_labels; type_suff_stats = Val(:categorical), max_iter = 10000, - node_swap_rule = RandomGroupSwap(), stop_rule = PreviousBestValue(5_000, Inf, :min), - dist = nothing, - kwargs...) - # derive user input +function make_greedy_suffstats_estimator( + data, + node_labels; + type_suff_stats = Val(:categorical), + max_iter = 10_000, + node_swap_rule = RandomGroupSwap(), + stop_rule = PreviousBestValue(5_000, Inf, :min), + kwargs... +) k = length(unique(node_labels)) - - # allocate sufficient statistics blocks - block_ss = make_k_block(k, type_suff_stats; data = data, dist = dist, kwargs...) - block_ss_swap = make_k_block(k, type_suff_stats; data = data, dist = dist, kwargs...) - - # create estimator + block_ss = make_k_block(k, type_suff_stats; data = data, kwargs...) + block_ss_swap = make_k_block(k, type_suff_stats; data = data, kwargs...) return GreedySuffStats{typeof(block_ss), typeof(node_swap_rule), typeof(stop_rule)}( block_ss, block_ss_swap, node_swap_rule, stop_rule, max_iter) - return es end function estimate!( diff --git a/src/api.jl b/src/api.jl index 886f6db..afc995b 100644 --- a/src/api.jl +++ b/src/api.jl @@ -8,7 +8,7 @@ function nethist_categorical( A, labels_start, convertor, Val(:categorical), - params, + params; num_categories = num_bins(convertor) ) end @@ -16,15 +16,15 @@ end function nethist_continuous( A, k, labels_start = ordered_start_labels(size(A, 1), k); - num_bins_::Int = 10, + bins::Int = 10, params::GreedyParams = GreedyParams()) - convertor = UnitIntervalConvertor(num_bins_) + convertor = UnitIntervalConvertor(bins) @info "Using $(num_bins(convertor)) discrete categories for edge values" _nethist( A, labels_start, convertor, Val(:categorical), - params, + params; num_categories = num_bins(convertor) ) end @@ -48,30 +48,39 @@ function _nethist( reset!(params) end data = convertor.(A) - es = GreedySuffStats( - data, labels_start, + es = make_greedy_suffstats_estimator( + data, + labels_start; type_suff_stats = type_suff_stats, max_iter = params.max_iter, - swap_rule = params.node_swap_rule, + node_swap_rule = params.node_swap_rule, stop_rule = params.stop_rule, - progress = params.display_progress; kwargs... ) node_labels, parameters = estimate!( - es, data, labels_start; iter_progress = params.progress_freq) + es, data, labels_start; + progress = params.display_progress, + iter_progress = params.progress_freq) return convert_to_result(node_labels, convertor, parameters) end function oracle_estimator( - data, oracle_labels, convertor; type_suff_stats = Val(:categorical)) + A, oracle_labels, convertor; type_suff_stats = Val(:categorical), kwargs...) + + # prepare data k = length(unique(oracle_labels)) - # allocate sufficient statistics blocks - block_ss = make_k_block(k, type_suff_stats; data = data) - block_ss_swap = make_k_block(k, type_suff_stats; data = data) - es_dummy = GreedySuffStats(block_ss, block_ss_swap, RandomGroupSwap(), + data = convertor.(A) + # prepare suff stats + block_ss = make_k_block( + k, type_suff_stats; data = data, num_categories = num_bins(convertor), kwargs...) + + # compute oracle suff stats + es_dummy = GreedySuffStats(block_ss, copy(block_ss), RandomGroupSwap(), PreviousBestValue(1_000, Inf, :min), 1) init!(es_dummy, data, oracle_labels) + + # retrieve parameters @info "Oracle estimator loss: $(loss(es_dummy, norm = get_num_obs(data)))" parameters = to_params.(es_dummy.block_ss) return convert_to_result(oracle_labels, convertor, parameters) diff --git a/src/preprocessor/binary.jl b/src/preprocessor/binary.jl index b9d80a1..b1f5f27 100644 --- a/src/preprocessor/binary.jl +++ b/src/preprocessor/binary.jl @@ -8,3 +8,5 @@ function to_distribution( c::BinaryConvertor, p::T; kwargs...) where {T <: Real} return p end + +num_bins(::BinaryConvertor) = 2 From e02e287fe2115ab67ef4aa97436b64220642e26a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 31 Oct 2025 12:33:26 +0100 Subject: [PATCH 239/266] use hungarian alg to align node labels --- Project.toml | 2 + docs/literate/tutorials/simple_graph.jl | 63 +++++--------- docs/literate/tutorials/weighted_network.jl | 66 +++++++++++---- src/NetworkHistogram.jl | 7 +- src/api.jl | 6 +- src/distributions/hist_dist.jl | 8 +- src/utils/align_partition.jl | 75 +++++++++++++++++ src/utils/utils_node_labels.jl | 18 +--- test/runtests.jl | 1 + test/test_align_partitions.jl | 91 +++++++++++++++++++++ 10 files changed, 255 insertions(+), 82 deletions(-) create mode 100644 src/utils/align_partition.jl create mode 100644 test/test_align_partitions.jl diff --git a/Project.toml b/Project.toml index 05202d7..0a487fe 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" +Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" @@ -36,6 +37,7 @@ Accessors = "0.1.42" ArgCheck = "2.5.0" BenchmarkTools = "1.6.3" Clustering = "0.15.8" +Hungarian = "0.7.0" IntervalSets = "0.7.11" LinearAlgebra = "1.12.0" LogExpFunctions = "0.3.29" diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 0e93a2d..467517d 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -44,7 +44,7 @@ end # 1. **Assign latent positions:** For a graph with `n` nodes, we sample `n` independent and identically distributed random variables $u_1, u_2, \dots, u_n$ from a Uniform(0, 1) distribution. These are the latent positions of our nodes. # 2. **Generate edges:** For each pair of nodes `(i, j)` with `i < j`, we generate a random number from a Bernoulli distribution with probability $W(u_i, u_j)$. This determines whether an edge exists between them. The resulting adjacency matrix `A` will be symmetric. # Let's sample a graph with 2000 nodes from our graphon `W`. -n = 2000 +n = 3000 u_true = rand(n); # Latent positions A = sample_graph(w, u_true); @@ -76,29 +76,25 @@ end # Now, let's use `NetworkHistogram.jl` to fit a network histogram to the graph `A` we sampled earlier. We will try to recover the underlying 2-block structure. -# First, we need to represent our graph in a format that the package understands. -# We can use an `EdgeList` to store the edges of the graph. -edge_list = EdgeList(A); - -# We also need to define the model for the edges. Since our graph is unweighted, -# we can use a `Bernoulli` distribution. The `Dist` wrapper is used to -# handle aggregation of distributions. -import NetworkHistogram: Dist, Assignment, nethist -dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter much. - # We start with a random initial assignment of nodes to `k=5` groups. k = 10 oracle_labels = ordered_start_labels(n, k); initial_assignment = shuffle(oracle_labels); -# Now, we create an `Assignment` object, which holds all the information -# about the model and the current state of the node groupings. -oracle_estimator = Assignment(oracle_labels, edge_list, Dist(dist)); -sbm_oracle = NetworkHistogram.to_block_model(oracle_estimator); -Mke.heatmap(sbm_oracle, colormap = :binary, colorrange = (0, 1)) +## We can compute the "oracle" estimator, which uses the true latent positions to assign nodes to groups. This serves as a benchmark for our estimation. +oracle_res = NetworkHistogram.oracle_estimator( + A, oracle_labels, NetworkHistogram.BinaryConvertor(); type_suff_stats = Val(:binary)); -println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) +let + fig = Mke.Figure(size = (400, 300)) + ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, oracle_res.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 2], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end +## # `NetworkHistogram.jl` provides optimization algorithms to improve the initial assignment. # Let's use the `nethist` function with `GreedyParams`, which iteratively moves nodes between # groups to maximize the log-likelihood. @@ -109,34 +105,14 @@ println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) # a = nethist(A, dist, initial_assignment, params_opti, false); -res = NetworkHistogram.nethist_binary_edges(A, - initial_assignment, GreedyParams( - 1_000_000, - RandomGroupSwap(), - Strict(), - PreviousBestValue(1_000, Inf, :min), - true - )); - -a = Assignment(res.node_labels, edge_list, Dist(dist)); -println("Log-likelihood after optimization: ", loglikelihood(a)) - -# The `Assignment` object `a` now contains the optimized node groupings and -# the fitted network histogram parameters. - -# We can visualize the fitted histogram. -heatmap_params(a, ordering = false, colorrange = (0, 1)) - -# We can convert it to a block model for easier interpretation. - -# res = NethistResult(a); +res = NetworkHistogram.nethist_binary(A, k, initial_assignment); let fig = Mke.Figure(size = (1220, 400)) titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[2], sbm_oracle, + Mke.heatmap!(axes[2], oracle_res.model, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 4], colormap = :binary, @@ -146,7 +122,7 @@ end # the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. -NetworkHistogram.align_res_true_latents!(res, oracle_estimator.node_labels); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels); # and display the true function, the oracle estimator, and the fitted model let @@ -154,7 +130,7 @@ let titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[2], sbm_oracle, + Mke.heatmap!(axes[2], oracle_res.model, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 4], colormap = :binary, @@ -166,10 +142,10 @@ end using Clustering -ξ = NetworkHistogram.node_labels_to_latents(res.node_labels, res.model); +# ξ = NetworkHistogram.node_labels_to_latents(res.labels, res.model); shape_range = 1:(k * (k + 1) ÷ 2 - 1) ssm_estimated, criterion_values = Graphons.estimate_ssm( - res.model, A, ξ, shape_range) + res.model, A, res.labels, shape_range) using Kneedle kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) @@ -178,7 +154,6 @@ kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smooth k_knee = knees(kr)[1] ssm_knee = SSM(res.model, k_knee) -Mke.heatmap(ssm_estimated, colormap = :binary, colorrange = (0, 1)) println("Number of shapes in SSM argmin: ", length(ssm_estimated.θ)) println("Number of shapes in SSM knee: ", length(ssm_knee.θ)) println("Number of shapes in SBM: ", length(res.model.θ)) diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index a640c67..0ce2775 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -6,9 +6,15 @@ using NetworkHistogram using Distributions using LinearAlgebra using Random +using ProgressMeter -graphon = DecoratedGraphon((x, y) -> Kumaraswamy( - 4 * (cos(π * (x - y)) + 1) + 1, max(x, y) * 8 + 1)) +import Distributions: pdf + +pdf_kuma(α, β, x, p = 1.0) = @. p * (α * β * x^(α - 1) .* (1 - x^α)^(β - 1)) + +graphon_params = (x, y) -> (4 * (cos(π * (x - y)) + 1) + 1, max(x, y) * 8 + 1) + +graphon = DecoratedGraphon((x, y) -> Kumaraswamy(graphon_params(x, y)...)) import CairoMakie as Mke let @@ -22,16 +28,18 @@ let fig end -n = 4000 -k = 4 -A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(0.9), n, n)); +n = 2000 +k = 10 +n_bins = 20 +p = 0.9 +A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); +ξs = range(0, 1; length = n) oracle_latents = ordered_start_labels(n, k); -starting_labels = copy(oracle_latents); -p_shuffle = 1 - 1.5 / k -@info "Shuffling $(p_shuffle*100)% labels for starting point" -indices_to_shuffle = sample(1:n, floor(Int, n * p_shuffle), replace = false); -starting_labels[indices_to_shuffle] .= shuffle(starting_labels[indices_to_shuffle]); -@assert starting_labels != oracle_latents + +res_oracle = NetworkHistogram.oracle_estimator( + A, oracle_latents, NetworkHistogram.UnitIntervalConvertor(n_bins)); + +starting_labels = shuffle(oracle_latents); max_iter = 1_000_000 stalled_iters = 5_000 @@ -39,7 +47,7 @@ stalled_iters = 5_000 res_new = NetworkHistogram.nethist_continuous( A, k, starting_labels; - num_bins_ = 10 + bins = n_bins ); # convertor = NetworkHistogram.UnitIntervalConvertor(10) @@ -60,12 +68,42 @@ res_new = NetworkHistogram.nethist_continuous( # counts(node_labels_es_new) ./ length(node_labels_es_new)); # res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); -NetworkHistogram.align_res_true_latents!(res_new, oracle_latents); +NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); +xs = range(0, 1; length = 100) + +# function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 20, p = p) +# nodes_1 = findall(res_oracle.labels .== g1) +# nodes_2 = findall(res_oracle.labels .== g2) +# edge_values = [A[x, y] for y in nodes_2 for x in nodes_1] +# Mke.vlines!(axis, edge_values, ymax = 0.025, color = :lightgray) +# # Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) +# x1 = sample(ξs[nodes_1], n_viz, replace = false) +# x2 = sample(ξs[nodes_2], n_viz, replace = false) +# for x_ in x1 +# for y_ in x2 +# Mke.lines!(axis, xs, pdf_kuma(graphon_params(x_, y_)..., xs, p), +# color = :gray, alpha = 0.1) +# end +# end +# Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_oracle.model.θ[g1, g2]), xs), +# color = :blue, label = "True") +# Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_new.model.θ[g1, g2]), xs), +# color = :black, linestyle = :dash, label = "Estimated") +# end +# for g in 1:k +# @showprogress for g2 in 1:g +# fig = Mke.Figure(size = (600, 400)) +# ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", +# ylabel = "Density") +# viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p) +# display(fig) +# end +# end ## ssm_test = SSM(res_new.model, k) -shape_range = 1:min(5, k * (k + 1) ÷ 2 - 1) +shape_range = 1:(k * (k + 1) ÷ 2 - 1) ssm_estimated, criterion_values = Graphons.estimate_ssm( res_new.model, A, res_new.labels, shape_range) diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 7979006..3339fd7 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -12,15 +12,16 @@ import Random: randperm, AbstractRNG, rand, shuffle import Distributions: logpdf, pdf import LogExpFunctions: xlogx using IntervalSets - +using Hungarian using Reexport @reexport using Graphons -import Graphons: _extract_param, convert_to_params +import Graphons: _extract_param, convert_to_params, node_labels_to_latents include("SymArray.jl") @reexport using .FastSymArray +include("utils/align_partition.jl") include("distributions/hist_dist.jl") include("preprocessor/abstractConvertor.jl") include("config_rules/include.jl") @@ -30,6 +31,6 @@ include("utils/utils_node_labels.jl") include("api.jl") export GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, - Strict, PreviousBestValue, nethist_binary_edges + Strict, PreviousBestValue, nethist_binary_edges, align_partitions end diff --git a/src/api.jl b/src/api.jl index afc995b..6e2d923 100644 --- a/src/api.jl +++ b/src/api.jl @@ -4,7 +4,7 @@ function nethist_categorical( params::GreedyParams = GreedyParams()) convertor = CategoricalConvertor(A) @info "Using $(num_bins(convertor)) discrete categories for edge values" - _nethist( + return _nethist( A, labels_start, convertor, Val(:categorical), @@ -20,7 +20,7 @@ function nethist_continuous( params::GreedyParams = GreedyParams()) convertor = UnitIntervalConvertor(bins) @info "Using $(num_bins(convertor)) discrete categories for edge values" - _nethist( + return _nethist( A, labels_start, convertor, Val(:categorical), @@ -33,7 +33,7 @@ function nethist_binary( A, k, labels_start = ordered_start_labels(size(A, 1), k); params::GreedyParams = GreedyParams()) - _nethist( + return _nethist( A, labels_start, BinaryConvertor(), Val(:binary), diff --git a/src/distributions/hist_dist.jl b/src/distributions/hist_dist.jl index 60cd4af..a500b37 100644 --- a/src/distributions/hist_dist.jl +++ b/src/distributions/hist_dist.jl @@ -19,7 +19,13 @@ function HistDistribution(bins, ps) return HistDistribution{typeof(bins), typeof(ps), typeof(cum_ps)}(bins, ps, cum_ps) end -logpdf(d::HistDistribution, x::Real) = log(pdf(d, x)) +function logpdf(d::HistDistribution, x::Real) + # potentially slow + bin_idx = findfirst(b -> x ∈ b, d.bins) + p = d.probs[bin_idx] + bin_idx == 1 && return log(p) + return log(p) - log(width(d.bins[bin_idx])) +end function pdf(d::HistDistribution, x::Real) # potentially slow diff --git a/src/utils/align_partition.jl b/src/utils/align_partition.jl new file mode 100644 index 0000000..79f6b20 --- /dev/null +++ b/src/utils/align_partition.jl @@ -0,0 +1,75 @@ + +""" + align_partitions(x::AbstractVector{<:Integer}, y::AbstractVector{<:Integer}) + +Align labels of partition `y` to match partition `x` using optimal matching. + +Returns `(y_aligned, mapping)` where: +- `y_aligned`: Vector with same length as `y`, with labels relabeled to match `x` +- `mapping`: Dictionary mapping original `y` labels to aligned `x` labels + +The alignment maximizes the overlap between partitions using the Hungarian algorithm. +Unmatched labels from `y` are assigned to unused labels from `x`. + +# Arguments +- `x::AbstractVector{<:Integer}`: Reference partition labels +- `y::AbstractVector{<:Integer}`: Partition labels to align + +# Examples +```julia +x = [1, 1, 2, 2, 3] +y = [2, 2, 1, 1, 3] +y_aligned, mapping = align_partitions(x, y) +# y_aligned == [1, 1, 2, 2, 3] +# mapping == Dict(2 => 1, 1 => 2, 3 => 3) +``` +""" +function align_partitions(x::AbstractVector{<:Integer}, y::AbstractVector{<:Integer}) + @argcheck length(x)==length(y) "Partitions must have same length" + + # Get unique labels and create index mappings + xlabs = sort!(unique(x)) + ylabs = sort!(unique(y)) + Kx, Ky = length(xlabs), length(ylabs) + + # Build contingency matrix: C[i,j] = count where x==xlabs[i] and y==ylabs[j] + C = zeros(Int, Kx, Ky) + for k in eachindex(x, y) + i = searchsortedfirst(xlabs, x[k]) + j = searchsortedfirst(ylabs, y[k]) + @inbounds C[i, j] = 1 + end + + # Solve maximum weight assignment on padded square matrix + K = max(Kx, Ky) + W = zeros(Int, K, K) + @views W[1:Kx, 1:Ky] .= -C + assignment, cost = hungarian(W) + + # Build mapping from y to x labels + mapping = Dict{eltype(ylabs), eltype(xlabs)}() + used_x = Set{eltype(xlabs)}() + + # Map matched pairs within actual partition sizes + for i in 1:Kx + j = assignment[i] + if 1 ≤ j ≤ Ky + mapping[ylabs[j]] = xlabs[i] + push!(used_x, xlabs[i]) + end + end + + # Assign unmatched y labels to unused x labels + unused_x = filter(∉(used_x), xlabs) + for j in 1:Ky + if !haskey(mapping, ylabs[j]) + @argcheck !isempty(unused_x) "Insufficient x labels for alignment" + mapping[ylabs[j]] = popfirst!(unused_x) + end + end + + # Relabel y using the mapping + y_aligned = map(ylab -> mapping[ylab], y) + + return y_aligned, mapping +end diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index 3cb4947..a9f7b25 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -11,14 +11,6 @@ function ordered_start_labels(n::Int, k::Int) return labels end -function node_labels_to_latents(node_labels::AbstractVector{Int}, sbm) - return map(label -> _label_to_latent(label, sbm), node_labels) -end - -function _label_to_latent(label::Int, sbm) - return sbm.cumsize[label] - eps() -end - function align_res_true_latents!(res::NethistResult, latents) perm = order_groups(res.labels, latents) permute!(res.model, perm) @@ -33,15 +25,7 @@ function permute!(sbm, perm) end function order_groups(node_labels, latents::AbstractVector) - n = length(node_labels) - k = length(unique(node_labels)) - sort_perm = sortperm(latents) - sorted_group_labels = node_labels[sort_perm] - dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - return sort( - 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) + return align_partitions(node_labels, latents)[2] end function get_num_obs(A::AbstractMatrix) diff --git a/test/runtests.jl b/test/runtests.jl index bdbb9d6..46bff3c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,4 +6,5 @@ using NetworkHistogram include("test_symarray.jl") include("test_pseudo_suff_stats.jl") include("test_hist_dist.jl") + include("test_align_partitions.jl") end diff --git a/test/test_align_partitions.jl b/test/test_align_partitions.jl new file mode 100644 index 0000000..a97c0ca --- /dev/null +++ b/test/test_align_partitions.jl @@ -0,0 +1,91 @@ +@testset "align_partitions" begin + @testset "Basic alignment" begin + x = [1, 1, 2, 2, 3] + y = [2, 2, 1, 1, 3] + + y_aligned, mapping = align_partitions(x, y) + + @test length(y_aligned) == length(y) + @test y_aligned == [1, 1, 2, 2, 3] + @test mapping[2] == 1 + @test mapping[1] == 2 + @test mapping[3] == 3 + end + + @testset "Perfect match" begin + x = [1, 1, 2, 2, 3, 3] + y = [1, 1, 2, 2, 3, 3] + + y_aligned, mapping = align_partitions(x, y) + + @test y_aligned == x + @test mapping == Dict(1 => 1, 2 => 2, 3 => 3) + end + + @testset "Complete permutation" begin + x = [1, 1, 2, 2, 3, 3] + y = [3, 3, 1, 1, 2, 2] + + y_aligned, mapping = align_partitions(x, y) + + @test y_aligned == x + @test mapping == Dict(3 => 1, 1 => 2, 2 => 3) + end + + @testset "Different numbers of clusters" begin + # x has 3 clusters, y has 2 + x = [1, 1, 2, 2, 3, 3] + y = [1, 1, 1, 1, 2, 2] + + y_aligned, mapping = align_partitions(x, y) + + @test length(y_aligned) == length(y) + # The alignment should maximize overlap + # y cluster 1 should map to x cluster with most overlap (1 or 2) + # y cluster 2 should map to x cluster 3 + @test sum(y_aligned[1:4] .== x[1:4]) ≥ 2 # Good overlap for first 4 + end + + @testset "Single cluster" begin + x = [1, 1, 1, 1] + y = [1, 1, 1, 1] + + y_aligned, mapping = align_partitions(x, y) + + @test y_aligned == x + @test mapping == Dict(1 => 1) + end + + @testset "Non-contiguous labels" begin + x = [10, 10, 20, 20, 30, 30] + y = [5, 5, 15, 15, 25, 25] + + y_aligned, mapping = align_partitions(x, y) + + @test length(y_aligned) == length(y) + # Should create optimal matching + @test all(l -> l ∈ [10, 20, 30], y_aligned) + end + + @testset "Error handling" begin + x = [1, 2, 3] + y = [1, 2] + + @test_throws Exception align_partitions(x, y) + end + + @testset "Preserves partition structure" begin + x = [1, 1, 1, 2, 2, 2, 3, 3, 3] + y = [2, 2, 2, 3, 3, 3, 1, 1, 1] + + y_aligned, mapping = align_partitions(x, y) + + # Check that elements in same cluster in y stay together in y_aligned + @test length(unique(y_aligned[1:3])) == 1 + @test length(unique(y_aligned[4:6])) == 1 + @test length(unique(y_aligned[7:9])) == 1 + + # Should achieve perfect alignment after relabeling + @test y_aligned == x + end +end From 4174c333f2d77e87d2de71053806749d4dae587b Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Wed, 12 Nov 2025 16:56:23 +0100 Subject: [PATCH 240/266] start adding OT methods --- docs/literate/tutorials/simple_graph.jl | 59 +++++++++++++++- docs/literate/tutorials/weighted_network.jl | 78 +++++++++++++-------- src/api.jl | 5 +- src/utils/utils_node_labels.jl | 32 +++++++-- 4 files changed, 138 insertions(+), 36 deletions(-) diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 467517d..e219dbe 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -122,7 +122,7 @@ end # the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. -NetworkHistogram.align_res_true_latents!(res, oracle_res.labels); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels, type = :greedy); # and display the true function, the oracle estimator, and the fitted model let @@ -172,3 +172,60 @@ let limits = (0, 1), label = "Edge Probability", width = 20) fig end + +## + +k_kmeans = 10; +clustering_res = kmeans(A, k_kmeans); + +res_kmeans = NetworkHistogram.oracle_estimator( + A, assignments(clustering_res), NetworkHistogram.BinaryConvertor(); + type_suff_stats = Val(:binary), + name = "k-means"); + +NetworkHistogram.align_res_true_latents!(res_kmeans, oracle_res.labels, type = :greedy); + +# and display the true function, the oracle estimator, and the fitted model +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], oracle_res.model, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], res_kmeans.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end + +# ξ = NetworkHistogram.node_labels_to_latents(res.labels, res.model); +shape_range = 1:(k_kmeans * (k_kmeans + 1) ÷ 2 - 1) +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res_kmeans.model, A, res_kmeans.labels, shape_range) + +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +# Let's extract the optimal number of shapes using the Kneedle algorithm: + +k_knee = knees(kr)[1] +ssm_knee = SSM(res_kmeans.model, k_knee) + +println("Number of shapes in SSM argmin: ", length(ssm_estimated.θ)) +println("Number of shapes in SSM knee: ", length(ssm_knee.θ)) +println("Number of shapes in SBM: ", length(res_kmeans.model.θ)) + +# We greatly reduced the number of parameters from the original SBM estimate while preserving much of the structure of the estimated graphon as seen below: + +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["SBM", "SSM argmin", "SSM knee"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], res_kmeans.model, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], ssm_estimated, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], ssm_knee, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 0ce2775..ea55ae7 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -71,34 +71,35 @@ res_new = NetworkHistogram.nethist_continuous( NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); xs = range(0, 1; length = 100) -# function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 20, p = p) -# nodes_1 = findall(res_oracle.labels .== g1) -# nodes_2 = findall(res_oracle.labels .== g2) -# edge_values = [A[x, y] for y in nodes_2 for x in nodes_1] -# Mke.vlines!(axis, edge_values, ymax = 0.025, color = :lightgray) -# # Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) -# x1 = sample(ξs[nodes_1], n_viz, replace = false) -# x2 = sample(ξs[nodes_2], n_viz, replace = false) -# for x_ in x1 -# for y_ in x2 -# Mke.lines!(axis, xs, pdf_kuma(graphon_params(x_, y_)..., xs, p), -# color = :gray, alpha = 0.1) -# end -# end -# Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_oracle.model.θ[g1, g2]), xs), -# color = :blue, label = "True") -# Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_new.model.θ[g1, g2]), xs), -# color = :black, linestyle = :dash, label = "Estimated") -# end -# for g in 1:k -# @showprogress for g2 in 1:g -# fig = Mke.Figure(size = (600, 400)) -# ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", -# ylabel = "Density") -# viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p) -# display(fig) -# end -# end +function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 20, p = p) + nodes_1 = findall(res_oracle.labels .== g1) + nodes_2 = findall(res_oracle.labels .== g2) + edge_values = [A[x, y] for y in nodes_2 for x in nodes_1] + Mke.vlines!(axis, edge_values, ymax = 0.025, color = :lightgray) + # Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) + x1 = sample(ξs[nodes_1], n_viz, replace = false) + x2 = sample(ξs[nodes_2], n_viz, replace = false) + for x_ in x1 + for y_ in x2 + Mke.lines!(axis, xs, pdf_kuma(graphon_params(x_, y_)..., xs, p), + color = :gray, alpha = 0.1) + end + end + Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_oracle.model.θ[g1, g2]), xs), + color = :blue, label = "True") + Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_new.model.θ[g1, g2]), xs), + color = :black, linestyle = :dash, label = "Estimated") +end + +for g in 1:k + @showprogress for g2 in 1:g + fig = Mke.Figure(size = (600, 400)) + ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ylabel = "Density") + viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p, n_viz = 5) + display(fig) + end +end ## ssm_test = SSM(res_new.model, k) @@ -116,3 +117,24 @@ Mke.lines(shape_range, criterion_values) # k_knee = knees(kr)[1] # ssm_knee = SSM(res_new.model, k_knee) + +## + +clustering_res = kmeans(A, k) + +res_kmeans = NetworkHistogram.oracle_estimator( + A, assignments(clustering_res), NetworkHistogram.UnitIntervalConvertor(n_bins); + type_suff_stats = Val(:categorical), + name = "k-means"); + +NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels, type = :greedy); + +for g in 1:k + @showprogress for g2 in 1:g + fig = Mke.Figure(size = (600, 400)) + ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ylabel = "Density") + viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) + display(fig) + end +end diff --git a/src/api.jl b/src/api.jl index 6e2d923..c14d9ea 100644 --- a/src/api.jl +++ b/src/api.jl @@ -66,7 +66,8 @@ function _nethist( end function oracle_estimator( - A, oracle_labels, convertor; type_suff_stats = Val(:categorical), kwargs...) + A, oracle_labels, convertor; type_suff_stats = Val(:categorical), + name = "oracle", kwargs...) # prepare data k = length(unique(oracle_labels)) @@ -81,7 +82,7 @@ function oracle_estimator( init!(es_dummy, data, oracle_labels) # retrieve parameters - @info "Oracle estimator loss: $(loss(es_dummy, norm = get_num_obs(data)))" + @info "$name estimator loss: $(loss(es_dummy, norm = get_num_obs(data)))" parameters = to_params.(es_dummy.block_ss) return convert_to_result(oracle_labels, convertor, parameters) end diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index a9f7b25..d259b85 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -11,10 +11,17 @@ function ordered_start_labels(n::Int, k::Int) return labels end -function align_res_true_latents!(res::NethistResult, latents) - perm = order_groups(res.labels, latents) +function align_res_true_latents!(res::NethistResult, latents; type = :greedy) + if type ∉ (:opt, :greedy) + error("Unknown alignment type: $type. Use :opt or :greedy.") + end + if type == :opt + @warn "The :opt alignment may not work that well; consider using :greedy instead." + end + new_labels, mapping = order_groups(res.labels, latents, Val(type)) + res.labels .= new_labels + perm = [key for (key, val) in sort(collect(mapping), by = last)] permute!(res.model, perm) - res.labels .= map(x -> findfirst(==(x), perm), res.labels) end function permute!(sbm, perm) @@ -24,8 +31,23 @@ function permute!(sbm, perm) sbm.cumsize .= cumsum(sbm.size) end -function order_groups(node_labels, latents::AbstractVector) - return align_partitions(node_labels, latents)[2] +function order_groups(node_labels, latents::AbstractVector, ::Val{:opt}) + return align_partitions(node_labels, latents) +end + +function order_groups(node_labels, latents::AbstractVector, ::Val{:greedy}) + n = length(node_labels) + k = length(unique(node_labels)) + sort_perm = sortperm(latents) + sorted_group_labels = node_labels[sort_perm] + dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] + counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for group in 1:k) + perm = sort( + 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) + new_labels = map(x -> findfirst(==(x), perm), node_labels) + mapping = Dict(perm[i] => i for i in 1:k) + return new_labels, mapping end function get_num_obs(A::AbstractMatrix) From 68ad7c0a051f8fadc8dd7d1a367b6248d8a9f252 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 10:40:50 +0100 Subject: [PATCH 241/266] add PythonCall dependency for ot functions and remove align partitions with hungarian --- CondaPkg.toml | 2 + Project.toml | 2 + .../PythonOptimalTransport.jl | 32 + .../__pycache__/fngw.cpython-314.pyc | Bin 0 -> 32832 bytes ext/PythonOptimalTransport/fngw.py | 1002 +++++++++++++++++ ext/PythonOptimalTransport/srGW.jl | 508 +++++++++ src/NetworkHistogram.jl | 3 +- src/utils/align_partition.jl | 75 -- src/utils/utils_node_labels.jl | 29 +- test/runtests.jl | 1 - test/test_align_partitions.jl | 91 -- 11 files changed, 1564 insertions(+), 181 deletions(-) create mode 100644 CondaPkg.toml create mode 100644 ext/PythonOptimalTransport/PythonOptimalTransport.jl create mode 100644 ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc create mode 100644 ext/PythonOptimalTransport/fngw.py create mode 100644 ext/PythonOptimalTransport/srGW.jl delete mode 100644 src/utils/align_partition.jl delete mode 100644 test/test_align_partitions.jl diff --git a/CondaPkg.toml b/CondaPkg.toml new file mode 100644 index 0000000..9446691 --- /dev/null +++ b/CondaPkg.toml @@ -0,0 +1,2 @@ +[deps] +pot = "" diff --git a/Project.toml b/Project.toml index 0a487fe..91a1031 100644 --- a/Project.toml +++ b/Project.toml @@ -26,11 +26,13 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0" LightMC = "b58f5c6e-c887-41d6-b553-02118416cd5d" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" +PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" [extensions] BootstrapExt = "Bootstrap" LightMCExt = "LightMC" MakieExt = "Makie" +PythonOptimalTransport = "PythonCall" [compat] Accessors = "0.1.42" diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl new file mode 100644 index 0000000..78849f4 --- /dev/null +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -0,0 +1,32 @@ +module PythonOptimalTransport +using PythonCall +using NetworkHistogram + +import NetworkHistogram: align_matrices, get_perm_alignment + +const ot = Ref{Py}() +const fngw = Ref{Py}() + +function __init__() + ot[] = pyimport("ot") + pyimport("sys").path.append(@__DIR__) + fngw[] = pyimport("fngw") +end + +jl_to_np(mat) = Py(mat).to_numpy() + +function get_perm_alignment(src, target) + plan = ot[].gromov.gromov_wasserstein( + C2 = jl_to_np(src), C1 = jl_to_np(target)) + plan = pyconvert(Matrix{Float64}, plan) + ordering = argmax(plan, dims = 1) .|> Tuple |> vec + perm = sort(ordering, by = x -> x[1]) .|> last + return perm +end + +function align_matrices(src, target) + perm = get_perm_alignment(src, target) + return src[perm, perm] +end + +end diff --git a/ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc b/ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..888d28678184b24a9350ce1329cc09d05f55099b GIT binary patch literal 32832 zcmeHwd2kz7dS~Mxjs`#+yzk~A9uf(OlqJiutT9E2lx&NZK&t7P(TIWwkb*@LU<1@O zqYNjRO@XmBh9`Cz&e&zDCa$4tj?g=CjW$VboY^DZOp>AuG2$Rfq9o2bQ~SrtmM4x= zmEHY)ue*T;2uhU4No8xdOL~0$`W@f9zW2TF{oaQy#SWgs)Ac{gLtmffxWAGMdp8}jA9TbV%x^wae9#$lF~0?Vcg%gz6Z151zEYpHf}7OEN_>0;7c2GI;FkI9 zaLavVJ_q|&;VbtQGhe08i7!6*EYs-ID{20ILC zV;~fbr9H={1HtjobTDwlFZri}v7nT8OybLEEIjI$B>#mDUD`8tIyiP_RPs*;!c(Ks zm_HUosBIz`8$IP8I}@A^bm-HbN#rsb4f>_A(?~fLdODo0jD{!A21mOCqx4bvHZ$uF zAh{tpMSpCe=}>HR${&+L=aGwP;@o678cmxg&P_~6{=itd(hc;d4F3%rXS~yDqb@nF zQ@@FZM$X7Z44M!<3(;#rj4Z^c3DL0--Erh8KceBdQT0Ed41hKzU;TrwnUobG2770<`i0|;u*s5rC*KL3Hb$gUG;AP^J||6;Xd7>2ZCifCk()cc zjpXI%(>vkCh}@u1xi?co^d<QF`1ES+PV0s}k?O;HMbG#{ERI`^ zBbKRgA~j;o5#*HrjBsMr1Ih@9REyY~;C|^9xf6PIG)L-=SE!_lbz)s6ymp;>)ZV2YwWtTE60LdlC|OevquQpZ zM@gjqaI;u4UeEfd{798bPxi0KiIv;+cPY6ZB_kiv=;Dqw%<8?|tiB(;5NSXyYO?vL zKG7yNs5KqbKaLi#a`jw9h%_9pQ{!mnfKH`W{fkl>l~TZuB8{3htQU1+qjo%N%BfV! zX^J$S;3CZ@bdeVP--Q3IqC3-Opr)4F%0=2RKihmotssc`Oc|d!e?lC6{1&DS8t^2c(Yp9k+(#zs8?%>IgsH!N@;c<{|lG4DW^0; zrnKs88;TVL#^_^gjH&~S6Pt2JD0q`0FQit~=Y`lrV_t||bciO=EEfB0qU(fy*Vj4D zXZJZCRJd#IO5iKj&Iz>Q6PY>Jh&hTr$bZCogm1|Y#c1Cwa|3l=b!t{2weFFYb@Hqg zI||0%v`+kLvAtk??R@pq_<0I5Ra?U&T?Hs_QR!=-nQgjf{MMZK=6lB9loQ`_&-j8S zzDpe*0dNxVjph^YbeghtIKi>eyP3Wn->QcDoJYD<#VgOUIw34nB3lW8sfg-j|ve9!Z}_7LU2ZC@9yakLeoNc zRuUrqNKoSW%-$f%cV!-aP}q%lF=G$;n!X>?77&ICJ!pnvSN5S$E7fg<5FsEeKk6+-)Tv@_7b3L6Xug2H&v zADac;EcxW5S{~mU1zGo*?{rkocqk^#j!~`x+mMW{&Asna<=z=+lM6Tyn8;8r#Dde& zu(YNEkxx;Cocm#aG`falFF)y@Iu$@WW}9VvGVGW8?Fpp=sEbYxofM)GKWg13oC}5~ zPRF9We=>5~kCfRIy(}c6?iPjdu#_EEw5Rk>j|KTt!PuusHXb@33}A<;cEfJrR5(1z zI(_7H5JaM^uz*;s$ZR}ikZDGBU{(rEPq1jSkpOnAQC`k`8iQvvGkES^0L4`YnmUj- zQbrKnywNf!?){ZPjXXL&J3ZQqjzJr+=6M3UW&wd?u~2wgn2lmCsy)Lt;gS#xvG!|= zVgoJ(M`@egCZO}$&P>WLd?w>QO~!|&r_t(a$KJIFuP_)sCxoXbF9>auJ)_fMX{wE% z@}D0K5uahqqEiS*h(f>IsfYdNH7tXo`RQwvkA)}IQJ0rwC>D;OTQT!bhbIFlO$bU7 zs?ol8TZflF8c?2!=!qEDH^T`AzNQpA2J=WXr{52~QMG>Q}i;P?<|oFd~HmvtyxCp~+C}0?+$` zSSv7Cl_im1r{PZyp6nD(22RRMdS7PcMOUkQh+n6z6jnQ$k^?f;DzXJo^?r=up6Cq zdiTlW^yk$0mxnK%%&tQ1GHKJ%lN~|~-4u<4rC4U=&CQ@)&EfE+(P+nL6xsN(=x5jx z8xyo%W8pa$1bwuiDXq(LTDwAE1h80P#uhU@LDGyubvtFBDfy{Ooq|fm=ZOj;LlsW4 zRpI$JbZK+!LL@lqk6vuSyzQg;$rc}Q8ewV{BF`xrbqXg423V}#4aMreSFE%P>=3Ls zyU58Q+9fINpg5TUlr|4X8H-NaG;2fJtg!Hj4^IDW{H=d{Z|`u2G0j83I|McjuAVkW z!HS1ZOKG#hb<+lninMO{qW#OecSJ9J<;%Ofd!m>A$#5v+EJNbryIv<2_BusXbekdMkhc?`0VI8640b*EEt;ZNgHW>Oq*#TgM6Dd`6Cf% z($hr|TBXt^d6DYan$``b_5IPbz8C-d@IM0WxFXV}i%Yd|o3^M+c-nwvSt5O%L~7x*NnZbyzx~SJ-n3oIbJP0i^J(2sS{F#0 z0;5!HTGyAhP6Q!V_+w!Vir#e5dH-b4pVsxKbu)5@=%SK9MHt2+F^P1u5-DOOXtFqr zg7Cy>x~LDkmb2-iUir^c?2qn3+T0&SCrm}sIyB-i^G0a{+gk9U*tNi>a-_^RcZ*F2qiUr=Ntt zGv%KgRyo|(@#%?k-pGZti##|%QFmfd={V@5?Pb)5p+CocbXL#t6-oUyYt7ugo7VEI zZ`Er1z@X<%bNfCrX`rl)0j)8i@6 z6U+7|Zn?_mdvCGO%FT;=QkA>nhgMxxFFBugF6b74S2{j{YuRo-@GUveff^HTJM=#_o(f#veHo371^4=xQ{J^HpT z<=S_{wf~xHf4uj-)-B)bThf1b;5VmJjvZH~7u?SV65WYYN%xDvZ%kjAj_a2lJKlHJ zuR5!eU%Kwx{L{AfR8_~7nT5t=<7>^YG+&uXn3k(LZkBFY?7eDwTc0ZJOXxrZ#sBIw<$%ma>Mf)conz-VfpL}THsyBb~A>Y zd8qc-4&xtnl^xq{{7FykvCkR*xVcO;8vmraOtc#R)Yj_~i;ZK-K?YLEd>K-N@y^$A zw3>7H%&7{(i1tW&?E$oUNUa+P^U>!ASWW8;0j?xYaSK6}3YEtB$6$^eL1)i55M)9gTbV#!wCw*2{ zvnmYA)C?**{Lm2Rk|Jy0{bzBrpCTV-Nt#>?5)?^AWz3=a}8If zUmgvC{NC)vXkdAt_wpSUwy{vQ5z*Nx38x&KL`QpRLm(Vu0;ohMRESA}u>^zwCx`AN z-vBvB;iOHmsR%Mi8zPZd)PfL_F-e@1zDRNC@Pj!zu#D&r@CKQ#m9v(sQWfN@nQN9R zlCQGD9V9J*BrPG?O^!ch6(D3eT#)q2YhKx&ocY$y7ru0-$YACFx~wUoyKUecrI#N| zdR{AkxqLzY9n%%l!p!Tj@0`7Ic2WBNY)aU%H1k$0Rki1aeb2Ie&ri#mKA^OB3`NDJ zx&GUB&gOWoeZD=>{frmFgw1_<&$6-NlMfw;6Q!NjA6R=!_dR0C;f2Ie7%%+2d|sFV zEZhT(vle=D|3I3J!eXcv?7^|R3VtzU`K?bU8niqHl;gxievW5&=kd(34~~6NYkJhb z8V=mgIB?F<;5s}mhfC*j&?7iR%70XjsO74n!IuN2(S8HRJy?+AnsRuk!T-CcjLJia z11X9SnK@!}8VQcYNa;@B*5i}BhEEngE4l|0oRUv2)@Oph&>_aK_UvOle)QQfK%8u? z!*etIFVLxSKVrD5?#a7xm12On9_a<&mj}x2&%L1^{GGwU!49j;-5L=tHIYNZUphff zhR;c(^h%nK4ztYNcEF$356hZ>EQga$BAl@Sg|o?wjpQhaoMsH3=uhAQV{6ecHVr$= zFt&|uL}{zpNDQ!%wwjH}nUuA0ZXd~s{>#{9-pq1B>01xJa6;jP&1*Q}>qQIF>$BfE zcja8FYU_%9>r&&=z+0PDF4&yM1?`@>!=HR;Q@P%cjlK51s$8x|k_zK`yC8OaHWHwC z|6FgQ5JfH5Dd6K3gGv}sLL3nw5&sUJ|84E%A zKwkmyURM6paQzIoFBEn{x82|~sKCU#O07o!!}lq*PWIsFN1B4hIOimeq*wlKTv;o< zZbW?jzgd3$?6|j=&pG2?Z@C~eLV-hl9D&HE`fFMFI_ERfu}=I`^IAuKrQM8j63G8k zap>tl=hJ7rUT@dl(3!wRyu*)>0DJGyNiE=19kvZD1Zfjg(If)XS|DvBr?BArAblDj zhXh}?EJ%L%3P`<#gp$ZPNQ;9R4i<+_&Eil<>c!&Vf2H(>t^QqG{i>rl-WS&u)Lhcy zuvR*TXk6_6$fcE!ofa$qK`~cO>w<$x$WVwr7n%>Px&%_pS)aEqS8s;CaQ|=hztO+6 z?=2|%c0&pDqVbN|=q&!&Pz0q82j>GjGWmeAyJO&)$Xre$^XB=@3ExXkJ^xhF|LZ59 z-_m`z|F`?D>VDXNwRd^T-uUJe|2eHVU4itnIQ?*M`Th#aSTPz%eH!6`plQ7GM_|f+ zonP;7@o)0C`rGEXk>AiFs7~m1?FQ}>%&?X_%AM9AzDGn=Ks*QTB8em7kZwpn@_@)~ zH=n~&5OGFa5qHEBDT$Ou%0&A(Lr7~d=1$QixaqZQV#6Qe}UBVt%5MrkI7 zah(`hWH^Tw#5o9Xb49)utx7~R`Eryf+NCLCKRjDF=ch6#&WQau9bvKOh|;#|JN+%F z%&`~3&7aT#agNDH52(~*c~Tb92JSN}1D9elWtj5H$Or@@4~&SSeWqS1h@FCkBhX!Th|-p7lTz`9oX>|(XgA=dbcMZ3=_+I+4dozFd_ z_j#}ZD-n&p(xFYhvY{4S$f?-s6pQ!ij2z9vNNuDJ{arx75bJ!E+EL?c)xL>tUsXnw zAXIFj9BOl7H2^nQ=c{Ju!ouD$=oI>__g+k=FXDJVN~9(%|E9%n^2lj`}KZ$Q=5)aU4Vjuv}V+0 z1;5tblj`PmdZbA#(eRL_bzWIIRWK*3C}?{@wBq)Hb`0`jQ{H>C_Wd3!M1wZ&cC$LOa0AKFrqnxt z(jX6j4{z5KjvJRP+x=Ti9LFo*YZ)EI(AQbCQwFYdE-U~y5IG&?`JrG?=$)JhOW5E| z3EkNDWwF>$5fxUj9WH#zfNhE}9aByk@)hmpGia$CkRLl40OnyVC@_qajNDeJ>+8Nt zK`2b!$Q;nZTIH>ailB|kd9w`pzMggRixJ@)%1?oo%lY-*l|H#4hBEkK`Anc7VZC=H z?Cu%mP_&Nf6FnnmvY#+bp;NOAkg8}vl_p|nY<_0l7Fs_8g#tMPg0V5cu!8H&13Km* zd~>kFgx68^lgbV>3lmcC$9dV^4T+rB@VW5vynFzd2?V>sJPR1%*^nPo z1&3>N>lr{v`MZ|^4-ZTNjywjmwTvQ#8b<}|(m@f5feHr~4gkp}7$uN&Kt^Hag`&|} zz!}jQLcbqbc=>^C1>v+9N0{Jwl}EON0L3xt6`-L8Vi}nPu?iqk0f&RA$jpghndB#l zq;nw@J+Mv2D@FlwijH5v8JaW^0(hH9#i#}5GZtk`;#>%5OrRRs;PFRU&DR#3Q|4W; z*3ufRHNScDnR6(dH0v2yYGz!}x$HC*AHXG)u^7Or5js!+zEeK82*87O$c&@ z*Q#e27>i~KO32=JVC~F&qyKy;ID%i}up&LXdW+2T7@c)J}ts4N0up9q-(k9l6eNiLwX`P#!W6J%sZam$2PxPX9 z&tw>9;nBU`@fFZ^rxaQI;~aO>S)MqXloon`qjGjETRQF_ocr|DOJO8e=m&ofNZ3?x zdGzW7`YN0}TJ?d6`)k>Wzs#YQ#zJ(V zZ_%A{ZdtZ$VblV48=M#&m;X#iXQ}9~kn=0#{3@KZeqbBWu#A^VQ3{Q*P_(DMw?~?# zkU4Tl$Q~v%Xf!I3vLS8F%;B_dHmy4=Yc**#8sYzoOZK0<-pPr22 zWiocr{`48;6N1<$MCsa#&o|Pq#(cawGzO;U9LL$*p!nT>qp4X)vxB&tQ6k@F}y&>(Ws!{j_e&aaU}O_?^y92N-U zZE5|?wzQ$Io17jvz2xhKPk(qDQv;_B1KUOm^&V?;%z_ErDVmU;qkQ$lNIU_a1E&WL zO^gZjxAfk;ca--kS{8hZN0)Y| zD)wG8md^E49&P)mfp6KkLR2vw{7GD2*rNeLB3+nUVPh0J{N_#$>*W467rRj zuZ(=QzB;|=TdCi6wV9B`d*=>O_tnkUC1w_y<8>)(D>}5g8BO%? zHDkr9r}U++=erh63(?nYOGAmSl;db%c`NM~P{Lres`jvgJ9r}$!3sXx+ zuj*5^4_`fc$DrS@0_Ku0XOA-4+F zV5NB5Quor#k7nPTz3Thng&$pd^U~W#|8VFJhEfk6TG@VRrT7r#RjBT2%IKTXADw&i z+*Rp^U;fco-~8&^GdU^qsVG2KY59%PmTRRgtM)R2sRQ6zxdi1`i7Z#rzsS{^#B8{&^B=R%+%^ArvhiyAqIN}!P*#38!#9#QWrQctbazle#G zu>9wQPIKs<(V4+Q49*8sO}Uyw#9Wx7!Z~a5&A_^Fn6C$InVi?nuTY(EBB4?u!n%@} zu_bc^EuH_UG1b2j%gA}{d0)gTTA@cW`@fBbl+{S`y-{@|H9Mv5z4R-t~^~6(t)lk*ryGxAPdj?9t8PsOyL_DYSzDp~~T$HdJf435RPL!&lhV z8GeQXCQdp-&Ntz_p_hfqqDc5$P=X;fouA2RQHs{8?CkX8-RUZ=)Y+%m#&th}sBBA# zecWnE*-Ja0-tmhC-v*>!xjwUmB9)mwr?6*^U@LOttO zsOM2sNP3I1plu%8B0!j@*7_NB0sOy1;S`XA3Fbxr0#{S@pT``Ee3N zP4lLCDbe)I*@X1c#pf?BG%k#!D!Nh@?~?vU<~Pk(-M`oOw(fWPuiD-@x@1mmdlGA% z(=<0gB7y*f>PX+vQ%UD2_XOOx&60n z&bi0e&Nk8@Vze9YkP>XY*`_r?Int+!C=6cIm%XEu{erntRA{b@^l4|5DiUIT==^_| z5o=^sb?kIVg3=ZO42|FeGkMcvy|b_X_httPLU$SI#0vUmC5bF3ou$Q$F$DDjvT_q9 z|Bei7GcdLNr%JeHI+9i{qewp4bJ7lFHiZAny@v3EIUN4~58;@6&V4Wb{;7xXnvqD#(!$f_ zButN}m&bU(W{Evxq!wWHvSJ(~QPw$r84)FNo}+O{LO)cFt(@KQ>@Ow97n@SfZ7a5I z%f@Z;FmvI%`jDO)lArO83-Snno(8|sU*tFY8*;fY&Ty=`Tlug)V$E*mhwgT7fEg6c ziA(c{Jt`M(Q{Dqm{lmatW+OYIQ)7f14%@ZiS!h(a0R~LnqBD~RYElazpqJaKi#SEA z3@sgMD72B!<{$CEgsU~;0nD!mMkO0ylG3;x5JIMSC=pArbt#{YTwjV4baML zuGT`iie+Ny3DBeFnoXHCB6vU-_ySYQl;xQlEQ7h-a&`(us)bDZXql3hsv)Cj*+{%d zg;;_9D#_Me^<~YA=9oRKI9{g47}1FpfJE{>+k=XP&7hB!kt(rL^yEOSsx@YFO8_;l zM#|FdMLzqG!RHt<`icRmbOLe-bCN!fuVkx9bTKF;*m9%>V9HXYsS&H!t1ZS-naJ<@ zJ(ze!3P26Hi2FE3i~IoAT&|@?ZFfL4>$I`d@SGcg^?>!*$&BssdXkXc`SuO?qQb6vd?ep2R9(erMjpQ2z8yjHWPnyqZ7O)$V8fC%w4 z3j7&3T?XAmudLY9>lz1q7xf-T*?7 z3GMQUG|!IdQC=1c<5MT0rZ@L)V-d+_;yoh*Oth~0b}fWSMGs8?1{D?B@hcVn*-2)D zA%vf-hzL)I$&_CJ`)=6c>(JOGLm~K*2Z&nN&}7!0*?}n%6E~iep@Hxt7QA8xO>Kg_ zlGoa+q<)oxFd_jI5w>SiNN!IuV5#Os*hAKy8_LXo3WJ%<`{rf#3mU`Zae@yz3V`D_ zh|gkd>4*y6gAF_wUd;Xw$XW=N%x8C#sb2whL1P`lbb#%08T(Z1Xk~G1a_0IV8z3Pavb;fOzi zw=*HU4$^Dzg7lx>=oALvZxwoHpC)6ak9mcDXy@Q*;FB=of zHiB0eI@Ehucno}k7C_iEJv8M9MsGqG8UsWGe5sRvuzfoKg5A4zZ13vcwW|lmKQhEP z8qN^T10pvW@nSfm$B#{C7&ppRi&WJI6!jD$O!=q1FwJ-7bXWpM6b?+spwomQk3wlP zfM=eUz5!_<=Zu8pMv^`6d>zgn6V6DCh0jRvgCB~FiHO<5#iAiSq}Jjl`1@g(*V3aK}GItn9BG^32 zaG-4MSl!e5fw8oeL2pMP`Ai@hdop4E^U@eXE^f=3aNL-$qd0p}HEjU@O&2lNCmSy4 zK%P=JIX&cThm$sd0Es7`i%E5wUl&Rnzzox-S!NWdLy-PGCHfILtOGMWRuH|D->p_oDGpDBaTfP%LeMiQ>_5_I5ZV{SkdN zfdgTh!5Ct2GUgv(aICa`d@NlI%12qCoT5ZsBQ$}sT1-Gbtw2cuCpAv$F?#Y~Qp$PK z%ZMqF^ExRi-hlJZ(3t&_o+}X+bPF(;m-2Le?eV$Zcw^jm)9g+(zA$vdT=lNG>Xk1n z>b|$*MqT&2b=?Gaa;F^i^ZNJ52ZIcbMu44c_PPDcELRz{&XaM23{Qe_zk~A!6GxvJ zoY&p57sqvk5_sQ`<1H&cx9@#( z@lA_k)!ur;-gV911t0}LgkN29Lg9@Ixj2t1`MKHPC?Ob8#m}uqN5w}5la>Fdn6nly z7q{Ig_FgN-QRUJOvZ|IU{@iuTqwn*@@fOmh!+b*p?vi^AlBzH;(akgt++mVHnK-!+g>6qgb^0BCX)CyJ7$7mHWy^>H0)VlPfqU$@n)mXyZ# zzgJb8Jht%EN?li~${R0Ut?&F^bZN(TFRs)-v}~zawbd_FT(`9^4qkogdglR_;(anB z=S-L`KYqhrf6ZQ>oRuxX{n~@zPlN< z%F0oGMGbnStOh+&(SRN)Ye0{j%aj*WOmCPscb@&nQN0CwE;oWozqBTYy@ zm3|O$W=}dqXZG5iswGn?n2;&RtWJmvO@aL6#|=Oqi2Xa&hCM0;nXA-ghn;J`X6b__ ztE`~DtE`YMHJ=`_Gdd}?OlY?ZBd638Ph7l3>(B#zR|fWiE8U+>IesZ`UvVfvs8EZkz6 zmd&A()_>*Ma7*3~JyoBco=8nrKME~pjXJiVp>&OdM?f#;4ni{tE^VX*M&3iZ zTjFBs{E2vcMUfJbL%0rKa@tGVpK41pjVTN|(9g+qSp2nMNgCZ+b|Q97_p)ZJLN41NH6gf&N&^rJ57^+eLsWcCTr zzeUKxqF<0VbQ!8tXag0)%OX@x#_V@u)GYUh#+5Bv1`S4=Z*@mQWx%*I6ilZsu#G8c z80B51TA2}6zpVO*p~#She5zWAK>l!OeS-QjMh?%b6`>|lG@IGlD+(#E&<6|>Y_LHo z1tlE2%)|&%b!KWN)FnJt31;MP-K<~+cll=YMd&~TQ57GHAaq&&ox!vX7 zTOLwH?qFtpCr>%D!7Ha;f&H9;d{Yg5KOMv-IZD^G=!XsD{$`g?9$^>n@^7f3eAQyl z2FlOX#G%_Fr|-9X?59V^)sD?8(wD1}%iJZqdl{IAbWu^MU*3&brG4{n6mI0s?A=RU z+p>9AY?YSXXvx}3w)a~ywP{9Y`7h9>17qv+09B8*5$=;!T<6u0HD>O}&Fxj_vGeh& za?dK4!TadD9~xAZRm(q7!1zR7S$Q)gw;a;(kv2(DiT6B^s}pDYbNsNE`s=@YPeS{Y ztBF?0%IgL#tIXVz2P4ZEOT5o%oKan@GnEcQ;XH*?1DUf^tFRRE6=nS7!};~n;lgI2 zbZc-3^0j3xgQgWvU5nTq<0uZeRF>X9!jID^f@>{Ik5bDV|Cmy=%MSQI~$2K6|d z-|$TBpi;@wPyup%6u>xJ1=Jp&#%~m4fBT5%q5b&@BoOJ3$@zV9PLp#6&L9KFNpHj3 z;gD!+lhxTse?qT+N)GL{7!En_)|y0HE9nI|Y2(;rDAG~OG$pdmmmes|Dk|pcgpa009 z&uremxdXu5T?cef5hSGK*~Olv8El4kVQ>FHT$kuc?p%9bL-(SbQI}5igZLUrs-;aA+Ps zlhCl>`A*f9s^u*J45k_nzSE>CQ9QVT#d#|4a77+IPS>w+(IuNO5r%2{3s){IclW}2 ze%qmUdQ$emRd>nlB9_L$((p=}r2aM2%ci6RGxdx6erx|5`%~`jxCyZ-L-pgg9LOS4%7R~Kld)%&Zg*$eaQ@g<~V*^ht3gX<*XHcc~y}} z*&)3|j-4FZF|!{#kxgW8q1P^QyyOu6UHUOO5poD;#dMf#!o8uO`M-$Jv}Ml}evoT2 zxL5iBucZHpj={NY(CKu4&UxPF_FUuk`~}zc7hKa{alL=RmHi#p^H0). + verbose : bool, optional + Print information along iterations. + log : bool, optional + Record log if True. + init_C : array-like, shape (N,N,d'), optional + Initialization for the barycenters' edge feature tensor. If not set + a random init is used. + init_F : array-like, shape (N,d), optional + Initialization for the barycenters' node features. If not set a + random init is used. + init_A : array-like, shape (N,N), optional + Initialization for the barycenters' structure matrix. If not set + a random init is used. + random_state : int or RandomState instance, optional + Fix the seed for reproducibility + + Returns + ------- + F : array-like, shape (`N`, `d`) + Barycenters' features + A : array-like, shape (`N`, `N`) + Barycenters' structure matrix + C : array-like, shape (`N`, `N`, `d'`) + Barycenters' edge feature tensor + log : dict + Only returned when log=True. It contains the keys: + + - :math:`\mathbf{T}`: list of (`N`, `ns`) transport matrices + - :math:`(\mathbf{M}_s)_s`: all distance matrices between the feature + of the barycenter and the other features + :math:`(dist(\mathbf{X}, \mathbf{Y}_s))_s` shape (`N`, `ns`) + + """ + Cs = list_to_array(*Cs) + As = list_to_array(*As) + ps = list_to_array( + *ps + ) # list to array bug when only one list has length one + Fs = list_to_array(*Fs) + if not isinstance(Cs, list): + Cs = [Cs] + if not isinstance(As, list): + As = [As] + if not isinstance(ps, list): + ps = [ps] + if not isinstance(Fs, list): + Fs = [Fs] + + p = list_to_array(p) + nx = get_backend(*Cs, *Fs, *As, *ps) + + S = len(Cs) + d = Fs[0].shape[1] # dimension on the node features + d_edge = Cs[0].shape[2] + if p is None: + p = nx.ones(N, type_as=Cs[0]) / N + + if fixed_edge_features: + if init_C is None: + raise UndefinedParameter("If C is fixed it must be initialized") + else: + C = init_C + else: + if init_C is None: + generator = check_random_state(random_state) + C = generator.randn(N, N, d_edge) + C = nx.from_numpy(C, type_as=ps[0]) + else: + C = init_C + + if fixed_structure: + if init_A is None: + raise UndefinedParameter("If A is fixed it must be initialized") + else: + A = init_A + else: + if init_A is None: + generator = check_random_state(random_state) + xalea = generator.randn(N, 2) + A = dist(xalea, xalea) + A = nx.from_numpy(A, type_as=ps[0]) + else: + A = init_A + + if fixed_node_features: + if init_F is None: + raise UndefinedParameter("If F is fixed it must be initialized") + else: + F = init_F + else: + if init_F is None: + F = nx.zeros((N, d), type_as=ps[0]) + else: + F = init_F + + T = [nx.outer(p, q) for q in ps] + + Ms = [dist(F, Fs[s]) for s in range(len(Fs))] + + cpt = 0 + err_node_feature = 1 + err_structure = 1 + err_edge_feature = 1 + + if log: + log_ = {} + log_["err_node_feature"] = [] + log_["err_edge_feature"] = [] + log_["err_structure"] = [] + log_["Ts_iter"] = [] + + while ( + err_node_feature > tol or err_structure > tol or err_edge_feature > tol + ) and cpt < max_iter: + Cprev = C + Aprev = A + Xprev = F + + if not fixed_node_features: + Fs_temp = [y.T for y in Fs] + F = update_node_feature_matrix(lambdas, Fs_temp, T, p).T + + Ms = [dist(F, Fs[s]) for s in range(len(Fs))] + + if not fixed_structure: + if dist_fun_A == "square_loss": + T_temp = [t.T for t in T] + A = update_structure_matrix(p, lambdas, T_temp, As) + + if not fixed_edge_features: + if dist_fun_C == "l2_norm": + T_temp = [t.T for t in T] + C = update_edge_feature_tensor(p, lambdas, T_temp, Cs) + + T = [ + fused_network_gromov_wasserstein2( + Ms[s], + C, + Cs[s], + A, + As[s], + p, + ps[s], + dist_fun_C, + dist_fun_A, + alpha, + beta, + numItermax=max_iter, + stopThr=1e-5, + verbose=verbose > 2, + log=True, + )[1]["T"] + for s in range(S) + ] + + # T is N,ns + err_node_feature = nx.norm(F - nx.reshape(Xprev, (N, d))) + err_structure = nx.norm(A - Aprev) + err_edge_feature = nx.norm(C - Cprev) + if log: + log_["err_node_feature"].append(err_node_feature) + log_["err_edge_feature"].append(err_edge_feature) + log_["err_structure"].append(err_structure) + log_["Ts_iter"].append(T) + + if verbose: + if cpt % 200 == 0: + print("{:5s}|{:12s}".format("It.", "Err") + "\n" + "-" * 19) + print("{:5d}|{:8e}|".format(cpt, err_structure)) + print("{:5d}|{:8e}|".format(cpt, err_node_feature)) + print("{:5d}|{:8e}|".format(cpt, err_edge_feature)) + print("\n") + + cpt += 1 + + if log: + log_["T"] = T # from target to Fs + log_["p"] = p + log_["Ms"] = Ms + + if log: + return F, A, C, log_ + else: + return F, A, C + + +def update_structure_matrix(p, lambdas, T, As): + r"""Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the + `S` :math:`\mathbf{T}_s` couplings. + It is calculated at each iteration + Parameters + ---------- + p : array-like, shape (N,) + Masses in the targeted barycenter. + lambdas : list of float + List of the `S` spaces' weights. + T : list of S array-like of shape (ns, N) + The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration. + As : list of S array-like, shape (ns, ns) + Metric cost matrices. + Returns + ------- + A : array-like, shape (`nt`, `nt`) + Updated :math:`\mathbf{A}` matrix. + """ + p = list_to_array(p) + T = list_to_array(*T) + As = list_to_array(*As) + nx = get_backend(*As, *T, p) + + tmpsum = sum( + [ + lambdas[s] * nx.dot(nx.dot(T[s].T, As[s]), T[s]) + for s in range(len(T)) + ] + ) + ppt = nx.outer(p, p) + return tmpsum / ppt + + +def update_node_feature_matrix(lambdas, Fs, Ts, p): + r"""Updates the feature with respect to the `S` :math:`\mathbf{T}_s` + couplings. + Parameters + ---------- + p : array-like, shape (N,) + masses in the targeted barycenter + lambdas : list of float + List of the `S` spaces' weights + Ts : list of S array-like, shape (ns,N) + The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration + Fs : list of S array-like, shape (d,ns) + The features. + + Returns + ------- + F : array-like, shape (`d`, `N`) + """ + p = list_to_array(p) + Ts = list_to_array(*Ts) + Fs = list_to_array(*Fs) + if not isinstance(Ts, list): + Ts = [Ts] + if not isinstance(Fs, list): + Fs = [Fs] + nx = get_backend(*Fs, *Ts, p) + + p = 1.0 / p + tmpsum = sum( + [ + lambdas[s] * nx.dot(Fs[s], Ts[s].T) * p[None, :] + for s in range(len(Ts)) + ] + ) + return tmpsum + + +def update_edge_feature_tensor(p, lambdas, T, Cs): + r"""Updates :math:`\mathbf{C}` according to the l2 norm inner distance with + the `S` :math:`\mathbf{T}_s` couplings. + + It is calculated at each iteration + + Parameters + ---------- + p : array-like, shape (N,) + Masses in the targeted barycenter. + lambdas : list of float + List of the `S` spaces' weights. + T : list of S array-like of shape (ns,N) + The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration. + Cs : list of S array-like, shape (ns,ns,d') + Edge features tensors + + Returns + ------- + C : array-like, shape (nt,nt,d') + Updated :math:`\mathbf{C}` tensor. + """ + p = list_to_array(p) + T = list_to_array(*T) + Cs = list_to_array(*Cs) + if not isinstance(T, list): + T = [T] + if not isinstance(Cs, list): + Cs = [Cs] + nx = get_backend(*Cs, *T, p) + + # Proposition 2.10 in our paper + tmpsum = sum( + [ + lambdas[s] + * nx.einsum( + "ijd,jk...->ikd", + nx.einsum("ij...,jkd->ikd", T[s].T, Cs[s]), + T[s], + ) + for s in range(len(T)) + ] + ) + ppt = nx.reshape(nx.outer(p, p), shape=(len(p), len(p), 1)) + return tmpsum / ppt + + +def fused_network_gromov_wasserstein2( + M, + C1, + C2, + A1, + A2, + p, + q, + dist_fun_C="l2_norm", + dist_fun_A="square_loss", + alpha=0.33, + beta=0.33, + armijo=False, + G0=None, + log=False, + **kwargs, +): + r""" + Computes the FNGW transport between two graphs + + See Algorithm 1 in our paper. + + Parameters + ---------- + M : array-like, shape (ns, nt) + Metric cost matrix between node features of source and target graphs + C1 : array-like, shape (ns, ns, d') + Edge feature tensor of the source graph + C2 : array-like, shape (nt, nt, d') + Edge feature tensor of the target graph + A1 : array-like, shape (ns, ns) + Structure matrix of the source graph + A2 : array-like, shape (nt, nt) + Structure matrix of the target graph + p : array-like, shape (ns,) + Distribution in the source space + q : array-like, shape (nt,) + Distribution in the target space + dist_fun_C : str, optional + Inner distance used for the edge feature tensor + dist_fun_A : str, optional + Loss function used for the structure matrix + alpha : float, optional + Trade-off parameter (0 < alpha < 1) + beta : float, optional + Trade-off parameter (0 < beta < 1) + armijo : bool, optional + If True the step of the line-search is found via an armijo research. + Else closed form is used. + If there are convergence issues use False. + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 must satisfy marginal constraints and will be used as + initial transport of the solver. + log : bool, optional + record log if True + **kwargs : dict + parameters can be directly passed to the ot.optim.cg solver + + Returns + ------- + fngw_dist : float + FNGW distance for the given parameters. + log : dict + Log dictionary return only if log==True in parameters. + """ + assert alpha + beta <= 1 + p, q = list_to_array(p, q) + p0, q0, C10, C20, A10, A20, M0 = p, q, C1, C2, A1, A2, M + if G0 is None: + nx = get_backend(p0, q0, C10, C20, A10, A20, M0) + else: + G0_ = G0 + nx = get_backend(p0, q0, C10, C20, A10, A20, M0, G0_) + + p = nx.to_numpy(p) + q = nx.to_numpy(q) + C1 = nx.to_numpy(C10) + C2 = nx.to_numpy(C20) + A1 = nx.to_numpy(A10) + A2 = nx.to_numpy(A20) + M = nx.to_numpy(M0) + + if G0 is None: + G0 = p[:, None] * q[None, :] + else: + G0 = nx.to_numpy(G0_) + # Check marginals of G0 + np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) + np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) + + constA, hA1, hA2 = init_matrix_A(A1, A2, p, q, dist_fun_A) + constC = init_matrix_C(C1, C2, p, q, dist_fun_C) + + def f(G): + return ngwloss(constC, C1, C2, G) + + def df(G): + return ngwgrad(constC, C1, C2, G) + + def g(G): + return gwloss(constA, hA1, hA2, G) + + def dg(G): + return gwggrad(constA, hA1, hA2, G) + + T, cg_log = cg( + p, + q, + (1 - alpha - beta) * M, + reg_f=alpha, + reg_g=beta, + f=f, + df=df, + g=g, + dg=dg, + G0=G0, + armijo=armijo, + C1=C1, + C2=C2, + A1=A1, + A2=A2, + constC=constC, + constA=constA, + log=True, + **kwargs, + ) + + fngw_dist = nx.from_numpy(cg_log["loss"][-1], type_as=C10) + T0 = nx.from_numpy(T, type_as=C10) + cg_log["fngw_dist"] = fngw_dist + cg_log["u"] = nx.from_numpy(cg_log["u"], type_as=C10) + cg_log["v"] = nx.from_numpy(cg_log["v"], type_as=C10) + cg_log["T"] = T0 + + # TODO: implement the gradient for p0, q0 + if dist_fun_C == "l2_norm" and dist_fun_A == "square_loss": + gC1 = 2 * C1 * (p[:, None] * p[None, :])[:, :, None] - 2 * np.einsum( + "ilt, kl->ikt", np.einsum("ij,jlt->ilt", T, C2), T + ) + gC2 = 2 * C2 * (q[:, None] * q[None, :])[:, :, None] - 2 * np.einsum( + "jkt, kl->jlt", np.einsum("ij,ikt->jkt", T, C1), T + ) + gC1 = nx.from_numpy(gC1, type_as=C10) + gC2 = nx.from_numpy(gC2, type_as=C10) + + gA1 = 2 * A1 * (p[:, None] * p[None, :]) - 2 * T.dot(A2).dot(T.T) + gA2 = 2 * A2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(A1).dot(T) + gA1 = nx.from_numpy(gA1, type_as=A10) + gA2 = nx.from_numpy(gA2, type_as=A10) + + fngw_dist = nx.set_gradients( + fngw_dist, + (p0, q0, C10, C20, A10, A20, M0), + ( + cg_log["u"] + - nx.mean( + cg_log["u"] + ), # No need for p0, q0 since they will not be updated, + # keeps it right now + cg_log["v"] - nx.mean(cg_log["v"]), + alpha * gC1, + alpha * gC2, + beta * gA1, + beta * gA2, + (1 - alpha - beta) * T0, + ), + ) + if log: + return fngw_dist, cg_log + else: + return fngw_dist + + +def init_matrix_C(C1, C2, p, q, dist="l2_norm"): + r"""Computation of the sum of the first two terms of Equation (6) in our + paper. + + Parameters + ---------- + C1 : array-like, shape (ns, ns, d') + Edge feature tensor of the source graph + C2 : array-like, shape (nt, nt, d') + Edge feature tensor of the target graph + T : array-like, shape (ns, nt) + Coupling between source and target spaces + p : array-like, shape (ns,) + + Returns + ------- + constC : array-like, shape (ns, nt) + + """ + C1, C2, p, q = list_to_array(C1, C2, p, q) + nx = get_backend(C1, C2, p, q) + + if dist == "l2_norm": + + def f1(a): + return nx.sum(nx.power(a, 2), axis=-1) + + def f2(b): + return nx.sum(nx.power(b, 2), axis=-1) + + else: + raise ValueError + + constC1 = nx.dot( + nx.dot(f1(C1), nx.reshape(p, (-1, 1))), nx.ones((1, len(q)), type_as=q) + ) + constC2 = nx.dot( + nx.ones((len(p), 1), type_as=p), + nx.dot(nx.reshape(q, (1, -1)), f2(C2).T), + ) + constC = constC1 + constC2 + + return constC + + +def tensor_product(constC, C1, C2, T): + r"""Implementation of the Prop. 2.5 in our paper. + + Parameters + ---------- + constC : array-like, shape (ns, nt) + the sum of the first two terms of Eq. (6) + C1 : array-like, shape (ns, ns, d') + Edge feature tensor of the source graph + C2 : array-like, shape (nt, nt, d') + Edge feature tensor of the target graph + + T : array-like, shape (ns, nt) + + Returns + ------- + tens : array-like, shape (ns, nt) + + """ + constC, C1, C2, T = list_to_array(constC, C1, C2, T) + nx = get_backend(constC, C1, C2, T) + + A = -2 * nx.einsum( + "ijd, kjd->ikd", nx.einsum("ijd,jk...->ikd", C1, T), C2 + ) # (ns, nt, d) + + A = nx.sum(A, axis=-1) # (ns, nt) + tens = constC + A + # tens -= tens.min() + return tens + + +def ngwloss(constC, C1, C2, T): + r"""Compute the third term of Eq.5 in our paper + + Parameters + ---------- + constC : array-like, shape (ns, nt) + the sum of the first two terms of Eq. (6) + C1 : array-like, shape (ns, ns, d') + Edge feature tensor of the source graph + C2 : array-like, shape (nt, nt, d') + Edge feature tensor of the target graph + T : array-like, shape (ns, nt) + Current value of transport matrix :math:`\mathbf{T}` + Current value of transport matrix :math:`\mathbf{T}` + + Returns + ------- + loss : float + + """ + + tens = tensor_product(constC, C1, C2, T) + + tens, T = list_to_array(tens, T) + nx = get_backend(tens, T) + + return nx.sum(tens * T) + + +def ngwgrad(constC, C1, C2, T): + r"""Compute the third term of Eq.7 in our paper + + Parameters + ---------- + constC : array-like, shape (ns, nt) + the sum of the first two terms of Eq. (6) + C1 : array-like, shape (ns, ns, d') + Edge feature tensor of the source graph + C2 : array-like, shape (nt, nt, d') + Edge feature tensor of the target graph + T : array-like, shape (ns, nt) + Current value of transport matrix :math:`\mathbf{T}` + + Returns + ------- + grad : array-like, shape (`ns`, `nt`) + + """ + return 2 * tensor_product(constC, C1, C2, T) + + +def cg( + a, + b, + M, + reg_f, + reg_g, + f, + df, + g, + dg, + G0=None, + numItermax=200, + numItermaxEmd=100000, + stopThr=1e-9, + stopThr2=1e-9, + verbose=False, + log=False, + **kwargs, +): + r""" + Solve the general regularized OT problem with conditional gradient + + The function solves the following optimization problem: + + .. math:: + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} + \rangle_F + \mathrm{reg} \cdot f(\gamma) + + s.t. \ \gamma \mathbf{1} &= \mathbf{a} + + \gamma^T \mathbf{1} &= \mathbf{b} + + \gamma &\geq 0 + where : + + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix + - :math:`f` is the regularization term (and `df` is its gradient) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights + (sum to 1) + + The algorithm used for solving the problem is conditional gradient as + discussed in :ref:`[1] ` + + + Parameters + ---------- + a : array-like, shape (ns,) + samples weights in the source domain + b : array-like, shape (nt,) + samples in the target domain + M : array-like, shape (ns, nt) + loss matrix + reg_f : float + Regularization term >0 + reg_g : float + Regularization term >0 + G0 : array-like, shape (ns,nt), optional + initial guess (default is indep joint density) + numItermax : int, optional + Max number of iterations + numItermaxEmd : int, optional + Max number of iterations for emd + stopThr : float, optional + Stop threshold on the relative variation (>0) + stopThr2 : float, optional + Stop threshold on the absolute variation (>0) + verbose : bool, optional + Print information along iterations + log : bool, optional + record log if True + **kwargs : dict + Parameters for linesearch + + Returns + ------- + gamma : (ns x nt) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary return only if log==True in parameters + + + .. _references-cg: + References + ---------- + + .. [1] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, + 7(3), 1853-1882. + + See Also + -------- + ot.lp.emd : Unregularized optimal ransport + ot.bregman.sinkhorn : Entropic regularized optimal transport + + """ + a, b, M, G0 = list_to_array(a, b, M, G0) + if isinstance(M, int) or isinstance(M, float): + nx = get_backend(a, b) + else: + nx = get_backend(a, b, M) + + loop = 1 + + if log: + log = {"loss": []} + + if G0 is None: + G = nx.outer(a, b) + else: + G = G0 + + def cost(G): + return nx.sum(M * G) + reg_f * f(G) + reg_g * g(G) + + f_val = cost(G) + if log: + log["loss"].append(f_val) + + it = 0 + + if verbose: + print( + "{:5s}|{:12s}|{:8s}|{:8s}".format( + "It.", "Loss", "Relative loss", "Absolute loss" + ) + + "\n" + + "-" * 48 + ) + print("{:5d}|{:8e}|{:8e}|{:8e}".format(it, f_val, 0, 0)) + + while loop: + + it += 1 + old_fval = f_val + + # problem linearization + Mi = M + reg_f * df(G) + reg_g * dg(G) + # set M positive + Mi += nx.min(Mi) + + # solve linear program + Gc, logemd = emd(a, b, Mi, numItermax=numItermaxEmd, log=True) + + deltaG = Gc - G + + # line search + alpha, fc, f_val = solve_linesearch( + cost, + G, + deltaG, + Mi, + f_val, + reg_f=reg_f, + reg_g=reg_g, + M=M, + Gc=Gc, + alpha_min=0.0, + alpha_max=1.0, + **kwargs, + ) + + G = G + alpha * deltaG + + # test convergence + if it >= numItermax: + loop = 0 + + abs_delta_fval = abs(f_val - old_fval) + relative_delta_fval = abs_delta_fval / abs(f_val) + if relative_delta_fval < stopThr or abs_delta_fval < stopThr2: + loop = 0 + + if log: + log["loss"].append(f_val) + + if verbose: + if it % 20 == 0: + print( + "{:5s}|{:12s}|{:8s}|{:8s}".format( + "It.", "Loss", "Relative loss", "Absolute loss" + ) + + "\n" + + "-" * 48 + ) + print( + "{:5d}|{:8e}|{:8e}|{:8e}".format( + it, f_val, relative_delta_fval, abs_delta_fval + ) + ) + + if log: + log.update(logemd) + return G, log + else: + return G + + +def solve_linesearch( + cost, + G, + deltaG, + Mi, + val, + armijo=True, + C1=None, + C2=None, + reg_f=None, + A1=None, + A2=None, + reg_g=None, + Gc=None, + constC=None, + constA=None, + M=None, + alpha_min=None, + alpha_max=None, +): + """ + Solve the linesearch in the FNGW iterations + + Parameters + ---------- + cost : method + Cost in the FNGW for the linesearch + G : array-like, shape(ns,nt) + The transport map at a given iteration of the FNGW + deltaG : array-like (ns,nt) + Difference between the optimal map found by linearization in the FW + algorithm and the value at a given iteration + Mi : array-like (ns,nt) + Cost matrix of the linearized transport problem. Corresponds to the + gradient of the cost + val : float + Value of the cost at `G` + armijo : bool, optional + If True the steps of the line-search is found via an armijo research. + Else closed form is used. + If there is convergence issues use False. + C1 : array-like (ns,ns,d'), optional + Edge feature tensor of the source graph. Only used and necessary when + armijo=False + C2 : array-like (nt,nt,d'), optional + Edge feature tensor in the target graph. Only used and necessary when + armijo=False + reg_f : float, optional + Regularization parameter. Only used and necessary when armijo=False + A1 : array-like (ns,ns), optional + Structure matrix of the source graph. Only used and necessary when + armijo=False + A2 : array-like (nt,nt), optional + Structure matrix of the target graph. Only used and necessary when + armijo=False + reg_g : float, optional + Regularization parameter. Only used and necessary when armijo=False + Gc : array-like (ns,nt) + Optimal map found by linearization in the FW algorithm. Only used and + necessary when armijo=False + constC : array-like (ns,nt) + Constant for the gromov cost. Only used and necessary when armijo=False + See :ref:`[24] `. + + M : array-like (ns,nt), optional + Cost matrix between the features. Only used and necessary when + armijo=False + alpha_min : float, optional + Minimum value for alpha + alpha_max : float, optional + Maximum value for alpha + + Returns + ------- + alpha : float + The optimal step size of the FW + fc : int + nb of function call. Useless here + f_val : float + The value of the cost for the next iteration + + """ + if armijo: + # TODO: Update for armijo + alpha, fc, f_val = line_search_armijo( + cost, G, deltaG, Mi, val, alpha_min=alpha_min, alpha_max=alpha_max + ) + else: + G, deltaG, C1, C2, constC, A1, A2, constA, M = list_to_array( + G, deltaG, C1, C2, constC, A1, A2, constA, M + ) + if isinstance(M, int) or isinstance(M, float): + nx = get_backend(G, deltaG, C1, C2, constC, A1, A2, constA) + else: + nx = get_backend(G, deltaG, C1, C2, constC, A1, A2, constA, M) + + dotC_1 = nx.sum( + nx.einsum( + "ijd, kjd->ikd", nx.einsum("ijd,jk...->ikd", C1, deltaG), C2 + ), + axis=-1, + ) + dotC_2 = nx.sum( + nx.einsum("ijd, kjd->ikd", nx.einsum("ijd,jk...->ikd", C1, G), C2), + axis=-1, + ) + + dotA_1 = nx.dot(nx.dot(A1, deltaG), A2.T) + dotA_2 = nx.dot(nx.dot(A1, G), A2.T) + + a = -2 * reg_f * nx.sum(dotC_1 * deltaG) - 2 * reg_g * nx.sum( + dotA_1 * deltaG + ) + + b = ( + nx.sum((M + reg_f * constC + reg_g * constA) * deltaG) + - 2 * reg_f * (nx.sum(dotC_1 * G) + nx.sum(dotC_2 * deltaG)) + - 2 * reg_g * (nx.sum(dotA_1 * G) + nx.sum(dotA_2 * deltaG)) + ) + + # c = cost(G) + # c was pased to solve_linesearch as c, which does not exist + alpha = solve_1d_linesearch_quad(a, b) + if alpha_min is not None or alpha_max is not None: + alpha = np.clip(alpha, alpha_min, alpha_max) + fc = None + f_val = cost(G + alpha * deltaG) + + return alpha, fc, f_val diff --git a/ext/PythonOptimalTransport/srGW.jl b/ext/PythonOptimalTransport/srGW.jl new file mode 100644 index 0000000..394fd9d --- /dev/null +++ b/ext/PythonOptimalTransport/srGW.jl @@ -0,0 +1,508 @@ +# Julia implementation of semi-relaxed Gromov-Wasserstein algorithms +# Converted from Python implementation by cvincentcuaz +# This module provides conditional gradient, mirror descent, and MM algorithms +# for semi-relaxed (fused) Gromov-Wasserstein optimal transport + +using LinearAlgebra +using Random + +# ============================================================================= +# Utility Functions +# ============================================================================= + +# Initialize transport plan for semi-relaxed GW +# Arguments: +# init_mode: "product", "random", or "random_product" +# p: source distribution (N1,) +# N1, N2: dimensions +# seed: random seed (nothing for no seeding) +# Returns: T - initial transport plan (N1, N2) +function initializer_semirelaxed_GW( + init_mode::String, p::AbstractVector{T}, N1::Int, N2::Int; + seed::Union{Int, Nothing} = 0) where {T <: Real} + if init_mode == "product" + q = ones(T, N2) / N2 + T_plan = p * q' + elseif init_mode == "random" + if !isnothing(seed) + Random.seed!(seed) + end + T_plan = rand(T, N1, N2) + # Scale to satisfy first marginal constraint + scale = p ./ sum(T_plan, dims = 2) + T_plan .*= scale + elseif init_mode == "random_product" + if !isnothing(seed) + Random.seed!(seed) + end + q = rand(T, N2) + q ./= sum(q) + T_plan = p * q' + else + error("Unknown init mode: $init_mode") + end + return T_plan +end + +# Initialize matrices for symmetric GW computation +function init_matrix_GW2(C1::AbstractMatrix{T}, C2::AbstractMatrix{T}, + p::AbstractVector{T}, q::AbstractVector{T}, + ones_p::AbstractVector{T}, ones_q::AbstractVector{T}) where {T <: Real} + f1_ = C1 .^ 2 + f2_ = C2 .^ 2 + constC1 = f1_ * (p * ones_q') + constC2 = (ones_p * q') * f2_ + constC = constC1 + constC2 + hC1 = C1 + hC2 = 2 * C2 + return constC, hC1, hC2 +end + +# Initialize matrices for asymmetric GW computation +function init_matrix_asymGW2(C1::AbstractMatrix{T}, C2::AbstractMatrix{T}, + p::AbstractVector{T}, q::AbstractVector{T}, + ones_p::AbstractVector{T}, ones_q::AbstractVector{T}) where {T <: Real} + f1_ = (C1 .^ 2) / 2.0 + f2_ = (C2 .^ 2) / 2.0 + constC1 = f1_ * (p * ones_q') + constC2 = (ones_p * q') * f2_' + constC = constC1 + constC2 + hC1 = C1 + hC2 = C2 + return constC, hC1, hC2 +end + +# Compute tensor product for GW distance +function tensor_product(constC::AbstractMatrix{T}, hC1::AbstractMatrix{T}, + hC2::AbstractMatrix{T}, T_plan::AbstractMatrix{T}) where {T <: Real} + A = -hC1 * T_plan * hC2' + return constC + A +end + +# ============================================================================= +# Conditional Gradient Descent Algorithms +# ============================================================================= + +# Conditional gradient algorithm for semi-relaxed (fused) Gromov-Wasserstein +# Solves: min_T α * ⟨L(C₁, C₂) ⊗ T, T⟩ + ⟨M, T⟩ +function cg_semirelaxed(C1::AbstractMatrix{T}, p::AbstractVector{T}, C2::AbstractMatrix{T}; + alpha::Real = 1.0, linear_cost::Union{Nothing, AbstractMatrix{T}} = nothing, + init_mode::String = "product", T_init::Union{Nothing, AbstractMatrix{T}} = nothing, + symmetry::Bool = true, use_log::Bool = false, eps::Real = 1e-5, + max_iter::Int = 1000, seed::Int = 0, verbose::Bool = false) where {T <: Real} + N1, N2 = size(C1, 1), size(C2, 1) + + # Initialize transport plan + if isnothing(T_init) + T_plan = initializer_semirelaxed_GW(init_mode, p, N1, N2; seed = seed) + else + @assert size(T_init) == (N1, N2) + T_plan = copy(T_init) + end + + # Check symmetry + if isnothing(symmetry) + symmetry = (C1 == C1') && (C2 == C2') + end + + # Initialize + q = vec(sum(T_plan, dims = 1)) + ones_p = ones(T, N1) + ones_q = ones(T, N2) + + # Compute initial gradient + if symmetry + constC, hC1, hC2 = init_matrix_GW2(C1, C2, p, q, ones_p, ones_q) + G = 2 * tensor_product(constC, hC1, hC2, T_plan) + else + constC, hC1, hC2 = init_matrix_asymGW2(C1, C2, p, q, ones_p, ones_q) + constCt, hC1t, hC2t = init_matrix_asymGW2(C1', C2', p, q, ones_p, ones_q) + subG = tensor_product(constC, hC1, hC2, T_plan) + subGt = tensor_product(constCt, hC1t, hC2t, T_plan) + G = subG + subGt + end + G .*= alpha + + srgw_loss = 0.5 * sum(G .* T_plan) + add_linear_cost = !isnothing(linear_cost) + + if add_linear_cost + linear_loss = sum(linear_cost .* T_plan) + current_loss = srgw_loss + linear_loss + G .+= linear_cost + else + current_loss = srgw_loss + end + + log = use_log ? Dict("loss" => [current_loss]) : nothing + convergence_criterion = Inf + outer_count = 0 + + while convergence_criterion > eps && outer_count < max_iter + previous_loss = current_loss + + # Direction finding by solving subproblem on rows + min_vals = minimum(G, dims = 2) + X = (G .== min_vals) .* T(1.0) + row_sums = vec(sum(X, dims = 2)) + scale = p ./ row_sums + X .*= scale + + # Exact line search + qX = vec(sum(X, dims = 1)) + + if symmetry + constCX, hC1X, hC2X = init_matrix_GW2(C1, C2, p, qX, ones_p, ones_q) + GX = 2 * alpha * tensor_product(constCX, hC1X, hC2X, X) + GXX = 0.5 * sum(GX .* X) + GXT = 0.5 * sum(GX .* T_plan) + + a = srgw_loss + GXX - 2 * GXT + b = 2 * (GXT - srgw_loss) + else + constCX, hC1X, hC2X = init_matrix_asymGW2(C1, C2, p, qX, ones_p, ones_q) + constCXt, hC1Xt, hC2Xt = init_matrix_asymGW2(C1', C2', p, qX, ones_p, ones_q) + subGX = tensor_product(constCX, hC1X, hC2X, X) + subGXt = tensor_product(constCXt, hC1Xt, hC2Xt, X) + GX = alpha * (subGX + subGXt) + GXX = 0.5 * sum(GX .* X) + subGXt_dotT = sum(subGXt .* T_plan) + subGTt_dotX = sum(subGt .* X) + + a = srgw_loss + GXX - subGXt_dotT - subGTt_dotX + b = -2 * srgw_loss + subGXt_dotT + subGTt_dotX + end + + if add_linear_cost + linear_loss_X = sum(linear_cost .* X) + b += linear_loss_X - linear_loss + end + + # Compute step size + if a > 0 + gamma = min(1, max(0, -b / (2 * a))) + elseif a + b < 0 + gamma = 1 + else + gamma = 0 + end + + # Update + T_plan .= (1 - gamma) * T_plan + gamma * X + current_loss += a * gamma^2 + b * gamma + + if add_linear_cost + linear_loss = (1 - gamma) * linear_loss + gamma * linear_loss_X + srgw_loss = current_loss - linear_loss + G .= (1 - gamma) * G + gamma * (GX + linear_cost) + else + srgw_loss = current_loss + G .= (1 - gamma) * G + gamma * GX + end + + outer_count += 1 + use_log && push!(log["loss"], current_loss) + + convergence_criterion = abs(previous_loss - current_loss) / + (abs(previous_loss) + 1e-15) + end + + return use_log ? (T_plan, current_loss, log) : (T_plan, current_loss) +end + +# Conditional gradient for semi-relaxed Gromov-Wasserstein +# Wrapper for cg_semirelaxed with α=1 and no linear cost +function cg_semirelaxed_gromov_wasserstein(C1::AbstractMatrix{T}, p::AbstractVector{T}, + C2::AbstractMatrix{T}; kwargs...) where {T <: Real} + return cg_semirelaxed(C1, p, C2; alpha = 1.0, linear_cost = nothing, kwargs...) +end + +# Conditional gradient for semi-relaxed fused Gromov-Wasserstein +# A1, A2: Feature matrices (N1×d), (N2×d) +# alpha: Trade-off parameter (0 for pure OT, 1 for pure GW) +function cg_semirelaxed_fused_gromov_wasserstein( + C1::AbstractMatrix{T}, A1::AbstractMatrix{T}, + p::AbstractVector{T}, C2::AbstractMatrix{T}, + A2::AbstractMatrix{T}, alpha::Real; + kwargs...) where {T <: Real} + N1, N2 = size(A1, 1), size(A2, 1) + d = size(A1, 2) + + # Compute Euclidean distance matrix between features + A1_sq = sum(A1 .^ 2, dims = 2) * ones(T, 1, N2) + A2_sq = ones(T, N1, 1) * sum(A2 .^ 2, dims = 2)' + D = A1_sq + A2_sq - 2 * A1 * A2' + + return cg_semirelaxed( + C1, p, C2; alpha = alpha, linear_cost = (1 - alpha) * D, kwargs...) +end + +# ============================================================================= +# Mirror Descent Algorithms (Entropic Regularization) +# ============================================================================= + +# Mirror descent algorithm using KL geometry for semi-relaxed (fused) GW +# gamma_entropy: Entropy regularization parameter (must be > 0) +function md_semirelaxed(C1::AbstractMatrix{T}, p::AbstractVector{T}, C2::AbstractMatrix{T}, + gamma_entropy::Real; alpha::Real = 1.0, + linear_cost::Union{Nothing, AbstractMatrix{T}} = nothing, + init_mode::String = "product", T_init::Union{Nothing, AbstractMatrix{T}} = nothing, + symmetry::Bool = true, use_log::Bool = false, eps::Real = 1e-5, + max_iter::Int = 1000, seed::Int = 0, verbose::Bool = false) where {T <: Real} + @assert gamma_entropy>0 "gamma_entropy must be positive" + + N1, N2 = size(C1, 1), size(C2, 1) + + # Initialize transport plan + if isnothing(T_init) + T_plan = initializer_semirelaxed_GW(init_mode, p, N1, N2; seed = seed) + else + @assert size(T_init) == (N1, N2) + T_plan = copy(T_init) + end + + # Check symmetry + if isnothing(symmetry) + symmetry = (C1 == C1') && (C2 == C2') + end + + # Initialize + q = vec(sum(T_plan, dims = 1)) + ones_p = ones(T, N1) + ones_q = ones(T, N2) + + # Compute initial gradient + if symmetry + constC, hC1, hC2 = init_matrix_GW2(C1, C2, p, q, ones_p, ones_q) + G = 2 * alpha * tensor_product(constC, hC1, hC2, T_plan) + else + constC, hC1, hC2 = init_matrix_asymGW2(C1, C2, p, q, ones_p, ones_q) + constCt, hC1t, hC2t = init_matrix_asymGW2(C1', C2', p, q, ones_p, ones_q) + subG = tensor_product(constC, hC1, hC2, T_plan) + subGt = tensor_product(constCt, hC1t, hC2t, T_plan) + G = alpha * (subG + subGt) + end + + current_loss = 0.5 * sum(G .* T_plan) + add_linear_cost = !isnothing(linear_cost) + + if add_linear_cost + linear_loss = sum(linear_cost .* T_plan) + current_loss += linear_loss + G .+= linear_cost + end + + log = use_log ? Dict("loss" => [current_loss]) : nothing + convergence_criterion = Inf + outer_count = 0 + + while convergence_criterion > eps && outer_count < max_iter + previous_loss = current_loss + + # Compute Bregman projection + M = G - gamma_entropy * Base.log.(T_plan) + K = Base.exp.(-M / gamma_entropy) + scaling = p ./ vec(sum(K, dims = 2)) + T_plan .= (scaling .* ones(T, 1, N2)) .* K + + q = vec(sum(T_plan, dims = 1)) + + # Update gradient + if symmetry + constC, hC1, hC2 = init_matrix_GW2(C1, C2, p, q, ones_p, ones_q) + G = 2 * alpha * tensor_product(constC, hC1, hC2, T_plan) + else + constC, hC1, hC2 = init_matrix_asymGW2(C1, C2, p, q, ones_p, ones_q) + constCt, hC1t, hC2t = init_matrix_asymGW2(C1', C2', p, q, ones_p, ones_q) + subG = tensor_product(constC, hC1, hC2, T_plan) + subGt = tensor_product(constCt, hC1t, hC2t, T_plan) + G = alpha * (subG + subGt) + end + + current_loss = 0.5 * sum(G .* T_plan) + + if add_linear_cost + linear_loss = sum(linear_cost .* T_plan) + current_loss += linear_loss + G .+= linear_cost + end + + outer_count += 1 + use_log && push!(log["loss"], current_loss) + + convergence_criterion = abs(previous_loss - current_loss) / + (abs(previous_loss) + 1e-15) + end + + return use_log ? (T_plan, current_loss, log) : (T_plan, current_loss) +end + +# Mirror descent for semi-relaxed Gromov-Wasserstein with entropic regularization +function md_semirelaxed_gromov_wasserstein(C1::AbstractMatrix{T}, p::AbstractVector{T}, + C2::AbstractMatrix{T}, gamma_entropy::Real; + kwargs...) where {T <: Real} + return md_semirelaxed( + C1, p, C2, gamma_entropy; alpha = 1.0, linear_cost = nothing, kwargs...) +end + +# Mirror descent for semi-relaxed fused Gromov-Wasserstein with entropic regularization +function md_semirelaxed_fused_gromov_wasserstein( + C1::AbstractMatrix{T}, A1::AbstractMatrix{T}, + p::AbstractVector{T}, C2::AbstractMatrix{T}, + A2::AbstractMatrix{T}, gamma_entropy::Real, + alpha::Real; kwargs...) where {T <: Real} + N1, N2 = size(A1, 1), size(A2, 1) + d = size(A1, 2) + + # Compute Euclidean distance matrix + A1_sq = sum(A1 .^ 2, dims = 2) * ones(T, 1, N2) + A2_sq = ones(T, N1, 1) * sum(A2 .^ 2, dims = 2)' + D = A1_sq + A2_sq - 2 * A1 * A2' + + return md_semirelaxed(C1, p, C2, gamma_entropy; alpha = alpha, + linear_cost = (1 - alpha) * D, kwargs...) +end + +# ============================================================================= +# Majorization-Minimization Algorithms with Sparsity Regularization +# ============================================================================= + +# MM algorithm with ℓₚ-ℓ₁ sparsity regularization for semi-relaxed (fused) GW +# Solves: min_T α⟨L(C₁,C₂)⊗T,T⟩ + ⟨M,T⟩ + λ∑ⱼ(∑ᵢTᵢⱼ)^p +function mm_lpl1_semirelaxed( + C1::AbstractMatrix{T}, p::AbstractVector{T}, C2::AbstractMatrix{T}, + gamma_entropy::Real; alpha::Real = 1.0, + linear_cost::Union{Nothing, AbstractMatrix{T}} = nothing, + T_init::Union{Nothing, AbstractMatrix{T}} = nothing, + init_mode::String = "product", symmetry::Bool = true, + p_reg::Real = 0.5, lambda_reg::Real = 0.001, + use_log::Bool = false, use_warmstart::Bool = false, + eps_inner::Real = 1e-6, eps_outer::Real = 1e-6, + max_iter_inner::Int = 1000, max_iter_outer::Int = 50, + seed::Int = 0, verbose::Bool = false, + inner_log::Bool = false) where {T <: Real} + @assert 0=0 "gamma_entropy must be non-negative" + + N1, N2 = size(C1, 1), size(C2, 1) + + # Initialize + if isnothing(T_init) + T_plan = initializer_semirelaxed_GW(init_mode, p, N1, N2; seed = seed) + T_init_warm = use_warmstart ? copy(T_plan) : nothing + else + @assert size(T_init) == (N1, N2) + T_plan = copy(T_init) + T_init_warm = nothing + end + + # Inner solver selection + if gamma_entropy == 0 + inner_solver = (total_linear_cost, T_init_local) -> cg_semirelaxed( + C1, p, C2; alpha = alpha, linear_cost = total_linear_cost, + init_mode = init_mode, T_init = T_init_local, symmetry = symmetry, + use_log = inner_log, eps = eps_inner, max_iter = max_iter_inner, + seed = seed, verbose = verbose + ) + else + inner_solver = (total_linear_cost, T_init_local) -> md_semirelaxed( + C1, p, C2, gamma_entropy; alpha = alpha, linear_cost = total_linear_cost, + init_mode = init_mode, T_init = T_init_local, symmetry = symmetry, + use_log = inner_log, eps = eps_inner, max_iter = max_iter_inner, + seed = seed, verbose = verbose + ) + end + + reg_linear_cost = zeros(T, N1, N2) + total_linear_cost = isnothing(linear_cost) ? nothing : copy(linear_cost) + + best_T = copy(T_plan) + ones_p = ones(T, N1, 1) + + log = use_log ? Dict("loss" => T[], "inner_loss" => []) : nothing + best_loss = T(Inf) + current_loss = T(1e15) + convergence_criterion = Inf + outer_count = 0 + + while convergence_criterion > eps_outer && outer_count < max_iter_outer + previous_loss = current_loss + + # Solve generalized problem + result = inner_solver(total_linear_cost, use_warmstart ? T_init_warm : nothing) + + if inner_log + T_plan, majorization_loss, inner_log_data = result + else + T_plan, majorization_loss = result + end + + # Compute linearized reg loss + linearized_reg_loss = sum(reg_linear_cost .* T_plan) + + if use_warmstart + T_init_warm = copy(T_plan) + end + + # Update regularization + q = vec(sum(T_plan, dims = 1)) + reg_loss = lambda_reg * sum((q .+ 1e-15) .^ p_reg) + current_loss = majorization_loss - linearized_reg_loss + reg_loss + + reg_linear_cost .= lambda_reg * p_reg * ((ones_p * q') .+ 1e-15) .^ (p_reg - 1.0) + + if isnothing(linear_cost) + total_linear_cost = reg_linear_cost + else + total_linear_cost = reg_linear_cost + linear_cost + end + + if verbose + println("Outer iter $outer_count: loss = $current_loss, q = $q") + end + + outer_count += 1 + + if use_log + push!(log["loss"], current_loss) + inner_log && push!(log["inner_loss"], inner_log_data) + end + + convergence_criterion = abs(previous_loss - current_loss) / + (abs(previous_loss) + 1e-15) + + if current_loss < best_loss + best_loss = current_loss + best_T = copy(T_plan) + end + end + + return use_log ? (best_T, best_loss, log) : (best_T, best_loss) +end + +# MM algorithm with sparsity for semi-relaxed Gromov-Wasserstein +function mm_lpl1_semirelaxed_gromov_wasserstein( + C1::AbstractMatrix{T}, p::AbstractVector{T}, + C2::AbstractMatrix{T}, gamma_entropy::Real; + kwargs...) where {T <: Real} + return mm_lpl1_semirelaxed(C1, p, C2, gamma_entropy; alpha = 1.0, + linear_cost = nothing, kwargs...) +end + +# MM algorithm with sparsity for semi-relaxed fused Gromov-Wasserstein +function mm_lpl1_semirelaxed_fused_gromov_wasserstein( + C1::AbstractMatrix{T}, A1::AbstractMatrix{T}, + p::AbstractVector{T}, C2::AbstractMatrix{T}, + A2::AbstractMatrix{T}, alpha::Real, + gamma_entropy::Real; kwargs...) where {T <: Real} + N1, N2 = size(A1, 1), size(A2, 1) + d = size(A1, 2) + + # Compute Euclidean distance matrix + A1_sq = sum(A1 .^ 2, dims = 2) * ones(T, 1, N2) + A2_sq = ones(T, N1, 1) * sum(A2 .^ 2, dims = 2)' + D = A1_sq + A2_sq - 2 * A1 * A2' + + return mm_lpl1_semirelaxed(C1, p, C2, gamma_entropy; alpha = alpha, + linear_cost = (1 - alpha) * D, kwargs...) +end diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index 3339fd7..d489b7f 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -21,7 +21,6 @@ import Graphons: _extract_param, convert_to_params, node_labels_to_latents include("SymArray.jl") @reexport using .FastSymArray -include("utils/align_partition.jl") include("distributions/hist_dist.jl") include("preprocessor/abstractConvertor.jl") include("config_rules/include.jl") @@ -31,6 +30,6 @@ include("utils/utils_node_labels.jl") include("api.jl") export GreedyParams, nethist, nethist_discrete_edges, ordered_start_labels, RandomGroupSwap, - Strict, PreviousBestValue, nethist_binary_edges, align_partitions + Strict, PreviousBestValue, nethist_binary_edges end diff --git a/src/utils/align_partition.jl b/src/utils/align_partition.jl deleted file mode 100644 index 79f6b20..0000000 --- a/src/utils/align_partition.jl +++ /dev/null @@ -1,75 +0,0 @@ - -""" - align_partitions(x::AbstractVector{<:Integer}, y::AbstractVector{<:Integer}) - -Align labels of partition `y` to match partition `x` using optimal matching. - -Returns `(y_aligned, mapping)` where: -- `y_aligned`: Vector with same length as `y`, with labels relabeled to match `x` -- `mapping`: Dictionary mapping original `y` labels to aligned `x` labels - -The alignment maximizes the overlap between partitions using the Hungarian algorithm. -Unmatched labels from `y` are assigned to unused labels from `x`. - -# Arguments -- `x::AbstractVector{<:Integer}`: Reference partition labels -- `y::AbstractVector{<:Integer}`: Partition labels to align - -# Examples -```julia -x = [1, 1, 2, 2, 3] -y = [2, 2, 1, 1, 3] -y_aligned, mapping = align_partitions(x, y) -# y_aligned == [1, 1, 2, 2, 3] -# mapping == Dict(2 => 1, 1 => 2, 3 => 3) -``` -""" -function align_partitions(x::AbstractVector{<:Integer}, y::AbstractVector{<:Integer}) - @argcheck length(x)==length(y) "Partitions must have same length" - - # Get unique labels and create index mappings - xlabs = sort!(unique(x)) - ylabs = sort!(unique(y)) - Kx, Ky = length(xlabs), length(ylabs) - - # Build contingency matrix: C[i,j] = count where x==xlabs[i] and y==ylabs[j] - C = zeros(Int, Kx, Ky) - for k in eachindex(x, y) - i = searchsortedfirst(xlabs, x[k]) - j = searchsortedfirst(ylabs, y[k]) - @inbounds C[i, j] = 1 - end - - # Solve maximum weight assignment on padded square matrix - K = max(Kx, Ky) - W = zeros(Int, K, K) - @views W[1:Kx, 1:Ky] .= -C - assignment, cost = hungarian(W) - - # Build mapping from y to x labels - mapping = Dict{eltype(ylabs), eltype(xlabs)}() - used_x = Set{eltype(xlabs)}() - - # Map matched pairs within actual partition sizes - for i in 1:Kx - j = assignment[i] - if 1 ≤ j ≤ Ky - mapping[ylabs[j]] = xlabs[i] - push!(used_x, xlabs[i]) - end - end - - # Assign unmatched y labels to unused x labels - unused_x = filter(∉(used_x), xlabs) - for j in 1:Ky - if !haskey(mapping, ylabs[j]) - @argcheck !isempty(unused_x) "Insufficient x labels for alignment" - mapping[ylabs[j]] = popfirst!(unused_x) - end - end - - # Relabel y using the mapping - y_aligned = map(ylab -> mapping[ylab], y) - - return y_aligned, mapping -end diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index d259b85..e06cd11 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -11,14 +11,11 @@ function ordered_start_labels(n::Int, k::Int) return labels end -function align_res_true_latents!(res::NethistResult, latents; type = :greedy) - if type ∉ (:opt, :greedy) - error("Unknown alignment type: $type. Use :opt or :greedy.") +function align_res_true_latents!(res::NethistResult, latents; type = nothing) + if !isnothing(type) + @warn "type argument is deprecated, only :greedy is supported now." end - if type == :opt - @warn "The :opt alignment may not work that well; consider using :greedy instead." - end - new_labels, mapping = order_groups(res.labels, latents, Val(type)) + new_labels, mapping = order_groups(res.labels, latents) res.labels .= new_labels perm = [key for (key, val) in sort(collect(mapping), by = last)] permute!(res.model, perm) @@ -31,11 +28,7 @@ function permute!(sbm, perm) sbm.cumsize .= cumsum(sbm.size) end -function order_groups(node_labels, latents::AbstractVector, ::Val{:opt}) - return align_partitions(node_labels, latents) -end - -function order_groups(node_labels, latents::AbstractVector, ::Val{:greedy}) +function order_groups(node_labels, latents::AbstractVector) n = length(node_labels) k = length(unique(node_labels)) sort_perm = sortperm(latents) @@ -54,3 +47,15 @@ function get_num_obs(A::AbstractMatrix) n = size(A, 1) return n * (n - 1) ÷ 2 end + +""" +Align the source and target matrices using optimal transport. This function requires +the PythonCall.jl package to be loaded +""" +function align_matrices end + +""" +Get the permutation aligning source and target matrices using optimal transport. This function requires +the PythonCall.jl package to be loaded +""" +function get_perm_alignment end diff --git a/test/runtests.jl b/test/runtests.jl index 46bff3c..bdbb9d6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,5 +6,4 @@ using NetworkHistogram include("test_symarray.jl") include("test_pseudo_suff_stats.jl") include("test_hist_dist.jl") - include("test_align_partitions.jl") end diff --git a/test/test_align_partitions.jl b/test/test_align_partitions.jl deleted file mode 100644 index a97c0ca..0000000 --- a/test/test_align_partitions.jl +++ /dev/null @@ -1,91 +0,0 @@ -@testset "align_partitions" begin - @testset "Basic alignment" begin - x = [1, 1, 2, 2, 3] - y = [2, 2, 1, 1, 3] - - y_aligned, mapping = align_partitions(x, y) - - @test length(y_aligned) == length(y) - @test y_aligned == [1, 1, 2, 2, 3] - @test mapping[2] == 1 - @test mapping[1] == 2 - @test mapping[3] == 3 - end - - @testset "Perfect match" begin - x = [1, 1, 2, 2, 3, 3] - y = [1, 1, 2, 2, 3, 3] - - y_aligned, mapping = align_partitions(x, y) - - @test y_aligned == x - @test mapping == Dict(1 => 1, 2 => 2, 3 => 3) - end - - @testset "Complete permutation" begin - x = [1, 1, 2, 2, 3, 3] - y = [3, 3, 1, 1, 2, 2] - - y_aligned, mapping = align_partitions(x, y) - - @test y_aligned == x - @test mapping == Dict(3 => 1, 1 => 2, 2 => 3) - end - - @testset "Different numbers of clusters" begin - # x has 3 clusters, y has 2 - x = [1, 1, 2, 2, 3, 3] - y = [1, 1, 1, 1, 2, 2] - - y_aligned, mapping = align_partitions(x, y) - - @test length(y_aligned) == length(y) - # The alignment should maximize overlap - # y cluster 1 should map to x cluster with most overlap (1 or 2) - # y cluster 2 should map to x cluster 3 - @test sum(y_aligned[1:4] .== x[1:4]) ≥ 2 # Good overlap for first 4 - end - - @testset "Single cluster" begin - x = [1, 1, 1, 1] - y = [1, 1, 1, 1] - - y_aligned, mapping = align_partitions(x, y) - - @test y_aligned == x - @test mapping == Dict(1 => 1) - end - - @testset "Non-contiguous labels" begin - x = [10, 10, 20, 20, 30, 30] - y = [5, 5, 15, 15, 25, 25] - - y_aligned, mapping = align_partitions(x, y) - - @test length(y_aligned) == length(y) - # Should create optimal matching - @test all(l -> l ∈ [10, 20, 30], y_aligned) - end - - @testset "Error handling" begin - x = [1, 2, 3] - y = [1, 2] - - @test_throws Exception align_partitions(x, y) - end - - @testset "Preserves partition structure" begin - x = [1, 1, 1, 2, 2, 2, 3, 3, 3] - y = [2, 2, 2, 3, 3, 3, 1, 1, 1] - - y_aligned, mapping = align_partitions(x, y) - - # Check that elements in same cluster in y stay together in y_aligned - @test length(unique(y_aligned[1:3])) == 1 - @test length(unique(y_aligned[4:6])) == 1 - @test length(unique(y_aligned[7:9])) == 1 - - # Should achieve perfect alignment after relabeling - @test y_aligned == x - end -end From e6348cc268ffc2607a7adddd6d39c297e438f14c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 11:01:19 +0100 Subject: [PATCH 242/266] add loading of extension in docs --- docs/make.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 6bef294..65323b2 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -31,8 +31,20 @@ DocMeta.setdocmeta!( NetworkHistogram, :DocTestSetup, :(using NetworkHistogram); recursive = true) +# based on available extensions, include them in the documentation +modules = [ + NetworkHistogram, + Base.get_extension(NetworkHistogram, :BootstrapExt), + Base.get_extension(NetworkHistogram, :LightMCExt), + Base.get_extension(NetworkHistogram, :MakieExt), + Base.get_extension(NetworkHistogram, :PythonOptimalTransport) +] + +#TODO: safety check, should probably throw an error instead +filter!(x -> !isnothing(x), modules) + makedocs(; - modules = [NetworkHistogram], + modules = modules, authors = "Jake Grainger, Charles Dufour", #repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git", sitename = "NetworkHistogram.jl", From bdf54fd554f28dc31725c8b17d9ae51e0d2e3b1a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 11:01:51 +0100 Subject: [PATCH 243/266] add docs to pot function --- ext/PythonOptimalTransport/PythonOptimalTransport.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index 78849f4..67b26fe 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -15,6 +15,12 @@ end jl_to_np(mat) = Py(mat).to_numpy() +""" +Get the permutation aligning source and target matrices using optimal transport. + +This function converts a gromov-wasserstein plan into a permutation by taking the argmax +along the rows. +""" function get_perm_alignment(src, target) plan = ot[].gromov.gromov_wasserstein( C2 = jl_to_np(src), C1 = jl_to_np(target)) @@ -24,6 +30,12 @@ function get_perm_alignment(src, target) return perm end +""" +Align the source and target matrices using optimal transport. + +# See Also +- [`get_perm_alignment`](@ref). +""" function align_matrices(src, target) perm = get_perm_alignment(src, target) return src[perm, perm] From 2045ea0f1a54889345be17aa684a8a567ef7807e Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 11:16:45 +0100 Subject: [PATCH 244/266] hack the docs --- docs/Project.toml | 1 + docs/make.jl | 27 +++-- docs/src/tutorials/simple_graph.md | 151 +++++++++++++++---------- docs/src/tutorials/weighted_network.md | 147 +++++++++++++++++++++--- 4 files changed, 243 insertions(+), 83 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index e053bd3..fdde702 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -4,6 +4,7 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DiscretizeDistributions = "1dbf0e27-43cd-4e03-8ecf-3f7be9d12b15" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +DocumenterInterLinks = "d12716ef-a0f6-4df4-a9f1-a5a34e75c656" Graphons = "e0c12bfd-47d7-434e-afb7-632611640ca5" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" diff --git a/docs/make.jl b/docs/make.jl index 65323b2..81fea06 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -32,27 +32,35 @@ DocMeta.setdocmeta!( recursive = true) # based on available extensions, include them in the documentation -modules = [ +modules_all = [ NetworkHistogram, Base.get_extension(NetworkHistogram, :BootstrapExt), Base.get_extension(NetworkHistogram, :LightMCExt), Base.get_extension(NetworkHistogram, :MakieExt), Base.get_extension(NetworkHistogram, :PythonOptimalTransport) ] +println("all modules: ", modules_all) #TODO: safety check, should probably throw an error instead -filter!(x -> !isnothing(x), modules) +modules = [filter(!isnothing, modules_all)...] +println("build/docs with modules: ", modules) + +using DocumenterInterLinks + +links = InterLinks( + "ot" => "https://pythonot.github.io/", +) makedocs(; modules = modules, authors = "Jake Grainger, Charles Dufour", - #repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git", + repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git", sitename = "NetworkHistogram.jl", - #format = Documenter.HTML(; - # prettyurls = get(ENV, "CI", "false") == "true", - # canonical = "https://SDS-EPFL.github.io/NetworkHistogram.jl", - # edit_link = "main", - # assets = String[]), + format = Documenter.HTML(; + prettyurls = get(ENV, "CI", "false") == "true", + canonical = "https://SDS-EPFL.github.io/NetworkHistogram.jl", + edit_link = "main", + assets = String[]), pages = [ "Home" => "index.md", "API Reference" => "api.md", @@ -60,7 +68,8 @@ makedocs(; "Multiplex networks" => "tutorials/multiplex_network.md", "Weighted networks" => "tutorials/weighted_network.md", "Temporal networks" => "tutorials/temporal_networks.md"]], - checkdocs = :none) + checkdocs = :none, + plugins = [links]) deploydocs(; repo = "github.com/SDS-EPFL/NetworkHistogram.jl.git") diff --git a/docs/src/tutorials/simple_graph.md b/docs/src/tutorials/simple_graph.md index 2cd6515..a9d4abd 100644 --- a/docs/src/tutorials/simple_graph.md +++ b/docs/src/tutorials/simple_graph.md @@ -58,7 +58,7 @@ To generate a random graph from a graphon, we follow these steps: Let's sample a graph with 2000 nodes from our graphon `W`. ````@example simple_graph -n = 2000 +n = 3000 u_true = rand(n); # Latent positions A = sample_graph(w, u_true); nothing #hide @@ -96,23 +96,6 @@ The main challenge is to find the optimal partition of nodes. `NetworkHistogram. Now, let's use `NetworkHistogram.jl` to fit a network histogram to the graph `A` we sampled earlier. We will try to recover the underlying 2-block structure. -First, we need to represent our graph in a format that the package understands. -We can use an `EdgeList` to store the edges of the graph. - -````@example simple_graph -edge_list = EdgeList(A); -nothing #hide -```` - -We also need to define the model for the edges. Since our graph is unweighted, -we can use a `Bernoulli` distribution. The `Dist` wrapper is used to -handle aggregation of distributions. - -````@example simple_graph -import NetworkHistogram: Dist, Assignment, nethist -dist = NetworkHistogram.Bernoulli(0.5) # The initial probability doesn't matter much. -```` - We start with a random initial assignment of nodes to `k=5` groups. ````@example simple_graph @@ -120,18 +103,20 @@ k = 10 oracle_labels = ordered_start_labels(n, k); initial_assignment = shuffle(oracle_labels); -nothing #hide -```` -Now, we create an `Assignment` object, which holds all the information -about the model and the current state of the node groupings. +# We can compute the "oracle" estimator, which uses the true latent positions to assign nodes to groups. This serves as a benchmark for our estimation. +oracle_res = NetworkHistogram.oracle_estimator( + A, oracle_labels, NetworkHistogram.BinaryConvertor(); type_suff_stats = Val(:binary)); -````@example simple_graph -oracle_estimator = Assignment(oracle_labels, edge_list, Dist(dist)); -sbm_oracle = NetworkHistogram.to_block_model(oracle_estimator); -Mke.heatmap(sbm_oracle, colormap = :binary, colorrange = (0, 1)) - -println("Log-likelihood of oracle estimator: ", loglikelihood(oracle_estimator)) +let + fig = Mke.Figure(size = (400, 300)) + ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, oracle_res.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 2], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end +# ```` `NetworkHistogram.jl` provides optimization algorithms to improve the initial assignment. @@ -145,39 +130,14 @@ params_opti = NetworkHistogram.GreedyParams( a = nethist(A, dist, initial_assignment, params_opti, false); ````@example simple_graph -res = NetworkHistogram.nethist_binary_edges(A, - initial_assignment, GreedyParams( - 1_000_000, - RandomGroupSwap(), - Strict(), - PreviousBestValue(1_000, Inf, :min), - true - )); - -a = Assignment(res.node_labels, edge_list, Dist(dist)); -println("Log-likelihood after optimization: ", loglikelihood(a)) -```` - -The `Assignment` object `a` now contains the optimized node groupings and -the fitted network histogram parameters. +res = NetworkHistogram.nethist_binary(A, k, initial_assignment); -We can visualize the fitted histogram. - -````@example simple_graph -heatmap_params(a, ordering = false, colorrange = (0, 1)) -```` - -We can convert it to a block model for easier interpretation. - -res = NethistResult(a); - -````@example simple_graph let fig = Mke.Figure(size = (1220, 400)) titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[2], sbm_oracle, + Mke.heatmap!(axes[2], oracle_res.model, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 4], colormap = :binary, @@ -189,7 +149,7 @@ end the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. ````@example simple_graph -NetworkHistogram.align_res_true_latents!(res, oracle_estimator.node_labels); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels, type = :greedy); nothing #hide ```` @@ -201,7 +161,7 @@ let titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) - Mke.heatmap!(axes[2], sbm_oracle, + Mke.heatmap!(axes[2], oracle_res.model, colormap = :binary, colorrange = (0, 1)) Mke.heatmap!(axes[3], res.model, colormap = :binary, colorrange = (0, 1)) Mke.Colorbar(fig[1, 4], colormap = :binary, @@ -214,11 +174,14 @@ We can even fit a Stochastic Shape Model quite easily from the fitted SBM. ````@example simple_graph using Clustering +```` + +ξ = NetworkHistogram.node_labels_to_latents(res.labels, res.model); -ξ = NetworkHistogram.node_labels_to_latents(res.node_labels, res.model); +````@example simple_graph shape_range = 1:(k * (k + 1) ÷ 2 - 1) ssm_estimated, criterion_values = Graphons.estimate_ssm( - res.model, A, ξ, shape_range) + res.model, A, res.labels, shape_range) using Kneedle kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) @@ -230,7 +193,6 @@ kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smooth k_knee = knees(kr)[1] ssm_knee = SSM(res.model, k_knee) -Mke.heatmap(ssm_estimated, colormap = :binary, colorrange = (0, 1)) println("Number of shapes in SSM argmin: ", length(ssm_estimated.θ)) println("Number of shapes in SSM knee: ", length(ssm_knee.θ)) println("Number of shapes in SBM: ", length(res.model.θ)) @@ -251,6 +213,75 @@ let limits = (0, 1), label = "Edge Probability", width = 20) fig end + +# + +k_kmeans = 10; +clustering_res = kmeans(A, k_kmeans); + +res_kmeans = NetworkHistogram.oracle_estimator( + A, assignments(clustering_res), NetworkHistogram.BinaryConvertor(); + type_suff_stats = Val(:binary), + name = "k-means"); + +NetworkHistogram.align_res_true_latents!(res_kmeans, oracle_res.labels, type = :greedy); +nothing #hide +```` + +and display the true function, the oracle estimator, and the fitted model + +````@example simple_graph +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["True Graphon W(u,v)", "Oracle Estimator", "Fitted Network Histogram"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], w, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], oracle_res.model, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], res_kmeans.model, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end +```` + +ξ = NetworkHistogram.node_labels_to_latents(res.labels, res.model); + +````@example simple_graph +shape_range = 1:(k_kmeans * (k_kmeans + 1) ÷ 2 - 1) +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res_kmeans.model, A, res_kmeans.labels, shape_range) + +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +```` + + Let's extract the optimal number of shapes using the Kneedle algorithm: + +````@example simple_graph +k_knee = knees(kr)[1] +ssm_knee = SSM(res_kmeans.model, k_knee) + +println("Number of shapes in SSM argmin: ", length(ssm_estimated.θ)) +println("Number of shapes in SSM knee: ", length(ssm_knee.θ)) +println("Number of shapes in SBM: ", length(res_kmeans.model.θ)) +```` + +We greatly reduced the number of parameters from the original SBM estimate while preserving much of the structure of the estimated graphon as seen below: + +````@example simple_graph +let + fig = Mke.Figure(size = (1220, 400)) + titles = ["SBM", "SSM argmin", "SSM knee"] + axes = [Mke.Axis(fig[1, i], aspect = Mke.DataAspect(), title = titles[i]) for i in 1:3] + Mke.heatmap!(axes[1], res_kmeans.model, colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[2], ssm_estimated, + colormap = :binary, colorrange = (0, 1)) + Mke.heatmap!(axes[3], ssm_knee, colormap = :binary, colorrange = (0, 1)) + Mke.Colorbar(fig[1, 4], colormap = :binary, + limits = (0, 1), label = "Edge Probability", width = 20) + fig +end ```` --- diff --git a/docs/src/tutorials/weighted_network.md b/docs/src/tutorials/weighted_network.md index a5420ab..e843438 100644 --- a/docs/src/tutorials/weighted_network.md +++ b/docs/src/tutorials/weighted_network.md @@ -5,37 +5,156 @@ EditURL = "../../literate/tutorials/weighted_network.jl" # Decorated Graphon Tutorial for Weighted Networks ````@example weighted_network +using Clustering using NetworkHistogram using Distributions -import CairoMakie as Mke using LinearAlgebra +using Random +using ProgressMeter + +import Distributions: pdf + +pdf_kuma(α, β, x, p = 1.0) = @. p * (α * β * x^(α - 1) .* (1 - x^α)^(β - 1)) -graphon = DecoratedGraphon((x, y) -> Exponential(3 * x * y + 1)) +graphon_params = (x, y) -> (4 * (cos(π * (x - y)) + 1) + 1, max(x, y) * 8 + 1) +graphon = DecoratedGraphon((x, y) -> Kumaraswamy(graphon_params(x, y)...)) + +import CairoMakie as Mke let - fig = Mke.Figure(size = (600, 300)) + fig = Mke.Figure() ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) - Mke.heatmap!(ax, graphon, colormap = :viridis) + hm = Mke.heatmap!(ax, graphon, colormap = :viridis) + Mke.Colorbar(fig[1, 2], hm) + ax2 = Mke.Axis(fig[1, 3], aspect = Mke.DataAspect()) + hm2 = Mke.heatmap!(ax2, graphon, k = 2, colormap = :viridis) + Mke.Colorbar(fig[1, 4], hm2) fig end -n = 500 +n = 2000 k = 10 -A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(0.7), n, n)); +n_bins = 20 +p = 0.9 +A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); +ξs = range(0, 1; length = n) oracle_latents = ordered_start_labels(n, k); + +res_oracle = NetworkHistogram.oracle_estimator( + A, oracle_latents, NetworkHistogram.UnitIntervalConvertor(n_bins)); + starting_labels = shuffle(oracle_latents); -res = NetworkHistogram.nethist_continuous_edges(A, - starting_labels, GreedyParams( - 1_000_000, - RandomGroupSwap(), - Strict(), - PreviousBestValue(5_000, Inf, :min), - true # progress bar - )); +max_iter = 1_000_000 +stalled_iters = 5_000 + +res_new = NetworkHistogram.nethist_continuous( + A, k, + starting_labels; + bins = n_bins +); nothing #hide ```` +convertor = NetworkHistogram.UnitIntervalConvertor(10) + +data = convertor.(A) +es_new = NetworkHistogram.GreedySuffStats( + data, initial_labels, num_categories = num_bins(convertor), + type_suff_stats = :categorical, + max_iter = max_iter, + swap_rule = NetworkHistogram.RandomGroupSwap(), + stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), + progress = true +); +node_labels_es_new, parameters = NetworkHistogram.estimate!( + es_new, data, initial_labels; iter_progress = 10_000) + +model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), + counts(node_labels_es_new) ./ length(node_labels_es_new)); + +res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); + +````@example weighted_network +NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); +xs = range(0, 1; length = 100) + +function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 20, p = p) + nodes_1 = findall(res_oracle.labels .== g1) + nodes_2 = findall(res_oracle.labels .== g2) + edge_values = [A[x, y] for y in nodes_2 for x in nodes_1] + Mke.vlines!(axis, edge_values, ymax = 0.025, color = :lightgray) +```` + +Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) + +````@example weighted_network + x1 = sample(ξs[nodes_1], n_viz, replace = false) + x2 = sample(ξs[nodes_2], n_viz, replace = false) + for x_ in x1 + for y_ in x2 + Mke.lines!(axis, xs, pdf_kuma(graphon_params(x_, y_)..., xs, p), + color = :gray, alpha = 0.1) + end + end + Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_oracle.model.θ[g1, g2]), xs), + color = :blue, label = "True") + Mke.lines!(axis, xs, map(Base.Fix1(pdf, res_new.model.θ[g1, g2]), xs), + color = :black, linestyle = :dash, label = "Estimated") +end + +for g in 1:k + @showprogress for g2 in 1:g + fig = Mke.Figure(size = (600, 400)) + ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ylabel = "Density") + viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p, n_viz = 5) + display(fig) + end +end + +# +ssm_test = SSM(res_new.model, k) + +shape_range = 1:(k * (k + 1) ÷ 2 - 1) +ssm_estimated, criterion_values = Graphons.estimate_ssm( + res_new.model, A, res_new.labels, shape_range) + +Mke.lines(shape_range, criterion_values) + +# +```` + +using Kneedle +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +# Let's extract the optimal number of shapes using the Kneedle algorithm: + +k_knee = knees(kr)[1] +ssm_knee = SSM(res_new.model, k_knee) + +````@example weighted_network +# + +clustering_res = kmeans(A, k) + +res_kmeans = NetworkHistogram.oracle_estimator( + A, assignments(clustering_res), NetworkHistogram.UnitIntervalConvertor(n_bins); + type_suff_stats = Val(:categorical), + name = "k-means"); + +NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels, type = :greedy); + +for g in 1:k + @showprogress for g2 in 1:g + fig = Mke.Figure(size = (600, 400)) + ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ylabel = "Density") + viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) + display(fig) + end +end +```` + --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* From d2dfd635dba941e0d65abc1d13b27361c437c30d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 11:17:14 +0100 Subject: [PATCH 245/266] add kwargs for ot alignment --- ext/PythonOptimalTransport/PythonOptimalTransport.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index 67b26fe..dddab44 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -20,10 +20,16 @@ Get the permutation aligning source and target matrices using optimal transport. This function converts a gromov-wasserstein plan into a permutation by taking the argmax along the rows. + +This function uses [`gromov_wasserstein`](https://pythonot.github.io/gen_modules/ot.gromov.html#ot.gromov.BAPG_gromov_wasserstein) + +# See also +- [`align_matrices`](@ref) +- [`ot.gromov.gromov_wasserstein`](@extref) """ -function get_perm_alignment(src, target) +function get_perm_alignment(src, target; kwargs...) plan = ot[].gromov.gromov_wasserstein( - C2 = jl_to_np(src), C1 = jl_to_np(target)) + C2 = jl_to_np(src), C1 = jl_to_np(target), kwargs...) plan = pyconvert(Matrix{Float64}, plan) ordering = argmax(plan, dims = 1) .|> Tuple |> vec perm = sort(ordering, by = x -> x[1]) .|> last @@ -33,7 +39,7 @@ end """ Align the source and target matrices using optimal transport. -# See Also +# See also - [`get_perm_alignment`](@ref). """ function align_matrices(src, target) From b1876321c6c76afd836749a05fa0c203f1f67d16 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 12:19:27 +0100 Subject: [PATCH 246/266] remove deprecated keyword arg --- docs/literate/tutorials/simple_graph.jl | 4 ++-- docs/literate/tutorials/weighted_network.jl | 2 +- docs/make.jl | 2 -- src/utils/utils_node_labels.jl | 5 +---- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index e219dbe..cb7a840 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -122,7 +122,7 @@ end # the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. -NetworkHistogram.align_res_true_latents!(res, oracle_res.labels, type = :greedy); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels); # and display the true function, the oracle estimator, and the fitted model let @@ -183,7 +183,7 @@ res_kmeans = NetworkHistogram.oracle_estimator( type_suff_stats = Val(:binary), name = "k-means"); -NetworkHistogram.align_res_true_latents!(res_kmeans, oracle_res.labels, type = :greedy); +NetworkHistogram.align_res_true_latents!(res_kmeans, oracle_res.labels); # and display the true function, the oracle estimator, and the fitted model let diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index ea55ae7..d8118da 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -127,7 +127,7 @@ res_kmeans = NetworkHistogram.oracle_estimator( type_suff_stats = Val(:categorical), name = "k-means"); -NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels, type = :greedy); +NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); for g in 1:k @showprogress for g2 in 1:g diff --git a/docs/make.jl b/docs/make.jl index 81fea06..a2d9cb3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -39,11 +39,9 @@ modules_all = [ Base.get_extension(NetworkHistogram, :MakieExt), Base.get_extension(NetworkHistogram, :PythonOptimalTransport) ] -println("all modules: ", modules_all) #TODO: safety check, should probably throw an error instead modules = [filter(!isnothing, modules_all)...] -println("build/docs with modules: ", modules) using DocumenterInterLinks diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index e06cd11..b3bb892 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -11,10 +11,7 @@ function ordered_start_labels(n::Int, k::Int) return labels end -function align_res_true_latents!(res::NethistResult, latents; type = nothing) - if !isnothing(type) - @warn "type argument is deprecated, only :greedy is supported now." - end +function align_res_true_latents!(res::NethistResult, latents) new_labels, mapping = order_groups(res.labels, latents) res.labels .= new_labels perm = [key for (key, val) in sort(collect(mapping), by = last)] From af6d0105ef2aab7c1f74b27ab23bb97843efd816 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 13:15:19 +0100 Subject: [PATCH 247/266] remove @showprogress from litterate example --- docs/literate/tutorials/weighted_network.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index d8118da..3f96489 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -6,7 +6,6 @@ using NetworkHistogram using Distributions using LinearAlgebra using Random -using ProgressMeter import Distributions: pdf @@ -29,7 +28,7 @@ let end n = 2000 -k = 10 +k = 5 n_bins = 20 p = 0.9 A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); @@ -92,7 +91,7 @@ function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 2 end for g in 1:k - @showprogress for g2 in 1:g + for g2 in 1:g fig = Mke.Figure(size = (600, 400)) ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") @@ -130,7 +129,7 @@ res_kmeans = NetworkHistogram.oracle_estimator( NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); for g in 1:k - @showprogress for g2 in 1:g + for g2 in 1:g fig = Mke.Figure(size = (600, 400)) ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") From a2f40da563c0c3f5ab7c854cf66c6ec3ebf40f27 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 13:20:12 +0100 Subject: [PATCH 248/266] change default params for categorical edges and update litterate example --- docs/literate/tutorials/multiplex_network.jl | 14 +++++--------- src/api.jl | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index 989424d..7b074c0 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -38,18 +38,14 @@ k = 20 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); -res = NetworkHistogram.nethist_discrete_edges(A, - initial_labels, GreedyParams( - 1_000_000, - RandomGroupSwap(), - Strict(), - PreviousBestValue(5_000, Inf, :min), - false # progress bar - )); +oracle_res = NetworkHistogram.oracle_estimator( + A, oracle_labels, NetworkHistogram.CategoricalConvertor(A)); + +res = NetworkHistogram.nethist_categorical(A, k, initial_labels) # Visualize the fitted models for different numbers of groups after aligning with true latents -NetworkHistogram.align_res_true_latents!(res, oracle_labels); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels); let fig = Mke.Figure(size = (4 * h, h)) for m in 1:4 diff --git a/src/api.jl b/src/api.jl index c14d9ea..c62bdf9 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,7 +1,7 @@ function nethist_categorical( A, k, labels_start = ordered_start_labels(size(A, 1), k); - params::GreedyParams = GreedyParams()) + params::GreedyParams = GreedyParams(stalled_iter = 10_000)) convertor = CategoricalConvertor(A) @info "Using $(num_bins(convertor)) discrete categories for edge values" return _nethist( From 202956badfcaf87b50867fcbbb5c1d521f65a90c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 14:38:06 +0100 Subject: [PATCH 249/266] solve PythonCall import problem partially --- docs/Project.toml | 1 + docs/literate/tutorials/multiplex_network.jl | 35 +++++++++++++- .../PythonOptimalTransport.jl | 45 ++++++++++++++++-- .../__pycache__/fngw.cpython-314.pyc | Bin 32832 -> 32832 bytes src/api.jl | 2 +- 5 files changed, 75 insertions(+), 8 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index fdde702..0270131 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -12,5 +12,6 @@ Kneedle = "4ef9287f-f14a-4b13-b4c1-9bb5ae54399a" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" NetworkHistogram = "7806f430-7229-459c-b2e6-df35e8e4eb5d" +PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index 7b074c0..e32d8db 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -34,7 +34,7 @@ n = 1000 true_latents = range(0, 1; length = n) A = sample_graph(graphon, true_latents); -k = 20 +k = 10 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); @@ -55,10 +55,41 @@ let fig end +# We can also align the fitted model to the true one using optimal transport. We need to load the `PythonCall.jl` +# package for that, as we will use the `POT` Python library. + +using PythonCall +θ_oracle = probs.(oracle_res.model.θ); +θ_hat = probs.(res.model.θ); + +# path_hardcoded = "/Users/dufour/Documents/code/networks/static/NetworkHistogram/ext/PythonOptimalTransport/" +# pyimport("sys").path.append(path_hardcoded) + +# const fngw_import = pyimport("fngw") +# fngw2 = @pyconst(pyimport("fngw")).fused_network_gromov_wasserstein2 + +perm = NetworkHistogram.get_perm_alignment(θ_hat, θ_oracle); + +θ_hat_aligned = θ_hat[perm, perm]; +estimator_aligned = DecoratedSBM(DiscreteNonParametric.(Ref(0:3), θ_hat_aligned), + res.model.size[perm]); + +let + fig = Mke.Figure(size = (2 * h, h)) + for m in 1:4 + ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + Mke.heatmap!(ax, estimator_aligned, k = m, colormap = :binary, colorrange = (0, 1)) + ax2 = Mke.Axis(fig[2, m], aspect = Mke.DataAspect()) + Mke.heatmap!( + ax2, oracle_res.model, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end + # The fitted network histogram can be further processed to obtain a smoother estimate of the underlying graphon. using Clustering -shape_range = 1:30 +shape_range = 1:20 ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index dddab44..bc9655f 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -10,10 +10,23 @@ const fngw = Ref{Py}() function __init__() ot[] = pyimport("ot") pyimport("sys").path.append(@__DIR__) + # TODO: find why I need to use fngw.x to access the functions later... fngw[] = pyimport("fngw") end -jl_to_np(mat) = Py(mat).to_numpy() +## helpers to convert Julia arrays to numpy arrays +jl_to_np(mat::AbstractArray{<:Real}) = Py(mat).to_numpy() +function jl_to_np(mat::AbstractMatrix{<:AbstractVector}) + Py(permutedims(stack(mat), (3, 2, 1))).to_numpy() +end + +# helpers for optimal transport alignment + +function plan_to_permutation(plan) + ordering = argmax(plan, dims = 1) .|> Tuple |> vec + perm = sort(ordering, by = x -> x[1]) .|> last + return perm +end """ Get the permutation aligning source and target matrices using optimal transport. @@ -27,13 +40,35 @@ This function uses [`gromov_wasserstein`](https://pythonot.github.io/gen_modules - [`align_matrices`](@ref) - [`ot.gromov.gromov_wasserstein`](@extref) """ -function get_perm_alignment(src, target; kwargs...) +function get_perm_alignment( + src::AbstractMatrix{<:Real}, target::AbstractMatrix{<:Real}; kwargs...) plan = ot[].gromov.gromov_wasserstein( C2 = jl_to_np(src), C1 = jl_to_np(target), kwargs...) + plan = pyconvert(typeof(target), plan) + return plan_to_permutation(plan) +end + +function get_perm_alignment(src::AbstractMatrix{T1}, target::AbstractMatrix{T2}; + kwargs...) where {T1 <: AbstractVector, T2 <: AbstractVector} + C1 = jl_to_np(target) + C2 = jl_to_np(src) + dist, log_ = fngw.x.fused_network_gromov_wasserstein2( + M = jl_to_np(zeros(size(target, 1), size(src, 1))), + C1 = C1, + C2 = C2, + A1 = jl_to_np(ones(size(target, 1), size(target, 1))), + A2 = jl_to_np(ones(size(src, 1), size(src, 1))), + p = jl_to_np(fill(1.0 / size(target, 1), size(target, 1))), + q = jl_to_np(fill(1.0 / size(src, 1), size(src, 1))), + alpha = 1.0, + beta = 0.0, + log = true, + kwargs... + ) + @show dist + plan = log_["T"] plan = pyconvert(Matrix{Float64}, plan) - ordering = argmax(plan, dims = 1) .|> Tuple |> vec - perm = sort(ordering, by = x -> x[1]) .|> last - return perm + return plan_to_permutation(plan) end """ diff --git a/ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc b/ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc index 888d28678184b24a9350ce1329cc09d05f55099b..526c4cf4d624f91c8f1f43d00a750c106e27b803 100644 GIT binary patch delta 22 ccmX@mz;vL2iCdeGmx}=iBz46%a+@{)06n<`t^fc4 delta 22 ccmX@mz;vL2iCdeGmx}=ibl;0@k diff --git a/src/api.jl b/src/api.jl index c62bdf9..3ddb5e2 100644 --- a/src/api.jl +++ b/src/api.jl @@ -1,7 +1,7 @@ function nethist_categorical( A, k, labels_start = ordered_start_labels(size(A, 1), k); - params::GreedyParams = GreedyParams(stalled_iter = 10_000)) + params::GreedyParams = GreedyParams(stalled_iters = 50_000, max_iter = 2_000_000)) convertor = CategoricalConvertor(A) @info "Using $(num_bins(convertor)) discrete categories for edge values" return _nethist( From 6d975958318a938da49179a49abf1c3873f15cbb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 14:43:12 +0100 Subject: [PATCH 250/266] clean code --- docs/literate/tutorials/multiplex_network.jl | 13 +++++-------- .../PythonOptimalTransport.jl | 4 +--- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index e32d8db..aff358c 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -58,16 +58,11 @@ end # We can also align the fitted model to the true one using optimal transport. We need to load the `PythonCall.jl` # package for that, as we will use the `POT` Python library. +ENV["JULIA_CONDAPKG_VERBOSITY"] = "-1" #hide conda messages using PythonCall θ_oracle = probs.(oracle_res.model.θ); θ_hat = probs.(res.model.θ); -# path_hardcoded = "/Users/dufour/Documents/code/networks/static/NetworkHistogram/ext/PythonOptimalTransport/" -# pyimport("sys").path.append(path_hardcoded) - -# const fngw_import = pyimport("fngw") -# fngw2 = @pyconst(pyimport("fngw")).fused_network_gromov_wasserstein2 - perm = NetworkHistogram.get_perm_alignment(θ_hat, θ_oracle); θ_hat_aligned = θ_hat[perm, perm]; @@ -77,9 +72,11 @@ estimator_aligned = DecoratedSBM(DiscreteNonParametric.(Ref(0:3), θ_hat_aligned let fig = Mke.Figure(size = (2 * h, h)) for m in 1:4 - ax = Mke.Axis(fig[1, m], aspect = Mke.DataAspect()) + ax = Mke.Axis( + fig[1, m], aspect = Mke.DataAspect(), ylabel = m == 1 ? "Estimated" : "") Mke.heatmap!(ax, estimator_aligned, k = m, colormap = :binary, colorrange = (0, 1)) - ax2 = Mke.Axis(fig[2, m], aspect = Mke.DataAspect()) + ax2 = Mke.Axis( + fig[2, m], aspect = Mke.DataAspect(), ylabel = m == 1 ? "Oracle" : "") Mke.heatmap!( ax2, oracle_res.model, k = m, colormap = :binary, colorrange = (0, 1)) end diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index bc9655f..29c115f 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -65,9 +65,7 @@ function get_perm_alignment(src::AbstractMatrix{T1}, target::AbstractMatrix{T2}; log = true, kwargs... ) - @show dist - plan = log_["T"] - plan = pyconvert(Matrix{Float64}, plan) + plan = pyconvert(Matrix{Float64}, log_["T"]) return plan_to_permutation(plan) end From 0e36b117155f66eb17a042afe1f6014d231639eb Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 14:48:11 +0100 Subject: [PATCH 251/266] clean tutorials --- docs/literate/tutorials/multiplex_network.jl | 2 +- docs/literate/tutorials/weighted_network.jl | 19 -------- docs/src/tutorials/multiplex_network.md | 49 +++++++++++++++----- docs/src/tutorials/simple_graph.md | 4 +- docs/src/tutorials/weighted_network.md | 36 ++------------ 5 files changed, 44 insertions(+), 66 deletions(-) diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index aff358c..7ff53a8 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -58,7 +58,7 @@ end # We can also align the fitted model to the true one using optimal transport. We need to load the `PythonCall.jl` # package for that, as we will use the `POT` Python library. -ENV["JULIA_CONDAPKG_VERBOSITY"] = "-1" #hide conda messages +ENV["JULIA_CONDAPKG_VERBOSITY"] = "-1" # hide conda messages #hide using PythonCall θ_oracle = probs.(oracle_res.model.θ); θ_hat = probs.(res.model.θ); diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 3f96489..50b31f5 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -49,24 +49,6 @@ res_new = NetworkHistogram.nethist_continuous( bins = n_bins ); -# convertor = NetworkHistogram.UnitIntervalConvertor(10) - -# data = convertor.(A) -# es_new = NetworkHistogram.GreedySuffStats( -# data, initial_labels, num_categories = num_bins(convertor), -# type_suff_stats = :categorical, -# max_iter = max_iter, -# swap_rule = NetworkHistogram.RandomGroupSwap(), -# stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), -# progress = true -# ); -# node_labels_es_new, parameters = NetworkHistogram.estimate!( -# es_new, data, initial_labels; iter_progress = 10_000) - -# model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), -# counts(node_labels_es_new) ./ length(node_labels_es_new)); - -# res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); xs = range(0, 1; length = 100) @@ -75,7 +57,6 @@ function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 2 nodes_2 = findall(res_oracle.labels .== g2) edge_values = [A[x, y] for y in nodes_2 for x in nodes_1] Mke.vlines!(axis, edge_values, ymax = 0.025, color = :lightgray) - # Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) x1 = sample(ξs[nodes_1], n_viz, replace = false) x2 = sample(ξs[nodes_2], n_viz, replace = false) for x_ in x1 diff --git a/docs/src/tutorials/multiplex_network.md b/docs/src/tutorials/multiplex_network.md index 7e2da12..71138fc 100644 --- a/docs/src/tutorials/multiplex_network.md +++ b/docs/src/tutorials/multiplex_network.md @@ -38,25 +38,20 @@ n = 1000 true_latents = range(0, 1; length = n) A = sample_graph(graphon, true_latents); -k = 20 +k = 10 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); -res = NetworkHistogram.nethist_discrete_edges(A, - initial_labels, GreedyParams( - 1_000_000, - RandomGroupSwap(), - Strict(), - PreviousBestValue(5_000, Inf, :min), - false # progress bar - )); -nothing #hide +oracle_res = NetworkHistogram.oracle_estimator( + A, oracle_labels, NetworkHistogram.CategoricalConvertor(A)); + +res = NetworkHistogram.nethist_categorical(A, k, initial_labels) ```` Visualize the fitted models for different numbers of groups after aligning with true latents ````@example multiplex_network -NetworkHistogram.align_res_true_latents!(res, oracle_labels); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels); let fig = Mke.Figure(size = (4 * h, h)) for m in 1:4 @@ -67,11 +62,41 @@ let end ```` +We can also align the fitted model to the true one using optimal transport. We need to load the `PythonCall.jl` +package for that, as we will use the `POT` Python library. + +````@example multiplex_network +ENV["JULIA_CONDAPKG_VERBOSITY"] = "-1" # hide conda messages #hide +using PythonCall +θ_oracle = probs.(oracle_res.model.θ); +θ_hat = probs.(res.model.θ); + +perm = NetworkHistogram.get_perm_alignment(θ_hat, θ_oracle); + +θ_hat_aligned = θ_hat[perm, perm]; +estimator_aligned = DecoratedSBM(DiscreteNonParametric.(Ref(0:3), θ_hat_aligned), + res.model.size[perm]); + +let + fig = Mke.Figure(size = (2 * h, h)) + for m in 1:4 + ax = Mke.Axis( + fig[1, m], aspect = Mke.DataAspect(), ylabel = m == 1 ? "Estimated" : "") + Mke.heatmap!(ax, estimator_aligned, k = m, colormap = :binary, colorrange = (0, 1)) + ax2 = Mke.Axis( + fig[2, m], aspect = Mke.DataAspect(), ylabel = m == 1 ? "Oracle" : "") + Mke.heatmap!( + ax2, oracle_res.model, k = m, colormap = :binary, colorrange = (0, 1)) + end + fig +end +```` + The fitted network histogram can be further processed to obtain a smoother estimate of the underlying graphon. ````@example multiplex_network using Clustering -shape_range = 1:30 +shape_range = 1:20 ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); diff --git a/docs/src/tutorials/simple_graph.md b/docs/src/tutorials/simple_graph.md index a9d4abd..2360201 100644 --- a/docs/src/tutorials/simple_graph.md +++ b/docs/src/tutorials/simple_graph.md @@ -149,7 +149,7 @@ end the block labels found by the optimization are not necessarily aligned with the true latent positions, hence the need to align them for better visualization. ````@example simple_graph -NetworkHistogram.align_res_true_latents!(res, oracle_res.labels, type = :greedy); +NetworkHistogram.align_res_true_latents!(res, oracle_res.labels); nothing #hide ```` @@ -224,7 +224,7 @@ res_kmeans = NetworkHistogram.oracle_estimator( type_suff_stats = Val(:binary), name = "k-means"); -NetworkHistogram.align_res_true_latents!(res_kmeans, oracle_res.labels, type = :greedy); +NetworkHistogram.align_res_true_latents!(res_kmeans, oracle_res.labels); nothing #hide ```` diff --git a/docs/src/tutorials/weighted_network.md b/docs/src/tutorials/weighted_network.md index e843438..7ba6246 100644 --- a/docs/src/tutorials/weighted_network.md +++ b/docs/src/tutorials/weighted_network.md @@ -10,7 +10,6 @@ using NetworkHistogram using Distributions using LinearAlgebra using Random -using ProgressMeter import Distributions: pdf @@ -33,7 +32,7 @@ let end n = 2000 -k = 10 +k = 5 n_bins = 20 p = 0.9 A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); @@ -53,29 +52,7 @@ res_new = NetworkHistogram.nethist_continuous( starting_labels; bins = n_bins ); -nothing #hide -```` - -convertor = NetworkHistogram.UnitIntervalConvertor(10) - -data = convertor.(A) -es_new = NetworkHistogram.GreedySuffStats( - data, initial_labels, num_categories = num_bins(convertor), - type_suff_stats = :categorical, - max_iter = max_iter, - swap_rule = NetworkHistogram.RandomGroupSwap(), - stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), - progress = true -); -node_labels_es_new, parameters = NetworkHistogram.estimate!( - es_new, data, initial_labels; iter_progress = 10_000) - -model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), - counts(node_labels_es_new) ./ length(node_labels_es_new)); - -res_new = NetworkHistogram.NethistResult(node_labels_es_new, model_es_new); -````@example weighted_network NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); xs = range(0, 1; length = 100) @@ -84,11 +61,6 @@ function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 2 nodes_2 = findall(res_oracle.labels .== g2) edge_values = [A[x, y] for y in nodes_2 for x in nodes_1] Mke.vlines!(axis, edge_values, ymax = 0.025, color = :lightgray) -```` - -Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) - -````@example weighted_network x1 = sample(ξs[nodes_1], n_viz, replace = false) x2 = sample(ξs[nodes_2], n_viz, replace = false) for x_ in x1 @@ -104,7 +76,7 @@ Mke.hist!(axis, edge_values; normalization = :pdf, color = :gray) end for g in 1:k - @showprogress for g2 in 1:g + for g2 in 1:g fig = Mke.Figure(size = (600, 400)) ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") @@ -142,10 +114,10 @@ res_kmeans = NetworkHistogram.oracle_estimator( type_suff_stats = Val(:categorical), name = "k-means"); -NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels, type = :greedy); +NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); for g in 1:k - @showprogress for g2 in 1:g + for g2 in 1:g fig = Mke.Figure(size = (600, 400)) ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") From fd2c1ec00ee34ddfdc6f230692bd94a52ee41dd4 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 14:56:43 +0100 Subject: [PATCH 252/266] tutorial follow example from paper --- docs/literate/tutorials/multiplex_network.jl | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index 7ff53a8..df72c4e 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -19,7 +19,17 @@ function W_multiplex(x, y) return DiscreteNonParametric(0:3, SVector{4}(ps)) end -graphon = DecoratedGraphon(W_multiplex) +function W3(x, y) + ps = zeros(4) + ps[1] = 3 * x * y + ps[2] = 3 * sin(2 * π * x) * sin(2 * π * y) + ps[3] = exp(-3 * (x - 0.5)^2 - 3 * (y - 0.5)^2) + ps[4] = 2 - 3 * (x + y) + e_ps = exp.(ps) + return DiscreteNonParametric(0:3, SVector{4}(e_ps ./ sum(e_ps))) +end + +graphon = DecoratedGraphon(W3) let fig = Mke.Figure(size = (4 * h, h)) @@ -28,13 +38,14 @@ let Mke.heatmap!(ax, graphon, k = m, colormap = :binary, colorrange = (0, 1)) end fig + display(fig) #src end n = 1000 true_latents = range(0, 1; length = n) A = sample_graph(graphon, true_latents); -k = 10 +k = 14 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); @@ -53,6 +64,7 @@ let Mke.heatmap!(ax, res.model, k = m, colormap = :binary, colorrange = (0, 1)) end fig + display(fig) #src end # We can also align the fitted model to the true one using optimal transport. We need to load the `PythonCall.jl` @@ -81,12 +93,13 @@ let ax2, oracle_res.model, k = m, colormap = :binary, colorrange = (0, 1)) end fig + display(fig) #src end # The fitted network histogram can be further processed to obtain a smoother estimate of the underlying graphon. using Clustering -shape_range = 1:20 +shape_range = 1:30 ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); @@ -109,4 +122,5 @@ let Mke.Colorbar(fig[2, end + 1], colormap = :lipari, limits = (0, 1), width = 0.05 * h) fig + display(fig) #src end From c9367225153bb7393f248289f1bd566402f36033 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 15:01:46 +0100 Subject: [PATCH 253/266] compare argmin and knee in example --- docs/literate/tutorials/multiplex_network.jl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index df72c4e..b013b7b 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -110,16 +110,21 @@ kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smooth k_knee = knees(kr)[1] ssm = SSM(res.model, k_knee) +models_to_plot = [graphon, res.model, ssm_estimated, ssm] +model_names = ["True graphon", "Block model", + "SSM argmin k=$(length(ssm_estimated.θ))", "SSM knee k=$k_knee"] + let - fig = Mke.Figure(size = (4 * h, 3 * h)) - for (i, model) in enumerate([graphon, res.model, ssm]) + fig = Mke.Figure(size = (4 * h, length(models_to_plot) * h)) + for (i, model) in enumerate(models_to_plot) for m in 1:4 - ax = Mke.Axis(fig[i, m], aspect = Mke.DataAspect()) - Mke.hidedecorations!(ax) + ax = Mke.Axis( + fig[i, m], aspect = Mke.DataAspect(), ylabel = m == 1 ? model_names[i] : "") + Mke.hidedecorations!(ax, label = false) Mke.heatmap!(ax, model, k = m, colormap = :lipari, colorrange = (0, 1)) end end - Mke.Colorbar(fig[2, end + 1], colormap = :lipari, + Mke.Colorbar(fig[2:3, end + 1], colormap = :lipari, limits = (0, 1), width = 0.05 * h) fig display(fig) #src From 7edf850f7f43cc7c01ce0e57fd74819b47e632b0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 15:10:21 +0100 Subject: [PATCH 254/266] clean weighted network tutorial --- docs/literate/tutorials/weighted_network.jl | 34 ++++++------------- docs/src/tutorials/multiplex_network.md | 31 ++++++++++++----- docs/src/tutorials/weighted_network.md | 37 ++++++--------------- 3 files changed, 45 insertions(+), 57 deletions(-) diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 50b31f5..1317764 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -27,6 +27,8 @@ let fig end +# We sample a weighted network from the graphon + n = 2000 k = 5 n_bins = 20 @@ -71,32 +73,16 @@ function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 2 color = :black, linestyle = :dash, label = "Estimated") end +fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - fig = Mke.Figure(size = (600, 400)) - ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p, n_viz = 5) - display(fig) end end - -## -ssm_test = SSM(res_new.model, k) - -shape_range = 1:(k * (k + 1) ÷ 2 - 1) -ssm_estimated, criterion_values = Graphons.estimate_ssm( - res_new.model, A, res_new.labels, shape_range) - -Mke.lines(shape_range, criterion_values) - -## -# using Kneedle -# kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) -# # Let's extract the optimal number of shapes using the Kneedle algorithm: - -# k_knee = knees(kr)[1] -# ssm_knee = SSM(res_new.model, k_knee) +fig +Mke.display(fig) #src ## @@ -109,12 +95,14 @@ res_kmeans = NetworkHistogram.oracle_estimator( NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); +fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - fig = Mke.Figure(size = (600, 400)) - ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) - display(fig) end end + +fig +display(fig) #src diff --git a/docs/src/tutorials/multiplex_network.md b/docs/src/tutorials/multiplex_network.md index 71138fc..9fc4dc4 100644 --- a/docs/src/tutorials/multiplex_network.md +++ b/docs/src/tutorials/multiplex_network.md @@ -23,7 +23,17 @@ function W_multiplex(x, y) return DiscreteNonParametric(0:3, SVector{4}(ps)) end -graphon = DecoratedGraphon(W_multiplex) +function W3(x, y) + ps = zeros(4) + ps[1] = 3 * x * y + ps[2] = 3 * sin(2 * π * x) * sin(2 * π * y) + ps[3] = exp(-3 * (x - 0.5)^2 - 3 * (y - 0.5)^2) + ps[4] = 2 - 3 * (x + y) + e_ps = exp.(ps) + return DiscreteNonParametric(0:3, SVector{4}(e_ps ./ sum(e_ps))) +end + +graphon = DecoratedGraphon(W3) let fig = Mke.Figure(size = (4 * h, h)) @@ -38,7 +48,7 @@ n = 1000 true_latents = range(0, 1; length = n) A = sample_graph(graphon, true_latents); -k = 10 +k = 14 oracle_labels = ordered_start_labels(n, k); initial_labels = shuffle(oracle_labels); @@ -96,7 +106,7 @@ The fitted network histogram can be further processed to obtain a smoother estim ````@example multiplex_network using Clustering -shape_range = 1:20 +shape_range = 1:30 ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); @@ -111,16 +121,21 @@ nothing #hide k_knee = knees(kr)[1] ssm = SSM(res.model, k_knee) +models_to_plot = [graphon, res.model, ssm_estimated, ssm] +model_names = ["True graphon", "Block model", + "SSM argmin k=$(length(ssm_estimated.θ))", "SSM knee k=$k_knee"] + let - fig = Mke.Figure(size = (4 * h, 3 * h)) - for (i, model) in enumerate([graphon, res.model, ssm]) + fig = Mke.Figure(size = (4 * h, length(models_to_plot) * h)) + for (i, model) in enumerate(models_to_plot) for m in 1:4 - ax = Mke.Axis(fig[i, m], aspect = Mke.DataAspect()) - Mke.hidedecorations!(ax) + ax = Mke.Axis( + fig[i, m], aspect = Mke.DataAspect(), ylabel = m == 1 ? model_names[i] : "") + Mke.hidedecorations!(ax, label = false) Mke.heatmap!(ax, model, k = m, colormap = :lipari, colorrange = (0, 1)) end end - Mke.Colorbar(fig[2, end + 1], colormap = :lipari, + Mke.Colorbar(fig[2:3, end + 1], colormap = :lipari, limits = (0, 1), width = 0.05 * h) fig end diff --git a/docs/src/tutorials/weighted_network.md b/docs/src/tutorials/weighted_network.md index 7ba6246..72bed22 100644 --- a/docs/src/tutorials/weighted_network.md +++ b/docs/src/tutorials/weighted_network.md @@ -30,7 +30,11 @@ let Mke.Colorbar(fig[1, 4], hm2) fig end +```` + +We sample a weighted network from the graphon +````@example weighted_network n = 2000 k = 5 n_bins = 20 @@ -75,36 +79,16 @@ function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 2 color = :black, linestyle = :dash, label = "Estimated") end +fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - fig = Mke.Figure(size = (600, 400)) - ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p, n_viz = 5) - display(fig) end end +fig -# -ssm_test = SSM(res_new.model, k) - -shape_range = 1:(k * (k + 1) ÷ 2 - 1) -ssm_estimated, criterion_values = Graphons.estimate_ssm( - res_new.model, A, res_new.labels, shape_range) - -Mke.lines(shape_range, criterion_values) - -# -```` - -using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) -# Let's extract the optimal number of shapes using the Kneedle algorithm: - -k_knee = knees(kr)[1] -ssm_knee = SSM(res_new.model, k_knee) - -````@example weighted_network # clustering_res = kmeans(A, k) @@ -116,15 +100,16 @@ res_kmeans = NetworkHistogram.oracle_estimator( NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); +fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - fig = Mke.Figure(size = (600, 400)) - ax = Mke.Axis(fig[1, 1], title = "Group $g vs Group $g2", xlabel = "Edge Value", + ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) - display(fig) end end + +fig ```` --- From 88fe7b892aee2008dc1fb0ca5fefb30c22f45803 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 14 Nov 2025 18:20:29 +0100 Subject: [PATCH 255/266] add OT alignment to weighted example --- docs/literate/tutorials/weighted_network.jl | 55 ++++++++++++++----- .../PythonOptimalTransport.jl | 2 +- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 1317764..6a6893a 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -6,22 +6,23 @@ using NetworkHistogram using Distributions using LinearAlgebra using Random +using Graphons import Distributions: pdf pdf_kuma(α, β, x, p = 1.0) = @. p * (α * β * x^(α - 1) .* (1 - x^α)^(β - 1)) -graphon_params = (x, y) -> (4 * (cos(π * (x - y)) + 1) + 1, max(x, y) * 8 + 1) +graphon_params = (x, y) -> (3 * abs(sin(2 * π * x) * sin(2 * π * y)) + 0.8, max(x, y) * 8) graphon = DecoratedGraphon((x, y) -> Kumaraswamy(graphon_params(x, y)...)) import CairoMakie as Mke let - fig = Mke.Figure() - ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) - hm = Mke.heatmap!(ax, graphon, colormap = :viridis) + fig = Mke.Figure(size = (510, 200)) + ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect(), title = "α") + hm = Mke.heatmap!(ax, graphon, k = 1, colormap = :viridis) Mke.Colorbar(fig[1, 2], hm) - ax2 = Mke.Axis(fig[1, 3], aspect = Mke.DataAspect()) + ax2 = Mke.Axis(fig[1, 3], aspect = Mke.DataAspect(), title = "β") hm2 = Mke.heatmap!(ax2, graphon, k = 2, colormap = :viridis) Mke.Colorbar(fig[1, 4], hm2) fig @@ -29,10 +30,12 @@ end # We sample a weighted network from the graphon +Random.seed!(1234); n = 2000 -k = 5 +k = 12 n_bins = 20 -p = 0.9 +p = 0.8 + A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); ξs = range(0, 1; length = n) oracle_latents = ordered_start_labels(n, k); @@ -43,7 +46,7 @@ res_oracle = NetworkHistogram.oracle_estimator( starting_labels = shuffle(oracle_latents); max_iter = 1_000_000 -stalled_iters = 5_000 +stalled_iters = 10_000 res_new = NetworkHistogram.nethist_continuous( A, k, @@ -51,8 +54,30 @@ res_new = NetworkHistogram.nethist_continuous( bins = n_bins ); -NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); -xs = range(0, 1; length = 100) +ENV["JULIA_CONDAPKG_VERBOSITY"] = "-1" # hide conda messages #hide +using PythonCall + +θ_oracle = Graphons._extract_param.(res_oracle.model.θ); +θ_hat = Graphons._extract_param.(res_new.model.θ); +perm, plan = NetworkHistogram.get_perm_alignment(θ_oracle, θ_hat); + +let + fig = Mke.Figure(size = (600, 400)) + ax = Mke.Axis(fig[1, 1], title = "OT plan heatmap", + xlabel = "Fitted groups", ylabel = "Oracle groups") + hm = Mke.heatmap!(ax, plan, colormap = :binary) + Mke.Colorbar(fig[1, 2], hm) + fig + Mke.display(fig) #src +end + +fitted_labels = map(x -> perm[x], res_new.labels); +res_ot_aligned = NetworkHistogram.oracle_estimator( + A, fitted_labels, NetworkHistogram.UnitIntervalConvertor(n_bins), + name = "aligned with OT perm"); +# NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); + +xs = range(0, 1; length = 20) function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 20, p = p) nodes_1 = findall(res_oracle.labels .== g1) @@ -76,9 +101,10 @@ end fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", - ylabel = "Density") - viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p, n_viz = 5) + ax = Mke.Axis(fig[g, g2])#, title = "Group $g vs Group $g2", xlabel = "Edge Value",ylabel = "Density") + Mke.hidedecorations!(ax) + viz_one_group!(ax, g, g2, A, ξs, res_oracle, + res_ot_aligned, xs, p = p, n_viz = 5) end end fig @@ -100,7 +126,8 @@ for g in 1:k for g2 in 1:g ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", ylabel = "Density") - viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) + viz_one_group!( + ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) end end diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index 29c115f..354c8a2 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -66,7 +66,7 @@ function get_perm_alignment(src::AbstractMatrix{T1}, target::AbstractMatrix{T2}; kwargs... ) plan = pyconvert(Matrix{Float64}, log_["T"]) - return plan_to_permutation(plan) + return plan_to_permutation(plan), plan end """ From 4b5acb4bdaef159f2f9020374444694b921a830c Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Sun, 16 Nov 2025 14:06:17 +0100 Subject: [PATCH 256/266] fix tutorials and refactor OT from python --- docs/literate/tutorials/weighted_network.jl | 43 ++---------- docs/src/tutorials/weighted_network.md | 65 ++++++++---------- .../PythonOptimalTransport.jl | 60 +---------------- ext/PythonOptimalTransport/alignment.jl | 66 +++++++++++++++++++ src/GreedySuffStats.jl | 6 ++ src/utils/utils_node_labels.jl | 18 ++--- 6 files changed, 116 insertions(+), 142 deletions(-) create mode 100644 ext/PythonOptimalTransport/alignment.jl diff --git a/docs/literate/tutorials/weighted_network.jl b/docs/literate/tutorials/weighted_network.jl index 6a6893a..4d938c1 100644 --- a/docs/literate/tutorials/weighted_network.jl +++ b/docs/literate/tutorials/weighted_network.jl @@ -32,8 +32,8 @@ end Random.seed!(1234); n = 2000 -k = 12 -n_bins = 20 +k = 15 +n_bins = 10 p = 0.8 A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); @@ -59,23 +59,12 @@ using PythonCall θ_oracle = Graphons._extract_param.(res_oracle.model.θ); θ_hat = Graphons._extract_param.(res_new.model.θ); -perm, plan = NetworkHistogram.get_perm_alignment(θ_oracle, θ_hat); - -let - fig = Mke.Figure(size = (600, 400)) - ax = Mke.Axis(fig[1, 1], title = "OT plan heatmap", - xlabel = "Fitted groups", ylabel = "Oracle groups") - hm = Mke.heatmap!(ax, plan, colormap = :binary) - Mke.Colorbar(fig[1, 2], hm) - fig - Mke.display(fig) #src -end +perm = NetworkHistogram.get_perm_alignment(θ_oracle, θ_hat); fitted_labels = map(x -> perm[x], res_new.labels); res_ot_aligned = NetworkHistogram.oracle_estimator( A, fitted_labels, NetworkHistogram.UnitIntervalConvertor(n_bins), name = "aligned with OT perm"); -# NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); xs = range(0, 1; length = 20) @@ -101,7 +90,7 @@ end fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - ax = Mke.Axis(fig[g, g2])#, title = "Group $g vs Group $g2", xlabel = "Edge Value",ylabel = "Density") + ax = Mke.Axis(fig[g, g2]) Mke.hidedecorations!(ax) viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_ot_aligned, xs, p = p, n_viz = 5) @@ -109,27 +98,3 @@ for g in 1:k end fig Mke.display(fig) #src - -## - -clustering_res = kmeans(A, k) - -res_kmeans = NetworkHistogram.oracle_estimator( - A, assignments(clustering_res), NetworkHistogram.UnitIntervalConvertor(n_bins); - type_suff_stats = Val(:categorical), - name = "k-means"); - -NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); - -fig = Mke.Figure(size = (1000, 1000)) -for g in 1:k - for g2 in 1:g - ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", - ylabel = "Density") - viz_one_group!( - ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) - end -end - -fig -display(fig) #src diff --git a/docs/src/tutorials/weighted_network.md b/docs/src/tutorials/weighted_network.md index 72bed22..1b4e24b 100644 --- a/docs/src/tutorials/weighted_network.md +++ b/docs/src/tutorials/weighted_network.md @@ -10,22 +10,23 @@ using NetworkHistogram using Distributions using LinearAlgebra using Random +using Graphons import Distributions: pdf pdf_kuma(α, β, x, p = 1.0) = @. p * (α * β * x^(α - 1) .* (1 - x^α)^(β - 1)) -graphon_params = (x, y) -> (4 * (cos(π * (x - y)) + 1) + 1, max(x, y) * 8 + 1) +graphon_params = (x, y) -> (3 * abs(sin(2 * π * x) * sin(2 * π * y)) + 0.8, max(x, y) * 8) graphon = DecoratedGraphon((x, y) -> Kumaraswamy(graphon_params(x, y)...)) import CairoMakie as Mke let - fig = Mke.Figure() - ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect()) - hm = Mke.heatmap!(ax, graphon, colormap = :viridis) + fig = Mke.Figure(size = (510, 200)) + ax = Mke.Axis(fig[1, 1], aspect = Mke.DataAspect(), title = "α") + hm = Mke.heatmap!(ax, graphon, k = 1, colormap = :viridis) Mke.Colorbar(fig[1, 2], hm) - ax2 = Mke.Axis(fig[1, 3], aspect = Mke.DataAspect()) + ax2 = Mke.Axis(fig[1, 3], aspect = Mke.DataAspect(), title = "β") hm2 = Mke.heatmap!(ax2, graphon, k = 2, colormap = :viridis) Mke.Colorbar(fig[1, 4], hm2) fig @@ -35,10 +36,12 @@ end We sample a weighted network from the graphon ````@example weighted_network +Random.seed!(1234); n = 2000 -k = 5 -n_bins = 20 -p = 0.9 +k = 15 +n_bins = 10 +p = 0.8 + A = sample_graph(graphon, n) .* Symmetric(rand(Bernoulli(p), n, n)); ξs = range(0, 1; length = n) oracle_latents = ordered_start_labels(n, k); @@ -49,7 +52,7 @@ res_oracle = NetworkHistogram.oracle_estimator( starting_labels = shuffle(oracle_latents); max_iter = 1_000_000 -stalled_iters = 5_000 +stalled_iters = 10_000 res_new = NetworkHistogram.nethist_continuous( A, k, @@ -57,8 +60,19 @@ res_new = NetworkHistogram.nethist_continuous( bins = n_bins ); -NetworkHistogram.align_res_true_latents!(res_new, res_oracle.labels); -xs = range(0, 1; length = 100) +ENV["JULIA_CONDAPKG_VERBOSITY"] = "-1" # hide conda messages #hide +using PythonCall + +θ_oracle = Graphons._extract_param.(res_oracle.model.θ); +θ_hat = Graphons._extract_param.(res_new.model.θ); +perm = NetworkHistogram.get_perm_alignment(θ_oracle, θ_hat); + +fitted_labels = map(x -> perm[x], res_new.labels); +res_ot_aligned = NetworkHistogram.oracle_estimator( + A, fitted_labels, NetworkHistogram.UnitIntervalConvertor(n_bins), + name = "aligned with OT perm"); + +xs = range(0, 1; length = 20) function viz_one_group!(axis, g1, g2, A, ξs, res_oracle, res_new, xs; n_viz = 20, p = p) nodes_1 = findall(res_oracle.labels .== g1) @@ -82,33 +96,12 @@ end fig = Mke.Figure(size = (1000, 1000)) for g in 1:k for g2 in 1:g - ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", - ylabel = "Density") - viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_new, xs, p = p, n_viz = 5) - end -end -fig - -# - -clustering_res = kmeans(A, k) - -res_kmeans = NetworkHistogram.oracle_estimator( - A, assignments(clustering_res), NetworkHistogram.UnitIntervalConvertor(n_bins); - type_suff_stats = Val(:categorical), - name = "k-means"); - -NetworkHistogram.align_res_true_latents!(res_kmeans, res_oracle.labels); - -fig = Mke.Figure(size = (1000, 1000)) -for g in 1:k - for g2 in 1:g - ax = Mke.Axis(fig[g, g2], title = "Group $g vs Group $g2", xlabel = "Edge Value", - ylabel = "Density") - viz_one_group!(ax, g, g2, A, ξs, res_oracle, res_kmeans, xs, p = p, n_viz = 5) + ax = Mke.Axis(fig[g, g2]) + Mke.hidedecorations!(ax) + viz_one_group!(ax, g, g2, A, ξs, res_oracle, + res_ot_aligned, xs, p = p, n_viz = 5) end end - fig ```` diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index 354c8a2..e6181da 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -20,64 +20,6 @@ function jl_to_np(mat::AbstractMatrix{<:AbstractVector}) Py(permutedims(stack(mat), (3, 2, 1))).to_numpy() end -# helpers for optimal transport alignment - -function plan_to_permutation(plan) - ordering = argmax(plan, dims = 1) .|> Tuple |> vec - perm = sort(ordering, by = x -> x[1]) .|> last - return perm -end - -""" -Get the permutation aligning source and target matrices using optimal transport. - -This function converts a gromov-wasserstein plan into a permutation by taking the argmax -along the rows. - -This function uses [`gromov_wasserstein`](https://pythonot.github.io/gen_modules/ot.gromov.html#ot.gromov.BAPG_gromov_wasserstein) - -# See also -- [`align_matrices`](@ref) -- [`ot.gromov.gromov_wasserstein`](@extref) -""" -function get_perm_alignment( - src::AbstractMatrix{<:Real}, target::AbstractMatrix{<:Real}; kwargs...) - plan = ot[].gromov.gromov_wasserstein( - C2 = jl_to_np(src), C1 = jl_to_np(target), kwargs...) - plan = pyconvert(typeof(target), plan) - return plan_to_permutation(plan) -end - -function get_perm_alignment(src::AbstractMatrix{T1}, target::AbstractMatrix{T2}; - kwargs...) where {T1 <: AbstractVector, T2 <: AbstractVector} - C1 = jl_to_np(target) - C2 = jl_to_np(src) - dist, log_ = fngw.x.fused_network_gromov_wasserstein2( - M = jl_to_np(zeros(size(target, 1), size(src, 1))), - C1 = C1, - C2 = C2, - A1 = jl_to_np(ones(size(target, 1), size(target, 1))), - A2 = jl_to_np(ones(size(src, 1), size(src, 1))), - p = jl_to_np(fill(1.0 / size(target, 1), size(target, 1))), - q = jl_to_np(fill(1.0 / size(src, 1), size(src, 1))), - alpha = 1.0, - beta = 0.0, - log = true, - kwargs... - ) - plan = pyconvert(Matrix{Float64}, log_["T"]) - return plan_to_permutation(plan), plan -end - -""" -Align the source and target matrices using optimal transport. - -# See also -- [`get_perm_alignment`](@ref). -""" -function align_matrices(src, target) - perm = get_perm_alignment(src, target) - return src[perm, perm] -end +include("alignment.jl") end diff --git a/ext/PythonOptimalTransport/alignment.jl b/ext/PythonOptimalTransport/alignment.jl new file mode 100644 index 0000000..06e7d42 --- /dev/null +++ b/ext/PythonOptimalTransport/alignment.jl @@ -0,0 +1,66 @@ + +# helpers for optimal transport alignment + +function plan_to_permutation(plan) + ordering = argmax(plan, dims = 1) .|> Tuple |> vec + perm = sort(ordering, by = x -> x[1]) .|> last + return perm +end + +""" +Get the permutation aligning source and target matrices using optimal transport. + +This function converts a gromov-wasserstein plan into a permutation by taking the argmax +along the rows. + +This function uses [`gromov_wasserstein`](https://pythonot.github.io/gen_modules/ot.gromov.html#ot.gromov.BAPG_gromov_wasserstein) + +# See also +- [`align_matrices`](@ref) +- [`ot.gromov.gromov_wasserstein`](@extref) +""" +function get_perm_alignment( + src::AbstractMatrix{<:Real}, + target::AbstractMatrix{<:Real}; + kwargs..., +) + plan = + ot[].gromov.gromov_wasserstein(C2 = jl_to_np(src), C1 = jl_to_np(target), kwargs...) + plan = pyconvert(typeof(target), plan) + return plan_to_permutation(plan) +end + +function get_perm_alignment( + src::AbstractMatrix{T1}, + target::AbstractMatrix{T2}; + kwargs..., +) where {T1<:AbstractVector,T2<:AbstractVector} + C1 = jl_to_np(target) + C2 = jl_to_np(src) + dist, log_ = fngw.x.fused_network_gromov_wasserstein2( + M = jl_to_np(zeros(size(target, 1), size(src, 1))), + C1 = C1, + C2 = C2, + A1 = jl_to_np(ones(size(target, 1), size(target, 1))), + A2 = jl_to_np(ones(size(src, 1), size(src, 1))), + p = jl_to_np(fill(1.0 / size(target, 1), size(target, 1))), + q = jl_to_np(fill(1.0 / size(src, 1), size(src, 1))), + alpha = 1.0, + beta = 0.0, + log = true, + kwargs..., + ) + plan = pyconvert(Matrix{Float64}, log_["T"]) + return plan_to_permutation(plan) +end + +""" +Align the source and target matrices using optimal transport. + +# See also +- [`get_perm_alignment`](@ref). +""" +function align_matrices(src, target) + perm = get_perm_alignment(src, target) + return src[perm, perm] +end diff --git a/src/GreedySuffStats.jl b/src/GreedySuffStats.jl index fd87e32..d164e84 100644 --- a/src/GreedySuffStats.jl +++ b/src/GreedySuffStats.jl @@ -7,6 +7,12 @@ struct NethistResult{L, M} <: Result model::M end +function permute!(res::NethistResult, perm::AbstractVector{<:Integer}) + permute!(res.model, perm) + res.labels .= map(x -> perm[x], res.labels) + return res +end + struct GreedySuffStats{M, NodeR <: NodeSwapRule, StopR <: StopRule} <: SBMEstimator block_ss::M block_ss_swap::M diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index b3bb892..348fed2 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -2,11 +2,11 @@ function ordered_start_labels(n::Int, k::Int) labels = Vector{Int}(undef, n) base_size = n ÷ k remainder = n % k - for group in 1:k - fill!(view(labels, ((group - 1) * base_size + 1):(group * base_size)), group) + for group = 1:k + fill!(view(labels, ((group-1)*base_size+1):(group*base_size)), group) end if remainder > 0 - fill!(view(labels, (k * base_size + 1):(k * base_size + remainder)), k) + fill!(view(labels, (k*base_size+1):(k*base_size+remainder)), k) end return labels end @@ -18,6 +18,7 @@ function align_res_true_latents!(res::NethistResult, latents) permute!(res.model, perm) end +#TODO: move to Graphons.jl see https://github.com/SDS-EPFL/Graphons.jl/pull/17 function permute!(sbm, perm) permuted_theta = copy(sbm.θ) sbm.θ .= permuted_theta[perm, perm] @@ -31,12 +32,13 @@ function order_groups(node_labels, latents::AbstractVector) sort_perm = sortperm(latents) sorted_group_labels = node_labels[sort_perm] dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] - counts = Dict(group => countmap(dummy_group_labels[sorted_group_labels .== group]) - for group in 1:k) - perm = sort( - 1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) + counts = Dict( + group => countmap(dummy_group_labels[sorted_group_labels .== group]) for + group = 1:k + ) + perm = sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g = 1:k), rev = true) new_labels = map(x -> findfirst(==(x), perm), node_labels) - mapping = Dict(perm[i] => i for i in 1:k) + mapping = Dict(perm[i] => i for i = 1:k) return new_labels, mapping end From 93e41d9b22dabce7e5c2478f891c8a6be3778bc5 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Fri, 19 Dec 2025 10:49:12 +0100 Subject: [PATCH 257/266] remove pycache --- .gitignore | 6 +++++- .../PythonOptimalTransport.jl | 4 ++++ .../__pycache__/fngw.cpython-314.pyc | Bin 32832 -> 0 bytes 3 files changed, 9 insertions(+), 1 deletion(-) delete mode 100644 ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc diff --git a/.gitignore b/.gitignore index d02318d..4b2ed35 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,8 @@ docs/Manifest.toml # It records a fixed state of all packages used by the project. As such, it should not be # committed for packages, but should be committed for applications that require a static # environment. -Manifest.toml \ No newline at end of file +Manifest.toml + + +## python +__pycache__/ \ No newline at end of file diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index e6181da..422e4e1 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -22,4 +22,8 @@ end include("alignment.jl") + + +# look at https://pythonot.github.io/auto_examples/backends/plot_optim_gromov_pytorch.html#sphx-glr-auto-examples-backends-plot-optim-gromov-pytorch-py +# to implement semi-relaxed gromov-wasserstein ? end diff --git a/ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc b/ext/PythonOptimalTransport/__pycache__/fngw.cpython-314.pyc deleted file mode 100644 index 526c4cf4d624f91c8f1f43d00a750c106e27b803..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32832 zcmeHwd2n0TdGEzSEEfQ=aNjQ%agj(!q%2vMWyKUFQnD>t0;x+UQA9xmNWmfr@B-A9 zsA1Z?4p>_;oZ4f!vB%V;p3sw}^h-U_Ydf!Q-crA&?NEkTF{s+8t^3@}`$x8%y6((d ze&4zG0v8}CQI6Z`oA>TWhv%MqmhWue`ObHK=fjp_2hZWD>Z%BReVXI`j$YJ9uXygf z1J4|Hg7fJPaWNhK>JRA;8e)cn#+dP7QLLzez8Mf^ikX<-c*uOv60|2Ge+*izel|CoFRQX(Rt9@>`HNFa;hkdK{Rr*Spug<4y<~pi=PSqOhFrz!AUXp9;o;Qralz;3@a5~VTPkSbj%V;#{m&Q&bJ$oR)`qvoz?0Wjyu98OZJ_7Cv@5vkxJ39fi(J@G*zN)ypgqS`4LBM?({a2 zm!nVjpwcrU*NJhUJ`tP9t9=zHVflH+?ezD!Q;p-(tF=3=8}>x15APK{;}5YoZZ(cr zrpAfXh&4x$Q~oo;iB%6MBOp>OmLH*UnfHj*qA|B6vUHG*;BrB~!m=+)62sXJbwk}B4Tb(!$mb?Q-jmwMEq9-vCJ=GCKQO+Acio1z{i zk@~~UV##!b1`RVqE%zal4AZr9(Xf=A-&V zo7kY%bWr~|TENQHa}goZaJ){9qnQIbm0I;LN@-L|0Y8c~YTB?~)QOGS@vJGQQYoh? z(s+W4G@sB#TJV1p{gn8V;tt@7Ieec*T5d$U-)%foSa{~o;$S4Y~1Ya<=Q^^wilew%oJ#nqvnTaKcxa)tk@|nF7SGPEiEXTQ4xe?` zQydrQU~dBI+NG&$m)KQ^UQJ!Sk*?#-YF$U(61}2cttsX}hW99?*?s&kUD~Fc(hQl> zs7MahbK;xt8Glnwe9JxK3!3;Y zb$A58Nx(OnPrTD<%GTio$42jF`gVM)8t!u*=~n&Ozdo1Gt@e)3)0zFi`O39#+0r#R zmK4fSY!)}EIo?;QmQR(c^{%NrSqjifx=xLZ5@|l5fBe5;jApsH%##I-k7CBFZRq(@ z=4*nljQO&$&8+R~#aglYskZILW)8Ce{_TcltY{7RA881wzQ&AiOU9QC>&k>>efZi@ z;h)07&=;PH%*KL3>~v7*9~>AFPWhz^W5H>N`+`3zL?j63cnJu>8KJ$qr$Y!$3*lKw zi1;HxiRUwWgDBsXdH6wLH{!*30nh$mG$bMY^z76r=c6jvC~3uGB^c_gwvoddKy#+?bFfDKnE*qFdPU9<3WFH z7Id@Zlap$Bd~XzF-DkejQ90wGm^3>^xe9DUGPXANzE72VXP`|k;6Pv^L%9$OPDjJi znhHcdMGfUCkchfl6vo3+c39D#(my>G zk<&pCiL$~1VzDB#@svTP8P$PVDKtI7qRmDE*s(@=IrC`@p3%(UxqAT=R~=~TK;B3h zL3Hy*%b>XTR|Ylm==kjPXfHYjZNQr63GA8$1dhc*;b~zuin*xv4BLcDLNLVIuPurV zxD*_vZFZZ0&TBg}DZlWUjQcbhADW&aVECL6o}RoQv`zMmPKTwbHh#)~ zel$dUhB1pyAsitJ{c@)s_Mg|V42tHbuTef0o>WI&UXr0$ID&4)%s(BT44^b2C`qVB z``&FGUjA%QIu(uvk(LHdUeY5N5;3qWGz{i9wUd7`j6squ7n3DCQ8=+LI2M)y0(}pS z3&YZE(91L31vQqIe<|#0uyc^6eI5yipzR3xCqozI^3^WN4l49X&WMnigjYB;E=-4G zLNqAPTfZP{V^EtAsuX2Ct`O9}9zlUmLydM0M74AXem3ydr-O!P;fDD%bYv&HtSr)! z!U$QXNwiuk5JV#Yqs7N`Swp1CLXga~^17;DX4q_l(LWCl>$JRD^#I&IUe4QaE&!Y4jB{de)V{^`BF!yU#n4*~BG*fh9$+8hNd z9zHFl%?j5|8!#%;y5WoVFYn$Fz4VnY@9yr2UixRlp_n&qI3P)B9nU@m=+my?rLUxQ zk+lAB^kPXysLXmgs-z-pCDME-s!q_fk>olFLNkY=g45|D#%t5YFqF#@X&ofm!la96 z&QGWLaS7^b5FEK6k-9EjbTKG}qv@hBL}7`=ng~jyE=pI+GD1$fDN5;dh4L-aUg`%m zk+x|^WqzG*9H%9CbQ+^EES(vh043qGqvuFKlcKR;Xu2nDr1ddvriBdhZQA6IM4(Ae z7e#25N}J?Gs$*+fH<;G$wQ;#S%z3pg1%(xvqAuZS1D0ip^mP)ch0`W^{Zsz-D}Q^_b}i3M>!;7Bbwg=gAZ-ea zQn6`WU)nkmgjnH^g)u04(?#d~lRmRt(#=Ywh?Stp;xG!r z6Qk*(KI~e~ri*&zKTokgx(jJ@e-xcC6-n#Rh{Mbqr44Lr!G~sh!6$uk>oMx9t%2Eb z?1Z-N50A~#jwHHu40CI%(y`I4bQ%&G+d3$B^g)czFwQNewg%6~wjQ|h8iAAO3pp&+jQ6Gl>9QV;#J;zrh_1CO5bNg;u%d@^! ztL+1Wo;S_y`^cn;Vk=9WdZp!tt>Il;!)mcBJ{;HI{3XKPvy>(LDNE&C?@epzvb8;N z3Qn>I&VrsCU&`7(x9>fRExzv=^IY$$g^zb%E_%*3Z%g!EE=GXEnb2K6_S_feznFAi zJ~7w7YPS69g;jg;a|81OiQa|AS7+k`Df^~H-HN>1QDxdGY#X>7LFYZZI?us8;bydCOeBQaBTNqEdy5ha7F3(Gj=N(D^qGz%9`(+77 z%C!}VT;=iJ_s!)S%d)iYr6-?%a-nx&CRN%IKXB7knmCp`ka9KMBm28iOmtbEwARhnd@D)w7l2U_C3SmzVDiU+ncg> zU-2f7KHCzXNi-#nzPRZd-Yed@eaqJFTfA+qZ?&Rw*;u+-S~hoB_P%fCSDo&-akY6< z+_>y$xMmTwUZE*&jE`Qk)U8(6!uR=k=WP>btx9gcZmD1BT57)Dv}5_9L+|vhJoMx$ z??_lKdskiVcv04Q-`Vujmey2x+m(IEnZ(RX(HEju_QeO5%iC_cHZMN7G;sCk+q#r% z-woIPYp(tA-uGI!e6Mdw|J{M#o=!P-T$x^QKO0DNCr%~ZF9yFcePueXUv}(x-&w!v ztV(|Ax^weS+uBoA9am-+8k3E$HNVn)WhP--uIjj1x@EEVs_AWgs&Evk&RpI3c2BCbKcV}mohz?NcE9k{olPYU{(}~-ydHtgoCCGS1vAXy4ra4=+*L6+oLPZkEUwS`IW1c)vuXfHe-Mv zc-@+++#GkT*3>D9W?z3ORnvt?KeH9D+Fi@;<`sL(4f~dB_ASeud#>9bSuZfruwrk# zVQ;@?Z(pptZhsK*5>=OPzHZ<2zRk1h^dyEZUr6>o`ztqHWr^t<#iqm+T9874Nu!OPurql@k=Ss!`JP5asrpT9!Yuj zUbla4{m_~4SjDd!2X&FZ3(7ZM4DXT!p@I223wTq@m(ip$AY^lBFWN-0v z@#0TDoY8aEO`rU&wfmC~JGqMXPd+qp#XCRw(85)3{^UcRxh6Dzl+?@bm}-t~Grsds z?XexkKj|tvw%hoVp4wxdGyZ9FnP@crS#_CcHU7D+*CiGk$CQH%q>}kEqzL1kuj6Pn z=kS?R6@(G(k@VUFX!VdP;>rdlcvGK!Rg zj6o>=i$=<{ieS-rL~}^0=3F>*WRo`K5xW{Y`*&oAHioJ;%}u9v6>W#Gt{Q}c+JM<%p~ycxA<5mcKxe>YBokIC8{Rc2Qu^u9|c1^*ar z*_j$B%tZYsdIY8?ITMtogOkEJXhbNSC(YNIoeD|!W8v8dpbHaGFOO{w#P~7)7=eUZA2^RC%jueU*-j?v~Cu1vo? z8Up#f*^AM@@;>k7J1lHtp==|fvr`gIIXH=q_R@wxIK~7}iB6~xlLTW42mwwG-ATRy za*o1Dn_^QDWRNyQBC)6iAtYmxI4OOR;?Us-b97)C(I4OqGF>ZYEmx%~$X7GhEL9|5 zWraIPS^`O0Lb972f66LA$a1(K>6O>KvOPKTt(`A?=}wWs%KuGSQ$ly!z&T1UKbG{o zR{nDNg8nAWiHZCi&+yLUnPVRu`=ZwLsDCva zxSw(0oTI^Ycw7#b&gGy-aEO%us2owtRYij@2TG&;29A5MAjdW3@KA&Q_fZ*@hY|-; z6d^Km#O5>-9E*|CoxZKdCwUE@EPPgU4<0C7Fi3%)N8l-r+sLqGU?gM)(|R++msB3x=Bhlam&f}9MW zlSb*4G#?#inY-`1!U>z#aKhJ%7Npl_zjN-& zxm4BG75mnu#-)L`Hmh8)Igbn4J#&XY`Ov0vy&oHU?R{0bT#qCb#`ShV?D%XXK=J;$ z-bNvcTCP_}%KY>^4w75}DYJka2hLhhzVqqt&F}IgO+{wTO{aqJ^7x*wAm1|;!O@R21&wjeNgPS9{N1>+R(jot z`1*gd{QB8(Z!e#7#=qWjL1=^mhx#}Ikx%v4vhsD#XQpGF_^0Nzj{Hiy8RaC9|L5Y+ z(}B*X&v?DwuDzi%fs1&DA0Ywu-l3CPz^OWH8(0X^Ca9uG1g5n>+D1-c!S_M>G(Zjs zzHC{L{O}c!dId5_FdKeu>WfB@|L~v%_;tKT5-Ap>1A>H;okE76_&ALG?4lRl>Iut z-rwTiNFdEITXmK>}J1cofsxnjD|X(a2{?oMp(B_j0{FAtY0TaiJC{muuhE9Obp{X zF|x>T4lRgt5a8yDd@Wj)h-mWVC{eUaQ^bCFws6i*Wl)?E`*AwLV$Tt!ZPj=BTTq!} zFNB*vp#$O^laC%ysmJo9ETRqEXI2I-#bnAb<&}{U2u2Zr7b}5vt@7E$YM(=_@fC}9pHsB?Tthmadr0r| zU;|bn8hxchn|x(MEx3?VvDGOS@6#DMnuU?tNFDmSfPf*^`6{)e#@DKS6WzY5j3_~< z*g`qf=EQ0MZm`Z*&CrE~ConKLa!Oo+8aa40BhuY!aZX&inT*}RiqSaRiCyT z?WlXyx^CrOVd5;(u}(3)EaD{E`-T0G|GgWQ6{Vtt1%~5;|V=+L6k$51oHXPs+RtauQaK<$b}|6W!&p#Y7%3UKtx(t3eV2kz zn7WZUpoO){TNf2U8*N%A*|azhMd@Wt|(KtaNK?@HL+ zGs>Z89n~j#M$TkEVVXjxW*Hz=(SRyV#L(FM%(^YKeg+B!as~urV}M}=*P91)%tQF* zV225>qv|J>9cUINq~MS9vb!4+Ij`Y!;pKVx05B5>c7?~saSK7Ynk=+$6ZQc3N0{_I zLU%_F%zUHC88n@xmTxJebqipHf!iZ^BU5_=uoY3q2wi0El5a9IC|-CLFv7DTKc)%} z*XY(WfRgfeF9RMPm;@Yo3}|Z^MG7^J3f85AA`}A^4lW!3l1(s5AnAaN!psXrqqBfB zqBDekKeF)h1KSG1X)%s4!SgDQYzF~~W7I1^Lk+|-G6`Z8K%@c=2T_rk6T>pePZCMz zLMVD*n~YbC0^}4Ozko9|X(9yhHj#=^3(RLM%9zBt5YU)FHL}6ukFuJtEjXvlyI`%Q zHCStY^X4<>P&jGUGqBXmxS(^{X(~Q|ODJP8fK?-Opa6WQd~Okrc?ottHy7QjsMem-; zFwVlGd%fc;pzTg6viPSt?xwRmaW*L}^Z-ZY>{zyR+(9_^>8Y2(NUqQi{vMF9so?VH z)d$AYUH3t7V0;x6OXrc3bm4xeL(2{sv~8ZgP6^sMSr} zSL;svC1{OiTDL4Qo*Y})zNkw%H!oW@e@67v(WdlOIC-?{0~7bxvJ-!uLoJPk=tAG3 zJLTN6Y}vx71?)CBF*+{)nUKy>(O)6w*U0&GIBEUBHlSe{FO{Mc8e^eoPknEXG)p0K zkem|N_v)(TqcLX7jRy~NuMLW7hd0z3y!4;-2p z6X?k~Dw%cX*U0y)aH8#STF1xfv{89FX_;K}Jc>s?i}X@ja?a7%`w4j2|D5A)@7Hs7 zKpB$V*KM`S#@c0D?M+){(w(x^e{Da2she6Cdlsdoo~zx*IZilEheY z`+|PeZ2i@XHyxEpPtvzgvM{!|eaW!oOF4GU>u&*7YgqH?YDvoRNL+u*!pq>-X9)b7 z^J29;>^O%Z|_~|zPe-C(sbR@d-L8=-m7R?@GTx)+MTM{ zd(Bun*H4Xe(@~x50dwzOtWG%|nCt)8s4Lq1u}N{ZkgpS(cz4OF#Y+FIeEitus>JXL z(4|LLEcG9lOWmg17S36E+e$v3bC%q;kq;mg`)vpLiaC4nZ72C$6y_$Mhr&w8S4zGz z@|AP0^4k^UtE8|h@>P?shJ3Z;t0P}M`5MS4P>M$KHIc8Gd@UUCnw!Yi%Gul>w88ha zLvy`zX93f6moQNA_=5QA^rCO2e%sY%LKg3xJ4D@AH(!^SS!j;erL3*!(CTJ1(Zkn_ z6|0`om%5(sS}-j{U$-p{CAw0chwy3p*B-y?7dZ-mxE4?1^DE~1h22YaE6saXi=Bz4 z%i~F1vh7uV;poD-*T1k*xpk>}rDD%Y@t&(Q|GDRnANuh_tM=+w_PuuKw+<~#Egik8 zPt`tr_2?age!q_Y1*dayex}*EpPQPWO7<<5#HUilU2y{)vE+u_Dp-S+ z;%!UaOEW*3eRKAz?}rzDbm`4YZy){Rp+6c*J$Ptk`=OQMLzGvcx~nOpZ$^J~?#**o zr5}FzM_+yOt8dTbq|B$H09~c!H%eQsmA0(f%Lt|p*lV(xEmododnd5tiRgaw=y z+g3}fliOc>98Fr{`GvvY_F&|?D=0$p)N2zjPb`ct4!wRRRo}DX>H$PJX+TQ#Pbs4D z01dN0ZcpOai&e?t6?fx;Z^hMiyTa}$zGHTk@p0?NO`OHH>a0w*E<1%~i*UxSz2R%w8a9<*h0UN^r&b;5~+N{I;TN@B*A z%n`J7{-efJ|3)k$=e6g35vyp09?9(gHX2e^Bgywh)s58bl)8@_58+;sfZImcrAt%n zQuzT~^kqtg_MPg5ex6$H3O&8YPtFp$ZkfrrRY*Lsx0}EJAXID z%o_aZ7;SjhJHl6#Y`jFWX5rn=WB9U1H;1FHBg&-E4hDoOhjZFct>Gpdu4N2gVOMAP zIS!aO=?ppFg!6`87AlJ(;d4O=hSYR^CZ|OyTC1|N(~ozjtF%&QpJp4^{RpD6EhYAG zt0iSG?RtY4v? zM^Pc^Ey{wnd2EXSVV+v+=hOx8{~m=?Kn^CD7yXm7LEKjiy9Nf0&7 zo93lN(=%ri(n}Ygzqru2Fp{e1N?E*1`X8C!G+%Z9LEqcD-|xR_d*|qqIkoLctaVP) z+yIFTZZ!|{y!nNa#L?d@Pr4Jfg`;tE%F?Ff9haWFIDZi`=}5}C>58=2^zF0XIe+DR z%CZgUJeHA#D=u4c)X(+fF4x>)c?6N#L47jI+J8mJr(4t_MxPOQF70BegB5C=Iz*rRy};xhY4D2f_QH=&GlkSg&hf|I4%fEaZ}1tjol}J{JZD&-?BO9 z9$!1#NP~#cZoETEu=Qq})&%89pDLm-cu`;Wj#Bmu=1NhaxiZqHol&Yti20%O|9wWR zkyX{P(;*2;TL>^Tf)C8(O^@}?zW(2v9V7_dWuy};=$n-!vY>R97Bj{W)CS0`H+PNi&}bNk;bu1W4s6}NnCU~WInrQwvd4l@aWy`A$r6D=>cr))LL#+tQL zf|xv;5~P8GxnGUtSnbY9JCxZF{xA0$!Vl(f`2RnIWAZuoz4-g59>QxzA}LD?PnVN0 zJ)&M7;{lr`_KcBQfYr;2ag0P+=lEqrl*oCG#vuv)P&u}8cE_{7k{n-bN;$W!*tRVj zx5>lIh41P^dTL01#yc*^BLI3D{6>F~-|TP5<-$0_vF2{&!}^FdyO|%l+r0s1P&6kl z%_H`xT)a(r4?y(~1Am!~?1)Z{5pFna*M?`IQQ-y{Fm;R0OdhC7Er5VtZmTZh6s&; z3*{=7iKQn%kD6;XW!8w`0bSq=Of6HEXKt_z=61{3DHN#|GVP;fN?NLhjG|>D@gfys z1^TNbTX)r$H7}ZD_ORl3nHpn6CsqIw$@^>%DiSt>K2}Dm#7fbV1F@>sn9VH#)VLZc zOSc#K>_Y~hW60<$2Bgvn$R*54`aHgpttQdMpp;Gk!q#IOe2ScA~U^QBHJmj#2A><skt3#F_q+!?0HwK7~z{d`A(| zUjU|*$fg`EE$m=SHa|5?ck5J>cGba1 zOdvWw;h&oF3y;XwpTdzZ9|sTtFQMs6qsq7AWYk_b4qU~=WKfU;v(};xUy}2I0EW+_ z{lXSGD{1P>sMsap_!zDo3*+r_x(=E6s5j;njw^X(3{Q1m5?U2PU{1?_Nsdd4r4sYR z@H%l%t&Jnl4QC~ z-up7azMT|tCj)Y%5a1(fpLJw&P%rWOQ94D>yN!N|fSifs(12D2Wvin38w{2tg*a z%O}!2JEli@SuBiCorId++`ElMB%6u%j0iB%y6W4t5GEBpGyxb?RA|SqRQP8nnGJ>z zezGDWJRK%eegW*eVT-RrW0wqt;7cALYF$H-A$%`1=t0RbqLb|w##MgQ?a9!#j(lFra(AVOPcx6 zETs||WER!wzL8m&)mRcnbLRSPY&6uVuUdeF^*R=lI72^d?D+$*YS<|Zc{_z8{s`XA zgz!2@ufYq_e|n=+7=XW3=$(C)F=*08E)?rW@8g;y$pyh<&_lOf1_7 zUSa4^?_uFF@CjM~Vbk=`lph$q31Mgq5E1aDPX596?EnaN@7l4wt9#e39vuJ35aVb# zLpTqJ++@Uy;fx+XHl1PIC|fO3RUc5)Q;0C-pZ3Bu-*4Fdc(V6NWqrrOg1I zd0zSkq=B3>5|SH9_Pq0TID1SuBQX{}Bf$@TC^9A@W)ByOhV+nfGw<3sInuCc2o`wm z9j}>Fab1e)PeNhRE99Id=hw&?ASX`FJRDi`2LxRPL}w&(CTZ8$MD~P*LP{Z-GHY~n z#sby)+4-{xf3oq#;4_z!vsY#o-QSMA`jr*umc`(keq1IDe1CGqxqI2Nn{6a7R%*>E zk_km-V-c=Y3nJ3SC!lk==g!#`)V+gspEo;JYW4?~!>_yeI0sJ>z#8{tfxS#`h zO5No2kh2|5+5iG1o_sDQ)oFfRC~W{UOq*tzQJ@Y%`uCLRN93>$%=B16=n{iMQ3IqM zIm?1nNnsX+&-Y|4AWE;&2O!YG5r+Pg{+Qm2#z&!aOY1|iv;`)LN5|RQ;gIwv^wk6o zglPt2h{4I2e}KWU()#hSbTKF&Wr1>v5_OHx1j=eL0r|87B?X++IH||z$%9EL=SeRk zrbN!`q^NiU&i{nQ>{s+$iLjtsfWf?!r}Jx%&-KO|}0dg?Pq4W%Aj?gj2mQl5{&yDoIjX2`pn?G z?v}kct|OGt!yj^RJ}?1plW;#XFn8c*F!lw54*i#HH!RiHEY-;!*DZ~=to+=*_szvO zEsj-t>kWI?HG3C;6aW!^b;${ZH!kGjJgVduW`m=IU_=$auo@i|9~n$m{-a{fTD)A` zcB9yPtr$m@OFPJ_TB`VS*Da5}&lksANS6-t5pn%$NoCxEe1I+~dl|O(zE$>O9S)`z z%HDwS!0(p5ju$UOa>Y;4!B@if%+o8D%DA4a?^L1k(&E(HrPs^)=XoH;`fs^Q68;y< z<0gEr?Sx6SRXb?+z{KII-Ti^hQOg6{R8o1{LB3-0Imzb&66dy?d>+o_xm`lOQm&%< zb{YA~$yY(XO3qpKK^1)0KtfSmO6&lj$x)mrN}66QUa{B5b*PEGI8lAwRym+;~^Lx>y9pAmUQvcAhrDoMuzff`A*1kA+^{MNf2Uv>t$&8#c zVY>YI4SW4Hdwp_NwgmSZ4}w1>_FX=gY<%`Bum!hT>b+6A<67yC)ylf$*@drOr5-6O zNBI>s=#jD-^hiYmdZerYJ>vGDM;xA2SH}(4wrj3!tEJ`1qUDC3r9q@}dp)aOr>B#rXaIAAucop@{=Do0DU0#?^GN1s1#(bQkNZeuKk*&51Op9 zg8HtqLblX=dc@A?q|`E@-7<`vQcpZ_@fNK^5AUa#)!f%9rvl{^(7zQ9 z9r>o(o+5ulJ38bFWG%;47EWI{bmaLB=33=5ix#bff&2(Qo%e|4V$Ri2Ro|gaiHm|Z z?fW`8Ro*)%tzHX1D^`%Ak$(~$+S0U=(a3*X?@QsbLHWkoAm^&D+6IN`gD$Xei)mUm zhe}%im1n~(c|Y`2eR_H#HCg>Aw462S*n)=AH4YvDy_h=)%_O)0ogJ{}2+0_E59w}+ zi>dP`;_($lN<`N zyLZcM6tKf#mil&+3g~n=ARif$?LI~6z%o&HW7#tJ0qhah99hzjx~SI^Nl%m6CqVxe zAq$IsLEg}1s8XQ~R17bRP&pa1--%JP+#ecOwrCkN7;V1Q9SxNM^oMp*r_>LZ3CLl*L>Y9Rvo!=d#F>dP28Jg-)Snn=-XW^1n~q`X2OFi5b$2Bj2~ zaO^S@BS_VmshLoh@Sq7LXVoxb=HwRyvrYrOCkIaQ_jB(p^8-Fa=fA|oH(BR)mwRt{ zNENw*nf0AK<;VuFoO%WJa|ZHFHT3;-5S!#EUDKi;Hjw+9T|Rk)UA)V`p^EZVi#;1C zKUWinZi}40-}14a9vxRZHm^uuu1YR*m+bCkU>?#%MWuduH)fUg&A(B&kvp?@FLiCp z=3TK>T6Uu)YcJW}Z^_iA8J*?7K${MXtx4b4wnKEMqM3KBsX;b+OJ=It+#L6iy9f&Q7huQpi`7@skhd*Gq>Bn}yP? z!6C@kmbDC;Ry=hrVt0(AIN(xQdjAMNPNN745xKKZ(U;6|7A{{-dWBUrq(3F+56L-A&KWp^3>+uD4R41- zqODCEUtz^|0tu+_b5tNZQh z#oq7D-e}zMZsU%d?%L$hMg5ZL>b{hFV1D0w+4pyJDfeUXeb7d{RQ`N9{lLPZdHhU5 z!-D5KRadH(w*WAhYCQN(ld44V;06}wsl3A#dH6V8zrsbAY{Ensrs*$Sxv<>b3+wr9 zhu-N)*#}qMCAW)M8UstiD`}GY*Gw;)k`m0+FYf!D{cr3~xx3>g#HI|@kKb}2$Jmuv z%H0_^K`WFf%NQ&+EtbFTOjT`LvD1~H*|#3LD*fTv|8N#cl!SPhaLDonrU!U{9UVLr z9z2KpZvFH9`27getHmp&t&3&fuU{#B@aoQ$lHQbkAKe?;bUBpld2Q#*I~Q6PyIl*823BeYR*Db)!k}{;($N*AjV374BLOTfL*wM)+#QSV#om<5 zi(f~umA-2e77o4J^3cuF@|Om_IgmWC@X%r?Rk|yFVAbJ{>wk(nKu42h3+1oYuavc= zT;$K*uFA)FP{VX_JLhJxn*B0|%aJyZBWuF2qD z=>xoy{wF#H=e9wo)BPpqd7s;Jjob5AT-#r9O+Vv$|B5U7d#>ka+@_CAGX`D7ho{XA zy2=mt>zZ_)4MB1vp*y43Ref~4XfuMopsUllK73SHt*iZTcZsh1qt1N>o% Date: Thu, 28 May 2026 16:43:47 +0200 Subject: [PATCH 258/266] update example in docs and update julia version for test --- .github/workflows/CI.yml | 2 +- docs/examples/custom_suffstats.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 56a8717..02f87e5 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: version: - - '1.11' + - '1.12' # - 'nightly' os: - ubuntu-latest diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index e65c3e4..bdd003c 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -69,7 +69,7 @@ max_iter = 1_000_000 stalled_iters = 5_000 data = convertor.(A) -es_new = NetworkHistogram.GreedySuffStats( +es_new = NetworkHistogram.make_greedy_suffstats_estimator( data, initial_labels, num_categories = num_bins(convertor), type_suff_stats = Val(:custom), max_iter = max_iter, From 8ea936516be387643d664d40552aca6345baac2d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 17:01:28 +0200 Subject: [PATCH 259/266] run formatter --- .github/workflows/Documentation.yml | 2 +- .../PythonOptimalTransport.jl | 2 -- ext/PythonOptimalTransport/alignment.jl | 20 +++++++++---------- src/utils/utils_node_labels.jl | 15 +++++++------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml index 4538118..999cd99 100644 --- a/.github/workflows/Documentation.yml +++ b/.github/workflows/Documentation.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: '1.8' + version: '1.12' - name: Install dependencies run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' - name: Build and deploy diff --git a/ext/PythonOptimalTransport/PythonOptimalTransport.jl b/ext/PythonOptimalTransport/PythonOptimalTransport.jl index 422e4e1..969429b 100644 --- a/ext/PythonOptimalTransport/PythonOptimalTransport.jl +++ b/ext/PythonOptimalTransport/PythonOptimalTransport.jl @@ -22,8 +22,6 @@ end include("alignment.jl") - - # look at https://pythonot.github.io/auto_examples/backends/plot_optim_gromov_pytorch.html#sphx-glr-auto-examples-backends-plot-optim-gromov-pytorch-py # to implement semi-relaxed gromov-wasserstein ? end diff --git a/ext/PythonOptimalTransport/alignment.jl b/ext/PythonOptimalTransport/alignment.jl index 06e7d42..2a9dbc0 100644 --- a/ext/PythonOptimalTransport/alignment.jl +++ b/ext/PythonOptimalTransport/alignment.jl @@ -20,21 +20,21 @@ This function uses [`gromov_wasserstein`](https://pythonot.github.io/gen_modules - [`ot.gromov.gromov_wasserstein`](@extref) """ function get_perm_alignment( - src::AbstractMatrix{<:Real}, - target::AbstractMatrix{<:Real}; - kwargs..., + src::AbstractMatrix{<:Real}, + target::AbstractMatrix{<:Real}; + kwargs... ) - plan = - ot[].gromov.gromov_wasserstein(C2 = jl_to_np(src), C1 = jl_to_np(target), kwargs...) + plan = ot[].gromov.gromov_wasserstein( + C2 = jl_to_np(src), C1 = jl_to_np(target), kwargs...) plan = pyconvert(typeof(target), plan) return plan_to_permutation(plan) end function get_perm_alignment( - src::AbstractMatrix{T1}, - target::AbstractMatrix{T2}; - kwargs..., -) where {T1<:AbstractVector,T2<:AbstractVector} + src::AbstractMatrix{T1}, + target::AbstractMatrix{T2}; + kwargs... +) where {T1 <: AbstractVector, T2 <: AbstractVector} C1 = jl_to_np(target) C2 = jl_to_np(src) dist, log_ = fngw.x.fused_network_gromov_wasserstein2( @@ -48,7 +48,7 @@ function get_perm_alignment( alpha = 1.0, beta = 0.0, log = true, - kwargs..., + kwargs... ) plan = pyconvert(Matrix{Float64}, log_["T"]) return plan_to_permutation(plan) diff --git a/src/utils/utils_node_labels.jl b/src/utils/utils_node_labels.jl index 348fed2..d5b50f0 100644 --- a/src/utils/utils_node_labels.jl +++ b/src/utils/utils_node_labels.jl @@ -2,11 +2,11 @@ function ordered_start_labels(n::Int, k::Int) labels = Vector{Int}(undef, n) base_size = n ÷ k remainder = n % k - for group = 1:k - fill!(view(labels, ((group-1)*base_size+1):(group*base_size)), group) + for group in 1:k + fill!(view(labels, ((group - 1) * base_size + 1):(group * base_size)), group) end if remainder > 0 - fill!(view(labels, (k*base_size+1):(k*base_size+remainder)), k) + fill!(view(labels, (k * base_size + 1):(k * base_size + remainder)), k) end return labels end @@ -33,12 +33,13 @@ function order_groups(node_labels, latents::AbstractVector) sorted_group_labels = node_labels[sort_perm] dummy_group_labels = repeat(1:k, inner = n ÷ k + 1)[1:n] counts = Dict( - group => countmap(dummy_group_labels[sorted_group_labels .== group]) for - group = 1:k + group => countmap(dummy_group_labels[sorted_group_labels .== group]) + for + group in 1:k ) - perm = sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g = 1:k), rev = true) + perm = sort(1:k, by = x -> Tuple(get(counts[x], g, 0) for g in 1:k), rev = true) new_labels = map(x -> findfirst(==(x), perm), node_labels) - mapping = Dict(perm[i] => i for i = 1:k) + mapping = Dict(perm[i] => i for i in 1:k) return new_labels, mapping end From 507b46dd6113aa538a95947a736744406cd6c2d1 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 17:33:49 +0200 Subject: [PATCH 260/266] update kneedle call --- docs/literate/tutorials/multiplex_network.jl | 2 +- docs/literate/tutorials/simple_graph.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index b013b7b..19fe710 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -104,7 +104,7 @@ ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing); +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, kneedle_scan_algorithm = ScanSmoothing(;S=1.0)); # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index cb7a840..8e2fca5 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -148,7 +148,7 @@ ssm_estimated, criterion_values = Graphons.estimate_ssm( res.model, A, res.labels, shape_range) using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, kneedle_scan_algorithm = ScanSmoothing(;S=1.0)) # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] @@ -205,7 +205,7 @@ ssm_estimated, criterion_values = Graphons.estimate_ssm( res_kmeans.model, A, res_kmeans.labels, shape_range) using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, scan_type = :smoothing) +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, kneedle_scan_algorithm = ScanSmoothing(;S=1.0)) # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] From c339cd4b0f41af7098682fc3f605aeaeb284ee85 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 17:38:33 +0200 Subject: [PATCH 261/266] format --- docs/examples/custom_suffstats.jl | 3 ++- docs/literate/tutorials/multiplex_network.jl | 6 ++++-- docs/literate/tutorials/simple_graph.jl | 12 ++++++++---- docs/literate/tutorials/temporal_networks.jl | 1 - ext/PythonOptimalTransport/alignment.jl | 3 ++- ext/PythonOptimalTransport/srGW.jl | 6 ++++-- src/api.jl | 3 ++- src/pseudo_suff_stats/generic.jl | 1 + test/test_symarray.jl | 7 +++++++ 9 files changed, 30 insertions(+), 12 deletions(-) diff --git a/docs/examples/custom_suffstats.jl b/docs/examples/custom_suffstats.jl index bdd003c..bf7bfb2 100644 --- a/docs/examples/custom_suffstats.jl +++ b/docs/examples/custom_suffstats.jl @@ -77,7 +77,8 @@ es_new = NetworkHistogram.make_greedy_suffstats_estimator( stop_rule = NetworkHistogram.PreviousBestValue(stalled_iters, Inf, :min), progress = true ); -node_labels_es_new, parameters = NetworkHistogram.estimate!( +node_labels_es_new, +parameters = NetworkHistogram.estimate!( es_new, data, initial_labels; iter_progress = 10_000) model_es_new = NetworkHistogram.DecoratedSBM(to_distribution.(convertor, parameters), diff --git a/docs/literate/tutorials/multiplex_network.jl b/docs/literate/tutorials/multiplex_network.jl index 19fe710..655c5a4 100644 --- a/docs/literate/tutorials/multiplex_network.jl +++ b/docs/literate/tutorials/multiplex_network.jl @@ -100,11 +100,13 @@ end using Clustering shape_range = 1:30 -ssm_estimated, criterion_values = Graphons.estimate_ssm( +ssm_estimated, +criterion_values = Graphons.estimate_ssm( res.model, A, true_latents, shape_range); using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, kneedle_scan_algorithm = ScanSmoothing(;S=1.0)); +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, + kneedle_scan_algorithm = ScanSmoothing(; S = 1.0)); # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] diff --git a/docs/literate/tutorials/simple_graph.jl b/docs/literate/tutorials/simple_graph.jl index 8e2fca5..680d74c 100644 --- a/docs/literate/tutorials/simple_graph.jl +++ b/docs/literate/tutorials/simple_graph.jl @@ -144,11 +144,13 @@ using Clustering # ξ = NetworkHistogram.node_labels_to_latents(res.labels, res.model); shape_range = 1:(k * (k + 1) ÷ 2 - 1) -ssm_estimated, criterion_values = Graphons.estimate_ssm( +ssm_estimated, +criterion_values = Graphons.estimate_ssm( res.model, A, res.labels, shape_range) using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, kneedle_scan_algorithm = ScanSmoothing(;S=1.0)) +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, + kneedle_scan_algorithm = ScanSmoothing(; S = 1.0)) # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] @@ -201,11 +203,13 @@ end # ξ = NetworkHistogram.node_labels_to_latents(res.labels, res.model); shape_range = 1:(k_kmeans * (k_kmeans + 1) ÷ 2 - 1) -ssm_estimated, criterion_values = Graphons.estimate_ssm( +ssm_estimated, +criterion_values = Graphons.estimate_ssm( res_kmeans.model, A, res_kmeans.labels, shape_range) using Kneedle -kr = kneedle(shape_range, criterion_values, "convex_dec", 1, kneedle_scan_algorithm = ScanSmoothing(;S=1.0)) +kr = kneedle(shape_range, criterion_values, "convex_dec", 1, + kneedle_scan_algorithm = ScanSmoothing(; S = 1.0)) # Let's extract the optimal number of shapes using the Kneedle algorithm: k_knee = knees(kr)[1] diff --git a/docs/literate/tutorials/temporal_networks.jl b/docs/literate/tutorials/temporal_networks.jl index 16e0166..27b9860 100644 --- a/docs/literate/tutorials/temporal_networks.jl +++ b/docs/literate/tutorials/temporal_networks.jl @@ -2,5 +2,4 @@ # Decorated Graphon Tutorial for Temporal Networks =# - # # How to use NetworkHistogram.jl for Temporal Networks diff --git a/ext/PythonOptimalTransport/alignment.jl b/ext/PythonOptimalTransport/alignment.jl index 2a9dbc0..0c68ebf 100644 --- a/ext/PythonOptimalTransport/alignment.jl +++ b/ext/PythonOptimalTransport/alignment.jl @@ -37,7 +37,8 @@ function get_perm_alignment( ) where {T1 <: AbstractVector, T2 <: AbstractVector} C1 = jl_to_np(target) C2 = jl_to_np(src) - dist, log_ = fngw.x.fused_network_gromov_wasserstein2( + dist, + log_ = fngw.x.fused_network_gromov_wasserstein2( M = jl_to_np(zeros(size(target, 1), size(src, 1))), C1 = C1, C2 = C2, diff --git a/ext/PythonOptimalTransport/srGW.jl b/ext/PythonOptimalTransport/srGW.jl index 394fd9d..1523329 100644 --- a/ext/PythonOptimalTransport/srGW.jl +++ b/ext/PythonOptimalTransport/srGW.jl @@ -398,14 +398,16 @@ function mm_lpl1_semirelaxed( # Inner solver selection if gamma_entropy == 0 - inner_solver = (total_linear_cost, T_init_local) -> cg_semirelaxed( + inner_solver = (total_linear_cost, + T_init_local) -> cg_semirelaxed( C1, p, C2; alpha = alpha, linear_cost = total_linear_cost, init_mode = init_mode, T_init = T_init_local, symmetry = symmetry, use_log = inner_log, eps = eps_inner, max_iter = max_iter_inner, seed = seed, verbose = verbose ) else - inner_solver = (total_linear_cost, T_init_local) -> md_semirelaxed( + inner_solver = (total_linear_cost, + T_init_local) -> md_semirelaxed( C1, p, C2, gamma_entropy; alpha = alpha, linear_cost = total_linear_cost, init_mode = init_mode, T_init = T_init_local, symmetry = symmetry, use_log = inner_log, eps = eps_inner, max_iter = max_iter_inner, diff --git a/src/api.jl b/src/api.jl index 3ddb5e2..329a645 100644 --- a/src/api.jl +++ b/src/api.jl @@ -57,7 +57,8 @@ function _nethist( stop_rule = params.stop_rule, kwargs... ) - node_labels, parameters = estimate!( + node_labels, + parameters = estimate!( es, data, labels_start; progress = params.display_progress, iter_progress = params.progress_freq) diff --git a/src/pseudo_suff_stats/generic.jl b/src/pseudo_suff_stats/generic.jl index ec3b673..0444c6e 100644 --- a/src/pseudo_suff_stats/generic.jl +++ b/src/pseudo_suff_stats/generic.jl @@ -29,6 +29,7 @@ function make_k_block(k, generic; data::AbstractArray, dist::D, kwargs...) where Consider using more specialized sufficient statistics types when possible." k_block = SymArray{GenericSuffStats{eltype(data), D}}(undef, k, k) for j in 1:k, i in 1:k + k_block[i, j] = GenericSuffStats(data, dist) end return k_block diff --git a/test/test_symarray.jl b/test/test_symarray.jl index 5f9d40a..ce950ea 100644 --- a/test/test_symarray.jl +++ b/test/test_symarray.jl @@ -72,6 +72,7 @@ using StaticArrays @test size(a) == (3, 3) for i in 1:3, j in 1:3 + @test a[i, j] == M[i, j] end @@ -144,6 +145,7 @@ using StaticArrays # Test deepcopy! src = SymArray{Vector{Int}}(undef, 4, 4) for j in 1:4, i in j:4 + src[i, j] = [i, j] end @@ -151,6 +153,7 @@ using StaticArrays dest = similar(src) deepcopy!(dest, src) for j in 1:4, i in j:4 + @test dest[i, j] == src[i, j] @test !(dest[i, j] === src[i, j]) # Ensure deep copy end @@ -158,10 +161,12 @@ using StaticArrays # on assigned dest dest2 = similar(src) for j in 1:4, i in j:4 + dest2[i, j] = [-1, -1] end deepcopy!(dest2, src) for j in 1:4, i in j:4 + @test dest2[i, j] == src[i, j] @test !(dest2[i, j] === src[i, j]) # Ensure deep copy end @@ -351,6 +356,7 @@ using StaticArrays # Check values are correct for i in 1:3, j in 1:3 + @test result1[i, j] ≈ 2.0 + M[i, j] @test result2[i, j] ≈ M[i, j] + 2.0 end @@ -385,6 +391,7 @@ using StaticArrays # Check values are correct for i in 1:3, j in 1:3 + @test result1[i, j] ≈ 2.0 + M[i, j] @test result2[i, j] ≈ M[i, j] + 2.0 end From 07e667422d9cde4762e95a389e79f7033465525d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 17:51:40 +0200 Subject: [PATCH 262/266] remove duplicated action for documentation --- .github/workflows/Documentation.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/Documentation.yml diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml deleted file mode 100644 index 999cd99..0000000 --- a/.github/workflows/Documentation.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Documentation - -on: - push: - branches: - - master - tags: '*' - pull_request: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@latest - with: - version: '1.12' - - name: Install dependencies - run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' - - name: Build and deploy - env: - DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # If authenticating with SSH deploy key - run: julia --project=docs/ docs/make.jl From a6fc3784d8a1b5e9bc39e2d1f65c4d73b6542b8a Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 17:55:47 +0200 Subject: [PATCH 263/266] add cache to docs --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 02f87e5..cb43b3e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -50,6 +50,7 @@ jobs: - uses: julia-actions/setup-julia@v1 with: version: '1' + - uses: julia-actions/cache@v1 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-docdeploy@v1 env: From a198c33ea560d825d7064ca4fcaa41b0888a2296 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 18:05:12 +0200 Subject: [PATCH 264/266] update julia action version --- .github/workflows/CI.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index cb43b3e..ba733a4 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -27,11 +27,11 @@ jobs: # - x86 steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: julia-actions/setup-julia@v3 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 + - uses: julia-actions/cache@v3 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 @@ -47,10 +47,10 @@ jobs: contents: write steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: julia-actions/setup-julia@v3 with: version: '1' - - uses: julia-actions/cache@v1 + - uses: julia-actions/cache@v3 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-docdeploy@v1 env: From f901ce06ee35333be668716e8cbf0f90fd37939d Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 18:05:31 +0200 Subject: [PATCH 265/266] update julia action version --- .github/workflows/CI.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ba733a4..b2937ef 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -26,7 +26,7 @@ jobs: - x64 # - x86 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v3 with: version: ${{ matrix.version }} @@ -46,7 +46,7 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v3 with: version: '1' From 99347d8bc69a14cac9be0e5ef5f5a01259718da0 Mon Sep 17 00:00:00 2001 From: dufourc1 Date: Thu, 28 May 2026 18:06:54 +0200 Subject: [PATCH 266/266] add matrix for cache --- .github/workflows/CI.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b2937ef..4c7327d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -42,7 +42,18 @@ jobs: docs: name: Documentation - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.12' + # - 'nightly' + os: + - ubuntu-latest + arch: + - x64 + # - x86 permissions: contents: write steps: