Skip to content

Commit 29e9e1b

Browse files
committed
add approximate LAD
1 parent adf8925 commit 29e9e1b

15 files changed

Lines changed: 164 additions & 51 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Upcoming Release
2+
- Add exact argument for LAD. If exact is true then the linear programming based exact solution is found. Otherwise, a GA based search is performed to yield approximate solutions.
23

34
# v0.8.19
45
- Update Satman(2013) algorithm

docs/make.jl

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,20 @@ using Documenter, LinRegOutliers
22

33

44
makedocs(
5-
format = Documenter.HTML(
6-
prettyurls = get(ENV, "CI", nothing) == "true",
7-
collapselevel = 2,
8-
# assets = ["assets/favicon.ico", "assets/extra_styles.css"],
9-
),
10-
sitename="LinRegOutliers",
11-
authors = "Mehmet Hakan Satman <mhsatman@gmail.com>, Shreesh Adiga <16567adigashreesh@gmail.com>, Guillermo Angeris <angeris@stanford.edu>, Emre Akadal <emre.akadal@istanbul.edu.tr>",
12-
pages = [
13-
"Algorithms" => "algorithms.md",
14-
"Diagnostics" => "diagnostics.md",
15-
"Types" => "types.md",
16-
"Datasets" => "datasets.md"
17-
]
18-
)
19-
20-
21-
deploydocs(
22-
repo = "github.com/jbytecode/LinRegOutliers",
5+
format = Documenter.HTML(
6+
prettyurls = get(ENV, "CI", nothing) == "true",
7+
collapselevel = 2,
8+
# assets = ["assets/favicon.ico", "assets/extra_styles.css"],
9+
),
10+
sitename = "LinRegOutliers",
11+
authors = "Mehmet Hakan Satman <mhsatman@gmail.com>, Shreesh Adiga <16567adigashreesh@gmail.com>, Guillermo Angeris <angeris@stanford.edu>, Emre Akadal <emre.akadal@istanbul.edu.tr>",
12+
pages = [
13+
"Algorithms" => "algorithms.md",
14+
"Diagnostics" => "diagnostics.md",
15+
"Types" => "types.md",
16+
"Datasets" => "datasets.md",
17+
],
2318
)
2419

2520

26-
21+
deploydocs(repo = "github.com/jbytecode/LinRegOutliers")

src/LinRegOutliers.jl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,19 @@ export find_minimum_nonzero
2020
export @extractRegressionSetting
2121

2222

23+
# Compact genetic algorithm
24+
include("cga.jl")
25+
import .CGA: cga
26+
27+
# Hooke-Jeeves algorithm
28+
include("hj.jl")
29+
# The function hj() is not exported.
30+
31+
# Genetic Algorithm
32+
include("ga.jl")
33+
import .GA: ga, RealChromosome
34+
35+
2336
# Predefined datasets used in outlier detection literature
2437
include("data.jl")
2538
import .DataSets: phones, hbk, stackloss
@@ -129,14 +142,6 @@ import .Hadi94: hadi1994
129142
include("dataimage.jl")
130143
import .DataImage: dataimage
131144

132-
# Compact genetic algorithm
133-
include("cga.jl")
134-
import .CGA: cga
135-
136-
137-
# Genetic Algorithm
138-
include("ga.jl")
139-
import .GA: ga, RealChromosome
140145

141146
# Modified and original Satman (2012) algorithms
142147
include("gwlts.jl")

src/asm2000.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ import LinearAlgebra: det
77
import Clustering: Hclust, hclust, cutree
88

99

10-
import ..Basis: RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
10+
import ..Basis:
11+
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
1112
import ..Diagnostics: mahalanobisSquaredBetweenPairs
1213
import ..LTS: lts
1314
import ..SMR98: majona
@@ -65,7 +66,7 @@ function asm2000(X::Array{Float64,2}, y::Array{Float64,1})::Dict
6566
#stdfit = standardize(ZScoreTransform, predicteds, dims = 1)
6667
stdres = zstandardize(resids)
6768
stdfit = zstandardize(predicteds)
68-
69+
6970
pairs = hcat(stdfit, stdres)
7071

7172
pairs = hcat(resids, predicteds)

src/dataimage.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ julia> Plots.plot(di)
4242
Marchette, David J., and Jeffrey L. Solka. "Using data images for outlier detection."
4343
Computational Statistics & Data Analysis 43.4 (2003): 541-552.
4444
"""
45-
function dataimage(dataMatrix::Array{Float64,2}; distance = :mahalanobis)::Array{RGB{Float64}, 2}
45+
function dataimage(
46+
dataMatrix::Array{Float64,2};
47+
distance = :mahalanobis,
48+
)::Array{RGB{Float64},2}
4649
d = nothing
4750
if distance == :mahalanobis
4851
d = mahalanobisSquaredBetweenPairs(dataMatrix)
@@ -55,7 +58,7 @@ function dataimage(dataMatrix::Array{Float64,2}; distance = :mahalanobis)::Array
5558
end
5659
colours = 1.0 .- d / maximum(d)
5760
n, _ = size(d)
58-
colormatrix = Array{RGB{Float64}, 2}(undef, n, n)
61+
colormatrix = Array{RGB{Float64},2}(undef, n, n)
5962
for i = 1:n
6063
for j = 1:n
6164
@inbounds colormatrix[i, j] = RGB(colours[i, j])

src/diagnostics.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ end
5555
function mahalanobisSquaredBetweenPairs(pairs::Matrix; covmatrix = nothing)
5656
n, _ = size(pairs)
5757
newmat = zeros(Float64, n, n)
58-
if isnothing(covmatrix)
58+
if isnothing(covmatrix)
5959
covmatrix = cov(pairs)
6060
end
6161
try
@@ -194,7 +194,7 @@ end
194194

195195
function dffit(X::Array{Float64,2}, y::Array{Float64,1}, i::Int)::Float64
196196
n, _ = size(X)
197-
indices = [j for j in 1:n if i != j]
197+
indices = [j for j = 1:n if i != j]
198198
olsfull = ols(X, y)
199199
Xsub = X[indices, :]
200200
ysub = y[indices]
@@ -423,7 +423,7 @@ end
423423

424424
function jacknifedS(X::Array{Float64,2}, y::Array{Float64,1}, k::Int)::Float64
425425
n, p = size(X)
426-
indices = [i for i in 1:n if i != k]
426+
indices = [i for i = 1:n if i != k]
427427
Xsub = X[indices, :]
428428
ysub = y[indices]
429429
olsreg = ols(Xsub, ysub)

src/hj.jl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
module HookeJeeves
2+
3+
export hj
4+
5+
function mutate(par, p, d)
6+
newpar = copy(par)
7+
newpar[p] += d
8+
return newpar
9+
end
10+
11+
function hj(
12+
f::FType,
13+
par::Vector{Float64};
14+
maxiter = 1000,
15+
startstep = 5.0,
16+
endstep = 0.0001,
17+
) where {FType<:Function}
18+
p = length(par)
19+
currentstep = startstep
20+
iter::Int64 = 0
21+
while iter < maxiter
22+
fold = f(par)
23+
fnow = fold
24+
for currentp = 1:p
25+
mutateleft = mutate(par, currentp, -currentstep)
26+
fleft = f(mutateleft)
27+
mutateright = mutate(par, currentp, currentstep)
28+
fright = f(mutateright)
29+
if fleft < fold
30+
par = mutateleft
31+
fnow = fleft
32+
elseif fright < fold
33+
par = mutateright
34+
fnow = fright
35+
end
36+
end
37+
if fold <= fnow
38+
currentstep /= 2
39+
end
40+
if currentstep < endstep
41+
break
42+
end
43+
iter += 1
44+
end
45+
46+
return Dict("par" => par, "iter" => iter, "step" => currentstep)
47+
end
48+
49+
end # end of module

src/lad.jl

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,18 @@ using GLPK
88
import ..Basis:
99
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, applyColumns
1010

11+
import ..HookeJeeves: hj
12+
import ..GA: ga
13+
1114
"""
1215
13-
lad(setting)
16+
lad(setting; exact = true)
1417
1518
Perform Least Absolute Deviations regression for a given regression setting.
1619
1720
# Arguments
1821
- `setting::RegressionSetting`: RegressionSetting object with a formula and dataset.
22+
- `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true.
1923
2024
# Description
2125
The LAD estimator searches for regression the parameters estimates that minimize the sum of absolute residuals.
@@ -51,23 +55,52 @@ Dict{Any,Any} with 2 entries:
5155
```
5256
5357
"""
54-
function lad(setting::RegressionSetting)
58+
function lad(setting::RegressionSetting; exact::Bool = true)
5559
X, y = @extractRegressionSetting setting
56-
return lad(X, y)
60+
return lad(X, y, exact = exact)
5761
end
5862

5963

6064
"""
6165
62-
lad(X, y)
66+
lad(X, y, exact = true)
6367
6468
Perform Least Absolute Deviations regression for a given regression setting.
6569
6670
# Arguments
6771
- `X::Array{Float64, 2}`: Design matrix of the linear model.
6872
- `y::Array{Float64, 1}`: Response vector of the linear model.
73+
- `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true.
6974
"""
70-
function lad(X::Array{Float64,2}, y::Array{Float64,1})
75+
function lad(X::Array{Float64,2}, y::Array{Float64,1}; exact::Bool = true)
76+
if exact
77+
return lad_exact(X, y)
78+
else
79+
return lad_approx(X, y)
80+
end
81+
end
82+
83+
function lad_approx(X::Array{Float64,2}, y::Array{Float64,1})
84+
n, p = size(X)
85+
86+
mins = ones(Float64, p) * 10^6 * (-1.0)
87+
maxs = ones(Float64, p) * 10^6
88+
popsize = 100
89+
90+
function fcost(par)
91+
return sum(abs.(y .- X * par))
92+
end
93+
94+
garesult = ga(popsize, p, fcost, mins, maxs, 0.90, 0.05, 1, p * 1000)
95+
best = garesult[1]
96+
hookejeevesresult =
97+
hj(fcost, best.genes, maxiter = 109000, startstep = 10.0, endstep = 0.000001)
98+
betas = hookejeevesresult["par"]
99+
result = Dict("betas" => betas, "residuals" => y .- X * betas)
100+
return result
101+
end
102+
103+
function lad_exact(X::Array{Float64,2}, y::Array{Float64,1})
71104
n, p = size(X)
72105

73106
m = JuMP.Model(GLPK.Optimizer)

src/py95.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ function jacknifedS(
148148
omittedIndices::Array{Int,1},
149149
)::Float64
150150
n, p = size(X)
151-
indices = [i for i in 1:n if !(i in omittedIndices)]
151+
indices = [i for i = 1:n if !(i in omittedIndices)]
152152
olsreg = ols(X[indices, :], y[indices])
153153
e = residuals(olsreg)
154154
s = sqrt(sum(e .^ 2.0) / (n - p - length(omittedIndices)))

src/satman2013.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ function satman2013(X::Array{Float64,2}, y::Array{Float64,1})
116116

117117
result = Dict()
118118
result["outliers"] = outlierset
119-
result["betas"] = betas
119+
result["betas"] = betas
120120
result["residuals"] = resids
121121

122122
return result

0 commit comments

Comments
 (0)