From 7b5c01efeb570fcfad54ec73a8be1df886562b4d Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:13:04 +0100 Subject: [PATCH 1/4] Update write to respect xml:space --- src/XML.jl | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/XML.jl b/src/XML.jl index 9027ade..2bac234 100644 --- a/src/XML.jl +++ b/src/XML.jl @@ -357,7 +357,7 @@ write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io))) write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w") -function write(io::IO, x; indentsize::Int=2, depth::Int=1) +function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1) indent = ' ' ^ indentsize nodetype = XML.nodetype(x) tag = XML.tag(x) @@ -365,48 +365,61 @@ function write(io::IO, x; indentsize::Int=2, depth::Int=1) children = XML.children(x) padding = indent ^ max(0, depth - 1) - print(io, padding) + !ctx[end] && print(io, padding) + if nodetype === Text print(io, value) + elseif nodetype === Element + push!(ctx, ctx[end]) + update_ctx!(ctx, x) print(io, '<', tag) _print_attrs(io, x) print(io, isempty(children) ? '/' : "", '>') if !isempty(children) if length(children) == 1 && XML.nodetype(only(children)) === Text - write(io, only(children); indentsize=0) + write(io, only(children), ctx; indentsize=0) print(io, "') else - println(io) + !ctx[end] && println(io) foreach(children) do child - write(io, child; indentsize, depth = depth + 1) - println(io) + write(io, child, ctx; indentsize, depth=depth + 1) + !ctx[end] && println(io) end - print(io, padding, "') + print(io, !ctx[end] ? padding : "", "') end end + pop!(ctx) + elseif nodetype === DTD print(io, "') + elseif nodetype === Declaration print(io, "") + elseif nodetype === ProcessingInstruction print(io, "") + elseif nodetype === Comment print(io, "") + elseif nodetype === CData print(io, "") + elseif nodetype === Document foreach(children) do child - write(io, child; indentsize) - println(io) + write(io, child, ctx; indentsize) + !ctx[end] && println(io) end + else error("Unreachable case reached during XML.write") end -end end + +end # module XML From 5c4d3d3235b1dbc52e4460f4aaf6420a87f757d6 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:14:20 +0100 Subject: [PATCH 2/4] Correct isspace test --- src/raw.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/raw.jl b/src/raw.jl index 8b77bba..a7fccaa 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -329,7 +329,11 @@ function parent(o::Raw) end #-----------------------------------------------------------------------------# next Raw -isspace(x::UInt8) = Base.isspace(Char(x)) +# isspace(x::UInt8) = Base.isspace(Char(x)) + +# XML whitespace per XML 1.0/1.1 production S: +# S ::= (#x20 | #x9 | #xD | #xA)+ +@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D) """ next(node) --> typeof(node) or Nothing From 0062e2a817e62e6ec316df80bccc8e5d410281ca Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:23:55 +0100 Subject: [PATCH 3/4] Fix isspace --- src/raw.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/raw.jl b/src/raw.jl index a7fccaa..1ed9a4e 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -357,7 +357,7 @@ function next_xml_space(o::Raw) has_xml_space = o.has_xml_space ctx = copy(o.ctx) last_type = type - k = findnext(!isspace, data, i) + k = findnext(!xml_isspace, data, i) if isnothing(k) return nothing end @@ -373,11 +373,11 @@ function next_xml_space(o::Raw) if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/' type = RawText j = findnext(==(UInt8('<')), data, i) - 1 - j = ctx[end] ? j : findprev(!isspace, data, j) # preserving whitespace if needed + j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument # Maybe drop pure-whitespace inter-element text nodes? # (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node) - #if all(isspace, @view data[i:j]) && depth > 1 + #if all(xml_isspace, @view data[i:j]) && depth > 1 # return next(Raw(type, depth, j, 0, data, ctx, has_xml_space)) #end end @@ -518,7 +518,7 @@ function prev_no_xml_space(o::Raw) # same as v0.3.5 ctx = has_xml_space ? copy(o.ctx) : [false] type === RawDocument && return nothing j = o.pos - 1 - j = findprev(!isspace, data, j) + j = findprev(!xml_isspace, data, j) if isnothing(j) return Raw(data, has_xml_space, ctx) # RawDocument end @@ -527,7 +527,7 @@ function prev_no_xml_space(o::Raw) # same as v0.3.5 if c !== '>' # text type = RawText i = findprev(==(UInt8('>')), data, j) + 1 - i = findnext(!isspace, data, i) # "lstrip" + i = findnext(!xml_isspace, data, i) # "lstrip" elseif c === '>' c2 = Char(o.data[j-1]) if c2 === '-' From be180408dc1050aa4497a323e33359c65c5e9bfa Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Tue, 2 Sep 2025 18:03:42 +0100 Subject: [PATCH 4/4] more isspace --- src/raw.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/raw.jl b/src/raw.jl index 1ed9a4e..c6837de 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -433,7 +433,7 @@ function next_no_xml_space(o::Raw) # same as v0.3.5 type = o.type has_xml_space = o.has_xml_space ctx = [false] - i = findnext(!isspace, data, i) + i = findnext(!xml_isspace, data, i) if isnothing(i) return nothing end @@ -445,7 +445,7 @@ function next_no_xml_space(o::Raw) # same as v0.3.5 if c !== '<' type = RawText j = findnext(==(UInt8('<')), data, i) - 1 - j = findprev(!isspace, data, j) # "rstrip" + j = findprev(!xml_isspace, data, j) # "rstrip" elseif c === '<' c2 = Char(o.data[i+1]) if c2 === '!'