Skip to content

Commit a98066c

Browse files
authored
Fix parsing xpath that has string literal containing parentheses and brackets (#318)
String literal in XPath 1.0 doesn't have escapes. `begin end while cond` is not a normal modifier while, but a do-while loop. (I didn't know it). REXML uses this syntax in several files. Removes preprocessing that removes spaces with gsub which may wrongly modifies string literal. Also fixes error parsing `//a[1] [2]`. All test except exact-error-message test and the one I added in this PR passed with: ```ruby def parse path path = path.gsub('(', '( ').gsub(/[\)\[\]]/, ' \0 ') # Insert space around paren/bracket ``` REF: https://www.w3.org/TR/xpath-10/#exprlex > For readability, whitespace may be used in expressions even though not explicitly allowed by the grammar: ExprWhitespace may be freely added within patterns before or after any ExprToken.
1 parent 2732516 commit a98066c

3 files changed

Lines changed: 54 additions & 14 deletions

File tree

lib/rexml/parsers/xpathparser.rb

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@ def namespaces=( namespaces )
1919
end
2020

2121
def parse path
22-
path = path.dup
23-
path.gsub!(/([\(\[])\s+/, '\1') # Strip ignorable spaces
24-
path.gsub!( /\s+([\]\)])/, '\1')
2522
parsed = []
2623
rest = OrExpr(path, parsed)
2724
if rest
@@ -359,17 +356,15 @@ def NodeTest path, parsed
359356
path = $'
360357
parsed << type.tr('-', '_').intern
361358
when PI
362-
path = $'
359+
path = $'.lstrip
363360
literal = nil
364-
if path =~ /^\s*\)/
365-
path = $'
366-
else
361+
unless path.start_with?(')')
367362
path =~ LITERAL
368363
literal = $1
369-
path = $'
364+
path = $'.lstrip
370365
raise ParseException.new("Missing ')' after processing instruction") if path[0] != ?)
371-
path = path[1..-1]
372366
end
367+
path = path[1..-1]
373368
parsed << :processing_instruction
374369
parsed << (literal || '')
375370
when LOCAL_NAME_WILDCARD
@@ -400,6 +395,7 @@ def Predicate path, parsed
400395
while path[0] == ?[
401396
path, expr = get_group(path)
402397
predicates << expr[1..-2] if expr
398+
path = path.lstrip
403399
end
404400
predicates.each{ |pred|
405401
preds = []
@@ -678,12 +674,20 @@ def get_group string
678674
depth = 0
679675
st = string[0,1]
680676
en = (st == "(" ? ")" : "]")
677+
quote = nil
681678
begin
682-
case string[ind,1]
683-
when st
684-
depth += 1
685-
when en
686-
depth -= 1
679+
if quote
680+
# ignore () [] inside quotes
681+
quote = nil if string[ind] == quote
682+
else
683+
case string[ind]
684+
when st
685+
depth += 1
686+
when en
687+
depth -= 1
688+
when '"', "'"
689+
quote = string[ind]
690+
end
687691
end
688692
ind += 1
689693
end while depth > 0 and ind < string.length

test/parser/test_xpath.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,5 +111,23 @@ def test_filter_string_double_quote
111111
abbreviate("a/b[attribute::name='double \" quote']/c"))
112112
end
113113
end
114+
115+
def test_spaces_between_tokens
116+
# REXML doesn't support space between function-name and opening paren.
117+
# Spaces after `(` and spaces around `)`, `[`, `]` are tested here.
118+
parser = REXML::Parsers::XPathParser.new
119+
assert_equal(
120+
parser.parse('//a/b[c][d]/e'),
121+
parser.parse(' // a / b [ c ] [ d ] / e '),
122+
)
123+
assert_equal(
124+
parser.parse('/a/b[string-length("1")<(2+3)]/c'),
125+
parser.parse(' / a / b [ string-length( "1" ) < ( 2 + 3 ) ] / c '),
126+
)
127+
assert_equal(
128+
parser.parse('//processing-instruction("a")'),
129+
parser.parse('//processing-instruction( "a" )'),
130+
)
131+
end
114132
end
115133
end

test/xpath/test_base.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,24 @@ def test_spaces
830830
match.call('/ a / child:: c [( @id )] /'))
831831
end
832832

833+
def test_space_paren_brace_inside_xpath_string
834+
doc = Document.new(<<~XML)
835+
<a>
836+
<b id=" [ ' 1 ) "/>
837+
<b id=' ( " 2 ] '/>
838+
</a>
839+
XML
840+
841+
assert_equal(
842+
[" [ ' 1 ) "],
843+
REXML::XPath.match(doc, "/a/b[@id=\" [ ' 1 ) \"]").map { |e| e.attributes['id'] }
844+
)
845+
assert_equal(
846+
[' ( " 2 ] '],
847+
REXML::XPath.match(doc, "/a/b[@id=' ( \" 2 ] ']").map { |e| e.attributes['id'] }
848+
)
849+
end
850+
833851
def test_text_nodes
834852
# source = "<root>
835853
#<child/>

0 commit comments

Comments
 (0)