Sort nodeset on demand (#330)

tompng · web-flow · commit ceaa8294b30b · 2026-06-17T22:27:12.000+09:00
Delay sorting, only sort when it is needed. In most case, sorting nodeset is not needed. Sort is only required in: - Final result - Creating nodesets(each nodeset should be axis-ordered) from a single nodeset - Ideally, this can be skipped if the following predicate is not position-dependent - Nodeset passed to a function (first node in document order is used) ### Number of sort operations | XPath | master(before #315) | master(after #315) | this PR | | --- | --- | --- | --- | | `/a/b/c/d/e` | 3 | 4 | 1 | | `(a/b/c/d)[position()>1]/e/f/g` | 5 | 7 | 2 | | `number(/a/b/c/d/e)` | 3 | 4 |1 | | `count(/a/b/c/d/e)` | 3 | 4 | 0 | | `//a//b//c//d//e` | 8 | 9 | 1 | | `/a[1]/b[1]/c[1]/d[1]/e` | 0 | 1 | 1 | #315 removed one `nodesets.size == 1` optimization path. This pull request will reduce the performance regression. To reduce more sort calls, we need to mark nodeset ordering: introducing `Nodeset = Struct.new(:nodes, :order)` but IMO, it shouldn't be done now. If `sort` is optimized, one extra sort won't be a problem. Optimizing `step` will be harder and the code may be complicated. ### Note This pull request will slightly add complexity and a risk to forgot sorting the nodeset in some path. The effect may seem drastic in some case for now, but it's just because `sort` is currently worst `O(n^2)`. We can improve `sort` performance, so there's an option to leave the sort strategy simple.
diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb
@@ -85,9 +85,9 @@ def get_namespace( node_set = nil )
       if node_set == nil
         yield @context[:node] if @context[:node].respond_to?(:namespace)
       else
-        if node_set.respond_to? :each
+        if node_set.kind_of? Array
           result = []
-          node_set.each do |node|
+          XPathParser.sort(node_set).each do |node|
             result << yield(node) if node.respond_to?(:namespace)
           end
           result
@@ -149,7 +149,7 @@ def string( object=@context[:node] )
       else
         case object
         when Array
-          string(object[0])
+          string(XPathParser.sort(object).first)
         when Float
           if object.nan?
             "NaN"
diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb
@@ -156,7 +156,7 @@ def match(path_stack, node)
       result = expr(path_stack, nodeset)
       case result
       when Array # nodeset
-        result.uniq
+        XPathParser.sort(result)
       else
         [result]
       end
@@ -323,7 +323,7 @@ def expr( path_stack, nodeset, context=nil )
             # If result is a nodeset, apply following predicates
             path_stack.unshift(:node)
             nodeset = step(path_stack) do
-              [:iterate_nodesets, [result]]
+              [:iterate_nodesets, [XPathParser.sort(result)]]
             end
           else
             return result
@@ -571,6 +571,7 @@ def split_positional_predicates(predicates)
     end
 
     # Performs an axis scanning step.
+    # Returns an unordered non-duplicated nodeset of matching nodes.
     # The caller provides a scanner method and its argument, which determines the axis to scan and the nodes to scan from:
     #   step(path_stack) { [scanner_method, scanner_argument] }
     # Scanner methods are called with `(scanner_argument, tester_block, selector)`
@@ -621,7 +622,7 @@ def step(path_stack, any_type: :element)
             nodes << node
           end
         end
-        new_nodeset = sort(nodes.to_a)
+        new_nodeset = nodes.to_a
       ensure
         leave(:step, path_stack, new_nodeset) if @debug
       end
@@ -761,7 +762,7 @@ def leave(tag, *args)
     # in and out of function calls.  If I knew what the index of the nodes was,
     # I wouldn't have to do this.  Maybe add a document IDX for each node?
     # Problems with mutable documents.  Or, rewrite everything.
-    def sort(array_of_nodes)
+    def self.sort(array_of_nodes)
       return array_of_nodes if array_of_nodes.size <= 1
 
       new_arry = []
diff --git a/test/test_jaxen.rb b/test/test_jaxen.rb
@@ -87,6 +87,13 @@ def process_value_of(context, variables, namespaces, value_of)
       xpath = value_of.attributes["select"]
       matched = XPath.match(context, xpath, namespaces, variables, strict: true)
 
+      # XPath.match can be a nodeset or a primitive value wrapped in an array.
+      # We need to unwrap primitive value because Functions doesn't accept array which is not a nodeset.
+      unless matched.all? { |node| node.is_a?(REXML::Node) }
+        assert_equal(1, matched.size, 'Primitive value should be a single value')
+        matched = matched.first
+      end
+
       message = user_message(context, xpath, matched)
       assert_equal(expected || "",
                    REXML::Functions.string(matched),
diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb
@@ -1505,5 +1505,21 @@ def test_descendant_axis_position_predicate_per_context_node
       result = XPath.match(xmldoc, "//a/descendant::b[1]")
       assert_equal(["b1", "b2"], result.map { |e| e.attributes["id"] })
     end
+
+    def test_reverse_axis_document_order_sort
+      doc = Document.new("<a><b>1</b><c>2</c><d>3</d><e/></a>")
+      assert_equal(["b", "c", "d"], XPath.match(doc, "//e/preceding-sibling::*").map(&:name))
+      assert_equal(["d"], XPath.match(doc, "//e/preceding-sibling::*[1]").map(&:name))
+      assert_equal(["b"], XPath.match(doc, "(//e/preceding-sibling::*)[1]").map(&:name))
+    end
+
+    def test_reverse_axis_function_argument_sort
+      doc = Document.new("<a><b>1</b><c>2</c><d>3</d><e/></a>")
+      assert_equal(["e"], XPath.match(doc, "//e[name(preceding-sibling::*)='b']").map(&:name))
+      assert_equal(["e"], XPath.match(doc, "//e[string(preceding-sibling::*)='1']").map(&:name))
+      assert_equal(["e"], XPath.match(doc, "//e[number(preceding-sibling::*)=1]").map(&:name))
+      assert_equal(["e"], XPath.match(doc, "//e[10 + preceding-sibling::* = 11]").map(&:name))
+      assert_equal(["e"], XPath.match(doc, "//e[preceding-sibling::* = '1']").map(&:name))
+    end
   end
 end