From c1e76fe257dbe521649ff941dde2ed016a1b518f Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 00:53:19 +0000 Subject: [PATCH 1/8] Add Choria transport (phases 1 and 2 of 5) Implements phases 1 and 2 of the Choria transport, enabling OpenBolt to run tasks, commands, and scripts on nodes via Choria's NATS pub/sub messaging as an alternative to SSH and WinRM. Phase 1 (bolt_tasks agent): Downloads task files to targets from an OpenVox/Puppet Server and executes them using the bolt_tasks Choria agent. Phase 2 (shell agent): Executes commands, scripts, and tasks through the Choria shell agent. This allows running tasks not available on an OpenVox/Puppet server. Everything is implemented as asynchronously as possible, aligning with Choria's model, and is built to run at scale across many thousands of nodes at once. See docs in a later commit for details on the phases of this project as well as user-facing and developer documentation. --- .rubocop.yml | 12 + lib/bolt/bolt_option_parser.rb | 7 +- lib/bolt/config/options.rb | 8 + lib/bolt/config/transport/choria.rb | 73 +++ lib/bolt/config/transport/options.rb | 97 +++ lib/bolt/executor.rb | 2 + lib/bolt/transport/choria.rb | 218 +++++++ lib/bolt/transport/choria/agent_discovery.rb | 137 +++++ lib/bolt/transport/choria/bolt_tasks.rb | 248 ++++++++ lib/bolt/transport/choria/client.rb | 276 +++++++++ lib/bolt/transport/choria/command_builders.rb | 199 +++++++ lib/bolt/transport/choria/helpers.rb | 197 ++++++ lib/bolt/transport/choria/shell.rb | 560 ++++++++++++++++++ lib/mcollective/agent/README.md | 18 + lib/mcollective/agent/shell.ddl | 154 +++++ openbolt.gemspec | 2 + 16 files changed, 2207 insertions(+), 1 deletion(-) create mode 100644 lib/bolt/config/transport/choria.rb create mode 100644 lib/bolt/transport/choria.rb create mode 100644 lib/bolt/transport/choria/agent_discovery.rb create mode 100644 lib/bolt/transport/choria/bolt_tasks.rb create mode 100644 lib/bolt/transport/choria/client.rb create mode 100644 lib/bolt/transport/choria/command_builders.rb create mode 100644 lib/bolt/transport/choria/helpers.rb create mode 100644 lib/bolt/transport/choria/shell.rb create mode 100644 lib/mcollective/agent/README.md create mode 100644 lib/mcollective/agent/shell.ddl diff --git a/.rubocop.yml b/.rubocop.yml index 7800c800c..eb1825188 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -22,4 +22,16 @@ Layout/LineLength: Max: 150 Style/FetchEnvVar: + Enabled: false + +Naming/BlockForwarding: + Enabled: false + +Naming/PredicatePrefix: + Enabled: false + +Lint/NoReturnInBeginEndBlocks: + Enabled: false + +Style/MultilineTernaryOperator: Enabled: false \ No newline at end of file diff --git a/lib/bolt/bolt_option_parser.rb b/lib/bolt/bolt_option_parser.rb index 7c99336a6..6865a3975 100644 --- a/lib/bolt/bolt_option_parser.rb +++ b/lib/bolt/bolt_option_parser.rb @@ -168,7 +168,7 @@ def get_help_text(subcommand, action = nil) when 'task' case action when 'run' - { flags: ACTION_OPTS + %w[params tmpdir noop], + { flags: ACTION_OPTS + %w[params tmpdir noop choria-agent], banner: TASK_RUN_HELP } when 'show' { flags: OPTIONS[:global] + OPTIONS[:global_config_setters] + %w[filter format], @@ -1095,6 +1095,11 @@ def initialize(options) define('--tmpdir DIR', 'The directory to upload and execute temporary files on the target.') do |tmpdir| @options[:tmpdir] = tmpdir end + define('--choria-agent AGENT', %w[bolt_tasks shell], + "Which Choria agent to use for task execution (bolt_tasks, shell).", + "Defaults to 'bolt_tasks'. Set to 'shell' for tasks not on the Puppet Server.") do |agent| + @options[:'choria-agent'] = agent + end separator "\n#{self.class.colorize(:cyan, 'Module options')}" define('--[no-]resolve', diff --git a/lib/bolt/config/options.rb b/lib/bolt/config/options.rb index 323a358d0..fa19cb38a 100644 --- a/lib/bolt/config/options.rb +++ b/lib/bolt/config/options.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require_relative '../../bolt/config/transport/choria' require_relative '../../bolt/config/transport/docker' require_relative '../../bolt/config/transport/jail' require_relative '../../bolt/config/transport/local' @@ -15,6 +16,7 @@ module Options # Transport config classes. Used to load default transport config which # gets passed along to the inventory. TRANSPORT_CONFIG = { + 'choria' => Bolt::Config::Transport::Choria, 'docker' => Bolt::Config::Transport::Docker, 'jail' => Bolt::Config::Transport::Jail, 'local' => Bolt::Config::Transport::Local, @@ -551,6 +553,12 @@ module Options _example: "winrm", _default: "ssh" }, + "choria" => { + description: "A map of configuration options for the choria transport.", + type: Hash, + _plugin: true, + _example: { "config-file" => "/etc/choria/client.conf" } + }, "docker" => { description: "A map of configuration options for the docker transport.", type: Hash, diff --git a/lib/bolt/config/transport/choria.rb b/lib/bolt/config/transport/choria.rb new file mode 100644 index 000000000..3ebab3ac0 --- /dev/null +++ b/lib/bolt/config/transport/choria.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true + +require_relative '../../../bolt/error' +require_relative '../../../bolt/config/transport/base' + +module Bolt + class Config + module Transport + class Choria < Base + OPTIONS = %w[ + choria-agent + cleanup + collective + command-timeout + config-file + host + interpreters + nats-connection-timeout + nats-servers + puppet-environment + rpc-timeout + ssl-ca + ssl-cert + ssl-key + task-timeout + tmpdir + ].sort.freeze + + DEFAULTS = { + 'cleanup' => true, + 'command-timeout' => 60, + 'nats-connection-timeout' => 30, + 'puppet-environment' => 'production', + 'rpc-timeout' => 30, + 'task-timeout' => 300, + 'tmpdir' => '/tmp' + }.freeze + + VALID_AGENTS = %w[bolt_tasks shell].freeze + + private def validate + super + + if @config['choria-agent'] && !VALID_AGENTS.include?(@config['choria-agent']) + raise Bolt::ValidationError, + "choria-agent must be one of #{VALID_AGENTS.join(', ')}, got '#{@config['choria-agent']}'" + end + + if @config['tmpdir'] && !absolute_path?(@config['tmpdir']) + raise Bolt::ValidationError, + "Choria tmpdir must be an absolute path, got '#{@config['tmpdir']}'" + end + + ssl_keys = %w[ssl-ca ssl-cert ssl-key] + provided_ssl = ssl_keys.select { |k| @config[k] } + if provided_ssl.any? && provided_ssl.length < ssl_keys.length + missing = ssl_keys - provided_ssl + raise Bolt::ValidationError, + "When overriding Choria SSL settings, all three options must be provided " \ + "(ssl-ca, ssl-cert, ssl-key). Missing: #{missing.join(', ')}" + end + + @config['interpreters'] = normalize_interpreters(@config['interpreters']) if @config['interpreters'] + end + + # Accept both POSIX absolute paths (/tmp) and Windows absolute paths (C:\temp). + def absolute_path?(path) + path.start_with?('/') || path.match?(Bolt::Transport::Choria::WINDOWS_PATH_REGEX) + end + end + end + end +end diff --git a/lib/bolt/config/transport/options.rb b/lib/bolt/config/transport/options.rb index f480a2cce..272998ff8 100644 --- a/lib/bolt/config/transport/options.rb +++ b/lib/bolt/config/transport/options.rb @@ -51,6 +51,56 @@ module Options _default: true, _example: false }, + "choria-agent" => { + type: String, + description: "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' " \ + "(downloads task files from a Puppet Server). Set to 'shell' for tasks " \ + "not available on the Puppet Server.", + _plugin: true, + _example: "shell" + }, + "collective" => { + type: String, + description: "The Choria collective to target. Overrides the main_collective from the Choria " \ + "client configuration file.", + _plugin: true, + _example: "production" + }, + "command-timeout" => { + type: Integer, + description: "How long to wait in seconds for commands and scripts to complete when using the " \ + "Choria transport.", + minimum: 1, + _plugin: true, + _default: 60, + _example: 120 + }, + "config-file" => { + type: String, + description: "The path to the Choria or MCollective client configuration file.", + _plugin: true, + _example: "/etc/choria/client.conf" + }, + "nats-connection-timeout" => { + type: Integer, + description: "How long to wait in seconds for the initial TCP connection to the NATS broker. " \ + "If the connection cannot be made within this time, the operation fails.", + minimum: 1, + _plugin: true, + _default: 30, + _example: 60 + }, + "rpc-timeout" => { + type: Integer, + description: "How long to wait in seconds for nodes to respond to an RPC request. " \ + "Used for lightweight operations like agent discovery, shell.start, and " \ + "shell.list polling. Distinct from command-timeout and task-timeout which " \ + "govern the overall duration of commands and tasks.", + minimum: 1, + _plugin: true, + _default: 30, + _example: 60 + }, "connect-timeout" => { type: Integer, description: "How long to wait in seconds when establishing connections. Set this value higher if you " \ @@ -225,6 +275,16 @@ module Options _plugin: true, _example: %w[defaults hmac-md5] }, + "nats-servers" => { + type: [String, Array], + description: "One or more NATS server addresses for the Choria transport. Overrides the middleware " \ + "hosts from the Choria client configuration file. Can be a single string or an array.", + items: { + type: String + }, + _plugin: true, + _example: ["nats://broker1:4222", "nats://broker2:4222"] + }, "native-ssh" => { type: [TrueClass, FalseClass], description: "This enables the native SSH transport, which shells out to SSH instead of using the " \ @@ -267,6 +327,14 @@ module Options _plugin: true, _example: "jump.example.com" }, + "puppet-environment" => { + type: String, + description: "The Puppet environment to use when constructing task file URIs for the Choria " \ + "bolt_tasks agent.", + _plugin: true, + _default: "production", + _example: "staging" + }, "read-timeout" => { type: Integer, description: "How long to wait in seconds when making requests to the Orchestrator.", @@ -343,6 +411,27 @@ module Options _plugin: true, _example: 445 }, + "ssl-ca" => { + type: String, + description: "The path to the CA certificate for Choria TLS connections. Overrides the CA " \ + "from the Choria client configuration file.", + _plugin: true, + _example: "/etc/choria/ssl/ca.pem" + }, + "ssl-cert" => { + type: String, + description: "The path to the client certificate for Choria TLS connections. Overrides the " \ + "certificate from the Choria client configuration file.", + _plugin: true, + _example: "/etc/choria/ssl/client.pem" + }, + "ssl-key" => { + type: String, + description: "The path to the client private key for Choria TLS connections. Overrides the " \ + "key from the Choria client configuration file.", + _plugin: true, + _example: "/etc/choria/ssl/client-key.pem" + }, "ssh-command" => { type: [Array, String], description: "The command and options to use when SSHing. This option is used when you need support for " \ @@ -393,6 +482,14 @@ module Options _default: "production", _example: "development" }, + "task-timeout" => { + type: Integer, + description: "How long to wait in seconds for tasks to complete when using the Choria transport.", + minimum: 1, + _plugin: true, + _default: 300, + _example: 300 + }, "tmpdir" => { type: String, description: "The directory to upload and execute temporary files on the target.", diff --git a/lib/bolt/executor.rb b/lib/bolt/executor.rb index 0700617bd..6e9fc9c79 100644 --- a/lib/bolt/executor.rb +++ b/lib/bolt/executor.rb @@ -13,6 +13,7 @@ require_relative '../bolt/result' require_relative '../bolt/result_set' # Load transports +require_relative '../bolt/transport/choria' require_relative '../bolt/transport/docker' require_relative '../bolt/transport/jail' require_relative '../bolt/transport/local' @@ -24,6 +25,7 @@ module Bolt TRANSPORTS = { + choria: Bolt::Transport::Choria, docker: Bolt::Transport::Docker, jail: Bolt::Transport::Jail, local: Bolt::Transport::Local, diff --git a/lib/bolt/transport/choria.rb b/lib/bolt/transport/choria.rb new file mode 100644 index 000000000..1e2389b22 --- /dev/null +++ b/lib/bolt/transport/choria.rb @@ -0,0 +1,218 @@ +# frozen_string_literal: true + +require 'base64' +require 'concurrent/map' +require 'digest/sha2' +require 'json' +require 'securerandom' +require 'shellwords' +require_relative '../../bolt/transport/base' + +module Bolt + module Transport + # Choria transport for OpenBolt. Communicates with nodes via Choria's NATS + # pub/sub messaging infrastructure using the choria-mcorpc-support gem as + # the client library. Extends Transport::Base directly (not Simple) because + # Choria's pub/sub model doesn't fit the persistent connection/shell + # abstraction that Simple assumes. + # + # Available capabilities depend on which agents are installed on the + # target node: + # + # bolt_tasks agent only: Only run_task works, via the bolt_tasks agent + # which downloads task files from an OpenVox/Puppet Server and executes + # them via task_wrapper. All other operations fail with an actionable + # error directing the user to install the shell agent. + # + # shell agent installed (>= 1.2.0): run_command, run_script, and + # run_task work. run_task uses the bolt_tasks agent by default. + # To run local tasks via the shell agent, set choria-agent to 'shell' + # in project config or specify --choria-agent shell. + # + # Upload, download, and plans are not yet supported. + class Choria < Base + def initialize + super + @config_mutex = Mutex.new + @config_error = nil + @client_configured = false + # Serializes RPC calls across batch threads. See the comment on + # rpc_request in helpers.rb for why this is necessary. + @rpc_mutex = Mutex.new + # Multiple batch threads write to this map concurrently when we + # have more than one collective. + @agent_cache = Concurrent::Map.new + @default_collective = nil + end + + # Advertise both shell and powershell so tasks with either requirement + # can be selected. The per-target selection happens in + # select_implementation below, which picks the right feature set based + # on the target's detected OS. + def provided_features + %w[shell powershell] + end + + # Override to select task implementation based on the target's OS. + # Other transports rely on inventory features to pick the right + # implementation, but Choria discovers the OS at runtime via the + # os.family fact. We pass only the detected platform's feature so + # task.select_implementation picks the correct .ps1 or .sh file. + # + # @param target [Bolt::Target] Target whose OS determines the implementation + # @param task [Bolt::Task] Task with platform-specific implementations + # @return [Hash] Selected implementation hash with 'path', 'name', 'input_method', 'files' keys + def select_implementation(target, task) + features = windows_target?(target) ? ['powershell'] : ['shell'] + impl = task.select_implementation(target, features) + impl['input_method'] ||= default_input_method(impl['path']) + impl + end + + # Group targets by collective so each batch uses a single RPC client + # scope. MCollective RPC calls are published to a collective-specific + # NATS subject, so targets in different collectives must be in separate + # batches. Most deployments have one collective, yielding one batch. + # Bolt runs each batch in its own thread and @rpc_mutex serializes + # the RPC calls across threads to prevent response misrouting. + # + # @param targets [Array] All targets for this operation + # @return [Array>] Targets grouped by collective + def batches(targets) + # Populates @default_collective from the Choria config so targets + # without an explicit collective are grouped correctly. + configure_client(targets.first) + targets.group_by { |target| collective_for(target) }.values + end + + # Override batch_task to handle multiple targets in one thread using the RPC. + # Implementation grouping (mixed-platform support) is handled internally + # by run_task_via_bolt_tasks and run_task_via_shell. + # + # @param targets [Array] Targets in a single collective batch + # @param task [Bolt::Task] Task to execute + # @param arguments [Hash] Task parameter names to values + # @param options [Hash] Execution options (unused currently, passed through from Base) + # @param position [Array] Positional info for result tracking + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets (successes and failures) + def batch_task(targets, task, arguments, _options = {}, position = [], &callback) + chosen_agent = targets.first.options['choria-agent'] || 'bolt_tasks' + result_opts = { action: 'task', name: task.name, position: position } + + # The results var here is the error results for incapable targets, to which we'll add in + # the successful results from the capable targets as we go. + capable, results = prepare_targets(targets, chosen_agent, result_opts, &callback) + + logger.debug { "Task #{task.name} routing: agent: #{chosen_agent}, #{capable.size} capable / #{targets.size - capable.size} incapable" } + + unless capable.empty? + capable.each { |target| callback&.call(type: :node_start, target: target) } + arguments = unwrap_sensitive_args(arguments) + + results += case chosen_agent + when 'bolt_tasks' + run_task_via_bolt_tasks(capable, task, arguments, result_opts, &callback) + when 'shell' + run_task_via_shell(capable, task, arguments, result_opts, &callback) + else + raise Bolt::Error.new( + "Unsupported choria-agent '#{chosen_agent}'", + 'bolt/choria-unsupported-agent' + ) + end + end + + results + end + + # Override batch_task_with for per-target arguments. Only called + # from the run_task_with Puppet plan function (no CLI or Ruby API + # path uses this). Discovery is batched upfront, but execution is + # sequential per-target because MCollective RPC calls send the + # same arguments to all targets. A future optimization could batch + # the download/infra-setup/polling steps while keeping only the + # start step per-target. + # + # THIS IS NOT YET READY FOR PRODUCTION. The API is stable, but we don't + # yet have full plan support and this runs the task sequentially across + # targets, which is very inefficient. It had to be implemented now, though, + # in order to prevent the assert_batch_size_one from the Base interface + # from blowing things up. + # + # @param targets [Array] Targets in a single collective batch + # @param task [Bolt::Task] Task to execute + # @param target_mapping [Hash{Bolt::Target => Hash}] Per-target argument hashes + # @param options [Hash] Execution options (passed through from Base) + # @param position [Array] Positional info for result tracking + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets + def batch_task_with(targets, task, target_mapping, options = {}, position = [], &callback) + # Pre-warm the agent cache so individual batch_task calls are cache hits + configure_client(targets.first) + discover_agents(targets) + + results = [] + targets.each do |target| + results += batch_task([target], task, target_mapping[target], options, position, &callback) + end + results + end + + # Override batch_connected? to check all targets in one RPC call. Only + # used for wait_until_available in plans. + # + # @param targets [Array] Targets to check connectivity for + # @return [Boolean] True if all targets responded to ping + def batch_connected?(targets) + logger.debug { "Checking connectivity for #{target_count(targets)}" } + first_target = targets.first + configure_client(first_target) + + response = rpc_request('rpcutil', targets, 'rpcutil.ping') do |client| + client.ping + end + response[:responded].length == targets.length + rescue StandardError => e + raise if e.is_a?(Bolt::Error) + + logger.warn { "Batch connectivity check failed: #{e.class}: #{e.message}" } + false + end + + def upload(_target, _source, _destination, _options = {}, _position = []) + raise Bolt::Error.new( + 'The Choria transport does not yet support upload.', + 'bolt/choria-unsupported-operation' + ) + end + + def download(_target, _source, _destination, _options = {}, _position = []) + raise Bolt::Error.new( + 'The Choria transport does not yet support download.', + 'bolt/choria-unsupported-operation' + ) + end + + # Returns the Choria node identity for a target. Uses the transport + # 'host' config if set, falling back to target.host (which Bolt + # derives from the URI or target name). + def choria_identity(target) + target.options['host'] || target.host + end + + # Returns the collective for a target, used by batches() to group + # targets. Falls back to the default collective from the loaded config. + def collective_for(target) + target.options['collective'] || @default_collective + end + end + end +end + +require_relative 'choria/agent_discovery' +require_relative 'choria/bolt_tasks' +require_relative 'choria/client' +require_relative 'choria/command_builders' +require_relative 'choria/helpers' +require_relative 'choria/shell' diff --git a/lib/bolt/transport/choria/agent_discovery.rb b/lib/bolt/transport/choria/agent_discovery.rb new file mode 100644 index 000000000..61f4de8e7 --- /dev/null +++ b/lib/bolt/transport/choria/agent_discovery.rb @@ -0,0 +1,137 @@ +# frozen_string_literal: true + +module Bolt + module Transport + class Choria + SHELL_MIN_VERSION = '1.2.0' + + AGENT_MIN_VERSIONS = { + 'shell' => SHELL_MIN_VERSION + }.freeze + + # Discover agents and detect OS on targets via two batched RPC calls + # (agent_inventory for agents+versions, get_fact for os.family). + # Populates @agent_cache with { agents: [...], os: 'redhat' | 'windows' | ... }. + # + # @param targets [Array] Targets to discover agents on + def discover_agents(targets) + uncached = targets.reject { |target| @agent_cache.key?(choria_identity(target)) } + return if uncached.empty? + + logger.debug { "Discovering agents on #{target_count(uncached)}" } + discover_agent_list(uncached) + discover_os_family(uncached) + + uncached.each do |target| + identity = choria_identity(target) + logger.warn { "No response from #{identity} during agent discovery" } unless @agent_cache.key?(identity) + end + end + + def has_agent?(target, agent_name) + @agent_cache[choria_identity(target)]&.dig(:agents)&.include?(agent_name) || false + end + + def windows_target?(target) + @agent_cache[choria_identity(target)]&.dig(:os) == 'windows' + end + + # Discover available agents on targets via rpcutil.agent_inventory + # and populate @agent_cache with agent lists. + # + # @param targets [Array] Targets to query for agent inventory + def discover_agent_list(targets) + response = rpc_request('rpcutil', targets, 'rpcutil.agent_inventory') do |client| + client.agent_inventory + end + response[:errors].each { |target, output| logger.debug { "agent_inventory failed for #{target.safe_name}: #{output[:error]}" } } + + response[:responded].each do |target, data| + sender = choria_identity(target) + agents = filter_agents(sender, data[:agents]) + unless agents + logger.warn { "Unexpected agent_inventory response from #{sender}. This target will be treated as unreachable." } + next + end + @agent_cache[sender] = { agents: agents } + logger.debug { "Discovered agents on #{sender}: #{agents.join(', ')}" } + end + rescue StandardError => e + raise if e.is_a?(Bolt::Error) + + logger.warn { "Agent discovery failed: #{e.class}: #{e.message}" } + end + + # Detect the OS family on targets via rpcutil.get_fact and update + # @agent_cache entries with the :os key. + # + # @param targets [Array] Targets to detect OS on + def discover_os_family(targets) + # Only fetch OS for targets that responded to agent_inventory + responded = targets.select { |target| @agent_cache.key?(choria_identity(target)) } + return if responded.empty? + + response = rpc_request('rpcutil', responded, 'rpcutil.get_fact') do |client| + client.get_fact(fact: 'os.family') + end + response[:errors].each { |target, output| + logger.warn { + "OS detection failed for #{target.safe_name}: #{output[:error]}. Defaulting to POSIX command syntax." + } + } + + response[:responded].each do |target, data| + sender = choria_identity(target) + os_family = data[:value].to_s.downcase + if os_family.empty? + logger.warn { "os.family fact is empty on #{sender}. Defaulting to POSIX command syntax." } + next + end + @agent_cache[sender][:os] = os_family + logger.debug { "Detected OS on #{sender}: #{os_family}" } + end + rescue StandardError => e + raise if e.is_a?(Bolt::Error) + + logger.warn { "OS detection failed: #{e.class}: #{e.message}. Defaulting to POSIX command syntax." } + end + + # Filter out agents that don't meet minimum version requirements. + # + # @param sender [String] Choria node identity (for logging) + # @param agent_list [Array, nil] Agent entries from agent_inventory, each with + # :agent (name) and :version keys + # @return [Array, nil] Agent names that meet version requirements, or nil + # if agent_list is not an Array + def filter_agents(sender, agent_list) + return nil unless agent_list.is_a?(Array) + + agent_list.filter_map do |entry| + name = entry['agent'] + next unless name + + version = entry['version'] + min_version = AGENT_MIN_VERSIONS[name] + if min_version && !meets_min_version?(version, min_version) + logger.warn { + "The '#{name}' agent on #{sender} is version #{version || 'unknown'}, " \ + "but #{min_version} or later is required. It will be treated as unavailable." + } + next + end + + name + end + end + + def meets_min_version?(version, min_version) + return false unless version + + Gem::Version.new(version) >= Gem::Version.new(min_version) + rescue ArgumentError => e + logger.warn { "Could not parse version '#{version}': #{e.message}. Treating agent as unavailable." } + false + end + end + end +end diff --git a/lib/bolt/transport/choria/bolt_tasks.rb b/lib/bolt/transport/choria/bolt_tasks.rb new file mode 100644 index 000000000..9aef95147 --- /dev/null +++ b/lib/bolt/transport/choria/bolt_tasks.rb @@ -0,0 +1,248 @@ +# frozen_string_literal: true + +module Bolt + module Transport + class Choria + # Run a task via the bolt_tasks agent. Groups targets by implementation + # to support mixed-platform batches. Starts all groups before polling any + # of them so tasks execute concurrently on nodes across implementations. + # + # @param targets [Array] Targets that have the bolt_tasks agent + # @param task [Bolt::Task] Task to execute + # @param arguments [Hash] Task parameter names to values + # @param result_opts [Hash] Options passed through to emit_results (:action, :name, :position) + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets + def run_task_via_bolt_tasks(targets, task, arguments, result_opts, &callback) + logger.debug { "Running task #{task.name} via bolt_tasks agent on #{target_count(targets)}" } + results = [] + + # Start all implementation groups. Each gets its own download + + # run_no_wait sequence. Tasks begin executing on nodes as soon as + # run_no_wait returns. + started_groups = [] + targets.group_by { |target| select_implementation(target, task) }.each do |implementation, impl_targets| + start_result = download_and_start_task(impl_targets, task, implementation, + arguments, result_opts, &callback) + results += start_result[:failed_results] + started_groups << start_result if start_result[:task_id] + end + + # Poll each group. Tasks are already running concurrently on nodes, + # so wall time is dominated by the longest task, not the sum. + # Each group has a different task_id, so they must be polled separately. + started_groups.each do |group| + output_by_target = poll_task_status(group[:targets], group[:task_id], task) + results += emit_results(output_by_target, **result_opts, &callback) + end + + results + end + + # Download task files from the server and start execution for one + # implementation group via bolt_tasks.download and bolt_tasks.run_no_wait. + # + # @param targets [Array] Targets sharing the same implementation + # @param task [Bolt::Task] Task being executed + # @param implementation [Hash] Task implementation with 'path', 'name', 'input_method', 'files' keys + # @param arguments [Hash] Task parameter names to values + # @param result_opts [Hash] Options passed through to emit_results (:action, :name, :position) + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Hash] with keys: + # - :failed_results [Array] Error results from setup phase + # - :targets [Array] Targets that successfully started + # - :task_id [String, nil] Shared task ID for polling, nil if nothing started + def download_and_start_task(targets, task, implementation, arguments, result_opts, &callback) + environment = targets.first.options['puppet-environment'] + input_method = implementation['input_method'] + impl_files = [{ 'name' => File.basename(implementation['name']), 'path' => implementation['path'] }] + + (implementation['files'] || []) + file_specs_json = impl_files.map { |file| task_file_spec(file, task.module_name, environment) }.to_json + + # The failed_results reference will get updated and if we ever end up without + # any targets left to act on, we can return it immediately. + failed_results = [] + none_started_result = { failed_results: failed_results, targets: [], task_id: nil } + + # Download task files + logger.debug { "Downloading task #{task.name} files via bolt_tasks to #{target_count(targets)}" } + response = rpc_request('bolt_tasks', targets, 'bolt_tasks.download') do |client| + client.download(task: task.name, files: file_specs_json, environment: environment) + end + # The bolt_tasks agent uses reply.fail! with statuscode 1 for download + # failures, which rpc_request routes to :responded since statuscode 0-1 + # means the action completed. Check rpc_statuscodes to catch these and + # report the download failure clearly instead of letting run_no_wait + # fail with a confusing "task not available" error. + dl_errors = response[:errors] + response[:rpc_statuscodes].each do |target, code| + next if code.zero? || dl_errors.key?(target) + + dl_errors[target] = error_output( + "bolt_tasks.download on #{target.safe_name} failed to download task files", + 'bolt/choria-download-failed' + ) + end + # Must use concat rather than += to preserve reference to failed_results for early return + failed_results.concat(emit_results(dl_errors, **result_opts, &callback)) + remaining = response[:responded].keys - dl_errors.keys + return none_started_result if remaining.empty? + + # Start task execution + logger.debug { "Starting task #{task.name} on #{target_count(remaining)}" } + response = rpc_request('bolt_tasks', remaining, 'bolt_tasks.run_no_wait') do |client| + client.run_no_wait(task: task.name, input_method: input_method, + files: file_specs_json, input: arguments.to_json) + end + failed_results.concat(emit_results(response[:errors], **result_opts, &callback)) + return none_started_result if response[:responded].empty? + + # Extract the shared task_id (all targets get the same one from + # the single run_no_wait call that fanned out to all of them) + task_id = response[:responded].values.first&.dig(:task_id) + unless task_id + no_id_errors = response[:responded].each_with_object({}) do |(target, _), errors| + errors[target] = error_output( + "bolt_tasks.run_no_wait on #{target.safe_name} succeeded but returned no task_id", + 'bolt/choria-missing-task-id' + ) + end + failed_results.concat(emit_results(no_id_errors, **result_opts, &callback)) + return none_started_result + end + + logger.debug { "Started task #{task.name} on #{target_count(response[:responded])}, task_id: #{task_id}" } + { failed_results: failed_results, targets: response[:responded].keys, task_id: task_id } + end + + # Poll bolt_tasks.task_status until all targets complete or timeout. + # + # @param targets [Array] Targets that were started successfully + # @param task_id [String] Shared task ID from run_no_wait + # @param task [Bolt::Task] Task being polled (used for timeout and error messages) + # @return [Hash{Bolt::Target => Hash}] Output hash for every target (success and error) + def poll_task_status(targets, task_id, task) + timeout = targets.first.options['task-timeout'] + + poll_result = poll_with_retries(targets, timeout, 'bolt_tasks.task_status') do |remaining| + response = rpc_request('bolt_tasks', remaining, 'bolt_tasks.task_status') do |client| + client.task_status(task_id: task_id) + end + next { rpc_failed: true, done: {} } if response[:rpc_failed] + + done = response[:errors].dup + + response[:responded].each do |target, data| + if data.nil? + done[target] = error_output( + "bolt_tasks.task_status on #{target.safe_name} returned success but no data", + 'bolt/choria-missing-data' + ) + next + end + next unless data[:completed] + + done[target] = extract_task_output(data, target) + end + + { rpc_failed: false, done: done } + end + + remaining_errors = poll_result[:remaining].each_with_object({}) do |target, errors| + errors[target] = + if poll_result[:rpc_persistent_failure] + error_output("RPC requests to poll task status on #{target.safe_name} failed persistently", + 'bolt/choria-poll-failed') + else + error_output("Task #{task.name} timed out after #{timeout} seconds on #{target.safe_name}", + 'bolt/choria-task-timeout') + end + end + + poll_result[:completed].merge(remaining_errors) + end + + # Extract stdout, stderr, and exitcode from a bolt_tasks task_status response. + # + # @param data [Hash] Task_status response data with :stdout, :stderr, :exitcode keys + # @param target [Bolt::Target] Target for logging and stdout unwrapping context + # @return [Hash] Output hash from output() or error_output() + def extract_task_output(data, target) + exitcode = exitcode_from(data, target, 'task') + output(stdout: unwrap_bolt_tasks_stdout(data[:stdout]), + stderr: data[:stderr], exitcode: exitcode) + end + + # Build a file spec hash for the bolt_tasks download action. Computes + # the Puppet Server file_content URI based on the file's module-relative path. + # + # @param file [Hash] With 'name' (module-relative path) and 'path' (local absolute path) + # @param module_name [String] Task's module name (used for simple task files) + # @param environment [String] Puppet environment name for the URI params + # @return [Hash] File spec with 'filename', 'sha256', 'size_bytes', and 'uri' keys + def task_file_spec(file, module_name, environment) + file_name = file['name'] + validate_file_name!(file_name) + file_path = file['path'] + + parts = file_name.split('/', 3) + path = if parts.length == 3 + mod, subdir, rest = parts + case subdir + when 'files' + "/puppet/v3/file_content/modules/#{mod}/#{rest}" + when 'lib' + "/puppet/v3/file_content/plugins/#{mod}/#{rest}" + else + "/puppet/v3/file_content/tasks/#{mod}/#{rest}" + end + else + "/puppet/v3/file_content/tasks/#{module_name}/#{file_name}" + end + + { + 'filename' => file_name, + 'sha256' => Digest::SHA256.file(file_path).hexdigest, + 'size_bytes' => File.size(file_path), + 'uri' => { + 'path' => path, + 'params' => { 'environment' => environment } + } + } + end + + # Fix double-encoding in the bolt_tasks agent's wrapper error path. + # + # Normally, create_task_stdout returns a Hash and reply_task_status + # calls .to_json on it, producing a single JSON string like: + # '{"_output":"hello world"}' + # + # But for wrapper errors, create_task_stdout returns an already + # JSON-encoded String. reply_task_status still calls .to_json on + # it, encoding it a second time. The result is a JSON string whose + # value is itself a JSON string: + # '"{\\"_error\\":{\\"kind\\":\\"choria.tasks/wrapper-error\\",...}}"' + # + # We parse one layer of JSON. In the normal case, that produces a + # Hash and we return the original string. In the double-encoded + # case, it produces a String (the inner JSON), which we return so + # Result.for_task can parse it. + # + # @param agent_stdout [String, nil] JSON-encoded stdout from the bolt_tasks agent + # @return [String, nil] JSON string suitable for Result.for_task + def unwrap_bolt_tasks_stdout(agent_stdout) + return agent_stdout unless agent_stdout.is_a?(String) + + parsed = begin + JSON.parse(agent_stdout) + rescue JSON::ParserError + return agent_stdout + end + + # Normal case: parsed is a Hash, return the original JSON string. + # Double-encoded case: parsed is a String (the inner JSON), return it. + parsed.is_a?(String) ? parsed : agent_stdout + end + end + end +end diff --git a/lib/bolt/transport/choria/client.rb b/lib/bolt/transport/choria/client.rb new file mode 100644 index 000000000..c5b72725d --- /dev/null +++ b/lib/bolt/transport/choria/client.rb @@ -0,0 +1,276 @@ +# frozen_string_literal: true + +module Bolt + module Transport + class Choria + # Number of consecutive RPC poll failures before giving up and marking + # all remaining targets as failed. Used by both polling loops + # (poll_task_status and wait_for_shell_results). + RPC_FAILURE_RETRIES = 3 + + # One-time setup of the local MCollective client connection to the + # NATS broker. MCollective::Config.loadconfig must only be called + # once since it loads plugins via PluginManager.loadclass, and a + # second call raises "Plugin already loaded". + # + # The @client_configured flag is checked twice: once before taking + # the mutex (fast path to avoid lock overhead on every call after + # setup) and once inside (handles the race where two batch threads + # both see false simultaneously and try to configure concurrently). + # + # This function is idempotent, so it should be called before any + # operation that needs the client connection to ensure it is configured + # correctly. + # + # @param target [Bolt::Target] Any target in the batch (used to read transport options) + def configure_client(target) + return if @client_configured + + @config_mutex.synchronize do + return if @client_configured + # If a previous attempt failed after partially initializing + # MCollective (e.g., plugins loaded but NATS connector failed), + # retrying loadconfig would hit "Plugin already loaded" errors. + # Re-raise the original error so the caller gets a clear message. + raise @config_error if @config_error + + # We do the require here because this is a pretty meaty library, and + # no need to load it when OpenBolt starts up if the user isn't using + # the Choria transport. + require 'mcollective' + + opts = target.options + config = MCollective::Config.instance + + unless config.configured + config_file = opts['config-file'] || MCollective::Util.config_file_for_user + + unless File.readable?(config_file) + msg = if opts['config-file'] + "Choria config file not found or not readable: #{config_file}" + else + "Could not find a readable Choria client config file. " \ + "Searched: #{MCollective::Util.config_paths_for_user.join(', ')}. " \ + "Set the 'config-file' option in the Choria transport configuration." + end + raise Bolt::Error.new(msg, 'bolt/choria-config-not-found') + end + + begin + config.loadconfig(config_file) + rescue StandardError => e + @config_error = Bolt::Error.new( + "Choria client configuration failed: #{e.class}: #{e.message}", + 'bolt/choria-config-failed' + ) + raise @config_error + end + logger.debug { "Loaded Choria client config from #{config_file}" } + end + + if opts['nats-servers'] + servers = [opts['nats-servers']].flatten + config.pluginconf['choria.middleware_hosts'] = servers.join(',') + logger.debug { "NATS servers overridden: #{servers.join(', ')}" } + end + + if opts['ssl-ca'] && opts['ssl-cert'] && opts['ssl-key'] + unreadable = %w[ssl-ca ssl-cert ssl-key].find { |key| !File.readable?(opts[key]) } + if unreadable + raise Bolt::Error.new( + "File for #{unreadable} is not readable: #{opts[unreadable]}", + 'bolt/choria-config-failed' + ) + end + + config.pluginconf['security.provider'] = 'file' + config.pluginconf['security.file.ca'] = opts['ssl-ca'] + config.pluginconf['security.file.certificate'] = opts['ssl-cert'] + config.pluginconf['security.file.key'] = opts['ssl-key'] + logger.debug { "Using file-based TLS security provider with given SSL override(s)" } + end + + @default_collective = config.main_collective + @client_configured = true + end + end + + # Create an MCollective::RPC::Client for one or more targets. + # Accepts a single target or an array. Uses MCollective's direct + # addressing mode (client.discover(nodes:)) to skip broadcast + # discovery and send requests directly to the specified nodes. + # + # Note that when the client is created, if the shell agent isn't already + # installed on the OpenBolt controller node, then the shell DDL that we + # bundle with OpenBolt at lib/mcollective/agent/shell.ddl + # automatically gets loaded since it's on the $LOAD_PATH and in the + # right place for MCollective's plugin loading. The bolt_tasks + # DDL is already included in the choria-mcorpc-support gem. + # + # @param agent_name [String] MCollective agent name (e.g. 'shell', 'bolt_tasks') + # @param targets [Bolt::Target, Array] One or more targets to address + # @param timeout [Numeric] RPC call timeout in seconds + # @return [MCollective::RPC::Client] Configured client with direct addressing enabled + def create_rpc_client(agent_name, targets, timeout) + targets = [targets].flatten + options = MCollective::Util.default_options + options[:timeout] = timeout + options[:verbose] = false + options[:connection_timeout] = targets.first.options['nats-connection-timeout'] + + collective = collective_for(targets.first) + options[:collective] = collective if collective + + client = MCollective::RPC::Client.new(agent_name, options: options) + client.progress = false + + identities = targets.map { |target| choria_identity(target) }.uniq + client.discover(nodes: identities) + + client + end + + # Make a batched RPC call and split results into responded and errors. + # Yields the RPC client so the caller specifies which action to invoke. + # + # Results are split based on MCollective RPC statuscodes: + # - statuscode 0: action completed successfully (:responded) + # - statuscode 1 (RPCAborted): action completed but reported a + # problem (:responded). The data is preserved rather than + # discarded because some agents (notably bolt_tasks) use + # statuscode 1 for application-level failures where the + # response data is still valid and meaningful (e.g., a task + # that ran but exited non-zero). Callers must handle this + # case and not assume :responded means success. + # - statuscode 2-5: RPC infrastructure error (:errors) + # - no response: target didn't reply (:errors) + # - exception: total RPC failure (rpc_failed: true) + # + # Serialized by @rpc_mutex because MCollective's NATS connector is a + # singleton with a shared receive queue. Concurrent RPC calls cause + # reply channel collisions, cross-thread message confusion, and subscription + # conflicts. See choria-transport-dev.md for the full explanation. + # + # @param agent [String] MCollective agent name (e.g. 'shell', 'bolt_tasks', 'rpcutil') + # @param targets [Bolt::Target, Array] One or more targets to address + # @param context [String] Human-readable label for logging (e.g. 'shell.start') + # @yield [MCollective::RPC::Client] The configured RPC client to invoke an action on + # @return [Hash] with keys: + # - :responded [Hash] Targets where the action completed (statuscode 0-1), + # mapped to their response data + # - :errors [Hash] Targets with RPC errors or no response, mapped to error output hashes + # - :rpc_failed [Boolean] True when the entire RPC call failed + # - :rpc_statuscodes [Hash] Per-target MCollective RPC statuscodes. + # Includes all targets that responded (both :responded and :errors). + # Not populated when rpc_failed is true (no individual responses). + def rpc_request(agent, targets, context) + targets = [targets].flatten + rpc_results = @rpc_mutex.synchronize do + rpc_timeout = targets.first.options['rpc-timeout'] + client = create_rpc_client(agent, targets, rpc_timeout) + yield(client) + end + by_sender = index_results_by_sender(rpc_results, targets, context) + + responded = {} + errors = {} + rpc_statuscodes = {} + targets.each do |target| + rpc_result = by_sender[choria_identity(target)] + if rpc_result.nil? + errors[target] = error_output( + "No response from #{target.safe_name} for #{context}", + 'bolt/choria-no-response' + ) + elsif rpc_result[:statuscode] > 1 + rpc_statuscodes[target] = rpc_result[:statuscode] + errors[target] = error_output( + "#{context} on #{target.safe_name} returned RPC error: " \ + "#{rpc_result[:statusmsg]} (code #{rpc_result[:statuscode]})", + 'bolt/choria-rpc-error' + ) + else + rpc_statuscodes[target] = rpc_result[:statuscode] + if rpc_result[:statuscode] == 1 + logger.warn { "#{context} on #{target.safe_name} had RPC status code #{rpc_result[:statuscode]}: #{rpc_result[:statusmsg]}" } + end + responded[target] = rpc_result[:data] + end + end + { responded: responded, errors: errors, rpc_failed: false, rpc_statuscodes: rpc_statuscodes } + rescue StandardError => e + raise if e.is_a?(Bolt::Error) + + logger.warn { "#{context} RPC call failed: #{e.class}: #{e.message}" } + errors = targets.each_with_object({}) do |target, errs| + errs[target] = error_output("#{context} failed on #{target.safe_name}: #{e.message}", + 'bolt/choria-rpc-failed') + end + { responded: {}, errors: errors, rpc_failed: true, rpc_statuscodes: {} } + end + + # Configure the client, discover agents, partition targets by agent + # availability, and emit errors for incapable targets. + # + # @param targets [Array] Targets to prepare + # @param agent_name [String] Required agent name (e.g. 'shell', 'bolt_tasks') + # @param result_opts [Hash] Options passed through to emit_results (:action, :name, :position) + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Two-element array: + # - [Array] Targets that have the required agent + # - [Array] Error results for targets that lack the agent + def prepare_targets(targets, agent_name, result_opts, &callback) + configure_client(targets.first) + discover_agents(targets) + + capable, incapable = targets.partition { |target| has_agent?(target, agent_name) } + + agent_errors = incapable.each_with_object({}) do |target, errors| + msg = if @agent_cache[choria_identity(target)].nil? + "No agent information available for #{target.safe_name} (node did not respond to discovery)" + else + "The '#{agent_name}' agent is not available on #{target.safe_name}." + end + errors[target] = error_output(msg, 'bolt/choria-agent-not-available') + end + incapable_results = emit_results(agent_errors, fire_node_start: true, **result_opts, &callback) + + [capable, incapable_results] + end + + # Index RPC results by sender, keeping only the first response per + # sender and only from the set of expected identities. Logs and discards + # responses from unexpected senders and duplicates. + # + # @param results [Array] Raw MCollective RPC result hashes with :sender keys + # @param targets [Array] Expected targets (used to build the allowed sender set) + # @param context [String] Human-readable label for log messages + # @return [Hash{String => Hash}] Sender identity to first valid RPC result hash + def index_results_by_sender(results, targets, context) + expected = targets.to_set { |target| choria_identity(target) } + by_sender = {} + results.each do |result| + sender = result[:sender] + unless sender + logger.warn { "Discarding #{context} response with nil sender" } + next + end + unless expected.include?(sender) + logger.warn { "Discarding #{context} response from unexpected sender '#{sender}'" } + next + end + if by_sender.key?(sender) + if result[:data] == by_sender[sender][:data] + logger.debug { "Ignoring duplicate #{context} response from #{sender}" } + else + logger.warn { "Ignoring duplicate #{context} response from #{sender} with different data" } + end + next + end + by_sender[sender] = result + end + by_sender + end + end + end +end diff --git a/lib/bolt/transport/choria/command_builders.rb b/lib/bolt/transport/choria/command_builders.rb new file mode 100644 index 000000000..2a55093dd --- /dev/null +++ b/lib/bolt/transport/choria/command_builders.rb @@ -0,0 +1,199 @@ +# frozen_string_literal: true + +module Bolt + module Transport + class Choria + # Platform-aware command builders. These generate the right shell + # commands based on whether the target is Windows (PowerShell) or + # POSIX (sh). OS is detected during agent discovery via the + # os.family fact. + + # Build a mkdir command for one or more directories. + # + # @param target [Bolt::Target] Used for platform detection + # @param paths [Array] Absolute directory paths to create + # @return [String] Shell command + def make_dir_command(target, *paths) + if windows_target?(target) + escaped = paths.map { |path| "'#{ps_escape(path)}'" }.join(', ') + "New-Item -ItemType Directory -Force -Path #{escaped}" + else + escaped = paths.map { |path| Shellwords.shellescape(path) }.join(' ') + "mkdir -m 700 -p #{escaped}" + end + end + + # Build a chmod +x command. Returns nil on Windows (not needed). + # + # @param target [Bolt::Target] Used for platform detection + # @param path [String] Absolute path to the file + # @return [String, nil] Shell command or nil + def make_executable_command(target, path) + windows_target?(target) ? nil : "chmod u+x #{Shellwords.shellescape(path)}" + end + + # Build a recursive directory removal command. + # + # @param target [Bolt::Target] Used for platform detection + # @param path [String] Absolute path to the directory + # @return [String] Shell command + def cleanup_dir_command(target, path) + windows_target?(target) ? + "Remove-Item -Recurse -Force -Path '#{ps_escape(path)}'" : + "rm -rf #{Shellwords.shellescape(path)}" + end + + # Build a command that writes base64-encoded content to a file + # after decoding the content. Requires base64 CLI on POSIX targets. + # + # @param target [Bolt::Target] Used for platform detection + # @param content_b64 [String] Base64-encoded file content + # @param dest [String] Absolute destination path on the remote node + # @return [String] Shell command + def upload_file_command(target, content_b64, dest) + if windows_target?(target) + "[IO.File]::WriteAllBytes('#{ps_escape(dest)}', " \ + "[Convert]::FromBase64String('#{content_b64}'))" + else + "printf '%s' #{Shellwords.shellescape(content_b64)} | base64 -d > #{Shellwords.shellescape(dest)}" + end + end + + # Prepend environment variables to a command string. + # Returns the command unchanged if env_vars is nil or empty. + # + # @param target [Bolt::Target] Used for platform detection + # @param command [String] The command to prepend env vars to + # @param env_vars [Hash{String => String}, nil] Variable names to values + # @param context [String] Description for error messages (e.g., 'task argument') + # @return [String] Command with env vars prepended + def prepend_env_vars(target, command, env_vars, context) + return command unless env_vars&.any? + + env_vars.each_key { |key| validate_env_key!(key, context) } + + if windows_target?(target) + set_stmts = env_vars.map { |key, val| "$env:#{key} = '#{ps_escape(val)}'" } + "#{set_stmts.join('; ')}; & #{command}" + else + env_str = env_vars.map { |key, val| "#{key}=#{Shellwords.shellescape(val)}" }.join(' ') + "/usr/bin/env #{env_str} #{command}" + end + end + + # Build a command that pipes data to another command via stdin. + # + # @param target [Bolt::Target] Used for platform detection + # @param data [String] Data to pipe (typically JSON task arguments) + # @param command [String] The command to receive stdin + # @return [String] Shell command with stdin piping + def stdin_pipe_command(target, data, command) + if windows_target?(target) + # Use a here-string (@'...'@) to avoid escaping issues with + # large JSON payloads. Content between @' and '@ is literal. + "@'\n#{data}\n'@ | & #{command}" + else + "printf '%s' #{Shellwords.shellescape(data)} | #{command}" + end + end + + # Escape a string for use as a shell argument on the target platform. + # + # @param target [Bolt::Target] Used for platform detection + # @param str [String] The string to escape + # @return [String] Escaped string (single-quoted on Windows, sh-escaped on POSIX) + def escape_arg(target, str) + windows_target?(target) ? "'#{ps_escape(str)}'" : Shellwords.shellescape(str) + end + + # Join path segments using the target platform's separator. + # Normalizes embedded forward slashes to backslashes on Windows. + # + # @param target [Bolt::Target] Used for platform detection + # @param parts [Array] Path segments to join + # @return [String] Joined path + def join_path(target, *parts) + sep = windows_target?(target) ? '\\' : '/' + parts = parts.map { |part| part.tr('/', sep) } if sep != '/' + parts.join(sep) + end + + # Wrap a PowerShell script for execution via shell agent. Uses + # -EncodedCommand with Base64-encoded UTF-16LE (the encoding + # Microsoft requires for -EncodedCommand) to avoid all quoting + # issues with cmd.exe and PowerShell metacharacters. + # + # @param script [String] PowerShell script to encode and wrap + # @return [String] powershell.exe command with -EncodedCommand + def powershell_cmd(script) + "powershell.exe -NoProfile -NonInteractive -EncodedCommand #{Base64.strict_encode64(script.encode('UTF-16LE'))}" + end + + # Escape single quotes for use inside PowerShell single-quoted strings. + # + # @param str [String] String to escape + # @return [String] String with single quotes doubled + def ps_escape(str) + str.gsub("'", "''") + end + + # Build the full command string for task execution via the shell agent, + # handling interpreter selection, environment variable injection, and + # stdin piping. + # + # @param target [Bolt::Target] Target (used for platform detection) + # @param remote_task_path [String] Absolute path to the task executable on the remote node + # @param arguments [Hash] Task parameter names to values + # @param input_method [String] How to pass arguments: 'stdin', 'environment', or 'both' + # @param interpreter_options [Hash{String => String}] File extension to interpreter path mapping + # @return [String] The fully constructed shell command + def build_task_command(target, remote_task_path, arguments, input_method, interpreter_options) + interpreter = select_interpreter(remote_task_path, interpreter_options) + cmd = interpreter ? + "#{Array(interpreter).map { |part| escape_arg(target, part) }.join(' ')} #{escape_arg(target, remote_task_path)}" : + escape_arg(target, remote_task_path) + + needs_env = Bolt::Task::ENVIRONMENT_METHODS.include?(input_method) + needs_stdin = Bolt::Task::STDIN_METHODS.include?(input_method) + + if needs_env && needs_stdin && windows_target?(target) + # On Windows, piping stdin into a multi-statement command + # requires a script block. Pipeline data doesn't automatically + # flow through a script block to inner commands, so we + # explicitly forward $input via a pipe. + env_params = envify_params(arguments) + env_params.each_key { |key| validate_env_key!(key, 'task argument') } + set_stmts = env_params.map { |key, val| "$env:#{key} = '#{ps_escape(val)}'" } + cmd = stdin_pipe_command(target, arguments.to_json, + "{ #{set_stmts.join('; ')}; $input | & #{cmd} }") + else + if needs_env + cmd = prepend_env_vars(target, cmd, envify_params(arguments), 'task argument') + end + + if needs_stdin + cmd = stdin_pipe_command(target, arguments.to_json, cmd) + end + end + + cmd + end + + # Convert task arguments to PT_-prefixed environment variable hash. + # Duplicated from Bolt::Shell#envify_params. We don't use Bolt::Shell + # classes because they interleave command building with connection-based + # execution (IO pipes, sudo prompts). With the Choria transport, we just + # need to build the command and send it via RPC so all the shell agents + # on the targets can execute it themselves. + # + # @param params [Hash{String => Object}] Task parameter names to values + # @return [Hash{String => String}] Environment variables with PT_ prefix + def envify_params(params) + params.each_with_object({}) do |(key, val), env| + val = val.to_json unless val.is_a?(String) + env["PT_#{key}"] = val + end + end + end + end +end diff --git a/lib/bolt/transport/choria/helpers.rb b/lib/bolt/transport/choria/helpers.rb new file mode 100644 index 000000000..95d02c06c --- /dev/null +++ b/lib/bolt/transport/choria/helpers.rb @@ -0,0 +1,197 @@ +# frozen_string_literal: true + +module Bolt + module Transport + class Choria + # Polling interval between rounds, used by poll_task_status + # and wait_for_shell_results. Each round makes one batched RPC call + # regardless of target count, so a 1-second interval balances + # responsiveness against broker load. + POLL_INTERVAL_SECONDS = 1 + + # Matches Windows absolute paths like C:\temp or D:/foo. + # Used by validate_file_name! and Config::Transport::Choria#absolute_path?. + WINDOWS_PATH_REGEX = %r{\A[A-Za-z]:[\\/]} + + def target_count(targets) + count = targets.is_a?(Hash) ? targets.size : targets.length + "#{count} #{count == 1 ? 'target' : 'targets'}" + end + + # Shared polling loop for bolt_tasks and shell polling. Handles sleep + # timing, round counting, RPC failure retry, and deadline enforcement. + # + # The block receives the remaining targets each round and returns: + # { done: {target => output_hash}, rpc_failed: bool } + # + # @param targets [Array, Hash] Initial targets to poll (duped internally) + # @param timeout [Numeric] Maximum seconds before exiting + # @param context [String] Label for log messages + # @return [Hash] with keys: + # - :completed [Hash{Target => Hash}] All finished target outputs + # - :remaining [Array, Hash] Targets still pending when the loop exited + # - :rpc_persistent_failure [Boolean] True if loop exited due to persistent RPC failures + def poll_with_retries(targets, timeout, context) + deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + timeout + remaining = targets.dup + completed = {} + poll_failures = 0 + poll_round = 0 + + until remaining.empty? + sleep(POLL_INTERVAL_SECONDS) + poll_round += 1 + logger.debug { "Poll round #{poll_round}: #{target_count(remaining)} still pending" } + + round = yield(remaining) + + if round[:rpc_failed] + poll_failures += 1 + logger.warn { "#{context} poll failed (attempt #{poll_failures}/#{RPC_FAILURE_RETRIES})" } + break if poll_failures >= RPC_FAILURE_RETRIES + + next + end + poll_failures = 0 + + round[:done].each do |target, output| + completed[target] = output + remaining.delete(target) + end + + break if Process.clock_gettime(Process::CLOCK_MONOTONIC) >= deadline + end + + { completed: completed, remaining: remaining, + rpc_persistent_failure: poll_failures >= RPC_FAILURE_RETRIES } + end + + # Build a Bolt::Result from an output hash. Handles both success and + # error cases based on the presence of the :error key. + # + # @param target [Bolt::Target] The target this result belongs to + # @param data [Hash] Output hash with keys :stdout, :stderr, :exitcode, and + # optionally :error and :error_kind for failures + # @param action [String] One of 'task', 'command', or 'script' + # @param name [String] Task/command/script name for result metadata + # @param position [Array] Positional info for result tracking + # @return [Bolt::Result] The constructed result + def build_result(target, data, action:, name:, position:) + if data[:error] + Bolt::Result.from_exception( + target, Bolt::Error.new(data[:error], data[:error_kind]), + action: action, position: position + ) + elsif action == 'task' + Bolt::Result.for_task(target, data[:stdout], data[:stderr], + data[:exitcode], name, position) + elsif %w[command script].include?(action) + Bolt::Result.for_command( + target, + { 'stdout' => data[:stdout], 'stderr' => data[:stderr], 'exit_code' => data[:exitcode] }, + action, name, position + ) + else + raise Bolt::Error.new( + "Unknown action '#{action}' in build_result", + 'bolt/choria-unknown-action' + ) + end + end + + # Convert a hash of { target => output } into Results, fire callbacks, + # and return the Results array. When fire_node_start is true, fires a + # :node_start callback before each :node_result. + # + # @param target_outputs [Hash{Bolt::Target => Hash}] Map of targets to output hashes + # @param action [String] One of 'task', 'command', or 'script' + # @param name [String] Task/command/script name for result metadata + # @param position [Array] Positional info for result tracking + # @param fire_node_start [Boolean] Whether to emit :node_start before each result + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets in the hash + def emit_results(target_outputs, action:, name:, position:, fire_node_start: false, &callback) + target_outputs.map do |target, data| + callback&.call(type: :node_start, target: target) if fire_node_start + result = build_result(target, data, action: action, name: name, position: position) + callback&.call(type: :node_result, result: result) + result + end + end + + # Build an output hash from command/task output. + def output(stdout: nil, stderr: nil, exitcode: nil) + { stdout: stdout || '', stderr: stderr || '', exitcode: exitcode || 0 } + end + + # Build an error output hash. When actual output is available (e.g. + # a command ran but failed), pass it through so the user sees it. + def error_output(message, kind, stdout: nil, stderr: nil, exitcode: 1) + output(stdout: stdout, stderr: stderr, exitcode: exitcode) + .merge(error: message, error_kind: kind) + end + + # Extract exit code from RPC response data, defaulting to 1 with a + # warning if the agent returned nil. + # + # @param data [Hash] RPC response data containing :exitcode + # @param target [Bolt::Target] Target for logging context + # @param context [String] Human-readable label for the log message + # @return [Integer] The exit code from the data, or 1 if nil + def exitcode_from(data, target, context) + exitcode = data[:exitcode] || data['exitcode'] + if exitcode.nil? + logger.warn { + "Agent on #{target.safe_name} returned no exit code for #{context}. " \ + "Defaulting to exit code 1. This usually indicates an agent-level error." + } + exitcode = 1 + end + exitcode + end + + # Validate that a file name does not contain path traversal sequences + # or absolute paths. Checks both POSIX and Windows conventions. + # Raises Bolt::Error on violations. + # + # @param name [String] Task file name to validate + def validate_file_name!(name) + if name.include?("\0") + raise Bolt::Error.new( + "Invalid null byte in task file name: #{name.inspect}", + 'bolt/invalid-task-filename' + ) + end + + if name.start_with?('/') || name.match?(WINDOWS_PATH_REGEX) + raise Bolt::Error.new( + "Absolute path not allowed in task file name: '#{name}'", + 'bolt/invalid-task-filename' + ) + end + + if name.split(%r{[/\\]}).include?('..') + raise Bolt::Error.new( + "Path traversal detected in task file name: '#{name}'", + 'bolt/path-traversal' + ) + end + end + + # Validate an environment variable key is safe for shell interpolation. + # + # @param key [String] Environment variable name to validate + # @param context [String] Description for error messages + def validate_env_key!(key, context) + safe_pattern = /\A[A-Za-z_][A-Za-z0-9_]*\z/ + return if safe_pattern.match?(key) + + raise Bolt::Error.new( + "Unsafe environment variable name '#{key}' in #{context}. " \ + "Names must match #{safe_pattern.source}", + 'bolt/invalid-env-var-name' + ) + end + end + end +end diff --git a/lib/bolt/transport/choria/shell.rb b/lib/bolt/transport/choria/shell.rb new file mode 100644 index 000000000..6e8d737b2 --- /dev/null +++ b/lib/bolt/transport/choria/shell.rb @@ -0,0 +1,560 @@ +# frozen_string_literal: true + +module Bolt + module Transport + class Choria + # Terminal shell job statuses that indicate the process has finished. + SHELL_DONE_STATUSES = %w[stopped failed].freeze + + # Run a command on targets via the shell agent. Assumes all targets in + # the batch are the same platform (POSIX or Windows). Mixed-platform + # batches use the first capable target's platform for command syntax. + # + # @param targets [Array] Targets in a single collective batch + # @param command [String] Shell command to execute + # @param options [Hash] Execution options - supports :env_vars for environment variables + # @param position [Array] Positional info for result tracking + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets + def batch_command(targets, command, options = {}, position = [], &callback) + result_opts = { action: 'command', name: command, position: position } + shell_targets, results = prepare_targets(targets, 'shell', result_opts, &callback) + return results if shell_targets.empty? + + logger.debug { "Running command via shell agent on #{target_count(shell_targets)}" } + + first_target = shell_targets.first + timeout = first_target.options['command-timeout'] + command = prepend_env_vars(first_target, command, options[:env_vars], 'run_command env_vars') + + shell_targets.each { |target| callback&.call(type: :node_start, target: target) } + + pending, start_failures = shell_start(shell_targets, command) + results += emit_results(start_failures, **result_opts, &callback) + results += emit_results(wait_for_shell_results(pending, timeout), **result_opts, &callback) + + results + end + + # Run a script on targets via the shell agent. Assumes all targets in + # the batch are the same platform (POSIX or Windows). Mixed-platform + # batches use the first capable target's platform for infrastructure + # commands (mkdir, upload, chmod, cleanup). + # + # @param targets [Array] Targets in a single collective batch + # @param script [String] Local path to the script file + # @param arguments [Array] Command-line arguments to pass to the script + # @param options [Hash] Execution options; supports :script_interpreter + # @param position [Array] Positional info for result tracking + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets + def batch_script(targets, script, arguments, options = {}, position = [], &callback) + result_opts = { action: 'script', name: script, position: position } + shell_targets, results = prepare_targets(targets, 'shell', result_opts, &callback) + return results if shell_targets.empty? + + logger.debug { "Running script via shell agent on #{target_count(shell_targets)}" } + + first_target = shell_targets.first + arguments = unwrap_sensitive_args(arguments) + timeout = first_target.options['command-timeout'] + tmpdir = generate_tmpdir_path(first_target) + + script_content = File.binread(script) + + shell_targets.each { |target| callback&.call(type: :node_start, target: target) } + + begin + remote_path = join_path(first_target, tmpdir, File.basename(script)) + active_targets = shell_targets.dup + + # Create a temp directory with restricted permissions + failures = shell_run(active_targets, + make_dir_command(first_target, tmpdir), + description: 'mkdir tmpdir') + results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + + # Upload the script file + if active_targets.any? + failures = upload_file_content(active_targets, script_content, remote_path) + results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + end + + # Make the script executable (no-op on Windows) + chmod_cmd = make_executable_command(first_target, remote_path) + if active_targets.any? && chmod_cmd + failures = shell_run(active_targets, chmod_cmd, description: 'chmod script') + results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + end + + # Execute the script asynchronously and poll for completion + if active_targets.any? + interpreter = select_interpreter(script, first_target.options['interpreters']) + cmd_parts = [] + cmd_parts += Array(interpreter).map { |part| escape_arg(first_target, part) } if interpreter && options[:script_interpreter] + cmd_parts << escape_arg(first_target, remote_path) + cmd_parts += arguments.map { |arg| escape_arg(first_target, arg) } + + pending, start_failures = shell_start(active_targets, cmd_parts.join(' ')) + results += emit_results(start_failures, **result_opts, &callback) + results += emit_results(wait_for_shell_results(pending, timeout), **result_opts, &callback) + end + ensure + cleanup_tmpdir(shell_targets, tmpdir) + end + + results + end + + # Generate a unique remote tmpdir path for batch operations. + # + # @param target [Bolt::Target] Target whose platform and tmpdir config determine the base path + # @return [String] Absolute path to a unique temporary directory + def generate_tmpdir_path(target) + base = target.options['tmpdir'] + base = 'C:\Windows\Temp' if base == '/tmp' && windows_target?(target) + join_path(target, base, "bolt-choria-#{SecureRandom.uuid}") + end + + # Clean up a remote tmpdir on targets, logging per-target failures. + # Used in ensure blocks after batch_script and batch_task_shell. + # + # @param targets [Array] Targets to clean up on + # @param tmpdir [String] Absolute path to the temporary directory to remove + def cleanup_tmpdir(targets, tmpdir) + return unless targets.first.options.fetch('cleanup', true) + + unless File.basename(tmpdir).start_with?('bolt-choria-') + logger.warn { "Refusing to delete unexpected tmpdir path: #{tmpdir}" } + return + end + + begin + failures = shell_run(targets, cleanup_dir_command(targets.first, tmpdir), + description: 'cleanup tmpdir') + failures.each do |target, failure| + logger.warn { "Cleanup failed on #{target.safe_name}. Task data may remain in #{tmpdir}. #{failure[:error]}" } + end + rescue StandardError => e + logger.warn { "Cleanup of #{tmpdir} failed on all targets: #{e.message}" } + end + end + + # Run a task via the shell agent. Groups targets by implementation to + # support mixed-platform batches. Starts all groups before polling so + # tasks execute concurrently on nodes across implementations. + # + # @param targets [Array] Targets that have the shell agent + # @param task [Bolt::Task] Task to execute + # @param arguments [Hash] Task parameter names to values + # @param result_opts [Hash] Options passed through to emit_results (:action, :name, :position) + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Array] Results for all targets + def run_task_via_shell(targets, task, arguments, result_opts, &callback) + logger.debug { "Running task #{task.name} via shell agent on #{target_count(targets)}" } + results = [] + all_pending = {} + cleanup_entries = [] + + # Each implementation group gets its own tmpdir because different + # platforms need different base paths (e.g., /tmp vs C:\Windows\Temp). + begin + targets.group_by { |target| select_implementation(target, task) }.each do |implementation, impl_targets| + start_result = upload_and_start_task(impl_targets, task, implementation, + arguments, result_opts, &callback) + results += start_result[:failed_results] + all_pending.merge!(start_result[:pending]) + cleanup_entries << { targets: impl_targets, tmpdir: start_result[:tmpdir] } + end + + # Poll all handles in one loop. Unlike bolt_tasks (which needs + # separate polls per task_id), shell handles are interchangeable. + unless all_pending.empty? + timeout = targets.first.options['task-timeout'] + results += emit_results(wait_for_shell_results(all_pending, timeout), **result_opts, &callback) + end + ensure + cleanup_entries.each { |entry| cleanup_tmpdir(entry[:targets], entry[:tmpdir]) } + end + + results + end + + # Upload task files and start execution for one implementation group. + # + # @param targets [Array] Targets sharing the same implementation + # @param task [Bolt::Task] Task being executed + # @param implementation [Hash] Task implementation with 'path', 'name', 'input_method', 'files' keys + # @param arguments [Hash] Task parameter names to values + # @param result_opts [Hash] Options passed through to emit_results (:action, :name, :position) + # @param callback [Proc] Called with :node_start and :node_result events + # @return [Hash] with keys: + # - :failed_results [Array] Error results from setup phase + # - :pending [Hash] Targets mapped to { handle: uuid } for polling + # - :tmpdir [String] Remote tmpdir path for cleanup + def upload_and_start_task(targets, task, implementation, arguments, result_opts, &callback) + arguments = arguments.dup + executable = implementation['path'] + input_method = implementation['input_method'] + extra_files = implementation['files'] + first_target = targets.first + tmpdir = generate_tmpdir_path(first_target) + + executable_content = File.binread(executable) + extra_file_contents = {} + extra_files.each do |file| + validate_file_name!(file['name']) + extra_file_contents[file['name']] = File.binread(file['path']) + end + + failed_results = [] + active_targets = targets.dup + task_dir = tmpdir + + # Create the tmpdir + failures = shell_run(active_targets, + make_dir_command(first_target, tmpdir), + description: 'mkdir tmpdir') + failed_results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + + # Tasks with extra files get a module-layout directory tree in + # tmpdir, and _installdir is set so the task can find them. + # Simple tasks go directly in tmpdir with no _installdir. + if active_targets.any? && extra_files.any? + arguments['_installdir'] = tmpdir + task_dir = join_path(first_target, tmpdir, task.tasks_dir) + + # Create subdirectories for the task and its dependencies + extra_dirs = extra_files.map { |file| join_path(first_target, tmpdir, File.dirname(file['name'])) }.uniq + all_dirs = [task_dir] + extra_dirs + failures = shell_run(active_targets, + make_dir_command(first_target, *all_dirs), + description: 'mkdir task dirs') + failed_results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + + # Upload each dependency file to its module-relative path + extra_files.each do |file| + break if active_targets.empty? + + failures = upload_file_content(active_targets, extra_file_contents[file['name']], + join_path(first_target, tmpdir, file['name'])) + failed_results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + end + end + + # Upload the main task executable + remote_task_path = join_path(first_target, task_dir, File.basename(executable)) if active_targets.any? + if remote_task_path + failures = upload_file_content(active_targets, executable_content, remote_task_path) + failed_results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + end + + # Make the task executable (no-op on Windows) + chmod_cmd = make_executable_command(first_target, remote_task_path) if remote_task_path + if active_targets.any? && chmod_cmd + failures = shell_run(active_targets, chmod_cmd, description: 'chmod task') + failed_results += emit_results(failures, **result_opts, &callback) + active_targets -= failures.keys + end + + # Start the task asynchronously + pending = {} + if active_targets.any? && remote_task_path + full_cmd = build_task_command(first_target, remote_task_path, arguments, input_method, + first_target.options['interpreters']) + pending, start_failures = shell_start(active_targets, full_cmd) + failed_results += emit_results(start_failures, **result_opts, &callback) + end + + { failed_results: failed_results, pending: pending, tmpdir: tmpdir } + end + + # Execute a synchronous command on targets via the shell.run RPC action. + # Used for internal prep/cleanup (mkdir, chmod, etc.) that completes quickly. + # Returns only failures since successes don't need to be reported. + # + # @param targets [Array] Targets to run the command on + # @param command [String] Shell command to execute + # @param description [String, nil] Human-readable label for logging (defaults to command) + # @return [Hash{Bolt::Target => Hash}] Failures only; empty hash means all succeeded + def shell_run(targets, command, description: nil) + label = description || command + command = powershell_cmd(command) if windows_target?(targets.first) + response = rpc_request('shell', targets, label) do |client| + client.run(command: command) + end + + # Check that the exit code is 0 for each successful RPC response, + # treating nonzero exit codes as failures. + failures = response[:errors] + response[:responded].each do |target, data| + data ||= {} + exitcode = exitcode_from(data, target, label) + next if exitcode.zero? + + failures[target] = error_output( + "#{label} failed on #{target.safe_name} (exit code #{exitcode}): #{data[:stderr]}", + 'bolt/choria-operation-failed', + stdout: data[:stdout], stderr: data[:stderr], exitcode: exitcode + ) + end + + failures + end + + # Upload file content to the same path on multiple targets via base64. + # The entire file is base64-encoded and sent as a single RPC message, + # so file size is limited by the NATS max message size (default 1MB, + # configurable via plugin.choria.network.client_max_payload in the + # Choria broker config). Base64 adds ~33% overhead, so the effective + # file size limit is roughly 750KB with default settings. + # Once the file-transfer agent is implemented, we'll use chunked + # transfers via that agent instead when it's available, removing this + # size limitation. + # + # @param targets [Array] Targets to upload to + # @param content [String] Raw file content (binary-safe) + # @param destination [String] Absolute path on the remote node + # @return [Hash{Bolt::Target => Hash}] Failures only; empty hash means all succeeded + def upload_file_content(targets, content, destination) + logger.debug { "Uploading #{content.bytesize} bytes to #{destination} on #{target_count(targets)}" } + encoded = Base64.strict_encode64(content) + command = upload_file_command(targets.first, encoded, destination) + shell_run(targets, command, description: "upload #{destination}") + end + + # Start an async command on targets via the shell.start RPC action. + # Returns handles for polling with wait_for_shell_results. + # + # @param targets [Array] Targets to start the command on + # @param command [String] Shell command to execute + # @return [Array] Two-element array: + # - pending [Hash] Targets mapped to { handle: uuid_string } + # - failures [Hash] Targets mapped to error output hashes + def shell_start(targets, command) + command = powershell_cmd(command) if windows_target?(targets.first) + response = rpc_request('shell', targets, 'shell.start') do |client| + client.start(command: command) + end + failures = response[:errors] + + pending, no_handle = response[:responded].partition { |_target, data| data&.dig(:handle) }.map(&:to_h) + pending.each { |target, data| logger.debug { "Started command on #{target.safe_name}, handle: #{data[:handle]}" } } + + no_handle.each_key do |target| + failures[target] = error_output("shell.start on #{target.safe_name} returned success but no handle", + 'bolt/choria-missing-handle') + end + + [pending, failures] + end + + # Wait for async shell handles to complete, fetch their output via + # shell_statuses, and kill timed-out processes. + # + # @param pending [Hash{Bolt::Target => Hash}] Targets to poll, each mapped to { handle: uuid_string } + # @param timeout [Numeric] Maximum seconds to wait before killing remaining processes + # @return [Hash{Bolt::Target => Hash}] Output hash for every target (success and error) + def wait_for_shell_results(pending, timeout) + return {} if pending.empty? + + poll_result = poll_with_retries(pending, timeout, 'shell.list') do |remaining| + completed, rpc_failed = shell_list(remaining) + next { rpc_failed: true, done: {} } if rpc_failed + + done = {} + fetch_targets = {} + completed.each do |target, value| + if value[:error] + done[target] = value + else + fetch_targets[target] = value + end + end + + unless fetch_targets.empty? + logger.debug { "Fetching output from #{target_count(fetch_targets)}" } + fetched = shell_statuses(fetch_targets) + fetch_targets.each_key do |target| + done[target] = fetched[target] || error_output( + "Command completed on #{target.safe_name} but output could not be fetched", + 'bolt/choria-result-processing-error' + ) + end + end + + { rpc_failed: false, done: done } + end + + remaining_errors = {} + unless poll_result[:remaining].empty? + if poll_result[:rpc_persistent_failure] + poll_result[:remaining].each_key do |target| + remaining_errors[target] = error_output( + "RPC requests to poll shell status on #{target.safe_name} failed persistently", + 'bolt/choria-poll-failed' + ) + end + else + kill_timed_out_processes(poll_result[:remaining]) + poll_result[:remaining].each_key do |target| + remaining_errors[target] = error_output( + "Command timed out after #{timeout} seconds on #{target.safe_name}", + 'bolt/choria-command-timeout' + ) + end + end + end + + poll_result[:completed].merge(remaining_errors) + end + + # One round of the shell.list RPC action to check which handles have + # completed. Targets not yet done are omitted from the return value. + # + # @param remaining [Hash{Bolt::Target => Hash}] Targets still pending, each mapped to + # { handle: uuid_string } + # @return [Array] Two-element array: + # - done [Hash{Bolt::Target => Hash}] Completed targets mapped to handle state or error hash + # - rpc_failed [Boolean] True when the entire RPC call failed + def shell_list(remaining) + response = rpc_request('shell', remaining.keys, 'shell.list') do |client| + client.list + end + return [{}, true] if response[:rpc_failed] + + done = response[:errors] + logger.debug { "shell.list: #{target_count(response[:responded])} responded, #{target_count(done)} failed" } unless done.empty? + + response[:responded].each do |target, data| + if data.nil? + done[target] = error_output("shell.list on #{target.safe_name} returned success but no data", + 'bolt/choria-missing-data') + next + end + + handle = remaining[target][:handle] + job = data.dig(:jobs, handle) + + unless job + logger.debug { + job_handles = data[:jobs]&.keys || [] + "shell.list on #{target.safe_name}: handle #{handle} not found, " \ + "available handles: #{job_handles.inspect}" + } + done[target] = error_output( + "Handle #{handle} not found in shell.list on #{target.safe_name}. " \ + "The process may have been cleaned up or the agent may have restarted.", + 'bolt/choria-handle-not-found' + ) + next + end + + status = job['status']&.to_s + logger.debug { "shell.list on #{target.safe_name}: handle #{handle} status: #{status}" } + done[target] = remaining[target] if SHELL_DONE_STATUSES.include?(status) + end + + [done, false] + end + + # Fetch stdout/stderr/exitcode from completed targets via the + # shell.statuses RPC action. Requires shell agent >= 1.2.0. + # + # @param targets [Hash{Bolt::Target => Hash}] Completed targets mapped to { handle: uuid_string } + # @return [Hash{Bolt::Target => Hash}] Output hash for each target + def shell_statuses(targets) + handles = targets.transform_values { |data| data[:handle] } + logger.debug { "Fetching shell.statuses for #{target_count(targets.keys)}" } + + results = {} + response = rpc_request('shell', targets.keys, 'shell.statuses') do |client| + client.statuses(handles: handles.values) + end + + response[:errors].each do |target, fail_output| + results[target] = fail_output + end + + response[:responded].each do |target, data| + statuses = data&.dig(:statuses) + handle = handles[target] + + unless statuses + results[target] = error_output( + "shell.statuses on #{target.safe_name} returned no data", + 'bolt/choria-missing-data' + ) + next + end + + status_data = statuses[handle] + unless status_data + results[target] = error_output( + "shell.statuses on #{target.safe_name} did not include handle #{handle}", + 'bolt/choria-missing-data' + ) + next + end + + status = status_data['status']&.to_s + stdout = status_data['stdout'] + stderr = status_data['stderr'] + error_msg = status_data['error'] + + if status == 'error' + results[target] = error_output( + "Handle #{handle} not found on #{target.safe_name}: #{error_msg}", + 'bolt/choria-handle-not-found' + ) + elsif status == 'failed' + results[target] = error_output( + "Process failed on #{target.safe_name}: #{stderr}", + 'bolt/choria-process-failed', + stdout: stdout, stderr: stderr, exitcode: 1 + ) + else + exitcode = exitcode_from(status_data, target, 'shell.statuses') + results[target] = output(stdout: stdout, stderr: stderr, exitcode: exitcode) + end + end + + results + rescue StandardError => e + raise if e.is_a?(Bolt::Error) + + logger.warn { "shell.statuses RPC call failed: #{e.class}: #{e.message}" } + targets.each_key do |target| + results[target] ||= error_output( + "Fetching output from #{target.safe_name} failed: #{e.class}: #{e.message}", + 'bolt/choria-result-processing-error' + ) + end + results + end + + # Kill processes on timed-out targets. Sequential because each target + # has a unique handle, requiring a separate shell.kill RPC call per target. + # A future batched kill action (like shell.statuses) would eliminate this. + # + # @param targets [Hash{Bolt::Target => Hash}] Timed-out targets mapped to { handle: uuid_string } + def kill_timed_out_processes(targets) + logger.debug { "Killing timed-out processes on #{target_count(targets)}" } + targets.each do |target, state| + rpc_request('shell', target, 'shell.kill') do |client| + client.kill(handle: state[:handle]) + end + rescue StandardError => e + logger.warn { "Failed to kill process on #{target.safe_name}: #{e.message}" } + end + end + end + end +end diff --git a/lib/mcollective/agent/README.md b/lib/mcollective/agent/README.md new file mode 100644 index 000000000..0cf05d616 --- /dev/null +++ b/lib/mcollective/agent/README.md @@ -0,0 +1,18 @@ +# Bundled Choria Agent DDLs + +This directory contains DDL (Data Definition Language) files for Choria +agents used by the Choria transport. MCollective's RPC client requires +a DDL file for every agent it calls, and it searches `$LOAD_PATH` for +`mcollective/agent/.ddl`. Since OpenBolt's `lib/` is on +`$LOAD_PATH`, placing DDLs here makes them findable automatically. + +## shell.ddl + +Copied from [choria-plugins/shell-agent](https://github.com/choria-plugins/shell-agent). +The `rpcutil` and `bolt_tasks` DDLs ship with the `choria-mcorpc-support` +gem, but the shell agent DDL does not. Without this bundled copy, users +would need to manually install it into their Choria config's libdir. + +If a user has their own copy of the DDL in their Choria libdir, their +version takes precedence because `loadconfig` prepends user libdirs to +`$LOAD_PATH` before the gem's `lib/` directory. diff --git a/lib/mcollective/agent/shell.ddl b/lib/mcollective/agent/shell.ddl new file mode 100644 index 000000000..1e0294cd5 --- /dev/null +++ b/lib/mcollective/agent/shell.ddl @@ -0,0 +1,154 @@ +metadata :name => "shell", + :description => "Run commands with the local shell", + :author => "Puppet Labs", + :license => "Apache-2.0", + :version => "1.2.0", + :url => "https://github.com/choria-plugins/shell-agent", + :timeout => 180 + +action "run", :description => "Run a command" do + display :always + + input :command, + :prompt => "Command", + :description => "Command to run", + :type => :string, + :validation => '.*', + :maxlength => 10 * 1024, + :optional => false + + input :user, + :prompt => "User", + :description => "User to run command as", + :type => :string, + :validation => '.*', + :maxlength => 1024, + :optional => true + + input :timeout, + :prompt => "Timeout", + :description => "Timeout to wait for the command to complete", + :type => :float, + :optional => true + # TODO(richardc): validate positive. May need another validator class + + output :stdout, + :description => "stdout from the command", + :display_as => "stdout" + + output :stderr, + :description => "stderr from the command", + :display_as => "stderr" + + output :success, + :description => "did the process exit successfully", + :display_as => "success" + + output :exitcode, + :description => "exit code of the command", + :display_as => "exitcode" +end + +action "start", :description => "Spawn a command" do + display :always + + input :command, + :prompt => "Command", + :description => "Command to run", + :type => :string, + :validation => '.*', + :maxlength => 10 * 1024, + :optional => false + + input :user, + :prompt => "User", + :description => "User to run command as", + :type => :string, + :validation => '.*', + :maxlength => 1024, + :optional => true + + output :handle, + :description => "identifier to a running command", + :display_as => "handle" +end + +action "status", :description => "Get status of managed command" do + display :always + + input :handle, + :prompt => "Handle", + :description => "Handle of the command", + :type => :string, + :validation => '^[0-9a-z\-]*$', + :maxlength => 36, + :optional => false + + input :stdout_offset, + :prompt => "stdout_offset", + :description => "stdout_offset", + :type => :integer, + :optional => true + + input :stderr_offset, + :prompt => "stderr_offset", + :description => "stderr_offset", + :type => :integer, + :optional => true + + # Running, Exited + output :status, + :description => "status of the command", + :display_as => "status" + + # Stdout to this point - resets internal state + output :stdout, + :description => "stdout of the command", + :display_as => "stdout" + + # Stderr to this point - resets internal state + output :stderr, + :description => "stderr of the command", + :display_as => "stderr" + + # Only meaningful if status == Exited + output :exitcode, + :description => "exitcode of the command", + :display_as => "exitcode" + +end + +action "list", :description => "Get a list of all running commands" do + display :always + + output :jobs, + :description => "state of managed jobs", + :display_as => "jobs" + +end + +action "statuses", :description => "Get status and output of multiple managed commands" do + display :always + + input :handles, + :prompt => "Handles", + :description => "Array of command handles to query", + :type => :array, + :optional => false + + output :statuses, + :description => "status and output keyed by handle", + :display_as => "statuses" +end + +action "kill", :description => "Kill a command by handle" do + display :always + + input :handle, + :prompt => "Handle", + :description => "Handle of the command", + :type => :string, + :validation => '^[0-9a-z\-]*$', + :maxlength => 36, + :optional => false +end diff --git a/openbolt.gemspec b/openbolt.gemspec index 7ebe3eb26..41a2a73c2 100644 --- a/openbolt.gemspec +++ b/openbolt.gemspec @@ -18,6 +18,7 @@ Gem::Specification.new do |spec| spec.files = Dir['exe/*'] + Dir['lib/**/*.rb'] + Dir['lib/**/*.json'] + + Dir['lib/**/*.ddl'] + Dir['libexec/*'] + Dir['bolt-modules/*/lib/**/*.rb'] + Dir['bolt-modules/*/types/**/*.pp'] + @@ -46,6 +47,7 @@ Gem::Specification.new do |spec| spec.add_dependency "addressable", '~> 2.5' spec.add_dependency "aws-sdk-ec2", '~> 1' spec.add_dependency "CFPropertyList", ">= 2.2" + spec.add_dependency "choria-mcorpc-support", "~> 2.26" spec.add_dependency "concurrent-ruby", "~> 1.0" spec.add_dependency "ffi", ">= 1.9.25", "< 2.0.0" spec.add_dependency "hiera-eyaml", ">= 3.0.0", "< 6.0.0" From 3eb3150a54e946f426fde5b9ea174da1e44f4a1a Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 00:53:37 +0000 Subject: [PATCH 2/8] Add tests for Choria transport phases 1 and 2 Attempts to minimize stubbing (although we still need a fair bit) and use the choria-mcorpc-support gem as much as possible. --- spec/lib/bolt_spec/choria.rb | 327 ++++++++ spec/unit/config/transport/choria_spec.rb | 109 +++ .../transport/choria/agent_discovery_spec.rb | 242 ++++++ spec/unit/transport/choria/bolt_tasks_spec.rb | 477 +++++++++++ spec/unit/transport/choria/client_spec.rb | 399 +++++++++ .../transport/choria/command_builders_spec.rb | 229 ++++++ spec/unit/transport/choria/helpers_spec.rb | 345 ++++++++ spec/unit/transport/choria/shell_spec.rb | 759 ++++++++++++++++++ spec/unit/transport/choria_spec.rb | 357 ++++++++ 9 files changed, 3244 insertions(+) create mode 100644 spec/lib/bolt_spec/choria.rb create mode 100644 spec/unit/config/transport/choria_spec.rb create mode 100644 spec/unit/transport/choria/agent_discovery_spec.rb create mode 100644 spec/unit/transport/choria/bolt_tasks_spec.rb create mode 100644 spec/unit/transport/choria/client_spec.rb create mode 100644 spec/unit/transport/choria/command_builders_spec.rb create mode 100644 spec/unit/transport/choria/helpers_spec.rb create mode 100644 spec/unit/transport/choria/shell_spec.rb create mode 100644 spec/unit/transport/choria_spec.rb diff --git a/spec/lib/bolt_spec/choria.rb b/spec/lib/bolt_spec/choria.rb new file mode 100644 index 000000000..edc4a96fd --- /dev/null +++ b/spec/lib/bolt_spec/choria.rb @@ -0,0 +1,327 @@ +# frozen_string_literal: true + +require 'bolt/inventory' +require 'bolt/transport/choria' +require 'mcollective' +require 'tempfile' + +module BoltSpec + # Shared helper methods for Choria transport specs. + module Choria + # Write a minimal Choria config file to a Tempfile. Returns the Tempfile + # object (call .path to get the path). + # + # Default config suppresses log output. Pass overrides as key-value pairs + # matching Choria config file syntax. + # + # write_choria_config(main_collective: 'production') + def write_choria_config(**overrides) + defaults = { logger_type: 'console', loglevel: 'error' } + config = defaults.merge(overrides) + + file = Tempfile.new(['choria-test', '.conf']) + config.each { |key, value| file.puts("#{key} = #{value}") } + file.flush + file + end + + # Build a real MCollective::RPC::Result matching the format the + # RPC client returns. The Result class delegates [] to an internal + # hash, so code accessing result[:sender], result[:data], etc. works. + def make_rpc_result(sender:, statuscode: 0, statusmsg: 'OK', data: {}) + identity = sender.is_a?(Bolt::Target) ? transport.choria_identity(sender) : sender + MCollective::RPC::Result.new('test', 'test', + sender: identity, statuscode: statuscode, statusmsg: statusmsg, data: data) + end + + # Stub agent discovery for one or more targets. Accumulates across + # calls so different targets can have different agent lists. Calling + # again for the same target replaces that target's entry. Instance + # variables are cleared between `it` blocks. + # + # Accepts Bolt::Target objects or host strings, single or as an array. + # Agents can be strings (version defaults to '1.2.0') or [name, version] + # pairs for version-specific scenarios. + # + # stub_agents(target, %w[rpcutil shell]) + # stub_agents(target2, %w[rpcutil bolt_tasks]) + # stub_agents([target, target2], %w[rpcutil bolt_tasks]) + # stub_agents(target, [['shell', '1.1.0']], os_family: 'windows') + def stub_agents(targets, agents, os_family: 'RedHat') + targets = [targets].flatten + + agent_data = agents.map do |agent| + name, version = agent.is_a?(Array) ? agent : [agent, '1.2.0'] + { 'agent' => name, 'name' => name, 'version' => version } + end + + @stub_inventory_results ||= [] + @stub_fact_results ||= [] + + # Replace existing entries for these targets (supports re-stubbing) + new_senders = targets.map { |target| transport.choria_identity(target) } + @stub_inventory_results.reject! { |result| new_senders.include?(result[:sender]) } + @stub_fact_results.reject! { |result| new_senders.include?(result[:sender]) } + + targets.each do |target| + @stub_inventory_results << make_rpc_result(sender: target, data: { agents: agent_data }) + @stub_fact_results << make_rpc_result(sender: target, data: { value: os_family }) + end + + allow(mock_rpc_client).to receive_messages(agent_inventory: @stub_inventory_results, get_fact: @stub_fact_results) + end + + # --- bolt_tasks result builders --- + + def make_download_result(sender, downloads: 1) + make_rpc_result(sender: sender, data: { downloads: downloads }) + end + + def make_task_run_result(sender, task_id: 'test-task-id') + make_rpc_result(sender: sender, data: { task_id: task_id }) + end + + def make_task_status_result(sender, stdout: '{"result":"ok"}', stderr: '', exitcode: 0, completed: true) + make_rpc_result(sender: sender, data: { + completed: completed, exitcode: exitcode, stdout: stdout, stderr: stderr + }) + end + + # --- shell agent result builders --- + + def make_shell_run_result(sender, stdout: '', stderr: '', exitcode: 0) + make_rpc_result(sender: sender, data: { stdout: stdout, stderr: stderr, exitcode: exitcode }) + end + + def make_shell_start_result(sender, handle: 'test-handle-uuid') + make_rpc_result(sender: sender, data: { handle: handle }) + end + + def make_shell_list_result(sender, handle, status: 'stopped') + make_rpc_result(sender: sender, data: { + jobs: { handle => { 'id' => handle, 'status' => status } } + }) + end + + def make_shell_statuses_result(sender, handle, stdout: '', stderr: '', exitcode: 0, status: 'stopped') + make_rpc_result(sender: sender, data: { + statuses: { handle => { 'status' => status, 'stdout' => stdout, 'stderr' => stderr, 'exitcode' => exitcode } } + }) + end + + # --- Shell agent stub helpers --- + # Accept a hash of { target => options }. + # + # stub_shell_start(target => { handle: 'h1' }) + # stub_shell_start(target => { handle: 'h1' }, target2 => { handle: 'h2' }) + # + # For single-target convenience with defaults, pass keyword args: + # stub_shell_start(stdout: 'ok') + # stub_shell_start # uses target with all defaults + + def stub_shell_run(targets = nil, **kwargs) + results = normalize_shell_targets(targets, kwargs).map do |sender, opts| + make_shell_run_result(sender, stdout: '', stderr: '', exitcode: 0, **opts) + end + allow(mock_rpc_client).to receive(:run).and_return(results) + end + + def stub_shell_start(targets = nil, **kwargs) + results = normalize_shell_targets(targets, kwargs).map do |sender, opts| + make_shell_start_result(sender, handle: 'test-handle-uuid', **opts) + end + allow(mock_rpc_client).to receive(:start).and_return(results) + end + + def stub_shell_list(targets = nil, **kwargs) + results = normalize_shell_targets(targets, kwargs).map do |sender, opts| + handle = opts.delete(:handle) || 'test-handle-uuid' + make_shell_list_result(sender, handle, status: 'stopped', **opts) + end + allow(mock_rpc_client).to receive(:list).and_return(results) + end + + def stub_shell_status(targets = nil, **kwargs) + results = normalize_shell_targets(targets, kwargs).map do |sender, opts| + handle = opts.delete(:handle) || 'test-handle-uuid' + make_shell_statuses_result(sender, handle, + stdout: '', stderr: '', exitcode: 0, status: 'stopped', **opts) + end + allow(mock_rpc_client).to receive(:statuses).and_return(results) + end + + def stub_shell_kill + allow(mock_rpc_client).to receive(:kill) + end + + private + + # Normalize arguments into a hash of { target => options }. + # If a target-keyed hash is given, use it directly. + # If only keyword args are given, wrap as { target => kwargs }. + def normalize_shell_targets(targets, kwargs) + if targets.is_a?(Hash) + targets + else + { target => kwargs } + end + end + end +end + +# Base setup for any Choria transport spec. Provides transport, inventory, +# targets, and the mock RPC client. Resets MCollective singleton state +# between tests so each test starts with a clean config. +RSpec.shared_context 'choria transport' do + include BoltSpec::Choria + + let(:transport) { Bolt::Transport::Choria.new } + let(:inventory) { Bolt::Inventory.empty } + let(:target) { inventory.get_target('choria://node1.example.com') } + let(:target2) { inventory.get_target('choria://node2.example.com') } + + # Use a plain double rather than instance_double because the real + # MCollective::RPC::Client dispatches agent actions via method_missing, + # so methods like :agent_inventory, :ping, :run, etc. are not actually + # defined on the class and instance_double would reject them. + let(:mock_rpc_client) do + mock_options = { filter: { 'identity' => [] } } + double('MCollective::RPC::Client').tap do |client| + allow(client).to receive(:identity_filter) + allow(client).to receive(:discover) { |**flags| mock_options[:filter]['identity'] = flags[:nodes] || [] } + allow(client).to receive(:progress=) + allow(client).to receive(:options).and_return(mock_options) + end + end + + before(:each) do + # Reset MCollective singleton state so each test starts clean. + @choria_config_file = write_choria_config + mc_config = MCollective::Config.instance + mc_config.set_config_defaults(@choria_config_file.path) + mc_config.instance_variable_set(:@configured, false) + MCollective::PluginManager.clear + + # Point targets at the temp config so configure_client uses it + # instead of auto-detecting from the filesystem. + inventory.set_config(target, 'transport', 'choria') + inventory.set_config(target, %w[choria config-file], @choria_config_file.path) + inventory.set_config(target2, 'transport', 'choria') + inventory.set_config(target2, %w[choria config-file], @choria_config_file.path) + + # Stub the RPC client constructor. This is the only MCollective + # stub we need -- it prevents the real client from connecting to + # NATS via TCP during construction. + allow(MCollective::RPC::Client).to receive(:new).and_return(mock_rpc_client) + + # Stub sleep so polling loops don't actually wait. + allow(transport).to receive(:sleep) + + # Default OS detection stub. Tests that need a different OS family + # (e.g. Windows) can override via stub_agents with os_family: param. + allow(mock_rpc_client).to receive(:get_fact).and_return([ + make_rpc_result(sender: target, data: { value: 'RedHat' }), + make_rpc_result(sender: target2, data: { value: 'RedHat' }) + ]) + end + + after(:each) do + @choria_config_file&.close! + end +end + +# Configures the client for multi-target tests. +RSpec.shared_context 'choria multi-target' do + before(:each) do + transport.configure_client(target) + end +end + +# Task object and metadata for task execution tests. +RSpec.shared_context 'choria task' do + let(:task_name) { 'mymod::mytask' } + let(:task_executable) { '/path/to/mymod/tasks/mytask.sh' } + let(:task_content) { "#!/bin/bash\necho '{\"result\": \"ok\"}'" } + let(:task) do + Bolt::Task.new( + task_name, + { 'input_method' => 'both' }, + [{ 'name' => 'mytask.sh', 'path' => task_executable }] + ) + end + + # Task with only a Linux (shell) implementation. + let(:linux_only_task) do + Bolt::Task.new( + 'mymod::linuxtask', + { + 'input_method' => 'both', + 'implementations' => [ + { 'name' => 'linuxtask.sh', 'requirements' => ['shell'] } + ] + }, + [{ 'name' => 'linuxtask.sh', 'path' => '/path/to/linuxtask.sh' }] + ) + end + + # Task with only a Windows (PowerShell) implementation. + let(:windows_only_task) do + Bolt::Task.new( + 'mymod::wintask', + { + 'input_method' => 'both', + 'implementations' => [ + { 'name' => 'wintask.ps1', 'requirements' => ['powershell'] } + ] + }, + [{ 'name' => 'wintask.ps1', 'path' => '/path/to/wintask.ps1' }] + ) + end + + # Task with implementations for both platforms. + let(:cross_platform_task) do + Bolt::Task.new( + 'mymod::crosstask', + { + 'input_method' => 'both', + 'implementations' => [ + { 'name' => 'crosstask.ps1', 'requirements' => ['powershell'] }, + { 'name' => 'crosstask.sh', 'requirements' => ['shell'] } + ] + }, + [ + { 'name' => 'crosstask.ps1', 'path' => '/path/to/crosstask.ps1' }, + { 'name' => 'crosstask.sh', 'path' => '/path/to/crosstask.sh' } + ] + ) + end +end + +# File system stubs for task executables. Stubs SHA256, File.size, +# File.binread, File.basename, and SecureRandom.uuid so both +# bolt_tasks (download manifest) and shell (file upload) paths work. +RSpec.shared_context 'choria task file stubs' do + before(:each) do + mock_digest = instance_double(Digest::SHA256, hexdigest: Digest::SHA256.hexdigest(task_content)) + allow(Digest::SHA256).to receive(:file).and_call_original + allow(Digest::SHA256).to receive(:file).with(task_executable).and_return(mock_digest) + allow(File).to receive(:size).and_call_original + allow(File).to receive(:size).with(task_executable).and_return(task_content.bytesize) + allow(File).to receive(:binread).and_call_original + allow(File).to receive(:binread).with(task_executable).and_return(task_content) + allow(File).to receive(:basename).and_call_original + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + end +end + +# File system stubs for script execution tests. Expects script_path and +# script_content to be defined via let in the including context. +RSpec.shared_context 'choria script file stubs' do + before(:each) do + allow(File).to receive(:binread).and_call_original + allow(File).to receive(:binread).with(script_path).and_return(script_content) + allow(File).to receive(:basename).and_call_original + allow(File).to receive(:basename).with(script_path).and_return(File.basename(script_path)) + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + end +end diff --git a/spec/unit/config/transport/choria_spec.rb b/spec/unit/config/transport/choria_spec.rb new file mode 100644 index 000000000..d05c1e539 --- /dev/null +++ b/spec/unit/config/transport/choria_spec.rb @@ -0,0 +1,109 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt/config/transport/choria' +require 'shared_examples/transport_config' + +describe Bolt::Config::Transport::Choria do + let(:transport) { Bolt::Config::Transport::Choria } + let(:data) { { 'host' => 'node1.example.com' } } + let(:merge_data) { { 'tmpdir' => '/var/tmp' } } + + include_examples 'transport config' + include_examples 'filters options' + + context 'using plugins' do + let(:plugin_data) { { 'host' => { '_plugin' => 'foo' } } } + let(:resolved_data) { { 'host' => 'foo' } } + + include_examples 'plugins' + end + + context 'validating' do + include_examples 'interpreters' + + %w[choria-agent config-file collective host puppet-environment ssl-ca ssl-cert ssl-key tmpdir].each do |opt| + it "#{opt} rejects non-string value" do + data[opt] = 100 + expect { transport.new(data) }.to raise_error(Bolt::ValidationError) + end + end + + %w[command-timeout nats-connection-timeout rpc-timeout task-timeout].each do |opt| + it "#{opt} rejects non-integer value" do + data[opt] = 'not_an_integer' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError) + end + end + + it 'nats-servers accepts a string' do + data['nats-servers'] = 'nats://broker:4222' + expect { transport.new(data) }.not_to raise_error + end + + it 'nats-servers accepts an array' do + data['nats-servers'] = ['nats://broker1:4222', 'nats://broker2:4222'] + expect { transport.new(data) }.not_to raise_error + end + + it 'nats-servers errors with wrong type' do + data['nats-servers'] = 12345 + expect { transport.new(data) }.to raise_error(Bolt::ValidationError) + end + + it 'cleanup errors with wrong type' do + data['cleanup'] = 'true' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError) + end + + it 'choria-agent rejects invalid values' do + data['choria-agent'] = 'not-an-agent' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /choria-agent must be one of/) + end + + %w[bolt_tasks shell].each do |agent| + it "choria-agent accepts '#{agent}'" do + data['choria-agent'] = agent + expect { transport.new(data) }.not_to raise_error + end + end + + it 'rejects partial SSL overrides (only ssl-ca)' do + data['ssl-ca'] = '/path/to/ca.pem' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /ssl-cert, ssl-key/) + end + + it 'rejects partial SSL overrides (missing ssl-key)' do + data['ssl-ca'] = '/path/to/ca.pem' + data['ssl-cert'] = '/path/to/cert.pem' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /ssl-key/) + end + + it 'accepts complete SSL overrides' do + data['ssl-ca'] = '/path/to/ca.pem' + data['ssl-cert'] = '/path/to/cert.pem' + data['ssl-key'] = '/path/to/key.pem' + expect { transport.new(data) }.not_to raise_error + end + + it 'tmpdir rejects relative paths' do + data['tmpdir'] = 'relative/path' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /absolute path/) + end + + it 'tmpdir accepts absolute paths' do + data['tmpdir'] = '/var/tmp/bolt' + expect { transport.new(data) }.not_to raise_error + end + + it 'tmpdir accepts Windows absolute paths with C: drive' do + data['tmpdir'] = 'C:\temp' + expect { transport.new(data) }.not_to raise_error + end + + it 'tmpdir rejects relative backslash paths' do + data['tmpdir'] = 'relative\path' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /absolute path/) + end + end +end diff --git a/spec/unit/transport/choria/agent_discovery_spec.rb b/spec/unit/transport/choria/agent_discovery_spec.rb new file mode 100644 index 000000000..ddef9bc80 --- /dev/null +++ b/spec/unit/transport/choria/agent_discovery_spec.rb @@ -0,0 +1,242 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' + +describe Bolt::Transport::Choria do + include_context 'choria transport' + + describe '#discover_agents' do + before(:each) do + transport.configure_client(target) + end + + it 'discovers agents on multiple targets in one RPC call' do + r1 = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'bolt_tasks' }] + }) + r2 = make_rpc_result(sender: target2, data: { + agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'shell', 'version' => '1.2.0' }] + }) + + f1 = make_rpc_result(sender: target, data: { value: 'RedHat' }) + f2 = make_rpc_result(sender: target2, data: { value: 'RedHat' }) + allow(mock_rpc_client).to receive_messages(agent_inventory: [r1, r2], get_fact: [f1, f2]) + + transport.discover_agents([target, target2]) + expect(transport.has_agent?(target, 'bolt_tasks')).to be true + expect(transport.has_agent?(target2, 'shell')).to be true + end + + it 'excludes agents below the required minimum version' do + result = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil' }, + { 'agent' => 'shell', 'version' => '1.1.0' }] + }) + + fact_result = make_rpc_result(sender: target, data: { value: 'RedHat' }) + allow(mock_rpc_client).to receive_messages(agent_inventory: [result], get_fact: [fact_result]) + + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'rpcutil')).to be true + expect(transport.has_agent?(target, 'shell')).to be false + end + + it 'treats agents with unparseable version strings as unavailable' do + result = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil' }, + { 'agent' => 'shell', 'version' => 'not-a-version' }] + }) + + fact_result = make_rpc_result(sender: target, data: { value: 'RedHat' }) + allow(mock_rpc_client).to receive_messages(agent_inventory: [result], get_fact: [fact_result]) + + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'shell')).to be false + end + + it 'does not cache non-responding targets' do + r1 = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil' }] + }) + + fact_result = make_rpc_result(sender: target, data: { value: 'RedHat' }) + allow(mock_rpc_client).to receive_messages(agent_inventory: [r1], get_fact: [fact_result]) + + transport.discover_agents([target, target2]) + expect(transport.has_agent?(target, 'rpcutil')).to be true + expect(transport.has_agent?(target2, 'rpcutil')).to be false + end + + it 'uses cache for already-discovered targets' do + stub_agents([target, target2], %w[rpcutil]) + + transport.discover_agents([target, target2]) + + expect(mock_rpc_client).not_to receive(:agent_inventory) + transport.discover_agents([target, target2]) + end + + it 'discards responses from unexpected senders' do + legit = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, + { 'agent' => 'shell', 'version' => '1.2.0' }] + }) + rogue = make_rpc_result(sender: 'evil.example.com', data: { + agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'bolt_tasks' }] + }) + + fact_result = make_rpc_result(sender: target, data: { value: 'RedHat' }) + allow(mock_rpc_client).to receive_messages(agent_inventory: [legit, rogue], get_fact: [fact_result]) + + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'shell')).to be true + end + + it 'treats target as unreachable when agent_inventory returns non-Array agents' do + result = make_rpc_result(sender: target, data: { agents: nil }) + allow(mock_rpc_client).to receive(:agent_inventory).and_return([result]) + + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'rpcutil')).to be false + expect(transport.has_agent?(target, 'shell')).to be false + end + + it 'treats target as unreachable when agents is a string instead of Array' do + result = make_rpc_result(sender: target, data: { agents: 'corrupted' }) + allow(mock_rpc_client).to receive(:agent_inventory).and_return([result]) + + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'rpcutil')).to be false + end + + describe 'error handling' do + it 'returns nil for all targets when agent_inventory raises' do + allow(mock_rpc_client).to receive(:agent_inventory).and_raise(StandardError, 'NATS timeout') + + transport.discover_agents([target, target2]) + expect(transport.has_agent?(target, 'rpcutil')).to be false + expect(transport.has_agent?(target2, 'rpcutil')).to be false + end + + it 're-raises Bolt::Error instead of swallowing it' do + allow(mock_rpc_client).to receive(:agent_inventory).and_raise( + Bolt::Error.new('Config problem', 'bolt/choria-config-failed') + ) + expect { transport.discover_agents([target, target2]) }.to raise_error( + Bolt::Error, /Config problem/ + ) + end + end + end + + describe '#has_agent?' do + before(:each) do + transport.configure_client(target) + end + + it 'returns true when the agent is in the cache' do + stub_agents(target, ['shell']) + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'shell')).to be true + end + + it 'returns false when the agent is not in the cache' do + stub_agents(target, ['rpcutil']) + transport.discover_agents([target]) + expect(transport.has_agent?(target, 'shell')).to be false + end + + it 'returns false when the target was not discovered' do + expect(transport.has_agent?(target, 'shell')).to be false + end + end + + describe '#windows_target?' do + before(:each) do + transport.configure_client(target) + end + + it 'returns true when os.family is windows' do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + transport.discover_agents([target]) + expect(transport.windows_target?(target)).to be true + end + + it 'returns false when os.family is RedHat' do + stub_agents(target, %w[rpcutil shell], os_family: 'RedHat') + transport.discover_agents([target]) + expect(transport.windows_target?(target)).to be false + end + + it 'returns false when os.family is nil' do + stub_agents(target, %w[rpcutil shell], os_family: nil) + transport.discover_agents([target]) + expect(transport.windows_target?(target)).to be false + end + end + + describe '#discover_os_family' do + before(:each) do + transport.configure_client(target) + end + + it 'detects non-Windows OS family' do + stub_agents(target, %w[rpcutil shell], os_family: 'RedHat') + transport.discover_agents([target]) + + expect(transport.windows_target?(target)).to be false + end + + it 'detects Windows OS family' do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + transport.discover_agents([target]) + + expect(transport.windows_target?(target)).to be true + end + + it 'defaults to POSIX when OS detection fails' do + result = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, + { 'agent' => 'shell', 'version' => '1.2.0' }] + }) + allow(mock_rpc_client).to receive(:agent_inventory).and_return([result]) + + allow(mock_rpc_client).to receive(:get_fact).and_raise(StandardError, 'NATS timeout') + + transport.discover_agents([target]) + + expect(transport.windows_target?(target)).to be false + end + + it 'defaults to POSIX when os.family fact is an empty string' do + result = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, + { 'agent' => 'shell', 'version' => '1.2.0' }] + }) + + fact_result = make_rpc_result(sender: target, data: { value: '' }) + allow(mock_rpc_client).to receive_messages(agent_inventory: [result], get_fact: [fact_result]) + + transport.discover_agents([target]) + + expect(transport.windows_target?(target)).to be false + end + + it 're-raises Bolt::Error from OS detection instead of swallowing it' do + result = make_rpc_result(sender: target, data: { + agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, + { 'agent' => 'shell', 'version' => '1.2.0' }] + }) + allow(mock_rpc_client).to receive(:agent_inventory).and_return([result]) + + allow(mock_rpc_client).to receive(:get_fact).and_raise( + Bolt::Error.new('Config problem', 'bolt/choria-config-failed') + ) + + expect { transport.discover_agents([target]) }.to raise_error( + Bolt::Error, /Config problem/ + ) + end + end +end diff --git a/spec/unit/transport/choria/bolt_tasks_spec.rb b/spec/unit/transport/choria/bolt_tasks_spec.rb new file mode 100644 index 000000000..4bf8c5b20 --- /dev/null +++ b/spec/unit/transport/choria/bolt_tasks_spec.rb @@ -0,0 +1,477 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' +require 'bolt_spec/sensitive' + +describe 'Bolt::Transport::Choria bolt_tasks' do + include_context 'choria transport' + include_context 'choria task' + include BoltSpec::Sensitive + + describe '#unwrap_bolt_tasks_stdout' do + before(:each) do + transport.configure_client(target) + end + + it 'passes through JSON hash stdout unchanged' do + raw = '{"_output":"hello world"}' + expect(transport.unwrap_bolt_tasks_stdout(raw)).to eq(raw) + end + + it 'passes through non-JSON content unchanged' do + expect(transport.unwrap_bolt_tasks_stdout('plain text')).to eq('plain text') + end + + it 'unwraps double-encoded wrapper error' do + inner = '{"_error":{"kind":"choria.tasks/wrapper-error","msg":"wrapper failed"}}' + double_encoded = inner.to_json + expect(transport.unwrap_bolt_tasks_stdout(double_encoded)).to eq(inner) + end + + it 'returns nil unchanged' do + expect(transport.unwrap_bolt_tasks_stdout(nil)).to be_nil + end + + it 'returns empty string unchanged' do + expect(transport.unwrap_bolt_tasks_stdout('')).to eq('') + end + + it 'returns integer unchanged' do + expect(transport.unwrap_bolt_tasks_stdout(42)).to eq(42) + end + end + + describe '#task_file_spec' do + let(:file_content) { '#!/bin/bash' } + let(:file_path) { '/path/to/file' } + let(:expected_sha256) { Digest::SHA256.hexdigest(file_content) } + let(:expected_size) { file_content.bytesize } + + before(:each) do + transport.configure_client(target) + mock_digest = instance_double(Digest::SHA256, hexdigest: expected_sha256) + allow(Digest::SHA256).to receive(:file).and_call_original + allow(Digest::SHA256).to receive(:file).with(file_path).and_return(mock_digest) + allow(File).to receive(:size).and_call_original + allow(File).to receive(:size).with(file_path).and_return(expected_size) + end + + def expect_file_spec(spec, filename:, uri_path:, environment: 'production') + expect(spec['filename']).to eq(filename) + expect(spec['uri']['path']).to eq(uri_path) + expect(spec['uri']['params']).to eq({ 'environment' => environment }) + expect(spec['sha256']).to eq(expected_sha256) + expect(spec['size_bytes']).to eq(expected_size) + end + + it 'builds a file spec for a simple task file' do + spec = transport.task_file_spec( + { 'name' => 'mytask.sh', 'path' => file_path }, + 'mymod', 'production' + ) + + expect_file_spec(spec, + filename: 'mytask.sh', + uri_path: '/puppet/v3/file_content/tasks/mymod/mytask.sh') + end + + it 'uses the modules mount for files/ directory dependencies' do + spec = transport.task_file_spec( + { 'name' => 'ruby_task_support/files/task_support.rb', 'path' => file_path }, + 'mymod', 'production' + ) + + expect_file_spec(spec, + filename: 'ruby_task_support/files/task_support.rb', + uri_path: '/puppet/v3/file_content/modules/ruby_task_support/task_support.rb') + end + + it 'uses the plugins mount for lib/ directory dependencies' do + spec = transport.task_file_spec( + { 'name' => 'mymod/lib/puppet/util/support.rb', 'path' => file_path }, + 'mymod', 'production' + ) + + expect_file_spec(spec, + filename: 'mymod/lib/puppet/util/support.rb', + uri_path: '/puppet/v3/file_content/plugins/mymod/puppet/util/support.rb') + end + + it 'uses the tasks mount for other subdirectories' do + spec = transport.task_file_spec( + { 'name' => 'mymod/tasks/thing.sh', 'path' => file_path }, + 'mymod', 'production' + ) + + expect_file_spec(spec, + filename: 'mymod/tasks/thing.sh', + uri_path: '/puppet/v3/file_content/tasks/mymod/thing.sh') + end + + it 'uses a custom environment in the URI params' do + spec = transport.task_file_spec( + { 'name' => 'mytask.sh', 'path' => file_path }, + 'mymod', 'staging' + ) + + expect_file_spec(spec, + filename: 'mytask.sh', + uri_path: '/puppet/v3/file_content/tasks/mymod/mytask.sh', + environment: 'staging') + end + end + + describe '#download_and_start_task' do + include_context 'choria task file stubs' + + before(:each) do + stub_agents([target, target2], %w[rpcutil bolt_tasks]) + allow(mock_rpc_client).to receive_messages( + download: [make_download_result(target), make_download_result(target2)], + run_no_wait: [make_task_run_result(target), make_task_run_result(target2)], + task_status: [make_task_status_result(target, stdout: '{"result":"success"}'), + make_task_status_result(target2, stdout: '{"result":"success"}')] + ) + end + + it 'sends the correct download arguments' do + expect(mock_rpc_client).to receive(:download).with(hash_including( + task: task_name, + environment: 'production' + )) + + transport.batch_task([target], task, { 'param1' => 'value1' }) + end + + it 'sends the correct run_no_wait arguments' do + expect(mock_rpc_client).to receive(:run_no_wait).with(hash_including( + task: task_name, + input_method: 'both' + )) + + transport.batch_task([target], task, { 'param1' => 'value1' }) + end + + it 'unwraps Sensitive values in task arguments' do + expect(mock_rpc_client).to receive(:run_no_wait).with( + hash_including(input: include('"s3cret"')) + ).and_return([make_task_run_result(target)]) + + transport.batch_task([target], task, { 'password' => make_sensitive('s3cret') }) + end + + it 'uses configured puppet-environment for file URIs' do + inventory.set_config(target, %w[choria puppet-environment], 'staging') + + expect(mock_rpc_client).to receive(:download).with(hash_including( + environment: 'staging' + )).and_return([make_download_result(target)]) + + transport.batch_task([target], task, {}) + end + + it 'builds correct file spec URIs in the download request' do + expect(mock_rpc_client).to receive(:download) do |args| + files = JSON.parse(args[:files]) + expect(files.length).to eq(1) + expect(files.first['filename']).to eq('mytask.sh') + expect(files.first['uri']['path']).to eq('/puppet/v3/file_content/tasks/mymod/mytask.sh') + expect(files.first).to have_key('sha256') + expect(files.first).to have_key('size_bytes') + [make_download_result(target)] + end + + transport.batch_task([target], task, {}) + end + + it 'uses bare filename for primary executable even when name has slashes' do + slashed_task = Bolt::Task.new( + task_name, + { 'input_method' => 'both' }, + [{ 'name' => 'mymod/tasks/mytask.sh', 'path' => task_executable }] + ) + + expect(mock_rpc_client).to receive(:download) do |args| + files = JSON.parse(args[:files]) + expect(files.first['filename']).to eq('mytask.sh') + expect(files.first['uri']['path']).to eq('/puppet/v3/file_content/tasks/mymod/mytask.sh') + [make_download_result(target)] + end + + transport.batch_task([target], slashed_task, {}) + end + + it 'includes dependency files with correct mounts in the download request' do + dep_path1 = '/path/to/ruby_task_support/files/task_support.rb' + dep_path2 = '/path/to/mymod/lib/puppet/util/support.rb' + multi_file_task = Bolt::Task.new( + task_name, + { 'input_method' => 'both', + 'files' => ['ruby_task_support/files/task_support.rb', 'mymod/lib/puppet/util/support.rb'] }, + [{ 'name' => 'mytask.sh', 'path' => task_executable }, + { 'name' => 'ruby_task_support/files/task_support.rb', 'path' => dep_path1 }, + { 'name' => 'mymod/lib/puppet/util/support.rb', 'path' => dep_path2 }] + ) + + mock_digest = instance_double(Digest::SHA256, hexdigest: 'abc123') + allow(Digest::SHA256).to receive(:file).with(dep_path1).and_return(mock_digest) + allow(Digest::SHA256).to receive(:file).with(dep_path2).and_return(mock_digest) + allow(File).to receive(:size).with(dep_path1).and_return(8) + allow(File).to receive(:size).with(dep_path2).and_return(8) + + expect(mock_rpc_client).to receive(:download) do |args| + files = JSON.parse(args[:files]) + expect(files.length).to eq(3) + expect(files[0]['uri']['path']).to eq('/puppet/v3/file_content/tasks/mymod/mytask.sh') + expect(files[1]['uri']['path']).to eq('/puppet/v3/file_content/modules/ruby_task_support/task_support.rb') + expect(files[2]['uri']['path']).to eq('/puppet/v3/file_content/plugins/mymod/puppet/util/support.rb') + [make_download_result(target)] + end + + transport.batch_task([target], multi_file_task, {}) + end + + describe 'error handling' do + it 'returns error when download fails with non-zero statuscode' do + failed_dl = make_rpc_result(sender: target, statuscode: 5, statusmsg: 'Download error') + allow(mock_rpc_client).to receive(:download).and_return([failed_dl]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/bolt_tasks\.download on .+ returned RPC error: Download error/) + end + + it 'catches download failure reported via statuscode 1 (reply.fail!)' do + # The bolt_tasks agent uses reply.fail! for download failures, which + # sets statuscode 1. rpc_request routes statuscode 1 to :responded, + # so download_and_start_task has special logic to check rpc_statuscodes + # and move statuscode-1 responses to the error bucket. + dl_result = make_rpc_result( + sender: target, statuscode: 1, + statusmsg: 'Could not download task files from puppet server', + data: { downloads: 0 } + ) + allow(mock_rpc_client).to receive(:download).and_return([dl_result]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/failed to download task files/) + end + + it 'returns error when download returns no response' do + allow(mock_rpc_client).to receive(:download).and_return([]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/No response from .+ for bolt_tasks\.download/) + end + + it 'returns error when run_no_wait returns no response' do + allow(mock_rpc_client).to receive(:run_no_wait).and_return([]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/No response from .+ for bolt_tasks\.run_no_wait/) + end + + it 'returns error when run_no_wait returns non-zero statuscode' do + failed_run = make_rpc_result(sender: target, statuscode: 5, statusmsg: 'Agent rejected task') + allow(mock_rpc_client).to receive(:run_no_wait).and_return([failed_run]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/bolt_tasks\.run_no_wait on .+ returned RPC error: Agent rejected task/) + end + + it 'returns error when run_no_wait returns success but no task_id' do + nil_id_result = make_rpc_result(sender: target, data: { task_id: nil }) + allow(mock_rpc_client).to receive(:run_no_wait).and_return([nil_id_result]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/succeeded but returned no task_id/) + end + + it 'returns error results for all targets when download raises' do + stub_agents([target, target2], %w[rpcutil bolt_tasks]) + allow(mock_rpc_client).to receive(:download).and_raise(StandardError, 'connection reset') + + results = transport.batch_task([target, target2], task, {}) + expect(results.length).to eq(2) + results.each do |result| + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/bolt_tasks\.download failed on .+: connection reset/) + end + end + + it 'returns error results for all targets when run_no_wait raises' do + stub_agents([target, target2], %w[rpcutil bolt_tasks]) + allow(mock_rpc_client).to receive(:download).and_return([ + make_download_result(target), make_download_result(target2) + ]) + allow(mock_rpc_client).to receive(:run_no_wait).and_raise(StandardError, 'broker disconnected') + + results = transport.batch_task([target, target2], task, {}) + expect(results.length).to eq(2) + results.each do |result| + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/bolt_tasks\.run_no_wait failed on .+: broker disconnected/) + end + end + + it 'handles partial download failure across targets' do + stub_agents([target, target2], %w[rpcutil bolt_tasks]) + dl2 = make_rpc_result(sender: target2, statuscode: 5, statusmsg: 'Puppet server unreachable') + allow(mock_rpc_client).to receive_messages( + download: [make_download_result(target), dl2], + run_no_wait: [make_task_run_result(target)], + task_status: [make_task_status_result(target)] + ) + + results = transport.batch_task([target, target2], task, {}) + expect(results.length).to eq(2) + + ok_results = results.select(&:ok?) + error_results = results.reject(&:ok?) + expect(ok_results.length).to eq(1) + expect(ok_results.first.target).to eq(target) + expect(error_results.length).to eq(1) + expect(error_results.first.target).to eq(target2) + expect(error_results.first.error_hash['msg']).to match(/bolt_tasks\.download on .+ returned RPC error: Puppet server unreachable/) + end + end + end + + describe '#poll_task_status' do + include_context 'choria task file stubs' + + before(:each) do + stub_agents(target, %w[rpcutil bolt_tasks]) + allow(mock_rpc_client).to receive_messages( + download: [make_download_result(target)], + run_no_wait: [make_task_run_result(target)] + ) + end + + it 'returns task output on successful completion' do + allow(mock_rpc_client).to receive(:task_status).and_return([ + make_task_status_result(target, stdout: '{"result":"success"}') + ]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be true + expect(result.value).to eq('result' => 'success') + end + + it 'handles JSON hash stdout' do + allow(mock_rpc_client).to receive(:task_status).and_return([ + make_task_status_result(target, stdout: '{"msg":"hello"}') + ]) + + result = transport.batch_task([target], task, {}).first + expect(result.value).to eq('msg' => 'hello') + end + + it 'handles double-encoded string stdout from wrapper errors' do + inner_json = '{"_error":{"msg":"wrapper failed","kind":"choria/wrapper_failed","details":{}}}' + allow(mock_rpc_client).to receive(:task_status).and_return([ + make_task_status_result(target, stdout: inner_json.to_json) + ]) + + result = transport.batch_task([target], task, {}).first + expect(result.value).to include('_error' => a_hash_including('msg' => 'wrapper failed', + 'kind' => 'choria/wrapper_failed')) + end + + it 'handles plain text stdout wrapped in _output' do + allow(mock_rpc_client).to receive(:task_status).and_return([ + make_task_status_result(target, stdout: '{"_output":"hello world"}') + ]) + + result = transport.batch_task([target], task, {}).first + expect(result.value).to eq('_output' => 'hello world') + end + + it 'handles empty stdout' do + allow(mock_rpc_client).to receive(:task_status).and_return([ + make_task_status_result(target, stdout: '') + ]) + + result = transport.batch_task([target], task, {}).first + expect(result.value).to eq('_output' => '') + end + + it 'preserves task output when agent returns statuscode 1 for failed tasks' do + failed_status = make_rpc_result( + sender: target, + statuscode: 1, + statusmsg: 'choria.tasks/task-error: The task errored with a code 1', + data: { + completed: true, exitcode: 1, + stdout: '{"_output":"task failed","_error":{"kind":"choria.tasks/task-error",' \ + '"msg":"The task errored with a code 1","details":{"exitcode":1}}}', + stderr: 'something went wrong' + } + ) + allow(mock_rpc_client).to receive(:task_status).and_return([failed_status]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.value).to include('_error' => a_hash_including('kind' => 'choria.tasks/task-error')) + end + + it 'defaults nil exitcode to 1' do + allow(mock_rpc_client).to receive(:task_status).and_return([ + make_task_status_result(target, exitcode: nil, stdout: '{"_output":"task ran"}') + ]) + + result = transport.batch_task([target], task, {}).first + expect(result.value['_error']['details']['exit_code']).to eq(1) + end + + it 'returns error on task timeout' do + never_done = make_task_status_result(target, completed: false, exitcode: nil, stdout: '', stderr: '') + allow(mock_rpc_client).to receive(:task_status).and_return([never_done]) + inventory.set_config(target, %w[choria task-timeout], 1) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/timed out/) + end + + it 'returns error when task_status returns non-zero statuscode' do + status_result = make_rpc_result(sender: target, statuscode: 4, statusmsg: 'Authorization denied for bolt_tasks') + allow(mock_rpc_client).to receive(:task_status).and_return([status_result]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('bolt/choria-rpc-error') + expect(result.error_hash['msg']).to match(/Authorization denied/) + end + + it 'fails all targets after 3 consecutive poll RPC failures' do + allow(mock_rpc_client).to receive(:task_status) + .and_raise(StandardError, 'NATS connection lost') + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('bolt/choria-poll-failed') + expect(result.error_hash['msg']).to match(/failed persistently/) + end + + it 'recovers after transient poll failures and completes successfully' do + call_count = 0 + allow(mock_rpc_client).to receive(:task_status) do + call_count += 1 + raise StandardError, 'transient NATS error' if call_count <= 2 + + [make_task_status_result(target)] + end + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be true + expect(result.value).to eq('result' => 'ok') + end + end +end diff --git a/spec/unit/transport/choria/client_spec.rb b/spec/unit/transport/choria/client_spec.rb new file mode 100644 index 000000000..83f4f2d69 --- /dev/null +++ b/spec/unit/transport/choria/client_spec.rb @@ -0,0 +1,399 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' +require 'tempfile' + +describe Bolt::Transport::Choria do + include_context 'choria transport' + + describe '#configure_client' do + it 'loads config on first call' do + mc_config = MCollective::Config.instance + expect(mc_config).to receive(:loadconfig).with(@choria_config_file.path).and_call_original + transport.configure_client(target) + end + + it 'only loads config once across multiple calls' do + mc_config = MCollective::Config.instance + expect(mc_config).to receive(:loadconfig).once.and_call_original + + transport.configure_client(target) + transport.configure_client(target) + end + + it 'uses an explicit config-file when provided and readable' do + custom_config = write_choria_config(main_collective: 'custom') + inventory.set_config(target, %w[choria config-file], custom_config.path) + + mc_config = MCollective::Config.instance + expect(mc_config).to receive(:loadconfig).with(custom_config.path).and_call_original + transport.configure_client(target) + end + + it 'raises when an explicit config file is not readable' do + inventory.set_config(target, %w[choria config-file], '/nonexistent/client.conf') + allow(File).to receive(:readable?).with('/nonexistent/client.conf').and_return(false) + + expect { transport.configure_client(target) }.to raise_error( + Bolt::Error, /Choria config file not found or not readable/ + ) + end + + it 'falls back to the next auto-detected config path when the first is not readable' do + inventory.set_config(target, %w[choria config-file], nil) + # Stub File.readable? at the I/O boundary to control which + # auto-detected paths appear readable. + auto_paths = MCollective::Util.config_paths_for_user + allow(File).to receive(:readable?).and_call_original + auto_paths.each { |path| allow(File).to receive(:readable?).with(path).and_return(false) } + # Make the second path "readable" and point loadconfig at our temp file. + second_path = auto_paths[1] + allow(File).to receive(:readable?).with(second_path).and_return(true) + allow(File).to receive(:exist?).and_call_original + allow(File).to receive(:exist?).with(second_path).and_return(true) + allow(File).to receive(:readlines).and_call_original + allow(File).to receive(:readlines).with(second_path).and_return( + File.readlines(@choria_config_file.path) + ) + + mc_config = MCollective::Config.instance + expect(mc_config).to receive(:loadconfig).with(second_path).and_call_original + transport.configure_client(target) + end + + it 'raises when no auto-detected config file is readable' do + inventory.set_config(target, %w[choria config-file], nil) + allow(File).to receive(:readable?).and_call_original + MCollective::Util.config_paths_for_user.each do |path| + allow(File).to receive(:readable?).with(path).and_return(false) + end + + expect { transport.configure_client(target) }.to raise_error( + Bolt::Error, /Could not find a readable Choria client config file/ + ) + end + + it 'applies NATS server overrides to pluginconf' do + inventory.set_config(target, %w[choria nats-servers], %w[nats://broker1:4222 nats://broker2:4222]) + + transport.configure_client(target) + + mc_config = MCollective::Config.instance + expect(mc_config.pluginconf['choria.middleware_hosts']).to eq('nats://broker1:4222,nats://broker2:4222') + end + + it 'applies TLS overrides to pluginconf' do + ca = Tempfile.new('ca.pem') + cert = Tempfile.new('cert.pem') + key = Tempfile.new('key.pem') + begin + inventory.set_config(target, %w[choria ssl-ca], ca.path) + inventory.set_config(target, %w[choria ssl-cert], cert.path) + inventory.set_config(target, %w[choria ssl-key], key.path) + + transport.configure_client(target) + + mc_config = MCollective::Config.instance + expect(mc_config.pluginconf['security.provider']).to eq('file') + expect(mc_config.pluginconf['security.file.ca']).to eq(ca.path) + expect(mc_config.pluginconf['security.file.certificate']).to eq(cert.path) + expect(mc_config.pluginconf['security.file.key']).to eq(key.path) + ensure + [ca, cert, key].each(&:close!) + end + end + + it 'raises when SSL file is not readable' do + inventory.set_config(target, %w[choria ssl-ca], '/nonexistent/ca.pem') + inventory.set_config(target, %w[choria ssl-cert], '/nonexistent/cert.pem') + inventory.set_config(target, %w[choria ssl-key], '/nonexistent/key.pem') + expect { transport.configure_client(target) }.to raise_error( + Bolt::Error, /ssl-ca.*not readable/ + ) + end + + it 'remembers loadconfig failure and re-raises on subsequent calls' do + mc_config = MCollective::Config.instance + allow(mc_config).to receive(:loadconfig).and_raise(RuntimeError, 'NATS connection refused') + + expect { transport.configure_client(target) }.to raise_error( + Bolt::Error, /Choria client configuration failed.*NATS connection refused/ + ) + + # Second call should re-raise the same error without calling loadconfig again + expect(mc_config).not_to receive(:loadconfig) + expect { transport.configure_client(target) }.to raise_error( + Bolt::Error, /Choria client configuration failed.*NATS connection refused/ + ) + end + end + + describe '#create_rpc_client' do + it 'discovers with all target identities' do + transport.configure_client(target) + expect(mock_rpc_client).to receive(:discover).with(nodes: %w[node1.example.com node2.example.com]) + transport.create_rpc_client('rpcutil', [target, target2], 10) + end + + it 'uses choria host config as identity when set' do + inventory.set_config(target, %w[choria host], 'node1.fqdn.example.com') + expect(mock_rpc_client).to receive(:discover).with(nodes: ['node1.fqdn.example.com']) + transport.create_rpc_client('shell', [target], 60) + end + + it 'disables progress output' do + expect(mock_rpc_client).to receive(:progress=).with(false) + transport.create_rpc_client('shell', [target], 60) + end + + it 'sets collective from the first target' do + transport.configure_client(target) + inventory.set_config(target, %w[choria collective], 'production') + expect(MCollective::RPC::Client).to receive(:new) do |_agent, opts| + expect(opts[:options][:collective]).to eq('production') + mock_rpc_client + end + transport.create_rpc_client('rpcutil', [target, target2], 10) + end + + it 'leaves collective as nil when not configured (falls back to main_collective)' do + expect(MCollective::RPC::Client).to receive(:new) do |_agent, opts| + expect(opts[:options][:collective]).to be_nil + mock_rpc_client + end + transport.create_rpc_client('shell', [target], 60) + end + + it 'passes nats-connection-timeout to RPC client options' do + transport.configure_client(target) + inventory.set_config(target, %w[choria nats-connection-timeout], 45) + + expect(MCollective::RPC::Client).to receive(:new) do |_agent, opts| + expect(opts[:options][:connection_timeout]).to eq(45) + mock_rpc_client + end + + transport.create_rpc_client('shell', [target], 10) + end + + it 'passes rpc-timeout as the RPC call timeout' do + transport.configure_client(target) + inventory.set_config(target, %w[choria rpc-timeout], 120) + + allow(mock_rpc_client).to receive(:ping).and_return([make_rpc_result(sender: target)]) + + expect(MCollective::RPC::Client).to receive(:new) do |_agent, opts| + expect(opts[:options][:timeout]).to eq(120) + mock_rpc_client + end + + transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + end + + describe 'default_collective' do + it 'uses default_collective when target has no explicit collective' do + production_config = write_choria_config(main_collective: 'production') + inventory.set_config(target, %w[choria config-file], production_config.path) + + transport.configure_client(target) + + expect(MCollective::RPC::Client).to receive(:new) do |_agent, opts| + expect(opts[:options][:collective]).to eq('production') + mock_rpc_client + end + transport.create_rpc_client('rpcutil', [target], 10) + end + end + end + + describe '#rpc_request' do + before(:each) do + transport.configure_client(target) + end + + it 'routes statuscode 0 to :responded' do + result = make_rpc_result(sender: target, statuscode: 0, data: { value: 'ok' }) + allow(mock_rpc_client).to receive(:ping).and_return([result]) + + response = transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + expect(response[:responded]).to have_key(target) + expect(response[:responded][target]).to eq(value: 'ok') + expect(response[:errors]).to be_empty + expect(response[:rpc_failed]).to be false + expect(response[:rpc_statuscodes][target]).to eq(0) + end + + it 'routes statuscode 1 to :responded and preserves data' do + result = make_rpc_result( + sender: target, statuscode: 1, + statusmsg: 'Task failed with exit code 1', + data: { exitcode: 1, stdout: '{"_error":{"msg":"failed"}}' } + ) + allow(mock_rpc_client).to receive(:ping).and_return([result]) + + response = transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + expect(response[:responded]).to have_key(target) + expect(response[:responded][target][:exitcode]).to eq(1) + expect(response[:errors]).to be_empty + expect(response[:rpc_statuscodes][target]).to eq(1) + end + + it 'routes statuscode 2+ to :errors' do + result = make_rpc_result(sender: target, statuscode: 3, statusmsg: 'Missing data') + allow(mock_rpc_client).to receive(:ping).and_return([result]) + + response = transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + expect(response[:responded]).to be_empty + expect(response[:errors]).to have_key(target) + expect(response[:errors][target][:error]).to match(/Missing data.*code 3/) + expect(response[:rpc_statuscodes][target]).to eq(3) + end + + it 'reports no-response targets as errors' do + allow(mock_rpc_client).to receive(:ping).and_return([]) + + response = transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + expect(response[:responded]).to be_empty + expect(response[:errors]).to have_key(target) + expect(response[:errors][target][:error]).to match(/No response/) + end + + it 'returns rpc_failed: true when the RPC call raises a StandardError' do + allow(mock_rpc_client).to receive(:ping).and_raise(StandardError, 'NATS timeout') + + response = transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + expect(response[:rpc_failed]).to be true + expect(response[:responded]).to be_empty + expect(response[:errors]).to have_key(target) + expect(response[:errors][target][:error]).to match(/NATS timeout/) + end + + it 're-raises Bolt::Error instead of returning rpc_failed' do + allow(mock_rpc_client).to receive(:ping).and_raise( + Bolt::Error.new('Config problem', 'bolt/choria-config-failed') + ) + + expect { + transport.rpc_request('rpcutil', [target], 'test') { |client| client.ping } + }.to raise_error(Bolt::Error, 'Config problem') + end + + it 'handles mixed statuscodes across targets' do + ok_result = make_rpc_result(sender: target, statuscode: 0, data: { value: 'ok' }) + err_result = make_rpc_result(sender: target2, statuscode: 4, statusmsg: 'Authorization denied') + allow(mock_rpc_client).to receive(:ping).and_return([ok_result, err_result]) + + response = transport.rpc_request('rpcutil', [target, target2], 'test') { |client| client.ping } + expect(response[:responded]).to have_key(target) + expect(response[:errors]).to have_key(target2) + expect(response[:rpc_statuscodes][target]).to eq(0) + expect(response[:rpc_statuscodes][target2]).to eq(4) + end + end + + describe '#index_results_by_sender' do + it 'indexes results by sender for expected targets' do + results = [ + { sender: 'node1.example.com', data: { exitcode: 0, stdout: 'ok1' } }, + { sender: 'node2.example.com', data: { exitcode: 0, stdout: 'ok2' } } + ] + + indexed = transport.index_results_by_sender(results, [target, target2], 'test') + + expect(indexed.keys).to contain_exactly('node1.example.com', 'node2.example.com') + expect(indexed['node1.example.com'][:data][:stdout]).to eq('ok1') + expect(indexed['node2.example.com'][:data][:stdout]).to eq('ok2') + end + + it 'discards responses from unexpected senders' do + results = [ + { sender: 'node1.example.com', data: { exitcode: 0 } }, + { sender: 'evil.example.com', data: { exitcode: 0 } } + ] + + indexed = transport.index_results_by_sender(results, [target, target2], 'test') + + expect(indexed.keys).to contain_exactly('node1.example.com') + expect(indexed).not_to have_key('evil.example.com') + end + + it 'discards responses with nil sender' do + results = [ + { sender: 'node1.example.com', data: { exitcode: 0 } }, + { sender: nil, data: { exitcode: 0 } } + ] + + indexed = transport.index_results_by_sender(results, [target, target2], 'test') + + expect(indexed.keys).to contain_exactly('node1.example.com') + end + + it 'keeps first response and ignores duplicate with same data' do + results = [ + { sender: 'node1.example.com', data: { exitcode: 0 } }, + { sender: 'node1.example.com', data: { exitcode: 0 } } + ] + + indexed = transport.index_results_by_sender(results, [target], 'test') + + expect(indexed.size).to eq(1) + expect(indexed['node1.example.com'][:data][:exitcode]).to eq(0) + end + + it 'keeps first response and ignores duplicate with different data' do + results = [ + { sender: 'node1.example.com', data: { exitcode: 0 } }, + { sender: 'node1.example.com', data: { exitcode: 1, stderr: 'dup' } } + ] + + indexed = transport.index_results_by_sender(results, [target], 'test') + + expect(indexed.size).to eq(1) + expect(indexed['node1.example.com'][:data][:exitcode]).to eq(0) + end + + it 'returns empty hash for empty results' do + indexed = transport.index_results_by_sender([], [target], 'test') + + expect(indexed).to be_empty + end + + it 'returns empty hash when no results match expected targets' do + results = [ + { sender: 'rogue1.example.com', data: { exitcode: 0 } }, + { sender: 'rogue2.example.com', data: { exitcode: 0 } } + ] + + indexed = transport.index_results_by_sender(results, [target], 'test') + + expect(indexed).to be_empty + end + + it 'logs warnings for unexpected and nil senders' do + logger = transport.logger + expect(logger).to receive(:warn).twice + + results = [ + { sender: nil, data: { exitcode: 0 } }, + { sender: 'evil.example.com', data: { exitcode: 0 } } + ] + + transport.index_results_by_sender(results, [target], 'test') + end + + it 'logs debug for identical duplicate and warn for different-data duplicate' do + logger = transport.logger + expect(logger).to receive(:debug) + expect(logger).to receive(:warn) + + results = [ + { sender: 'node1.example.com', data: { exitcode: 0 } }, + { sender: 'node1.example.com', data: { exitcode: 0 } }, + { sender: 'node1.example.com', data: { exitcode: 1 } } + ] + + transport.index_results_by_sender(results, [target], 'test') + end + end +end diff --git a/spec/unit/transport/choria/command_builders_spec.rb b/spec/unit/transport/choria/command_builders_spec.rb new file mode 100644 index 000000000..589e0d55a --- /dev/null +++ b/spec/unit/transport/choria/command_builders_spec.rb @@ -0,0 +1,229 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' + +describe 'Bolt::Transport::Choria command builders' do + include_context 'choria transport' + + describe '#stdin_pipe_command (POSIX)' do + it 'pipes data via printf without trailing newline' do + cmd = transport.stdin_pipe_command(target, '{"key":"value"}', '/tmp/task.sh') + expect(cmd).to eq("printf '%s' \\{\\\"key\\\":\\\"value\\\"\\} | /tmp/task.sh") + end + + it 'escapes special shell characters in data' do + cmd = transport.stdin_pipe_command(target, 'data with $pecial & chars', 'mycmd') + expect(cmd).to start_with("printf '%s'") + expect(cmd).to end_with('| mycmd') + end + end + + describe '#escape_arg (POSIX)' do + it 'escapes spaces' do + result = transport.escape_arg(target, 'arg with spaces') + expect(result).to eq('arg\ with\ spaces') + end + + it 'escapes single quotes' do + result = transport.escape_arg(target, "it's") + expect(result).to eq("it\\'s") + end + + it 'returns empty string for empty input' do + result = transport.escape_arg(target, '') + expect(result).to eq("''") + end + end + + describe '#join_path (POSIX)' do + it 'uses forward slash as separator' do + result = transport.join_path(target, '/tmp', 'bolt-dir', 'mytask.sh') + expect(result).to eq('/tmp/bolt-dir/mytask.sh') + end + end + + describe '#make_dir_command (POSIX)' do + it 'generates a mkdir command with mode 700' do + cmd = transport.make_dir_command(target, '/tmp/bolt-choria-test') + expect(cmd).to eq('mkdir -m 700 -p /tmp/bolt-choria-test') + end + + it 'joins multiple paths into a space-separated list' do + cmd = transport.make_dir_command(target, '/tmp/bolt-dir', '/tmp/bolt-dir/lib') + expect(cmd).to eq('mkdir -m 700 -p /tmp/bolt-dir /tmp/bolt-dir/lib') + end + end + + describe '#prepend_env_vars' do + it 'returns command unchanged when env_vars is nil' do + result = transport.prepend_env_vars(target, 'echo hello', nil, 'test') + expect(result).to eq('echo hello') + end + + it 'returns command unchanged when env_vars is empty' do + result = transport.prepend_env_vars(target, 'echo hello', {}, 'test') + expect(result).to eq('echo hello') + end + + it 'prepends /usr/bin/env with escaped variables' do + result = transport.prepend_env_vars(target, 'mycommand', { 'FOO' => 'bar baz' }, 'test') + expect(result).to eq('/usr/bin/env FOO=bar\ baz mycommand') + end + + it 'handles multiple variables' do + result = transport.prepend_env_vars(target, 'cmd', { 'A' => '1', 'B' => '2' }, 'test') + expect(result).to eq('/usr/bin/env A=1 B=2 cmd') + end + end + + describe '#build_task_command' do + it 'builds a command with stdin piping when input_method is stdin' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', { 'key' => 'value' }, 'stdin', nil) + expect(cmd).to eq("printf '%s' \\{\\\"key\\\":\\\"value\\\"\\} | /tmp/mytask.sh") + end + + it 'builds a command with environment variables when input_method is environment' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', { 'key' => 'value' }, 'environment', nil) + expect(cmd).to eq('/usr/bin/env PT_key=value /tmp/mytask.sh') + end + + it 'builds a command with both stdin and environment when input_method is both' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', { 'key' => 'value' }, 'both', nil) + expect(cmd).to eq("printf '%s' \\{\\\"key\\\":\\\"value\\\"\\} | /usr/bin/env PT_key=value /tmp/mytask.sh") + end + + it 'JSON-serializes non-string argument values for environment variables' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', + { 'config' => { 'nested' => true }, 'count' => 42 }, + 'environment', nil) + expect(cmd).to eq('/usr/bin/env PT_config=\{\"nested\":true\} PT_count=42 /tmp/mytask.sh') + end + + it 'uses configured interpreter for the task file extension' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', {}, 'both', { '.sh' => '/opt/bash5/bin/bash' }) + expect(cmd).to match(%r{/opt/bash5/bin/bash\s.*/mytask\.sh}) + end + + it 'uses interpreter with multiple path elements' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', {}, 'both', { '.sh' => ['/usr/bin/env', 'bash'] }) + expect(cmd).to match(%r{/usr/bin/env bash\s.*/mytask\.sh}) + end + + it 'uses no interpreter when none is configured' do + cmd = transport.build_task_command(target, '/tmp/mytask.sh', {}, 'both', nil) + expect(cmd).to eq("printf '%s' \\{\\} | /tmp/mytask.sh") + end + end + + describe 'Windows command builders' do + before(:each) do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + transport.configure_client(target) + transport.discover_agents([target]) + end + + describe '#make_dir_command' do + it 'generates a PowerShell New-Item command' do + cmd = transport.make_dir_command(target, 'C:\Windows\Temp\bolt-test') + expect(cmd).to eq("New-Item -ItemType Directory -Force -Path 'C:\\Windows\\Temp\\bolt-test'") + end + + it 'joins multiple paths into a comma-separated list' do + cmd = transport.make_dir_command(target, 'C:\temp\bolt-dir', 'C:\temp\bolt-dir\lib') + expect(cmd).to eq("New-Item -ItemType Directory -Force -Path 'C:\\temp\\bolt-dir', 'C:\\temp\\bolt-dir\\lib'") + end + end + + describe '#make_executable_command' do + it 'returns nil because Windows does not need chmod' do + result = transport.make_executable_command(target, 'C:\temp\mytask.ps1') + expect(result).to be_nil + end + end + + describe '#cleanup_dir_command' do + it 'generates a PowerShell Remove-Item command' do + cmd = transport.cleanup_dir_command(target, 'C:\Windows\Temp\bolt-test') + expect(cmd).to eq("Remove-Item -Recurse -Force -Path 'C:\\Windows\\Temp\\bolt-test'") + end + end + + describe '#upload_file_command' do + it 'generates a PowerShell IO.File WriteAllBytes command' do + content_b64 = Base64.strict_encode64('test content') + cmd = transport.upload_file_command(target, content_b64, 'C:\temp\myfile.txt') + expect(cmd).to eq("[IO.File]::WriteAllBytes('C:\\temp\\myfile.txt', " \ + "[Convert]::FromBase64String('#{content_b64}'))") + end + end + + describe '#prepend_env_vars' do + it 'generates $env: syntax' do + cmd = transport.prepend_env_vars(target, 'mycommand', { 'FOO' => 'bar' }, 'test') + expect(cmd).to eq("$env:FOO = 'bar'; & mycommand") + end + + it 'handles multiple environment variables' do + cmd = transport.prepend_env_vars(target, 'cmd', { 'A' => '1', 'B' => '2' }, 'test') + expect(cmd).to eq("$env:A = '1'; $env:B = '2'; & cmd") + end + end + + describe '#stdin_pipe_command' do + it 'generates a here-string pipe' do + cmd = transport.stdin_pipe_command(target, '{"key":"value"}', 'mytask.ps1') + expect(cmd).to eq("@'\n{\"key\":\"value\"}\n'@ | & mytask.ps1") + end + end + + describe '#escape_arg' do + it 'wraps the argument in single quotes' do + result = transport.escape_arg(target, 'my argument') + expect(result).to eq("'my argument'") + end + + it 'escapes single quotes by doubling them' do + result = transport.escape_arg(target, "it's a test") + expect(result).to eq("'it''s a test'") + end + end + + describe '#join_path' do + it 'uses backslash as separator' do + result = transport.join_path(target, 'C:\temp', 'bolt-dir', 'mytask.ps1') + expect(result).to eq('C:\temp\bolt-dir\mytask.ps1') + end + end + + describe '#powershell_cmd' do + it 'uses -EncodedCommand with Base64-encoded UTF-16LE' do + cmd = transport.powershell_cmd('Write-Host "hello"') + expect(cmd).to start_with('powershell.exe -NoProfile -NonInteractive -EncodedCommand ') + encoded_part = cmd.split('-EncodedCommand ').last + decoded = Base64.decode64(encoded_part).force_encoding('UTF-16LE').encode('UTF-8') + expect(decoded).to eq('Write-Host "hello"') + end + end + + describe '#build_task_command' do + it 'builds a command with PowerShell syntax for stdin piping' do + cmd = transport.build_task_command(target, 'C:\temp\mytask.ps1', + { 'key' => 'value' }, 'stdin', nil) + expect(cmd).to eq("@'\n{\"key\":\"value\"}\n'@ | & 'C:\\temp\\mytask.ps1'") + end + + it 'builds a command with $env: syntax for environment input_method' do + cmd = transport.build_task_command(target, 'C:\temp\mytask.ps1', + { 'key' => 'value' }, 'environment', nil) + expect(cmd).to eq("$env:PT_key = 'value'; & 'C:\\temp\\mytask.ps1'") + end + + it 'builds a command with both stdin and environment for both input_method' do + cmd = transport.build_task_command(target, 'C:\temp\mytask.ps1', + { 'key' => 'value' }, 'both', nil) + expect(cmd).to eq("@'\n{\"key\":\"value\"}\n'@ | & { $env:PT_key = 'value'; $input | & 'C:\\temp\\mytask.ps1' }") + end + end + end +end diff --git a/spec/unit/transport/choria/helpers_spec.rb b/spec/unit/transport/choria/helpers_spec.rb new file mode 100644 index 000000000..4063169ac --- /dev/null +++ b/spec/unit/transport/choria/helpers_spec.rb @@ -0,0 +1,345 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' + +describe 'Bolt::Transport::Choria helpers' do + include_context 'choria transport' + + describe '#build_result' do + it 'builds a successful command Result with stdout, stderr, and exit_code' do + output = { stdout: 'hello', stderr: 'warn', exitcode: 0 } + result = transport.build_result(target, output, action: 'command', name: 'echo hello', position: []) + + expect(result.ok?).to be true + expect(result.value['stdout']).to eq('hello') + expect(result.value['stderr']).to eq('warn') + expect(result.value['exit_code']).to eq(0) + end + + it 'builds a successful script Result with stdout, stderr, and exit_code' do + output = { stdout: 'script output', stderr: '', exitcode: 0 } + result = transport.build_result(target, output, action: 'script', name: 'myscript.sh', position: []) + + expect(result.ok?).to be true + expect(result.value['stdout']).to eq('script output') + expect(result.value['stderr']).to eq('') + expect(result.value['exit_code']).to eq(0) + end + + it 'builds a successful task Result' do + output = { stdout: '{"result": true}', stderr: '', exitcode: 0 } + result = transport.build_result(target, output, action: 'task', name: 'my_task', position: []) + + expect(result.ok?).to be true + expect(result.value).to include('result' => true) + end + + it 'builds a failed command Result when exit code is non-zero' do + output = { stdout: 'partial', stderr: 'error output', exitcode: 2 } + result = transport.build_result(target, output, action: 'command', name: 'failing_cmd', position: []) + + expect(result.ok?).to be false + expect(result.value['stdout']).to eq('partial') + expect(result.value['stderr']).to eq('error output') + expect(result.value['exit_code']).to eq(2) + expect(result.error_hash['kind']).to eq('puppetlabs.tasks/command-error') + expect(result.error_hash['msg']).to include('exit code 2') + end + + it 'builds a failed script Result when exit code is non-zero' do + output = { stdout: '', stderr: 'script failed', exitcode: 4 } + result = transport.build_result(target, output, action: 'script', name: 'failing_script.sh', position: []) + + expect(result.ok?).to be false + expect(result.value['stdout']).to eq('') + expect(result.value['stderr']).to eq('script failed') + expect(result.value['exit_code']).to eq(4) + expect(result.error_hash['kind']).to eq('puppetlabs.tasks/command-error') + expect(result.error_hash['msg']).to include('exit code 4') + end + + it 'builds a failed task Result when exit code is non-zero' do + output = { stdout: '', stderr: 'task error output', exitcode: 3 } + result = transport.build_result(target, output, action: 'task', name: 'my_task', position: []) + + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('puppetlabs.tasks/task-error') + expect(result.error_hash['msg']).to include('exit code 3') + end + + it 'builds an error Result for command when output has :error key' do + output = { stdout: '', stderr: '', exitcode: 1, + error: 'something failed', error_kind: 'bolt/choria-rpc-error' } + result = transport.build_result(target, output, action: 'command', name: 'echo hi', position: []) + + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('bolt/choria-rpc-error') + expect(result.error_hash['msg']).to eq('something failed') + end + + it 'builds an error Result for task when output has :error key' do + output = { stdout: '', stderr: '', exitcode: 1, + error: 'task failed', error_kind: 'bolt/task-error' } + result = transport.build_result(target, output, action: 'task', name: 'my_task', position: []) + + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('bolt/task-error') + expect(result.error_hash['msg']).to eq('task failed') + end + + it 'raises Bolt::Error for unknown action' do + output = { stdout: '', stderr: '', exitcode: 0 } + + expect { + transport.build_result(target, output, action: 'unknown', name: 'x', position: []) + }.to raise_error(Bolt::Error, /Unknown action 'unknown'/) + end + + it 'prioritizes error key over action type' do + output = { stdout: 'some output', stderr: 'some error', exitcode: 1, + error: 'override error', error_kind: 'bolt/test-error' } + result = transport.build_result(target, output, action: 'task', name: 'my_task', position: []) + + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('bolt/test-error') + expect(result.error_hash['msg']).to eq('override error') + end + end + + describe '#error_output' do + it 'builds an error hash with default values' do + result = transport.error_output('bad thing happened', 'bolt/test-error') + + expect(result[:error]).to eq('bad thing happened') + expect(result[:error_kind]).to eq('bolt/test-error') + expect(result[:stdout]).to eq('') + expect(result[:stderr]).to eq('') + expect(result[:exitcode]).to eq(1) + end + + it 'preserves provided stdout, stderr, and exitcode' do + result = transport.error_output('failed', 'bolt/test-error', + stdout: 'partial output', stderr: 'error details', exitcode: 42) + + expect(result[:stdout]).to eq('partial output') + expect(result[:stderr]).to eq('error details') + expect(result[:exitcode]).to eq(42) + expect(result[:error]).to eq('failed') + end + end + + describe '#exitcode_from' do + it 'returns the exitcode from data when present' do + result = transport.exitcode_from({ exitcode: 42 }, target, 'test command') + expect(result).to eq(42) + end + + it 'defaults to 1 and logs a warning when exitcode is nil' do + logger = transport.logger + expect(logger).to receive(:warn) + + result = transport.exitcode_from({ exitcode: nil }, target, 'test command') + expect(result).to eq(1) + end + + it 'defaults to 1 when exitcode key is missing' do + logger = transport.logger + expect(logger).to receive(:warn) + + result = transport.exitcode_from({}, target, 'test command') + expect(result).to eq(1) + end + + it 'returns the exitcode when accessed via string key' do + result = transport.exitcode_from({ 'exitcode' => 42 }, target, 'test command') + expect(result).to eq(42) + end + end + + describe '#validate_env_key!' do + it 'allows valid POSIX environment variable names' do + expect { transport.validate_env_key!('MY_VAR_123', 'test') }.not_to raise_error + expect { transport.validate_env_key!('_UNDER', 'test') }.not_to raise_error + end + + it 'rejects env key with backticks' do + expect { + transport.validate_env_key!('`whoami`', 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects env key with newline' do + expect { + transport.validate_env_key!("FOO\nBAR=injected", 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects env key with null byte' do + expect { + transport.validate_env_key!("FOO\x00BAR", 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects env key with spaces' do + expect { + transport.validate_env_key!('FOO BAR', 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects env key with equals sign' do + expect { + transport.validate_env_key!('FOO=BAR', 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects env key starting with digit' do + expect { + transport.validate_env_key!('9VAR', 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects empty env key' do + expect { + transport.validate_env_key!('', 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + + it 'rejects env key with shell metacharacters' do + expect { + transport.validate_env_key!('BAD$(evil)', 'test') + }.to raise_error(Bolt::Error, /Unsafe environment variable name/) + end + end + + describe '#validate_file_name!' do + it 'accepts a simple file name' do + expect { transport.validate_file_name!('mytask.sh') }.not_to raise_error + end + + it 'accepts a module-relative path without traversal' do + expect { transport.validate_file_name!('mymod/files/helper.rb') }.not_to raise_error + end + + it 'rejects file names with null bytes' do + expect { + transport.validate_file_name!("legit.sh\x00../../etc/passwd") + }.to raise_error(Bolt::Error, /Invalid null byte/) + end + + it 'rejects absolute paths' do + expect { + transport.validate_file_name!('/etc/passwd') + }.to raise_error(Bolt::Error, /Absolute path not allowed/) + end + + it 'rejects path traversal with ..' do + expect { + transport.validate_file_name!('../../etc/shadow') + }.to raise_error(Bolt::Error, /Path traversal detected/) + end + + it 'rejects .. that stays within bounds' do + expect { + transport.validate_file_name!('mymod/tasks/../tasks/file.sh') + }.to raise_error(Bolt::Error, /Path traversal detected/) + end + + it 'rejects trailing ..' do + expect { + transport.validate_file_name!('mymod/..') + }.to raise_error(Bolt::Error, /Path traversal detected/) + end + + it 'rejects bare .. as file name' do + expect { + transport.validate_file_name!('..') + }.to raise_error(Bolt::Error, /Path traversal detected/) + end + + it 'does not reject names containing .. as a substring in a segment' do + # "foo..bar" has no ".." path segment, so it should be allowed + expect { transport.validate_file_name!('foo..bar') }.not_to raise_error + end + + it 'rejects Windows absolute paths like C:\Windows\cmd.exe' do + expect { + transport.validate_file_name!('C:\Windows\cmd.exe') + }.to raise_error(Bolt::Error, /Absolute path not allowed/) + end + + it 'rejects backslash traversal like ..\..\..\etc\passwd' do + expect { + transport.validate_file_name!('..\..\..\etc\passwd') + }.to raise_error(Bolt::Error, /Path traversal detected/) + end + + it 'accepts valid backslash-separated paths like mymod\tasks\mytask.ps1' do + expect { transport.validate_file_name!('mymod\tasks\mytask.ps1') }.not_to raise_error + end + end + + describe '#poll_with_retries' do + it 'returns completed targets from each round' do + rounds = [ + { rpc_failed: false, done: { target => { stdout: 'ok', stderr: '', exitcode: 0 } } } + ] + result = transport.poll_with_retries([target], 30, 'test') { rounds.shift } + + expect(result[:completed][target][:stdout]).to eq('ok') + expect(result[:remaining]).to be_empty + expect(result[:rpc_persistent_failure]).to be false + end + + it 'retries on rpc_failed and gives up after RPC_FAILURE_RETRIES' do + round = { rpc_failed: true, done: {} } + result = transport.poll_with_retries([target], 30, 'test') { round } + + expect(result[:remaining]).to include(target) + expect(result[:rpc_persistent_failure]).to be true + expect(result[:completed]).to be_empty + end + + it 'resets failure counter after a successful round' do + rounds = [ + { rpc_failed: true, done: {} }, + { rpc_failed: true, done: {} }, + { rpc_failed: false, done: { target => { stdout: 'recovered', stderr: '', exitcode: 0 } } } + ] + result = transport.poll_with_retries([target], 30, 'test') { rounds.shift } + + expect(result[:completed][target][:stdout]).to eq('recovered') + expect(result[:remaining]).to be_empty + expect(result[:rpc_persistent_failure]).to be false + end + + it 'stops when deadline is exceeded' do + allow(Process).to receive(:clock_gettime).and_return(0, 0, 100) + result = transport.poll_with_retries([target], 5, 'test') do + { rpc_failed: false, done: {} } + end + + expect(result[:remaining]).to include(target) + expect(result[:rpc_persistent_failure]).to be false + end + + it 'works with Hash targets (shell handles)' do + pending_handles = { target => { handle: 'abc-123' } } + result = transport.poll_with_retries(pending_handles, 30, 'test') do |_remaining| + { rpc_failed: false, done: { target => { stdout: 'done', stderr: '', exitcode: 0 } } } + end + + expect(result[:completed][target][:stdout]).to eq('done') + expect(result[:remaining]).to be_empty + end + + it 'does not mutate the original targets collection' do + original = [target] + transport.poll_with_retries(original, 30, 'test') do + { rpc_failed: false, done: { target => { stdout: '', stderr: '', exitcode: 0 } } } + end + + expect(original).to eq([target]) + end + end +end diff --git a/spec/unit/transport/choria/shell_spec.rb b/spec/unit/transport/choria/shell_spec.rb new file mode 100644 index 000000000..647786324 --- /dev/null +++ b/spec/unit/transport/choria/shell_spec.rb @@ -0,0 +1,759 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' + +describe Bolt::Transport::Choria do + include_context 'choria transport' + include_context 'choria task' + + describe '#generate_tmpdir_path' do + it 'generates a path under the target tmpdir with a uuid suffix' do + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + path = transport.generate_tmpdir_path(target) + expect(path).to eq('/tmp/bolt-choria-test-uuid') + end + + it 'respects custom tmpdir from target options' do + inventory.set_config(target, %w[choria tmpdir], '/var/tmp') + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + path = transport.generate_tmpdir_path(target) + expect(path).to eq('/var/tmp/bolt-choria-test-uuid') + end + + context 'on Windows targets' do + before(:each) do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + transport.configure_client(target) + transport.discover_agents([target]) + end + + it 'uses C:\Windows\Temp when default tmpdir is /tmp on a Windows target' do + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + path = transport.generate_tmpdir_path(target) + expect(path).to eq('C:\Windows\Temp\bolt-choria-test-uuid') + end + + it 'respects custom tmpdir on Windows targets' do + inventory.set_config(target, 'choria', 'tmpdir' => 'D:\bolt\tmp') + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + path = transport.generate_tmpdir_path(target) + expect(path).to eq('D:\bolt\tmp\bolt-choria-test-uuid') + end + end + end + + describe '#batch_command' do + include_context 'choria multi-target' + + it 'executes a command and returns stdout, stderr, exit_code' do + stub_agents([target, target2], %w[rpcutil shell]) + stub_shell_start({ target => { handle: 'h1' }, target2 => { handle: 'h2' } }) + stub_shell_list({ target => { handle: 'h1' }, target2 => { handle: 'h2' } }) + stub_shell_status({ target => { handle: 'h1', stdout: 'hello', stderr: 'warn' }, + target2 => { handle: 'h2', stdout: 'hello', stderr: 'warn' } }) + + events = [] + callback = proc { |event| events << event } + + results = transport.batch_command([target, target2], 'echo hello', {}, [], &callback) + expect(results.length).to eq(2) + results.each { |result| expect(result.value).to eq('stdout' => 'hello', 'stderr' => 'warn', 'exit_code' => 0) } + + started_targets = events.select { |event| event[:type] == :node_start }.map { |event| event[:target] } + finished_targets = events.select { |event| event[:type] == :node_result }.map { |event| event[:result].target } + expect(started_targets).to contain_exactly(target, target2) + expect(finished_targets).to contain_exactly(target, target2) + end + + it 'returns non-zero exit codes without raising' do + stub_agents(target, %w[rpcutil shell]) + stub_shell_start + stub_shell_list + stub_shell_status(stdout: '', stderr: 'fail', exitcode: 42) + stub_shell_kill + + result = transport.batch_command([target], 'exit 42').first + expect(result.value['exit_code']).to eq(42) + end + + it 'returns error when shell agent is not available' do + stub_agents(target, %w[rpcutil bolt_tasks]) + + result = transport.batch_command([target], 'echo hello').first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/shell.*agent.*not available/) + end + + it 'fires error for targets without shell agent' do + stub_agents(target, %w[rpcutil shell]) + stub_agents(target2, %w[rpcutil]) + + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1' } }) + stub_shell_status({ target => { handle: 'h1', stdout: 'hello' } }) + + results = transport.batch_command([target, target2], 'echo hello') + expect(results.length).to eq(2) + + ok_results = results.select(&:ok?) + error_results = results.reject(&:ok?) + expect(ok_results.length).to eq(1) + expect(ok_results.first.target).to eq(target) + expect(error_results.length).to eq(1) + expect(error_results.first.target).to eq(target2) + expect(error_results.first.error_hash['msg']).to match(/shell.*agent.*not available/) + end + + it 'returns timeout error when command exceeds command-timeout' do + stub_agents(target, %w[rpcutil shell]) + stub_shell_start + stub_shell_kill + + # shell.list always reports the process as still running + list_data = { + jobs: { 'test-handle-uuid' => { 'id' => 'test-handle-uuid', 'status' => 'running' } } + } + allow(mock_rpc_client).to receive(:list).and_return( + [make_rpc_result(sender: target, data: list_data)] + ) + + # Force immediate timeout via monotonic clock + allow(Process).to receive(:clock_gettime).and_return(0, 0, 100) + inventory.set_config(target, %w[choria command-timeout], 1) + + results = transport.batch_command([target], 'sleep 999', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-command-timeout') + expect(results.first.error_hash['msg']).to match(/timed out/) + end + + describe 'Windows targets' do + it 'wraps commands in PowerShell encoding for Windows targets' do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + + captured_cmd = nil + allow(mock_rpc_client).to receive(:start) do |args| + captured_cmd = args[:command] + [make_rpc_result(sender: target, data: { handle: 'test-handle-uuid' })] + end + stub_shell_list + stub_shell_status(stdout: 'ok') + + transport.batch_command([target], 'hostname', {}) + expect(captured_cmd).to start_with('powershell.exe -NoProfile -NonInteractive -EncodedCommand ') + end + end + + describe 'env_vars' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + stub_shell_start + stub_shell_list + stub_shell_status(stdout: 'ok') + stub_shell_kill + end + + it 'passes env_vars to the command via /usr/bin/env' do + expect(mock_rpc_client).to receive(:start) do |args| + expect(args[:command]).to eq('/usr/bin/env MY_VAR=hello mycommand') + [make_rpc_result(sender: target, data: { handle: 'test-handle-uuid' })] + end + + transport.batch_command([target], 'mycommand', { env_vars: { 'MY_VAR' => 'hello' } }) + end + end + + describe 'error messages' do + it 'includes the actual error from shell.start in the error message' do + stub_agents(target, %w[rpcutil shell]) + + start_result = make_rpc_result(sender: target, statuscode: 5, + statusmsg: 'Unknown action start for agent shell') + allow(mock_rpc_client).to receive(:start).and_return([start_result]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['msg']).to include('Unknown action start for agent shell') + end + + it 'returns error results for all targets when start raises' do + stub_agents(target, %w[rpcutil shell]) + + allow(mock_rpc_client).to receive(:start).and_raise(StandardError, 'NATS broker down') + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-rpc-failed') + expect(results.first.error_hash['msg']).to match(/NATS broker down/) + end + end + end + + describe '#batch_script' do + let(:script_path) { '/tmp/test_script.sh' } + let(:script_content) { "#!/bin/bash\necho hello" } + + include_context 'choria multi-target' + include_context 'choria script file stubs' + + it 'uploads script, makes executable, and runs it on multiple targets' do + stub_agents([target, target2], %w[rpcutil shell]) + stub_shell_run({ target => {}, target2 => {} }) + stub_shell_start({ target => { handle: 'h1' }, target2 => { handle: 'h2' } }) + stub_shell_list({ target => { handle: 'h1' }, target2 => { handle: 'h2' } }) + stub_shell_status({ target => { handle: 'h1', stdout: 'hello' }, + target2 => { handle: 'h2', stdout: 'hello' } }) + + events = [] + callback = proc { |event| events << event } + + results = transport.batch_script([target, target2], script_path, [], {}, [], &callback) + expect(results.length).to eq(2) + results.each { |result| expect(result.value).to eq('stdout' => 'hello', 'stderr' => '', 'exit_code' => 0) } + + starts = events.select { |event| event[:type] == :node_start } + finishes = events.select { |event| event[:type] == :node_result } + expect(starts.length).to eq(2) + expect(finishes.length).to eq(2) + end + + describe 'command building' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + stub_shell_run + stub_shell_start + stub_shell_list + stub_shell_status(stdout: 'hello') + stub_shell_kill + end + + it 'escapes script arguments' do + captured_cmd = nil + allow(mock_rpc_client).to receive(:start) do |args| + captured_cmd = args[:command] + [make_rpc_result(sender: target, data: { handle: 'test-handle-uuid' })] + end + + transport.batch_script([target], script_path, ['arg with spaces'], {}, []) + expect(captured_cmd).to include("arg\\ with\\ spaces") + end + + it 'prepends interpreter when script_interpreter option is set' do + inventory.set_config(target, %w[choria interpreters .sh], '/usr/local/bin/bash') + + captured_cmd = nil + allow(mock_rpc_client).to receive(:start) do |args| + captured_cmd = args[:command] + [make_rpc_result(sender: target, data: { handle: 'test-handle-uuid' })] + end + + transport.batch_script([target], script_path, [], { script_interpreter: true }, []) + expect(captured_cmd).to match(%r{/usr/local/bin/bash\s.*/test_script\.sh}) + end + + it 'does not prepend interpreter when script_interpreter option is not set' do + inventory.set_config(target, %w[choria interpreters .sh], '/usr/local/bin/bash') + + captured_cmd = nil + allow(mock_rpc_client).to receive(:start) do |args| + captured_cmd = args[:command] + [make_rpc_result(sender: target, data: { handle: 'test-handle-uuid' })] + end + + transport.batch_script([target], script_path, [], {}, []) + expect(captured_cmd).not_to include('/usr/local/bin/bash') + end + end + + describe 'infrastructure step failures' do + before(:each) do + stub_agents([target, target2], %w[rpcutil shell]) + end + + it 'excludes target that fails mkdir and continues with remaining' do + run_calls = 0 + allow(mock_rpc_client).to receive(:run) do + run_calls += 1 + if run_calls == 1 + [ + make_rpc_result(sender: target, data: { stdout: '', stderr: '', exitcode: 0 }), + make_rpc_result(sender: target2, data: { stdout: '', stderr: 'permission denied', exitcode: 1 }) + ] + else + [make_rpc_result(sender: target, data: { stdout: '', stderr: '', exitcode: 0 })] + end + end + + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1' } }) + stub_shell_status({ target => { handle: 'h1', stdout: 'hello' } }) + + events = [] + results = transport.batch_script([target, target2], script_path, [], {}, [], &proc { |event| events << event }) + + expect(results.length).to eq(2) + + ok_results = results.select(&:ok?) + error_results = results.reject(&:ok?) + expect(ok_results.length).to eq(1) + expect(ok_results.first.target).to eq(target) + expect(error_results.length).to eq(1) + expect(error_results.first.target).to eq(target2) + expect(error_results.first.error_hash['kind']).to eq('bolt/choria-operation-failed') + expect(error_results.first.error_hash['msg']).to match(/permission denied/) + + started_targets = events.select { |event| event[:type] == :node_start }.map { |event| event[:target] } + finished_targets = events.select { |event| event[:type] == :node_result }.map { |event| event[:result].target } + expect(started_targets).to contain_exactly(target, target2) + expect(finished_targets).to contain_exactly(target, target2) + end + + it 'returns all errors when all targets fail infrastructure setup' do + stub_shell_run({ target => { stderr: 'no space', exitcode: 1 }, + target2 => { stderr: 'no space', exitcode: 1 } }) + + results = transport.batch_script([target, target2], script_path, [], {}) + expect(results.length).to eq(2) + results.each do |result| + expect(result.ok?).to be false + expect(result.error_hash['kind']).to eq('bolt/choria-operation-failed') + end + end + end + end + + describe '#run_task_via_shell' do + before(:each) do + inventory.set_config(target, %w[choria choria-agent], 'shell') + stub_agents(target, %w[rpcutil shell]) + stub_shell_run + stub_shell_start + stub_shell_list + stub_shell_status(stdout: '{"result":"ok"}') + stub_shell_kill + + allow(File).to receive(:binread).and_call_original + allow(File).to receive(:binread).with(task_executable).and_return(task_content) + allow(File).to receive(:basename).and_call_original + allow(SecureRandom).to receive(:uuid).and_return('test-uuid') + end + + it 'uploads task file, makes executable, and runs it' do + result = transport.batch_task([target], task, {}).first + expect(result.value).to eq('result' => 'ok') + end + + # Command-building details (stdin/environment/both input methods, + # JSON serialization, interpreter selection) are covered by the + # pure function tests for #build_task_command in command_builders_spec. + + it 'does not mutate the original arguments hash' do + original_args = { 'key' => 'value' } + original_args_dup = original_args.dup + + extra_task = Bolt::Task.new( + task_name, + { 'input_method' => 'both', 'files' => ['mymod/lib/helper.rb'] }, + [ + { 'name' => 'mytask.sh', 'path' => task_executable }, + { 'name' => 'mymod/lib/helper.rb', 'path' => '/path/to/mymod/lib/helper.rb' } + ] + ) + allow(File).to receive(:binread).with('/path/to/mymod/lib/helper.rb').and_return('# helper') + + transport.batch_task([target], extra_task, original_args) + + expect(original_args).to eq(original_args_dup) + end + + it 'handles tasks with extra files' do + extra_task = Bolt::Task.new( + task_name, + { 'input_method' => 'both', 'files' => ['mymod/lib/helper.rb'] }, + [ + { 'name' => 'mytask.sh', 'path' => task_executable }, + { 'name' => 'mymod/lib/helper.rb', 'path' => '/path/to/mymod/lib/helper.rb' } + ] + ) + allow(File).to receive(:binread).with('/path/to/mymod/lib/helper.rb').and_return('# helper') + + result = transport.batch_task([target], extra_task, {}).first + expect(result.ok?).to be true + expect(result.value).to eq('result' => 'ok') + end + end + + describe '#shell_run' do + before(:each) do + transport.configure_client(target) + end + + it 'returns empty hash when all targets succeed' do + stub_shell_run({ target => {}, target2 => {} }) + + failures = transport.shell_run([target, target2], 'echo ok', + description: 'test') + expect(failures).to be_empty + end + + it 'returns failures for non-responding targets' do + stub_shell_run({ target => {} }) + + failures = transport.shell_run([target, target2], 'echo ok', + description: 'test') + expect(failures.keys).to eq([target2]) + expect(failures[target2][:error]).to match(/No response/) + end + + it 'returns failures for non-zero exit codes' do + stub_shell_run({ target => {}, + target2 => { exitcode: 1, stderr: 'Permission denied' } }) + + failures = transport.shell_run([target, target2], 'mkdir /foo', + description: 'mkdir') + expect(failures.keys).to eq([target2]) + expect(failures[target2][:error]).to match(/exit code 1/) + end + + it 'returns all targets as failed when RPC call raises' do + allow(mock_rpc_client).to receive(:run).and_raise(StandardError, 'NATS timeout') + + failures = transport.shell_run([target, target2], 'echo ok', + description: 'test') + expect(failures.keys).to contain_exactly(target, target2) + expect(failures[target][:error]).to match(/NATS timeout/) + end + + it 'returns failures for non-zero RPC statuscodes' do + allow(mock_rpc_client).to receive(:run).and_return([ + make_shell_run_result(target), + make_rpc_result(sender: target2, statuscode: 5, + statusmsg: 'Authorization denied', data: {}) + ]) + + failures = transport.shell_run([target, target2], 'echo ok', + description: 'test') + expect(failures.keys).to eq([target2]) + expect(failures[target2][:error]).to match(/test on .+ returned RPC error: Authorization denied/) + end + end + + describe '#shell_start' do + before(:each) do + transport.configure_client(target) + end + + it 'returns error output hashes for all targets when RPC call raises' do + allow(mock_rpc_client).to receive(:start).and_raise(StandardError, 'NATS connection lost') + + pending_map, errors = transport.shell_start([target, target2], 'echo hi') + expect(pending_map).to be_empty + expect(errors.length).to eq(2) + errors.each_value do |output| + expect(output[:error_kind]).to eq('bolt/choria-rpc-failed') + expect(output[:error]).to match(/NATS connection lost/) + end + end + + it 'handles nil data from shell.start gracefully' do + stub_agents(target, %w[rpcutil shell]) + + start_result = make_rpc_result(sender: target, statuscode: 0, data: nil) + allow(mock_rpc_client).to receive(:start).and_return([start_result]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['msg']).to match(/no handle/) + end + end + + describe '#upload_file_content' do + before(:each) do + transport.configure_client(target) + end + + it 'base64-encodes content and runs via shell_run' do + content = "binary\x00content\nwith\tnewlines" + + allow(mock_rpc_client).to receive(:run) do |args| + expect(args[:command]).to match(%r{printf '%s' .+ \| base64 -d > /remote/path}) + [make_rpc_result(sender: target, data: { stdout: '', stderr: '', exitcode: 0 })] + end + + failures = transport.upload_file_content([target], content, '/remote/path') + expect(failures).to be_empty + end + + it 'returns failures from underlying shell_run' do + allow(mock_rpc_client).to receive(:run).and_return([ + make_rpc_result(sender: target, + data: { stdout: '', stderr: 'disk full', exitcode: 1 }) + ]) + + failures = transport.upload_file_content([target], 'data', '/remote/path') + expect(failures).to have_key(target) + expect(failures[target][:error]).to match(/disk full/) + end + end + + describe '#cleanup_tmpdir' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + end + + it 'skips cleanup when cleanup option is false' do + inventory.set_config(target, %w[choria cleanup], false) + stub_shell_start + stub_shell_list + stub_shell_status(stdout: 'ok') + allow(File).to receive(:binread).and_return('#!/bin/bash\necho hi') + + run_commands = [] + allow(mock_rpc_client).to receive(:run) do |args| + run_commands << args[:command] + [make_rpc_result(sender: target, data: { exitcode: 0 })] + end + + transport.batch_script([target], task_executable, [], {}) + expect(run_commands).not_to include(match(/rm -rf/)) + end + + it 'does not mask task results when cleanup fails' do + stub_shell_start + stub_shell_list + stub_shell_status(stdout: 'task output') + allow(File).to receive(:binread).and_return("#!/bin/bash\necho hi") + + allow(mock_rpc_client).to receive(:run) do |args| + if args[:command]&.include?('rm -rf') + raise StandardError, 'NATS timeout during cleanup' + end + + [make_rpc_result(sender: target, data: { exitcode: 0 })] + end + + results = transport.batch_script([target], task_executable, [], {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be true + end + + it 'refuses to delete paths that do not start with bolt-choria-' do + transport.configure_client(target) + + expect(mock_rpc_client).not_to receive(:run) + transport.cleanup_tmpdir([target], '/tmp/some-other-dir') + end + end + + describe '#shell_statuses' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + end + + it 'returns error result when status is failed' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1', status: 'failed' } }) + stub_shell_status({ target => { handle: 'h1', status: 'failed', stderr: 'exec format error', exitcode: nil } }) + stub_shell_kill + + results = transport.batch_command([target], 'bad_command', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-process-failed') + end + + it 'returns error result when status is error (handle not found)' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1', status: 'stopped' } }) + stub_shell_kill + + allow(mock_rpc_client).to receive(:statuses).and_return([ + make_rpc_result(sender: target, data: { + statuses: { 'h1' => { 'status' => 'error' } } + }) + ]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-handle-not-found') + end + + it 'returns error result when statuses data is nil' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1' } }) + stub_shell_kill + + allow(mock_rpc_client).to receive(:statuses).and_return([ + make_rpc_result(sender: target, data: { statuses: nil }) + ]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-missing-data') + end + + it 'returns error result when specific handle is missing from statuses' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1' } }) + stub_shell_kill + + statuses_data = { + statuses: { 'wrong-handle' => { 'status' => 'stopped', 'stdout' => 'ok' } } + } + allow(mock_rpc_client).to receive(:statuses).and_return( + [make_rpc_result(sender: target, data: statuses_data)] + ) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-missing-data') + expect(results.first.error_hash['msg']).to match(/did not include handle/) + end + + it 'returns error results for all targets when statuses raises' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_list({ target => { handle: 'h1' } }) + + allow(mock_rpc_client).to receive(:statuses).and_raise(StandardError, 'NATS timeout') + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + # The exception is caught by rpc_request's rescue (before shell_statuses' + # rescue), which returns it as a rpc-failed error for the target. + expect(results.first.error_hash['kind']).to eq('bolt/choria-rpc-failed') + expect(results.first.error_hash['msg']).to match(/NATS timeout/) + end + end + + describe '#shell_list' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + end + + it 'returns error when handle is not found in shell.list response' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_kill + + allow(mock_rpc_client).to receive(:list).and_return([ + make_rpc_result(sender: target, + data: { jobs: { 'other-handle' => { 'status' => 'stopped' } } }) + ]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-handle-not-found') + end + + it 'returns error when shell.list responds with nil data' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_kill + + allow(mock_rpc_client).to receive(:list).and_return([ + make_rpc_result(sender: target, data: nil) + ]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + end + end + + describe '#wait_for_shell_results' do + describe 'persistent poll failure' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + end + + it 'fails all targets after 3 consecutive poll RPC failures' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_kill + + allow(mock_rpc_client).to receive(:list) + .and_raise(StandardError, 'NATS connection lost') + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-poll-failed') + expect(results.first.error_hash['msg']).to match(/failed persistently/) + end + end + + describe 'RPC error statuscode during polling' do + before(:each) do + stub_agents(target, %w[rpcutil shell]) + end + + it 'returns error result immediately when shell.list returns non-zero statuscode' do + stub_shell_start({ target => { handle: 'h1' } }) + stub_shell_kill + + list_result = make_rpc_result(sender: target, statuscode: 4, statusmsg: 'Authorization denied') + allow(mock_rpc_client).to receive(:list).and_return([list_result]) + + results = transport.batch_command([target], 'hostname', {}) + expect(results.length).to eq(1) + expect(results.first.ok?).to be false + expect(results.first.error_hash['kind']).to eq('bolt/choria-rpc-error') + expect(results.first.error_hash['msg']).to match(/Authorization denied/) + expect(results.first.error_hash['msg']).to match(/code 4/) + end + end + + describe 'partial target failures during polling' do + before(:each) do + transport.configure_client(target) + end + + it 'completes unaffected targets when one target does not respond to statuses' do + stub_agents([target, target2], %w[rpcutil shell]) + + stub_shell_start({ target => { handle: 'h1' }, target2 => { handle: 'h2' } }) + stub_shell_list({ target => { handle: 'h1' }, target2 => { handle: 'h2' } }) + # Only target2 responds to shell.statuses. target does not respond, + # so rpc_request will put it in errors as a no-response error. + stub_shell_status({ target2 => { handle: 'h2', stdout: 'hello' } }) + stub_shell_kill + + results = transport.batch_command([target, target2], 'hostname', {}) + expect(results.length).to eq(2) + + ok_results = results.select(&:ok?) + error_results = results.reject(&:ok?) + expect(ok_results.length).to eq(1) + expect(ok_results.first.target).to eq(target2) + expect(error_results.length).to eq(1) + expect(error_results.first.target).to eq(target) + expect(error_results.first.error_hash['msg']).to match(/No response/) + end + end + end + + describe '#kill_timed_out_processes' do + before(:each) { transport.configure_client(target) } + + it 'still returns timeout error when kill raises' do + pending_map = { target => { handle: 'h1' } } + + # shell_list never finds completion, so the loop times out + allow(transport).to receive(:shell_list).and_return([{}, false]) + allow(mock_rpc_client).to receive(:kill).and_raise(StandardError, 'NATS timeout on kill') + + inventory.set_config(target, %w[choria command-timeout], 1) + outputs = transport.wait_for_shell_results(pending_map, 1) + + expect(outputs.length).to eq(1) + expect(outputs[target][:error_kind]).to eq('bolt/choria-command-timeout') + end + end +end diff --git a/spec/unit/transport/choria_spec.rb b/spec/unit/transport/choria_spec.rb new file mode 100644 index 000000000..5e1bd37cd --- /dev/null +++ b/spec/unit/transport/choria_spec.rb @@ -0,0 +1,357 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'bolt_spec/choria' +require 'bolt_spec/sensitive' + +describe Bolt::Transport::Choria do + include_context 'choria transport' + include_context 'choria task' + include BoltSpec::Sensitive + + describe '#provided_features' do + it 'includes shell and powershell' do + expect(transport.provided_features).to eq(%w[shell powershell]) + end + end + + describe '#select_implementation' do + before(:each) do + transport.configure_client(target) + end + + it 'selects the shell implementation for a Linux target' do + stub_agents(target, %w[rpcutil shell], os_family: 'RedHat') + transport.discover_agents([target]) + + impl = transport.select_implementation(target, cross_platform_task) + expect(impl['name']).to eq('crosstask.sh') + end + + it 'selects the powershell implementation for a Windows target' do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + transport.discover_agents([target]) + + impl = transport.select_implementation(target, cross_platform_task) + expect(impl['name']).to eq('crosstask.ps1') + end + + it 'raises when a Linux target runs a Windows-only task' do + stub_agents(target, %w[rpcutil shell], os_family: 'RedHat') + transport.discover_agents([target]) + + expect { + transport.select_implementation(target, windows_only_task) + }.to raise_error(Bolt::Error, /No suitable implementation/) + end + + it 'raises when a Windows target runs a Linux-only task' do + stub_agents(target, %w[rpcutil shell], os_family: 'windows') + transport.discover_agents([target]) + + expect { + transport.select_implementation(target, linux_only_task) + }.to raise_error(Bolt::Error, /No suitable implementation/) + end + end + + describe '#batch_connected?' do + include_context 'choria multi-target' + + it 'returns true when all targets respond to ping' do + r1 = make_rpc_result(sender: target) + r2 = make_rpc_result(sender: target2) + allow(mock_rpc_client).to receive(:ping).and_return([r1, r2]) + + expect(transport.batch_connected?([target, target2])).to be true + end + + it 'returns false when some targets do not respond' do + r1 = make_rpc_result(sender: target) + allow(mock_rpc_client).to receive(:ping).and_return([r1]) + + expect(transport.batch_connected?([target, target2])).to be false + end + + it 'returns false on error' do + allow(mock_rpc_client).to receive(:ping).and_raise(StandardError, 'NATS timeout') + expect(transport.batch_connected?([target, target2])).to be false + end + + it 're-raises Bolt::Error instead of returning false' do + allow(mock_rpc_client).to receive(:ping).and_raise( + Bolt::Error.new('Config problem', 'bolt/choria-config-failed') + ) + expect { transport.batch_connected?([target, target2]) }.to raise_error( + Bolt::Error, 'Config problem' + ) + end + + it 'ignores responses from unexpected senders' do + r1 = make_rpc_result(sender: target) + r2 = make_rpc_result(sender: target2) + rogue = make_rpc_result(sender: 'rogue.example.com') + allow(mock_rpc_client).to receive(:ping).and_return([r1, r2, rogue]) + + expect(transport.batch_connected?([target, target2])).to be true + end + + it 'returns false when expected target is missing despite extra senders' do + r1 = make_rpc_result(sender: target) + rogue = make_rpc_result(sender: 'rogue.example.com') + allow(mock_rpc_client).to receive(:ping).and_return([r1, rogue]) + + expect(transport.batch_connected?([target, target2])).to be false + end + + it 'does not disconnect the shared NATS connection' do + allow(mock_rpc_client).to receive(:ping).and_return([]) + expect(mock_rpc_client).not_to receive(:disconnect) + transport.batch_connected?([target]) + end + end + + describe '#upload' do + it 'raises an unsupported operation error' do + expect { transport.upload(target, '/src', '/dst') }.to raise_error( + Bolt::Error, /does not yet support upload/ + ) + end + end + + describe '#download' do + it 'raises an unsupported operation error' do + expect { transport.download(target, '/src', '/dst') }.to raise_error( + Bolt::Error, /does not yet support download/ + ) + end + end + + describe '#batches' do + include_context 'choria multi-target' + + it 'groups all targets into one batch when they share the same collective' do + batches = transport.batches([target, target2]) + expect(batches.length).to eq(1) + expect(batches.first).to contain_exactly(target, target2) + end + + it 'groups targets into separate batches by collective' do + inventory.set_config(target, %w[choria collective], 'production') + inventory.set_config(target2, %w[choria collective], 'staging') + + batches = transport.batches([target, target2]) + expect(batches.length).to eq(2) + collectives = batches.map { |batch| batch.first.options['collective'] }.sort + expect(collectives).to eq(%w[production staging]) + end + + it 'uses default collective for targets without explicit collective' do + inventory.set_config(target, %w[choria collective], 'production') + + batches = transport.batches([target, target2]) + expect(batches.length).to eq(2) + collectives = batches.map { |batch| batch.first.options['collective'] } + expect(collectives).to contain_exactly(nil, 'production') + end + end + + describe '#batch_task' do + include_context 'choria task file stubs' + + describe 'single target' do + context 'default agent routing' do + it 'defaults to bolt_tasks when bolt_tasks agent is present' do + stub_agents(target, %w[rpcutil bolt_tasks shell]) + + expect(transport).to receive(:run_task_via_bolt_tasks).and_return( + [Bolt::Result.for_task(target, '{"result":"ok"}', '', 0, task_name, [])] + ) + expect(transport).not_to receive(:run_task_via_shell) + + transport.batch_task([target], task, {}) + end + + it 'returns error when bolt_tasks not available' do + stub_agents(target, %w[rpcutil shell]) + + expect(transport).not_to receive(:run_task_via_bolt_tasks) + expect(transport).not_to receive(:run_task_via_shell) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/bolt_tasks.*not available/) + end + + it 'returns error when neither agent is available' do + stub_agents(target, %w[rpcutil]) + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/bolt_tasks.*not available/) + end + end + + context 'with forced choria-agent' do + it 'uses only bolt_tasks when forced' do + stub_agents(target, %w[rpcutil bolt_tasks shell]) + inventory.set_config(target, %w[choria choria-agent], 'bolt_tasks') + + expect(transport).to receive(:run_task_via_bolt_tasks).and_return( + [Bolt::Result.for_task(target, '{}', '', 0, task_name, [])] + ) + expect(transport).not_to receive(:run_task_via_shell) + + transport.batch_task([target], task, {}) + end + + it 'uses only shell when forced' do + stub_agents(target, %w[rpcutil bolt_tasks shell]) + inventory.set_config(target, %w[choria choria-agent], 'shell') + + expect(transport).not_to receive(:run_task_via_bolt_tasks) + expect(transport).to receive(:run_task_via_shell).and_return( + [Bolt::Result.for_task(target, '{}', '', 0, task_name, [])] + ) + + transport.batch_task([target], task, {}) + end + + it 'returns error when forced agent is not available on target' do + stub_agents(target, %w[rpcutil bolt_tasks]) + inventory.set_config(target, %w[choria choria-agent], 'shell') + + result = transport.batch_task([target], task, {}).first + expect(result.ok?).to be false + expect(result.error_hash['msg']).to match(/shell.*not available/) + end + + it 'raises for invalid forced agent value' do + stub_agents(target, %w[rpcutil bolt_tasks shell invalid_agent]) + inventory.set_config(target, %w[choria choria-agent], 'invalid_agent') + + expect { + transport.batch_task([target], task, {}) + }.to raise_error(Bolt::ValidationError, /choria-agent must be/) + end + end + end + + describe 'multi-target' do + include_context 'choria multi-target' + + it 'downloads and runs task on multiple targets via bolt_tasks' do + stub_agents([target, target2], %w[rpcutil bolt_tasks]) + + allow(mock_rpc_client).to receive_messages( + download: [make_download_result(target), make_download_result(target2)], + run_no_wait: [make_task_run_result(target), make_task_run_result(target2)], + task_status: [make_task_status_result(target), make_task_status_result(target2)] + ) + + expect(transport).to receive(:run_task_via_bolt_tasks).and_call_original + expect(transport).not_to receive(:run_task_via_shell) + + events = [] + callback = proc { |event| events << event } + + results = transport.batch_task([target, target2], task, {}, {}, [], &callback) + expect(results.length).to eq(2) + results.each { |result| expect(result.value).to eq('result' => 'ok') } + + started_targets = events.select { |event| event[:type] == :node_start }.map { |event| event[:target] } + finished_targets = events.select { |event| event[:type] == :node_result }.map { |event| event[:result].target } + expect(started_targets).to contain_exactly(target, target2) + expect(finished_targets).to contain_exactly(target, target2) + end + + it 'handles partial failure: one target has no agents' do + # Only node1 responds to discovery and node2 is unreachable + stub_agents(target, %w[rpcutil bolt_tasks]) + + allow(mock_rpc_client).to receive_messages( + download: [make_download_result(target)], + run_no_wait: [make_task_run_result(target)], + task_status: [make_task_status_result(target)] + ) + + results = transport.batch_task([target, target2], task, {}) + expect(results.length).to eq(2) + + ok_results = results.select(&:ok?) + error_results = results.reject(&:ok?) + expect(ok_results.length).to eq(1) + expect(ok_results.first.target).to eq(target) + expect(error_results.length).to eq(1) + expect(error_results.first.target).to eq(target2) + expect(error_results.first.error_hash['msg']).to match(/No agent information.*did not respond to discovery/) + end + + it 'uses shell agent for all targets when choria-agent is shell' do + inventory.set_config(target, %w[choria choria-agent], 'shell') + inventory.set_config(target2, %w[choria choria-agent], 'shell') + stub_agents([target, target2], %w[rpcutil shell]) + + allow(mock_rpc_client).to receive_messages( + run: [make_shell_run_result(target), make_shell_run_result(target2)], + start: [make_shell_start_result(target, handle: 'h1'), make_shell_start_result(target2, handle: 'h2')], + list: [make_shell_list_result(target, 'h1'), make_shell_list_result(target2, 'h2')], + statuses: [ + make_shell_statuses_result(target, 'h1', stdout: '{"result":"ok"}'), + make_shell_statuses_result(target2, 'h2', stdout: '{"result":"ok"}') + ] + ) + + expect(transport).to receive(:run_task_via_shell).and_call_original + expect(transport).not_to receive(:run_task_via_bolt_tasks) + + events = [] + callback = proc { |event| events << event } + results = transport.batch_task([target, target2], task, {}, {}, [], &callback) + + expect(results.length).to eq(2) + results.each { |result| expect(result.value).to eq('result' => 'ok') } + + started_targets = events.select { |event| event[:type] == :node_start }.map { |event| event[:target] } + finished_targets = events.select { |event| event[:type] == :node_result }.map { |event| event[:result].target } + expect(started_targets).to contain_exactly(target, target2) + expect(finished_targets).to contain_exactly(target, target2) + end + end + end + + describe '#batch_task_with' do + include_context 'choria multi-target' + include_context 'choria task file stubs' + + it 'runs task with per-target arguments, batching discovery' do + stub_agents([target, target2], %w[rpcutil bolt_tasks]) + + # batch_task_with runs each target sequentially, so each call + # returns the next target's result. + allow(mock_rpc_client).to receive(:download) + .and_return([make_download_result(target)], [make_download_result(target2)]) + allow(mock_rpc_client).to receive(:run_no_wait) + .and_return([make_task_run_result(target)], [make_task_run_result(target2)]) + allow(mock_rpc_client).to receive(:task_status) + .and_return([make_task_status_result(target)], [make_task_status_result(target2)]) + + target_mapping = { + target => { 'param' => 'value1' }, + target2 => { 'param' => 'value2' } + } + + events = [] + callback = proc { |event| events << event } + + results = transport.batch_task_with([target, target2], task, target_mapping, {}, [], &callback) + expect(results.length).to eq(2) + results.each { |result| expect(result.value).to eq('result' => 'ok') } + + started_targets = events.select { |event| event[:type] == :node_start }.map { |event| event[:target] } + finished_targets = events.select { |event| event[:type] == :node_result }.map { |event| event[:result].target } + expect(started_targets).to contain_exactly(target, target2) + expect(finished_targets).to contain_exactly(target, target2) + end + end +end From d76edbbe3f634eb779803fdccca0ecfe4f0f7283 Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 00:53:50 +0000 Subject: [PATCH 3/8] Add Choria transport documentation - choria-transport.md: User guide covering configuration, usage, and examples - choria-transport-dev.md: Developer guide for architecture, data flow, and patterns - choria-transport-plan.md: Project plan with phased roadmap and progress tracking - choria-transport-testing.md: Test environment setup for manual verification --- docs/choria-transport-dev.md | 489 ++++++++++++++++++++ docs/choria-transport-plan.md | 395 +++++++++++++++++ docs/choria-transport-testing.md | 734 +++++++++++++++++++++++++++++++ docs/choria-transport.md | 378 ++++++++++++++++ 4 files changed, 1996 insertions(+) create mode 100644 docs/choria-transport-dev.md create mode 100644 docs/choria-transport-plan.md create mode 100644 docs/choria-transport-testing.md create mode 100644 docs/choria-transport.md diff --git a/docs/choria-transport-dev.md b/docs/choria-transport-dev.md new file mode 100644 index 000000000..5b7af7df7 --- /dev/null +++ b/docs/choria-transport-dev.md @@ -0,0 +1,489 @@ +# Choria Transport: Developer Guide + +This document orients developers to the Choria transport codebase. It covers +architecture, design patterns, threading, and data flow. For function-level +detail, see the code comments in each file. + +For user-facing documentation, see [choria-transport.md](choria-transport.md). +For the project roadmap, see [choria-transport-plan.md](choria-transport-plan.md). +For test environment setup, see [choria-transport-testing.md](choria-transport-testing.md). + +## Architecture Overview + +### Why Base, Not Simple + +`Bolt::Transport::Choria` extends `Transport::Base` directly, not `Simple`. +The `Simple` base class assumes persistent connections and a shell abstraction +(open connection, run command, close connection). Choria doesn't work that +way. It's fire-and-forget messaging with no persistent connections. Every RPC +call creates a fresh client, publishes a request, waits for replies, and is +done. + +### How Choria RPC Works + +``` +OpenBolt Controller NATS Broker Target Node + | | | + |-- RPC Request (JSON) --> | | + | (via MCollective::RPC::Client) |-- message --> | + | | [Choria Server] + | | [Agent receives] + | | [Agent executes] + | |<-- reply -- | + |<-- RPC Response (JSON) -- | | +``` + +Key points: +- **No persistent connections.** Each RPC call is a request/reply over NATS + pub/sub. +- **Multi-target by default.** One RPC call addresses all targets in a batch. + NATS pub/sub delivers it in parallel. +- **DDLs are mandatory.** `MCollective::RPC::Client.new(agent)` loads the + agent's DDL at construction time. DDLs for `rpcutil` and `bolt_tasks` ship + with the `choria-mcorpc-support` gem. The shell DDL is bundled with OpenBolt + at `lib/mcollective/agent/shell.ddl`. +- **MCollective::Config is a singleton.** `loadconfig` must only be called + once. We guard this with `@config_mutex` and check + `MCollective::Config.instance.configured`. + +The Ruby library that implements this RPC client is the `choria-mcorpc-support` +gem, which provides `MCollective::RPC::Client`. Despite the MCollective name +(historical legacy), this is the current Choria client library. + +### Key Agents + +- **rpcutil**: Built into every Choria node. Provides `ping` and + `agent_inventory` (what agents are installed). +- **bolt_tasks**: Ships with Puppet-enabled Choria setups. Downloads task files + from an OpenVox/Puppet Server and executes them. Can only run tasks, not arbitrary + commands. +- **shell**: A separate plugin + ([choria-plugins/shell-agent](https://github.com/choria-plugins/shell-agent)). + Provides synchronous (`run`) and asynchronous (`start`/`list`/`statuses`/`kill`) + command execution. Version 1.2.0+ required for the batched `statuses` action. + +## File Layout + +| File | Responsibility | +|------|---------------| +| `lib/bolt/transport/choria.rb` | Main transport class. Entry point for OpenBolt's executor. Batching, task routing, connectivity checks. | +| `lib/bolt/transport/choria/client.rb` | MCollective client configuration, RPC client creation, `rpc_request` pipeline. One-time setup, NATS/TLS overrides. | +| `lib/bolt/transport/choria/agent_discovery.rb` | Agent detection and OS discovery with per-target caching. | +| `lib/bolt/transport/choria/helpers.rb` | Shared utilities: `prepare_targets`, result builders, polling, security validators. | +| `lib/bolt/transport/choria/shell.rb` | Shell agent execution: commands, scripts, shell-path tasks. Batched polling architecture. | +| `lib/bolt/transport/choria/bolt_tasks.rb` | bolt_tasks agent execution: download, async start, status polling, stdout unwrapping. | +| `lib/bolt/transport/choria/command_builders.rb` | Platform-aware command generation for POSIX and Windows (PowerShell). | +| `lib/bolt/config/transport/choria.rb` | Config class: option declarations, defaults, validation. | +| `lib/bolt/config/transport/options.rb` | Shared option schema definitions (Choria entries added alongside other transports). | +| `lib/mcollective/agent/shell.ddl` | Bundled shell agent DDL for client-side validation. | + +See code comments in each file for function-level detail. + +## Key Abstractions + +### Agent Discovery and Caching + +On first contact with a target, two RPC calls discover what's available: + +1. `rpcutil.agent_inventory` returns the list of installed agents with versions +2. `rpcutil.get_fact(fact: 'os.family')` returns the OS family for platform dispatch + +Results are cached in `@agent_cache`, a `Concurrent::Map` keyed by Choria +identity. Each entry stores `{ agents: [...], os: 'redhat'|'windows'|... }`. +The cache lives for the transport instance's lifetime. Non-responding targets +are not cached (intentional, to allow retry on transient failures). + +Agent versions are checked against `AGENT_MIN_VERSIONS` (e.g., shell >= 1.2.0). +Agents below the minimum are excluded from the cache and logged as warnings. + +### RPC Request Pipeline + +All RPC calls flow through `rpc_request` in `client.rb`. This method: + +1. Creates an `MCollective::RPC::Client` via `create_rpc_client` +2. Yields it to the caller's block +3. Splits results into successes and failures via `index_results_by_sender` +4. Handles error classification (see deep dive below) + +The method is serialized by `@rpc_mutex` (see Threading Model). + +### Batching Model + +`batches(targets)` groups targets by their Choria collective. Each group runs +in its own OpenBolt batch thread with a single RPC client scope. + +The `prepare_targets` helper combines the common setup pattern used by every +batch method: + +1. `configure_client(target)` for one-time MCollective setup +2. `discover_agents(targets)` for agent detection +3. Partition targets into capable and incapable based on agent availability +4. Emit error results for incapable targets immediately +5. Return `[capable_targets, error_results]` + +**Same-path trick for tmpdir:** All targets in a batch get the same tmpdir +path (e.g., `/tmp/bolt-choria-abc`). Since they're different machines, this +is fine, and it means every infrastructure command (mkdir, chmod, upload) can +be batched with identical arguments. + +## Threading Model + +OpenBolt runs batches in parallel threads sharing the same transport instance. The +transport handles this with: + +- **`@config_mutex`**: Protects `configure_client` from concurrent access. + Uses double-checked locking: the `@client_configured` flag is checked before + and inside the lock for fast-path efficiency. + +- **`@rpc_mutex`**: Serializes all RPC calls. This is necessary because + MCollective's NATS connector is a PluginManager singleton with a single + receive queue, and `MCollective::Client` uses a non-atomic + `@@request_sequence` class variable for reply-to NATS subjects. Concurrent + RPC calls cause response misrouting: threads can get duplicate sequence + numbers (reply subject collision) and pop each other's messages from the + shared queue (message loss). This was confirmed to break with just 2 + concurrent clients. The mutex ensures only one RPC call is in flight at a + time while allowing non-RPC work (file I/O, result processing, cache + lookups) to run in parallel across batch threads. + +- **`@agent_cache`**: A `Concurrent::Map` (thread-safe without GIL + dependency). Multiple batch threads write to it concurrently when targets + span multiple collectives. + +- **Per-target collective read**: `create_rpc_client` reads the collective + from target options, not from shared transport state. + +## Data Flow + +### run_command on N targets + +``` +batch_command(N targets, "hostname") + prepare_targets(targets) # configure_client + 2 RPC calls: agent_inventory + get_fact + shell_start(capable, cmd) # 1 RPC call: shell.start -> N handles + wait_for_shell_results(pending, 60s) + [loop every 1 second]: + shell_list(remaining) # 1 RPC call: shell.list -> which are done? + shell_statuses(completed) # 1 RPC call: shell.statuses -> output for batch + kill_timed_out_processes(...) # sequential shell.kill calls (only on timeout) +``` + +Best-case total: 2 discovery + 1 start + 1 list + 1 statuses = **5 RPC calls**. + +### run_task via bolt_tasks on N targets + +``` +batch_task(N targets, task, args) + prepare_targets(targets) # configure_client + 2 RPC calls + run_task_via_bolt_tasks(capable, ...) + bolt_tasks.download(file_specs) # 1 RPC call (nodes download from OpenVox/Puppet Server) + bolt_tasks.run_no_wait(task, args) # 1 RPC call -> 1 shared task_id + poll_task_status(targets, task_id) + [loop every 1 second]: + bolt_tasks.task_status(id) # 1 RPC call per round +``` + +Best-case total: 2 discovery + 1 download + 1 run + 1 status = **5 RPC calls**. +The bolt_tasks path is inherently batched because task_status uses a shared task_id. + +### run_task via shell on N targets + +``` +batch_task(N targets, task, args) + prepare_targets(targets) # configure_client + 2 RPC calls + run_task_via_shell(capable, ...) + shell_run: mkdir # 1 RPC call + upload_file_content: task # 1 RPC call per file + shell_run: chmod # 1 RPC call + shell_start(capable, cmd) # 1 RPC call + wait_for_shell_results(pending, 300s) + [loop every 1 second]: + shell_list # 1 RPC call per round + shell_statuses # 1 RPC call per round with completions + cleanup_tmpdir # 1 RPC call +``` + +Best-case total (single file task): 2 discovery + 1 mkdir + 1 upload + 1 chmod +\+ 1 start + 1 list + 1 statuses + 1 cleanup = **9 RPC calls**. + +## Platform Support + +OS is detected during agent discovery via the `os.family` fact. +`command_builders.rb` contains all platform-aware logic: + +- **POSIX**: `/usr/bin/env` for env vars, `Shellwords.shellescape` for + escaping, `printf '%s'` for stdin piping, `base64 -d` for file uploads, + `mkdir -m 700` for temp dirs +- **Windows**: PowerShell `$env:` for env vars, single-quote doubling for + escaping, here-strings for stdin, `[Convert]::FromBase64String` for uploads, + `New-Item` for temp dirs, `powershell.exe -EncodedCommand` for complex scripts + +`select_implementation` in `choria.rb` picks `.ps1` task files for Windows +targets and `.sh` for POSIX, supporting mixed-platform batches. + +**Known gap (Phase 5):** `batch_script` does not handle `options[:pwsh_params]` +(PowerShell named parameter splatting for `.ps1` scripts). This option is only +reachable from the `run_script` plan function and YAML plan `script` steps, +which are not yet supported. When plan support is added, `batch_script` will +need a branch that builds a PowerShell splatting command (similar to +`Bolt::Shell::Powershell::Snippets.ps_task`) instead of passing positional +arguments. + +## Key Function Deep Dives + +### bolt_tasks stdout encoding chain (`unwrap_bolt_tasks_stdout`) + +The bolt_tasks agent has a multi-layer encoding chain that requires careful +unwrapping: + +1. **Task runs**, produces raw stdout +2. **`create_task_stdout`** (in `tasks_support.rb`) wraps it: + - If valid JSON hash: returns the hash object + - If valid JSON but not hash: wraps in `{"_output": raw_string}` + - If not valid JSON: wraps in `{"_output": raw_string}` + - If wrapper error: returns `{"_error": {...}}.to_json` (a JSON **string**) +3. **`reply_task_status`** calls `.to_json` on that result: + - Normal case: hash.to_json = proper JSON string + - Wrapper error case: string.to_json = **double-encoded** JSON string +4. **OpenBolt receives** `result[:data][:stdout]` as a JSON string + +The `unwrap_bolt_tasks_stdout` method handles this: +- Parses the outer JSON layer +- If the result has only `_output` and/or `_error` keys (the wrapper's + signature), extracts `_output` as the real stdout +- If `_output` is itself a JSON string (double-encoding from error case), + unwraps one more layer +- Passes the unwrapped stdout to `Bolt::Result.for_task`, which does its own + JSON parsing + +Edge cases handled: +- Non-JSON stdout: returned as-is +- Task that legitimately returns `_output` keys alongside other keys: detected + by checking for extra keys beyond the wrapper's signature +- Zero exit code with `_error`: the task itself reported the error, don't unwrap + +### wait_for_shell_results / shell_list / shell_statuses + +The shell polling loop is the heart of asynchronous command execution: + +**`wait_for_shell_results`** is the outer loop. It: +1. Sleeps `POLL_INTERVAL` (1 second) +2. Calls `shell_list` for one round of polling +3. Calls `shell_statuses` for any targets that completed this round +4. Tracks consecutive RPC failures (3 in a row = fail all remaining targets) +5. On timeout, calls `kill_timed_out_processes` to clean up remote processes + +**`shell_list`** does a single round: +1. Sends one batched `shell.list` RPC call to all remaining targets +2. For each target, matches its handle against the returned job list +3. Returns `[done_hash, rpc_failed_boolean]` where done contains completed + targets (status in `SHELL_DONE_STATUSES`: `stopped` or `failed`) + +**`shell_statuses`** retrieves results for completed targets: +1. Sends one batched `shell.statuses` RPC call with all completed handles +2. Returns `{ target => output_hash }` with stdout, stderr, exitcode + +Output is fetched immediately when targets complete, not deferred to after the +loop. This means completed targets have their results retrieved promptly. + +### rpc_request + +The central RPC helper through which every MCollective call flows. Key +behaviors: + +**Serialization**: Acquires `@rpc_mutex` before any RPC call. See Threading +Model for why this is necessary. + +**Error classification**: Splits results into three categories: +- **Success** (statuscode 0 or 1): The agent responded. Statuscode 1 means + "application error but action completed" (the task ran but had issues). +- **Agent error** (statuscode >= 2): The agent itself had a problem. Returned + as failures with `bolt/choria-agent-error`. +- **No response**: Target didn't reply. Returned as failures with + `bolt/choria-no-response`. + +**Absorption of StandardError**: If the RPC call raises a `StandardError` +(but not a `Bolt::Error`, which is re-raised), the error is absorbed and all +targets are returned as failures with `bolt/choria-rpc-failed`. This prevents +a single NATS hiccup from crashing the entire batch. + +### prepare_targets + +The common setup pattern used by every batch method. Combines four steps into +one call: + +1. `configure_client(target)` for one-time MCollective setup +2. `discover_agents(targets)` for agent detection (cached after first call) +3. Partition targets by agent availability using `has_agent?` +4. Emit error results (via callback) for targets missing the required agent + +Returns `[capable_targets, error_results]`. The caller proceeds with only +capable targets while error results are already emitted to the user. + +### run_task_via_shell input_method handling + +The shell path for task execution must handle three input methods: + +- **`environment`**: Arguments become `PT_`-prefixed environment variables + (via `envify_params`). Injected with `/usr/bin/env` on POSIX, `$env:` on + Windows. Non-string values are JSON-serialized. +- **`stdin`**: Arguments are JSON-serialized and piped via `printf '%s'` on + POSIX, PowerShell here-strings on Windows. +- **`both`**: Both mechanisms are used simultaneously. + +**Why `printf '%s'` instead of `echo`**: `echo` interprets backslash escape +sequences on some platforms (e.g., `\n` becomes a newline). `printf '%s'` +passes the string through literally. This matters when task arguments contain +backslashes. + +## Architectural Decisions + +### Async execution to avoid DDL timeouts + +Both agents use asynchronous execution patterns (`bolt_tasks.run_no_wait`, +`shell.start`) instead of synchronous calls. The bolt_tasks DDL has a +hardcoded 60-second timeout on `run_and_wait`. The shell agent's `run` action +has a 180-second DDL timeout. By starting asynchronously and polling, we avoid +these limits entirely. + +Synchronous `shell.run` is used for infrastructure operations (mkdir, chmod, +upload) that complete in sub-second times, where the DDL timeout is not a +concern. + +### Fixed poll interval + +Polling uses a fixed 1-second interval (`POLL_INTERVAL`). Exponential backoff +was considered but adds complexity without clear benefit. Each poll round is a +single RPC call regardless of target count, so the broker load is constant. +One second provides reasonable responsiveness without excessive polling. + +### Deterministic agent selection + +Agent selection for `run_task` is explicit via the `choria-agent` config +option (default `bolt_tasks`). There is no automatic fallback between agents. +If the selected agent is not available on a target, that target gets a clear +error. This is simpler and more predictable than a try-and-fallback approach. + +### Batched shell polling + +Shell polling uses `shell.list` + `shell.statuses` instead of per-handle +`shell.status` calls. This reduces RPC overhead from O(N) per round to O(1) +per target node, making it feasible at scale. This is why version 1.2.0 +of the shell agent is required, since this is the version version to include +`shell.statuses`. + +### Kill on timeout + +When `wait_for_shell_results` times out, it kills background processes via +`shell.kill` to prevent orphans on target nodes. This does NOT apply to +bolt_tasks (which has no kill mechanism). The bolt_tasks agent eventually +times out its own subprocess based on the DDL timeout. + +### Consecutive failure tracking + +Both polling loops (`wait_for_shell_results` and `poll_task_status`) track +consecutive RPC failures. Three failures in a row triggers a fail-all for +remaining targets with `bolt/choria-poll-failed`. This prevents infinite +retry loops when the NATS broker goes down mid-operation. + +## Scalability + +This code was written to ensure scalability when running across thousands +and thousands of nodes, potentially split across a handful of collectives. + +### Batch-only architecture + +All operations use batch methods. Single-target `run_task`, `run_command`, +`run_script`, and `connected?` delegate to their batch counterparts. This +ensures the same code path handles 1 target and 10,000 targets. + +### O(N) identity filter setup + +MCollective's `identity_filter` method uses array set union (`|`) per call, +making N calls O(N^2). For 10,000 targets, this is millions of operations. +The transport sets the identity filter array directly in O(N) and +pre-populates `@discovered_agents` with the same list to bypass broadcast +discovery. + +### Batched polling + +Shell polling uses one `shell.list` + one `shell.statuses` call per poll +round, regardless of target count. bolt_tasks polling uses one `task_status` +call per round with a shared task_id. + +### OpenBolt concurrency vs. Choria concurrency + +OpenBolt's `--concurrency` setting controls parallel target processing in +SSH/WinRM transports. This does not apply to Choria. The Choria transport +handles its own concurrency: `batches()` groups targets by collective +(typically 1 group), and each batch uses MCollective's native multi-node RPC. + +### Known scaling limitations + +- **Shell agent at extreme scale (5,000+ targets):** Even with batched + polling, 5,000 shell targets create significant polling overhead. The + bolt_tasks path is preferred for large deployments. +- **Agent discovery for offline nodes:** Non-responding targets are not cached + (to allow retry), so each operation re-queries offline nodes. At 10,000 + targets with many offline, this adds discovery overhead. +- **Per-target options in batches:** Batch methods use `targets.first.options` + for shared settings (timeout, tmpdir, cleanup). Per-target option differences + within a batch are silently ignored. This is inherent to the batch execution + model. +- **Base64 upload size:** The `upload_file_content` method sends files + in a single NATS message. After base64 encoding (~33% expansion) and RPC + overhead, the max raw file size is roughly 700-750KB with the default 1MB + message limit. This will be addressed in Phase 4 with chunked file transfer. + +## Testing + +### Test file layout + +| File | Coverage | +|------|---------| +| `spec/unit/transport/choria_spec.rb` | Main transport: batching, task routing, connectivity | +| `spec/unit/transport/choria/client_spec.rb` | Client config, RPC client creation | +| `spec/unit/transport/choria/agent_discovery_spec.rb` | Agent detection, caching, version checks | +| `spec/unit/transport/choria/helpers_spec.rb` | RPC pipeline, prepare_targets, security validators | +| `spec/unit/transport/choria/shell_spec.rb` | Shell execution, polling, uploads, cleanup | +| `spec/unit/transport/choria/bolt_tasks_spec.rb` | bolt_tasks execution, polling, stdout unwrapping | +| `spec/unit/transport/choria/command_builders_spec.rb` | POSIX/Windows command generation | +| `spec/unit/config/transport/choria_spec.rb` | Config options, defaults, validation | +| `spec/lib/bolt_spec/choria.rb` | Shared test helpers, config file writer, stub helpers | + +### Mocking pattern + +Tests use the real `choria-mcorpc-support` gem for `MCollective::Config`, +`MCollective::Util`, and `MCollective::RPC::Result`. The only MCollective +stub is `RPC::Client.new` (to avoid NATS TCP connections). A fresh Tempfile +config is written per test via `write_choria_config`. The standard pattern: + +```ruby +# In bolt_spec/choria.rb shared context: +let(:mock_rpc_client) { double('MCollective::RPC::Client') } + +before(:each) do + @choria_config_file = write_choria_config + MCollective::Config.instance.set_config_defaults(@choria_config_file.path) + allow(MCollective::RPC::Client).to receive(:new).and_return(mock_rpc_client) +end +``` + +A plain `double` is used (not `instance_double`) because the real RPC client +dispatches agent actions via `method_missing`. + +### Running tests + +```bash +bundle exec rspec spec/unit/transport/choria/ +bundle exec rspec spec/unit/config/transport/choria_spec.rb +``` + +For the full suite: + +```bash +bundle exec rspec spec/unit/transport/choria/ spec/unit/config/transport/choria_spec.rb +``` + +For manual/integration testing, see +[choria-transport-testing.md](choria-transport-testing.md). diff --git a/docs/choria-transport-plan.md b/docs/choria-transport-plan.md new file mode 100644 index 000000000..6b09a3528 --- /dev/null +++ b/docs/choria-transport-plan.md @@ -0,0 +1,395 @@ +# Choria Transport: Project Plan + +This document describes the project plan and roadmap for the Choria transport +in OpenBolt. It covers the phased implementation approach, current progress, +and future work. + +For user-facing documentation, see [choria-transport.md](choria-transport.md). +For developer documentation, see [choria-transport-dev.md](choria-transport-dev.md). +For test environment setup, see [choria-transport-testing.md](choria-transport-testing.md). + +--- + +## Overview + +The Choria transport lets OpenBolt communicate with nodes via Choria's NATS +pub/sub messaging infrastructure instead of SSH/WinRM. It uses the +`choria-mcorpc-support` gem as the client library, sending RPC requests to +agents running on target nodes. + +The transport is implemented in phases, each adding capabilities based on +which Choria agents are available on the remote nodes: + +| Phase | Agents Required | Capabilities Added | +|-------|----------------|-------------------| +| Phase 1 | bolt_tasks (ships with Choria+Puppet) | `run_task` (OpenVox/Puppet Server tasks only) | +| Phase 2 | shell >= 1.2.0 (separate install) | `run_command`, `run_script`, `run_task` (local tasks) | +| Phase 3 | bolt_tasks | [foreman_openbolt](https://github.com/overlookinfra/foreman_openbolt) and [smart_proxy_openbolt](https://github.com/overlookinfra/smart_proxy_openbolt) Choria transport support (bolt_tasks only) | +| Phase 4 | file-transfer (new, to be written) | `upload`, `download` (any size, chunked) | +| Phase 5 | (all above) | Full plan support including apply blocks | + +--- + +## Architecture + +### How Choria RPC Works + +``` +OpenBolt Controller NATS Broker Target Node + | | | + |-- RPC Request (JSON) --> | | + | (via MCollective::RPC::Client) |-- message --> | + | | [Choria Server] + | | [Agent receives] + | | [Agent executes] + | |<-- reply -- | + |<-- RPC Response (JSON) -- | | +``` + +Key points: +- **No persistent connections.** Each RPC call is a request/reply over NATS + pub/sub. +- **Multi-target by default.** One RPC call addresses all targets in a batch. + NATS pub/sub delivers it in parallel. +- **DDLs are mandatory.** The MCollective RPC client loads an agent's DDL at + construction time. DDLs for `rpcutil` and `bolt_tasks` ship with the gem. + The shell DDL is bundled with OpenBolt. + +### Transport Design + +The transport extends `Transport::Base` directly (not `Simple`), because +Choria's pub/sub model doesn't fit the persistent connection/shell abstraction +that `Simple` assumes. Each operation: + +1. Configures the MCollective client (one-time setup) +2. Discovers agents on target nodes (cached after first contact) +3. Partitions targets by agent availability, emitting errors for incapable ones +4. Performs the operation on capable targets +5. Returns per-target results + +### Agent Discovery + +On first contact with a target, two RPC calls discover what's available: + +1. `rpcutil.agent_inventory` returns the list of installed agents with versions +2. `rpcutil.get_fact(os.family)` returns the OS family for platform dispatch + +Results are cached per target for the transport instance's lifetime. + +### Thread Safety + +OpenBolt runs batches in parallel threads sharing the same transport instance: + +- Client configuration is protected by a mutex (one-time setup) +- All RPC calls are serialized to prevent concurrent MCollective usage +- Agent cache uses a thread-safe concurrent map + +For detailed threading analysis, see +[choria-transport-dev.md](choria-transport-dev.md#threading-model). + +--- + +## Phase 1: bolt_tasks Agent Support + +**Status: Complete** + +Phase 1 delivers task execution via the bolt_tasks agent, which downloads +task files from an OpenVox/Puppet Server and executes them on target nodes. + +What shipped: +- `run_task` via bolt_tasks agent with async execution and polling +- `run_command`, `run_script` return clear per-target errors when the shell + agent is not available (rather than crashing) +- `upload`, `download` return "not yet supported" errors +- Connectivity checking via `rpcutil.ping` +- Agent detection with per-target caching +- Client configuration with NATS, TLS, and collective overrides +- Config class with validation for all transport options +- Transport and config registration in OpenBolt's executor and config systems + +--- + +## Phase 2: Shell Agent Support + +**Status: Complete** + +Phase 2 adds command and script execution via the shell agent, plus an +alternative task execution path that uploads task files directly instead of +downloading from an OpenVox/Puppet Server. + +What shipped: +- `run_command` with async execution, timeout, and process kill on timeout +- `run_script` with remote tmpdir creation, script upload via base64, and + cleanup +- `run_task` via shell agent with support for all input methods (environment, + stdin, both) +- Deterministic agent selection via `choria-agent` config and `--choria-agent` + CLI flag (no automatic fallback between agents) +- Batched shell polling via `shell.list` + `shell.statuses` for scalability +- Platform-aware command builders for POSIX and Windows (PowerShell) +- Interpreter support via the `interpreters` config option + +### Shell Agent Actions Used + +| Action | Usage | Response | +|--------|-------|----------| +| `run` | Synchronous execution (infrastructure ops) | `stdout`, `stderr`, `exitcode` | +| `start` | Start async command | `handle` (process identifier) | +| `list` | List all managed processes on a node | Array of `{ id, ... }` | +| `statuses` | Batch status of multiple handles | Per-handle `stdout`, `stderr`, `exitcode`, `status` | +| `kill` | Kill background process | (acknowledgement) | + +--- + +## Phase 3: Foreman OpenBolt Support + +**Status: Not started** + +### Goal + +Update [foreman_openbolt](https://github.com/overlookinfra/foreman_openbolt) and +[smart_proxy_openbolt](https://github.com/overlookinfra/smart_proxy_openbolt) +to support the Choria transport. This integration will only support the +bolt_tasks agent path (task execution via OpenVox/Puppet Server file downloads), not +the shell agent. Eventually, when plan support is introduced to these components, +and Phase 5 of this project is complete, the Foreman integration will have full +OpenBolt support. + +### Scope + +- Foreman OpenBolt will be able to run tasks on Choria-managed nodes via + bolt_tasks +- No `run_command`, `run_script`, or shell agent support +- Configuration will be limited to the bolt_tasks-compatible options + +--- + +## Phase 4: File Transfer Agent + +**Status: Not started** + +### Goal + +Implement `upload` and `download` support via a new `file-transfer` Choria +agent that efficiently sends and receives large files, chunked to stay under +the NATS message size limit. + +### Background: NATS and Choria Constraints + +NATS itself is binary-safe -- payloads are opaque byte arrays and the server +never inspects them. However, the Choria RPC protocol serializes all messages +as nested JSON. The DDL type system supports string, integer, float, number, +boolean, array, and hash -- no binary type. This means binary file data cannot +be sent as-is through Choria RPC action inputs; it must be encoded (typically +base64) to survive JSON serialization. + +NATS has a configurable max message size (default 1MB, max 64MB). The Choria +broker inherits this as `plugin.choria.network.client_max_payload`. + +NATS JetStream (called "Choria Streams" in Choria) is available but not +enabled by default -- it requires `plugin.choria.network.stream.store` to be +set. When enabled, Choria uses JetStream for event streams, KV store, and +leader elections. JetStream also provides an Object Store feature for chunked +binary storage, though Choria does not use it today. + +### Approach Options + +The final approach will be chosen when this phase begins. Three options are +documented here with their tradeoffs. + +#### Option A: Compressed + Base64 via Choria RPC + +The simplest approach. File data is optionally gzip-compressed, then base64-encoded, and +sent as string action inputs through standard Choria RPC calls. The agent +decompresses and writes to disk. + +This works entirely within the existing Choria RPC framework using a Ruby +agent, which aligns with the OpenBolt community's expertise. + +**Pros:** +- Works with any Choria deployment (no JetStream, no special config) +- Ruby agent, consistent with existing agent ecosystem +- Simple protocol, uses standard RPC request/reply + +**Cons:** +- Base64 encoding adds ~33% size overhead (on the compressed data) +- Each chunk is a full RPC round-trip with JSON serialization overhead + (four nested JSON layers plus base64 at the transport level) +- Chunk size limited by NATS max message size minus RPC overhead + +**Chunk size calculation:** +``` +max_chunk_bytes = (message_size_limit - overhead_estimate) * 3 / 4 +``` + +With the default 1MB message limit: +- RPC overhead (headers, JSON keys, etc.): ~4,096 bytes +- Available for base64 payload: 1,044,480 bytes +- Raw data per chunk (before compression): 1,044,480 * 3 / 4 = 783,360 + bytes (~765 KB) +- Conservative default: 512 KB chunks (leaves generous headroom) + +Compression reduces actual wire size significantly for compressible files +(text, configs, scripts, catalogs). For already-compressed files (zip, tar.gz, +images, binaries), compression is skipped to avoid wasting CPU. + +#### Option B: Hybrid RPC + Direct NATS Binary Channel + +Uses Choria RPC for coordination (setup, teardown, status) and a separate +direct NATS connection for the binary data transfer. This avoids JSON +serialization and base64 encoding on the data path entirely. + +The file-transfer agent would need to be written in Go, since Go agents have +clean access to the raw NATS connection via `Instance.Connector().Nats()`. +The Ruby side (OpenBolt client) would open a second `NATS::IO::Client` +instance for the data channel, since the existing `NatsWrapper` feeds all +subscriptions into a single shared receive queue that would conflict with +RPC traffic. + +**Pros:** +- Zero encoding overhead on the data path (raw compressed bytes over NATS) +- Efficient use of the full NATS message size for data +- Binary-safe without any serialization workarounds + +**Cons:** +- Requires a Go agent (different language from the rest of the ecosystem) +- More complex protocol (RPC for control plane, raw NATS for data plane) +- OpenBolt needs a second NATS connection with its own TLS configuration +- Must coordinate subject naming and cleanup between the two channels + +#### Option C: JetStream Object Store + +Both sides use the NATS JetStream Object Store for chunked binary transfer. +Object Store handles chunking (default 128KB, configurable), integrity +verification (SHA-256), and reassembly natively. RPC calls coordinate the +transfer (initiate, confirm completion, clean up). + +Like Option B, the agent would need to be Go since Ruby's NATS client does +not expose JetStream Object Store APIs. + +**Pros:** +- Native chunking with built-in SHA-256 integrity checking +- No file size limit (constrained only by disk space) +- Handles all chunk management, retries, and verification internally +- Well-supported by NATS -- this is the recommended approach for large + payloads in NATS documentation + +**Cons:** +- Requires JetStream enabled on the Choria broker (not the default) +- Requires a Go agent +- Adds a deployment prerequisite that other phases do not have +- Object Store is not used anywhere else in Choria today + +### Common Design (All Approaches) + +Regardless of approach, the file-transfer agent needs these filesystem +operations: + +| Action | Description | Inputs | Outputs | +|--------|-------------|--------|---------| +| `mkdir` | Create directory | `path`, `mode` | `created` | +| `stat` | Get file/dir metadata | `path` | `exists`, `size`, `type`, `mode`, `mtime` | +| `delete` | Remove file/directory | `path`, `recursive` | `deleted` | + +The chunk transfer actions will vary by approach but the transfer protocol +follows the same pattern: + +**Upload (OpenBolt to remote node):** +1. `stat` the destination to check if it exists (optional, for overwrite + semantics) +2. `mkdir` parent directories if needed +3. Read local file, compress (if beneficial), and send in chunks +4. Agent writes chunks to a temp file, renames on completion +5. Return result with bytes transferred + +**Download (remote node to OpenBolt):** +1. `stat` the remote file to get size +2. Request chunks until the full file is received +3. Decompress (if compressed) and write each chunk to local file +4. Return result with bytes transferred + +**Directory transfers:** +- For upload: walk the local directory tree, transfer each file +- For download: use `stat` to detect directory, then walk remote tree + +### Config Changes + +New options: +- `chunk-size` (Integer, default: 524288 = 512KB) - Size of file transfer + chunks in bytes + +### Testing Strategy + +1. Unit tests mocking the file-transfer agent responses +2. Chunk boundary conditions (exactly 1 chunk, multiple chunks, empty file) +3. Compression behavior (compressible files, already-compressed files, empty + files) +4. Directory traversal (nested dirs, empty dirs, symlinks) +5. Resume/retry on chunk failure +6. Integration tests against a real Choria cluster with the agent + +--- + +## Phase 5: Full Plan Support + +**Status: Not started** + +### Goal + +All OpenBolt plan features work with the Choria transport, including: +- Plan execution with multiple steps +- Apply blocks (Puppet code application) +- Plan functions (run_command, run_script, run_task, upload_file, download_file) +- Error handling (catch_errors, run_plan) +- Parallel execution within plans +- Variables, iterators, conditionals + +### What Already Works + +With Phases 1+2+4, the following plan functions should work automatically +because they delegate to the transport's public methods: +- `run_command()` via transport.run_command +- `run_script()` via transport.run_script +- `run_task()` via transport.run_task +- `upload_file()` via transport.upload +- `download_file()` via transport.download + +### What Needs Work: Apply Blocks + +Apply blocks compile a Puppet catalog on the controller and apply it on the +target. This requires: + +1. **Puppet library on the target.** The `puppet_library` plugin hook + installs Puppet on the target if needed. +2. **Catalog application.** The compiled catalog is sent to the target and + applied via `libexec/apply_catalog.rb`. + +The apply mechanism works by: +1. Compiling the catalog locally (controller-side, OpenBolt handles this) +2. Uploading the catalog, plugins, and the apply helper script to the target +3. Running the apply helper script on the target + +With Phases 2+4, steps 2 and 3 should work: +- Phase 4's `upload` sends the catalog and plugin files +- Phase 2's `run_command`/`run_script` executes the apply helper + +### Investigation Needed + +- **Apply prep:** How does `Bolt::ApplyPrep` work? Does it use specific + transport methods, or does it go through the standard + run_task/upload/run_command path? +- **Plugin sync:** How are Puppet plugins synced to the target? Is this a + separate transport call or part of the apply mechanism? +- **Hiera data:** How is Hiera data sent to the target for apply? +- **puppet_library hook:** How does this interact with the Choria transport? + The default bootstrap hook installs Puppet via a task, which should work + with Phase 2. But we are also probably assuming all nodes have OpenVox/Puppet installed already. + +### Testing Strategy + +1. Simple apply block (single resource) +2. Apply with Hiera data +3. Apply with custom modules +4. Plan with mixed steps (run_command + run_task + apply) +5. Error handling in plans (catch_errors around Choria operations) +6. Parallel execution within plans diff --git a/docs/choria-transport-testing.md b/docs/choria-transport-testing.md new file mode 100644 index 000000000..365d45755 --- /dev/null +++ b/docs/choria-transport-testing.md @@ -0,0 +1,734 @@ +# Choria Transport: Test Environment Setup + +A guide for setting up a Choria test environment alongside an existing OpenVox +Puppet installation, and verifying all Choria transport functionality in OpenBolt. + +For configuration reference, see [choria-transport.md](choria-transport.md). +For the project roadmap, see [choria-transport-plan.md](choria-transport-plan.md). +For developer documentation, see [choria-transport-dev.md](choria-transport-dev.md). + +## Prerequisites + +- A primary server (also the OpenBolt controller) with OpenBolt installed +- Two or more remote nodes running Choria server +- The remote nodes start with the default agent set: `choria_util`, `discovery`, + `filemgr`, `package`, `puppet`, `rpcutil`, `scout`, `service` +- Neither `bolt_tasks` nor `shell` agents are installed initially +- A working OpenVox Puppet installation (for TLS certificates) + +## Installing development changes on the primary + +The packaged OpenBolt lives at `/opt/puppetlabs/bolt/`. The gem's lib directory +is at: + +``` +/opt/puppetlabs/bolt/lib/ruby/gems//gems/openbolt-/lib +``` + +To test changes without rebuilding the package, just +overwrite the lib directory. If it gets messed up, reinstall the package. + +### Copy the lib directory to the primary + +```bash +DEV=/path/to/openbolt # your local checkout +PRIMARY=user@primary.example.com +BOLT_GEM=/opt/puppetlabs/bolt/lib/ruby/gems//gems/openbolt-/lib + +rsync -av $DEV/lib/ $PRIMARY:/tmp/openbolt-lib/ +ssh $PRIMARY "sudo rsync -av /tmp/openbolt-lib/ $BOLT_GEM/" +``` + +### Install the choria-mcorpc-support gem + +The Choria transport depends on `choria-mcorpc-support ~> 2.26`. This gem is +included in OpenBolt 5.4.0. But when testing against and older version, install it into the packaged +OpenBolt's gem environment: + +```bash +sudo /opt/puppetlabs/bolt/bin/gem install choria-mcorpc-support --version '~> 2.26' --no-document +``` + +Verify it's loadable: + +```bash +/opt/puppetlabs/bolt/bin/ruby -e "require 'mcollective'; puts 'choria-mcorpc-support loaded OK'" +``` + +### Verify the transport loads + +```bash +/opt/puppetlabs/bolt/bin/ruby -e " + require 'bolt/transport/choria' + require 'bolt/config/transport/choria' + puts 'Choria transport loaded OK' + puts 'Config options: ' + Bolt::Config::Transport::Choria::OPTIONS.join(', ') +" +``` + +## Choria client configuration + +The OpenBolt controller needs a Choria client config to connect to the NATS broker. +The MCollective client library refuses to run as root, so OpenBolt must be run as a +regular user. + +MCollective looks for client config files in this order (first readable wins): + +1. `~/.choriarc` +2. `~/.mcollective` +3. `/etc/choria/client.conf` +4. `/etc/puppetlabs/mcollective/client.cfg` + +### Generate a client certificate + +The user running OpenBolt needs a certificate signed by the Puppet CA. For non-root +users, MCollective resolves the certname as `.mcollective` by default. +Generate a matching certificate on the primary server: + +```bash +sudo puppetserver ca generate --certname .mcollective +``` + +Copy the cert, key, and CA to the user's home directory: + +```bash +mkdir -p ~/.puppetlabs/etc/puppet/ssl/certs ~/.puppetlabs/etc/puppet/ssl/private_keys +sudo cp /etc/puppetlabs/puppet/ssl/certs/.mcollective.pem \ + ~/.puppetlabs/etc/puppet/ssl/certs/ +sudo cp /etc/puppetlabs/puppet/ssl/private_keys/.mcollective.pem \ + ~/.puppetlabs/etc/puppet/ssl/private_keys/ +sudo cp /etc/puppetlabs/puppet/ssl/certs/ca.pem \ + ~/.puppetlabs/etc/puppet/ssl/certs/ +sudo chown -R $(whoami) ~/.puppetlabs +chmod 600 ~/.puppetlabs/etc/puppet/ssl/private_keys/*.pem +``` + +### Set up `~/.choriarc` + +Create `~/.choriarc` with the NATS broker address and cert paths. Replace +`primary.example.com` with your primary server's FQDN and `` with +your OS username throughout: + +```ini +collectives = mcollective +main_collective = mcollective +connector = nats +identity = .mcollective +libdir = /opt/puppetlabs/mcollective/plugins +logger_type = console +loglevel = warn +securityprovider = choria +plugin.choria.middleware_hosts = nats://primary.example.com:4222 +plugin.security.provider = file +plugin.security.file.certificate = ~/.puppetlabs/etc/puppet/ssl/certs/.mcollective.pem +plugin.security.file.key = ~/.puppetlabs/etc/puppet/ssl/private_keys/.mcollective.pem +plugin.security.file.ca = ~/.puppetlabs/etc/puppet/ssl/certs/ca.pem +``` + +### Verify Choria connectivity + +```bash +choria ping +choria rpc rpcutil agent_inventory +``` + +### Running OpenBolt + +The packaged OpenBolt is at `/opt/puppetlabs/bolt/bin/bolt`, wrapped by +`/opt/puppetlabs/bin/bolt`. +Use config files and OpenBolt inventory config rather than environment variables +for MCollective settings. + +## Test inventory setup + +Create a test project directory: + +```bash +mkdir -p ~/choria-test && cd ~/choria-test +``` + +```yaml +# bolt-project.yaml +--- +name: choria_test +modulepath: + - /etc/puppetlabs/code/environments/production/modules # Environment modules (tasks for bolt_tasks to download) + - /etc/puppetlabs/code/modules # Base modules shared across environments + - /opt/puppetlabs/puppet/modules # Puppet's vendored core modules (service, facts, etc.) + - modules # OpenBolt Puppetfile-installed deps (ruby_task_helper, etc.) +``` + +OpenBolt needs local access to task metadata to know which files to tell the +bolt_tasks agent to download. The server-side module paths are listed first +so that OpenBolt reads the same module versions that the bolt_tasks agent will +actually download from the server. The local `modules` directory comes last +as a fallback for Puppetfile-installed dependencies. When using +`--choria-agent shell`, OpenBolt uploads files directly, so local modules should +take precedence instead — put `modules` first or omit the server paths. + +Task helper dependencies like `puppetlabs-ruby_task_helper` must also be +installed on the server. Without them, bolt_tasks will fail with 404 errors +when downloading task files. + +OpenBolt also auto-injects its own internal paths (visible in `--log-level debug` +output): `bolt-modules` is prepended, and `.modules` plus the gem's built-in +modules directory are appended. These don't need to be specified manually. + +```yaml +# inventory.yaml +--- +config: + transport: choria +targets: + - name: agent1 + config: + choria: + host: nodeA.example.com + - name: agent2 + config: + choria: + host: nodeB.example.com +``` + +Transport and transport config go under `config:` in `inventory.yaml` (not in +`bolt-project.yaml`). The `name` is a short alias for use in OpenBolt commands, +and `host` is the actual Choria identity (FQDN shown by `choria ping`). + +Since `~/.choriarc` is auto-detected, no `config-file` setting is needed +in the inventory. + +### Target names must match Choria identities + +Target URIs must use the exact Choria identity (typically the FQDN shown by +`choria ping`). Mismatched names cause timeout errors. See +[choria-transport.md](choria-transport.md#target-names-must-match-choria-identities) +for details and workarounds using `name` with `host` config. + +## Setting up Choria infrastructure via Puppet + +The `choria` Puppet module manages the Choria server, broker, and MCO Ruby +compatibility layer on your nodes. The `bolt_tasks` and `shell` agents are +Ruby MCollective agents that run through Choria's MCO compatibility shim. + +### Puppetfile + +Some of the agent modules are on the Forge, but they are no longer being updated there. Add them from GitHub: + +```ruby +# Puppetfile (in the environment, e.g. production) +mod 'choria-choria', :latest +mod 'choria-mcollective', :latest +mod 'choria-mcollective_choria', :latest +mod 'mcollective_agent_shell', + git: 'https://github.com/choria-plugins/shell-agent', + ref: '1.2.0' +``` + +The shell agent requires version 1.2.0 or later (for the batched `statuses` +action). The `bolt_tasks` agent is included in a standard Choria install and +does not need a separate Puppetfile entry. + +Deploy with r10k: + +```bash +sudo /opt/puppetlabs/puppet/bin/r10k puppetfile install \ + --puppetfile /path/to/Puppetfile \ + --moduledir /etc/puppetlabs/code/environments/production/modules +``` + +### Hiera configuration + +Set up the environment-level hiera config: + +```yaml +# hiera.yaml (in the environment directory) +--- +version: 5 +defaults: + datadir: data + data_hash: yaml_data +hierarchy: + - name: "Per-node data" + path: "nodes/%{trusted.certname}.yaml" + - name: "Common data" + paths: + - "common.yaml" +``` + +Common data applied to all nodes. You don't have to use this exact configuration, (i.e. you might want to use SRV instead, different configs for different nodes, etc.): + +```yaml +# data/common.yaml +choria::manage_package_repo: true +choria::server: true + +choria::server_config: + plugin.choria.puppetserver_host: "primary.example.com" + plugin.choria.puppetserver_port: 8140 + plugin.choria.puppetca_host: "primary.example.com" + plugin.choria.puppetca_port: 8140 + plugin.choria.middleware_hosts: "primary.example.com:4222" + plugin.choria.use_srv: false + +# Allow all callers for testing. Restrict in production. +mcollective::site_policies: + - action: "allow" + callers: "/.*/" + actions: "*" + facts: "*" + classes: "*" + +mcollective::client: true +mcollective_choria::config: + security.certname_whitelist: "/\\.mcollective$/, /.*/" + +mcollective::client_config: + plugin.security.provider: "file" + plugin.security.file.certificate: "/etc/puppetlabs/puppet/ssl/certs/%{trusted.certname}.pem" + plugin.security.file.key: "/etc/puppetlabs/puppet/ssl/private_keys/%{trusted.certname}.pem" + plugin.security.file.ca: "/etc/puppetlabs/puppet/ssl/certs/ca.pem" + +mcollective::plugin_classes: + - mcollective_agent_bolt_tasks + - mcollective_agent_shell +``` + +Per-node data for the primary (enables the NATS broker): + +```yaml +# data/nodes/.yaml +choria::broker::network_broker: true +``` + +### Site manifest + +Again, you don't need to set it up this way directly, but an easy manifest for working with just a handful of nodes. + +```puppet +# site.pp +node "primary.example.com" { + include choria + include choria::broker +} + +node default { + include choria + file { '/root/.choria': + ensure => file, + content => "plugin.security.provider = puppet\nplugin.security.certname = ${trusted['certname']}\n", + owner => 'root', + group => 'root', + mode => '0600', + } +} +``` + +### Apply and verify + +Run Puppet on all nodes: + +```bash +ssh nodeA.example.com 'sudo puppet agent -t' +ssh nodeB.example.com 'sudo puppet agent -t' +``` + +Verify agents are loaded: + +```bash +choria rpc rpcutil agent_inventory -I nodeA.example.com +``` + +The agent list should include `bolt_tasks` and `shell`. + +**Client-side DDLs:** The `bolt_tasks` DDL comes from the +`choria-mcorpc-support` gem (included in OpenBolt 5.4.0). The `shell` DDL is +bundled with OpenBolt and preloaded automatically. No manual DDL installation +is needed on the OpenBolt controller. + +### Removing agents for testing + +To test downgrade scenarios (verifying error messages when agents are missing), +remove agents from `mcollective::plugin_classes` in Hiera and set absent: + +```yaml +mcollective_agent_shell::ensure: absent +``` + +Run Puppet on the node, then restart Choria server: + +```bash +ssh nodeA.example.com 'sudo puppet agent -t && sudo systemctl restart choria-server' +``` + +Verify removal: + +```bash +choria rpc rpcutil agent_inventory -I nodeA.example.com +``` + +OpenBolt caches agent lists per target for the transport's lifetime, so start a +fresh `bolt` command after changing agents (don't re-run within the same plan). + +## Test cases + +### Connectivity (no agents required) + +```bash +bolt inventory show --targets nodeA.example.com +``` + +This doesn't require any special agents. It should show the target config. + +### No task agents installed + +Both nodes have only the default Choria agents. Neither `bolt_tasks` nor +`shell` is installed. Every operation that needs them should fail with a +clear, per-node error. + +**run_command (needs shell agent)** + +```bash +bolt command run 'whoami' --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Both nodes fail with an error like: +``` +The 'shell' agent is not available on nodeA.example.com. +``` + +Verify the error names the specific target, not a generic failure. + +**run_script (needs shell agent)** + +```bash +echo '#!/bin/bash +echo "hello from $(hostname)"' > /tmp/test.sh + +bolt script run /tmp/test.sh --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Same shell agent error. + +**run_task (needs bolt_tasks or shell)** + +```bash +bolt task run facts --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Both nodes fail with an error like: +``` +The 'bolt_tasks' agent is not available on nodeA.example.com. +Install either the bolt_tasks or shell agent on target nodes to run tasks via Choria. +``` + +**upload/download (not yet supported)** + +```bash +bolt file upload /tmp/test.sh /tmp/test_remote.sh --targets nodeA.example.com +bolt file download /etc/hostname /tmp/downloaded/ --targets nodeA.example.com +``` + +Expected: Both fail with `The Choria transport does not yet support upload/download.` + +### bolt_tasks agent only + +Install `bolt_tasks` on **both nodes** (see "Installing agents" above). + +**run_task with an OpenVox/Puppet Server task** + +```bash +bolt task run facts --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Succeeds on both nodes. The bolt_tasks agent downloads the `facts` +task from the OpenVox/Puppet Server and executes it. + +Note: This requires the OpenVox/Puppet Server to be accessible from the remote nodes +at their configured `puppet_server` (default `puppet:8140`) and the task +module to be available in the configured environment (see "Task module +requirements" above). + +If the OpenVox/Puppet Server isn't set up or the task isn't available, you'll see a +`bolt/choria-task-download-failed` error. This is expected and tests that the +error path works correctly. + +**run_task with parameters** + +```bash +bolt task run package action=status name=puppet \ + --targets nodeA.example.com +``` + +Expected: Returns the package status. Verifies that task parameters are +passed through correctly to the bolt_tasks agent. + +**run_command still fails (no shell agent)** + +```bash +bolt command run 'whoami' --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Still fails with the shell agent error. `bolt_tasks` doesn't help +with `run_command`. + +**run_script still fails (no shell agent)** + +```bash +bolt script run /tmp/test.sh --targets nodeA.example.com +``` + +Expected: Still fails with the shell agent error. + +**Forced agent selection** + +```bash +# Force bolt_tasks (should work, same as default) +bolt task run facts --targets nodeA.example.com --choria-agent bolt_tasks + +# Force shell (should fail: not installed) +bolt task run facts --targets nodeA.example.com --choria-agent shell +``` + +Expected: First succeeds (or fails at download, not at agent detection). +Second fails with `bolt/choria-agent-not-available` for shell. + +### Shell agent on one node (mixed fleet) + +Install the shell agent on Node A only, leaving Node B with just +`bolt_tasks`. This tests mixed-fleet behavior. + +**run_command (mixed results)** + +```bash +bolt command run 'whoami' --targets nodeA.example.com,nodeB.example.com +``` + +Expected: +- Node A: Succeeds, shows username +- Node B: Fails with `bolt/choria-agent-not-available` (no shell agent) + +This is the key mixed-fleet test. Both results should appear in the output, +not a single crash. + +**run_command with exit code** + +```bash +bolt command run 'exit 42' --targets nodeA.example.com +``` + +Expected: Reports exit code 42. + +**run_script (mixed results)** + +```bash +echo '#!/bin/bash +echo "hostname: $(hostname)" +echo "uptime: $(uptime)"' > /tmp/test_script.sh + +bolt script run /tmp/test_script.sh --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Same split. Node A succeeds, Node B fails. + +**run_script with arguments** + +```bash +echo '#!/bin/bash +echo "Args: $@"' > /tmp/test_args.sh + +bolt script run /tmp/test_args.sh arg1 arg2 --targets nodeA.example.com +``` + +Expected: `Args: arg1 arg2` + +**run_task (both succeed via bolt_tasks)** + +```bash +bolt task run facts --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Both succeed. Even though Node A has shell, bolt_tasks is the +default and both nodes have it. + +**run_task with local task (not on OpenVox/Puppet Server)** + +Test with a task that's NOT on the OpenVox/Puppet Server (a local custom task). +Without `--choria-agent shell`, this will fail because bolt_tasks can't find +the task on the OpenVox/Puppet Server: + +```bash +mkdir -p tasks +cat > tasks/hello.sh << 'TASK' +#!/bin/bash +echo "{\"message\": \"hello from $(hostname)\"}" +TASK +cat > tasks/hello.json << 'META' +{"description": "Test task", "input_method": "stdin", "parameters": {}} +META + +bolt task run choria_test::hello --targets nodeA.example.com +``` + +Expected: Fails with `bolt/choria-task-download-failed` and a message +suggesting `--choria-agent shell`. + +Now retry with `--choria-agent shell` (Node A has the shell agent): + +```bash +bolt task run choria_test::hello --targets nodeA.example.com --choria-agent shell +``` + +Expected: Succeeds via the shell agent. + +**Agent selection** + +```bash +# Use shell agent (Node A has it) +bolt task run choria_test::hello --targets nodeA.example.com --choria-agent shell + +# Use shell agent (Node B doesn't have it) +bolt task run choria_test::hello --targets nodeB.example.com --choria-agent shell +``` + +Expected: +- First: Succeeds via shell agent +- Second: Fails with `bolt/choria-agent-not-available` + +### Both agents installed + +Install the shell agent on Node B too (same steps as above). Now both nodes +have both agents installed. + +**All operations succeed on both nodes** + +```bash +bolt command run 'whoami' --targets nodeA.example.com,nodeB.example.com +bolt script run /tmp/test_script.sh --targets nodeA.example.com,nodeB.example.com +bolt task run facts --targets nodeA.example.com,nodeB.example.com +bolt task run choria_test::hello --targets nodeA.example.com,nodeB.example.com --choria-agent shell +``` + +All should succeed on both nodes. + +**upload/download still unsupported** + +```bash +bolt file upload /tmp/test.sh /tmp/test_remote.sh --targets nodeA.example.com +bolt file download /etc/hostname /tmp/downloaded/ --targets nodeA.example.com +``` + +Still fails with `bolt/choria-unsupported-operation`, regardless of agents. + +**Multiple targets** + +```bash +bolt command run 'hostname -f' --targets nodeA.example.com,nodeB.example.com +``` + +Expected: Returns hostname from both nodes. Verifies multi-target fanout +works correctly. + +**Timeouts** + +Test that long-running commands are killed on timeout: + +```bash +bolt command run 'sleep 300' --targets nodeA.example.com \ + --transport-config '{"choria": {"command-timeout": 5}}' +``` + +Expected: Times out after ~5 seconds, kills the background process on the +node (check debug logs for "Killed background process"). + +**Temp directory cleanup** + +```bash +# With cleanup enabled (default) +bolt script run /tmp/test_script.sh --targets nodeA.example.com --log-level debug 2>&1 | \ + grep -i "tmpdir\|bolt-choria-" + +# Verify nothing left behind +ssh nodeA.example.com 'ls /tmp/bolt-choria-* 2>/dev/null || echo "clean"' + +# With cleanup disabled +bolt script run /tmp/test_script.sh --targets nodeA.example.com \ + --transport-config '{"choria": {"cleanup": false}}' --log-level debug + +# Should still be there +ssh nodeA.example.com 'ls -la /tmp/bolt-choria-*' +# Clean up manually +ssh nodeA.example.com 'rm -rf /tmp/bolt-choria-*' +``` + +## Debug logging + +Add `--log-level debug` to any OpenBolt command to see detailed transport traces: + +- `Loaded Choria client config from ...` (config file found) +- `Discovering agents on N targets` (agent discovery start) +- `Discovered agents on : ...` (per-target agent list) +- `The 'shell' agent on is version X, but Y or later is required` (version check) +- `Running command via shell agent on N targets` (command start) +- `Running task via bolt_tasks agent on N targets` (task routing) +- `Task routing: agent: bolt_tasks, N capable / M incapable` (agent routing decision) +- `Started command on , handle: ...` (shell agent start) +- `Poll round N: M targets still pending` (poll loop progress) +- `shell.list on : handle ... status: stopped` (per-target poll result) +- `Fetching shell.statuses for N targets` (batched output fetch) +- `Uploading N bytes to on M targets` (file upload) +- `Checking connectivity for N targets` (connectivity check) +- `Timed out after Ns with M targets still pending, killing processes` (timeout) +- `Killing timed-out processes on N targets` (kill start) + +## Diagnostics + +```bash +# Check what agents are available on a node (uses choria directly, not OpenBolt) +choria rpc rpcutil agent_inventory -I nodeA.example.com + +# Check Choria connectivity +choria ping -I nodeA.example.com + +# Enable debug logging in OpenBolt +bolt command run 'hostname' --targets nodeA.example.com --log-level debug +``` + +## Troubleshooting + +**Timeouts on all operations:** +Check that target names in your inventory match the exact Choria identity +(FQDN) shown by `choria ping`. Mismatched names are the most common cause of +"no response" errors. + +**DDL-not-found errors:** +The `bolt_tasks` DDL comes from the `choria-mcorpc-support` gem. If you see +DDL errors, verify the gem is installed (see "Install the choria-mcorpc-support +gem" above). The shell DDL is bundled with OpenBolt and does not need separate +installation. + +**Task download 404 errors (`bolt/choria-task-download-failed`):** +The task module (and its dependencies like `ruby_task_helper`) must be +installed on the OpenVox/Puppet Server. + +**Agent not found after install:** +Restart `choria-server` on the target node. The Go server only loads agents at +startup. + +**MCollective refuses to run as root:** +Use a non-root user with a Puppet CA-signed certificate. See "Choria client +configuration" above. + +**Agent cache shows stale data:** +OpenBolt caches agent lists per target for the transport's lifetime. Start a fresh +`bolt` command after installing or removing agents. + +## Test matrix summary + +| Operation | No agents | bolt_tasks only | shell only | Both agents | +|-------------|----------------|-----------------|---------------------------|--------------------------------------| +| run_command | agent error | agent error | works | works (shell) | +| run_script | agent error | agent error | works | works (shell) | +| run_task | no-agent error | works (bt) | works (--choria-agent sh) | works (bt default, sh if configured) | +| upload | unsupported | unsupported | unsupported | unsupported | +| download | unsupported | unsupported | unsupported | unsupported | +| connected? | works | works | works | works | diff --git a/docs/choria-transport.md b/docs/choria-transport.md new file mode 100644 index 000000000..dac29e9e8 --- /dev/null +++ b/docs/choria-transport.md @@ -0,0 +1,378 @@ +# Choria Transport + +The Choria transport lets OpenBolt communicate with nodes via +[Choria's](https://choria.io/) NATS pub/sub messaging infrastructure instead +of SSH or WinRM. Rather than opening direct connections to each node, OpenBolt +publishes RPC requests to a NATS message broker, and agents running on target +nodes pick them up, execute the requested action, and reply. + +Key components: +- **NATS broker**: Message middleware that routes requests and replies +- **Choria Server**: Runs on each managed node, hosts agents +- **Agents**: Plugins that perform actions (run commands, execute tasks, etc.) + +The transport uses the `choria-mcorpc-support` Ruby gem as its client library. + +For the project roadmap, see [choria-transport-plan.md](choria-transport-plan.md). +For developer documentation, see [choria-transport-dev.md](choria-transport-dev.md). +For test environment setup, see [choria-transport-testing.md](choria-transport-testing.md). + +## Prerequisites + +- A working Choria cluster with a NATS broker +- Choria Server running on each target node +- A Choria client config file on the OpenBolt controller +- At least one of the supported agents installed on target nodes: + - **bolt_tasks** (ships with Puppet-enabled Choria setups) + - **shell** (separate install, version 1.2.0 or later) + +## Configuration + +### Inventory setup + +Transport and config options go under `config:` in your inventory file: + +```yaml +# inventory.yaml +config: + transport: choria + choria: + config-file: /home/user/.choriarc +targets: + - uri: choria://node1.example.com + - uri: choria://node2.example.com +``` + +Per-target overrides: + +```yaml +targets: + - uri: choria://node1.example.com + config: + choria: + collective: production + nats-servers: + - nats://broker1:4222 + - nats://broker2:4222 +``` + +If the config file is in one of the auto-detected locations (`~/.choriarc`, +`/etc/choria/client.conf`, `/etc/puppetlabs/mcollective/client.cfg`), you +can omit the `config-file` option. + +### Target names must match Choria identities + +The transport uses the target's hostname as a Choria identity filter. This +**must match the node's Choria identity exactly**, which is typically the +FQDN shown by `choria ping`. + +If target names don't match, you'll see timeout errors. Use the full FQDN: + +```yaml +targets: + - uri: choria://node1.dc.example.com +``` + +If you want short names, use `name` with the `host` config to specify the +Choria identity separately: + +```yaml +targets: + - name: nodeA + config: + choria: + host: node1.dc.example.com +``` + +### Config option reference + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `choria-agent` | String | `bolt_tasks` | Agent for task execution: `bolt_tasks` or `shell`. Also available as `--choria-agent` CLI flag. | +| `cleanup` | Boolean | `true` | Clean up temp directories after operations. Set to `false` for debugging. | +| `collective` | String | (from Choria config file) | Choria collective to route messages through. Per-target. | +| `command-timeout` | Integer | `60` | Seconds to wait for commands and scripts to complete. | +| `config-file` | String | (auto-detected) | Path to a Choria/MCollective client config file. | +| `host` | String | (from URI) | Target's Choria identity (FQDN). Overrides the hostname from the URI. | +| `interpreters` | Hash | (none) | File extension to interpreter mapping (e.g., `{".rb": "/usr/bin/ruby"}`). | +| `nats-connection-timeout` | Integer | `30` | Seconds to wait for the TCP connection to the NATS broker. | +| `nats-servers` | String or Array | (from Choria config file) | NATS broker addresses. Overrides the config file. | +| `puppet-environment` | String | `production` | Puppet environment for bolt_tasks file URIs. | +| `rpc-timeout` | Integer | `30` | Seconds to wait for replies to individual RPC calls. | +| `ssl-ca` | String | (from Choria config file) | CA certificate path for TLS. | +| `ssl-cert` | String | (from Choria config file) | Client certificate path for TLS. | +| `ssl-key` | String | (from Choria config file) | Client private key path for TLS. | +| `task-timeout` | Integer | `300` | Seconds to wait for task execution to complete. | +| `tmpdir` | String | `/tmp` or `C:\Windows\Temp` | Base path for temp directories on remote nodes. Must be absolute. | + +**Timeout hierarchy:** Three levels of timeout control different things: +- `nats-connection-timeout` (30s): How long to wait for the initial TCP + connection to the NATS broker +- `rpc-timeout` (30s): How long to wait for replies to each individual RPC + call (discovery, status checks, etc.) +- `command-timeout` (60s) / `task-timeout` (300s): How long to wait for the + entire operation (the full duration of the command or task) + +**SSL options:** If you provide any of `ssl-ca`, `ssl-cert`, or `ssl-key`, +you must provide all three. Partial SSL configurations are rejected during +validation. + +## Operations + +### run_command + +Requires the **shell agent** (>= 1.2.0) on target nodes. + +```bash +bolt command run 'hostname -f' --targets node1.example.com,node2.example.com +``` + +Commands are started asynchronously on all targets in one RPC call, then +polled for completion. If a command exceeds `command-timeout`, the background +process is killed on the target node. + +### run_script + +Requires the **shell agent** (>= 1.2.0) on target nodes. + +```bash +bolt script run ./check_disk.sh --targets node1.example.com +``` + +The script is uploaded to a temp directory on each target, made executable, +and run. Temp directories are cleaned up afterward (unless `cleanup: false`). + +Interpreter support: + +```yaml +choria: + interpreters: + ".rb": "/usr/bin/ruby" + ".py": "/usr/bin/python3" +``` + +### run_task + +Works with either the **bolt_tasks** or **shell** agent. + +```bash +# Uses bolt_tasks by default (downloads from OpenVox/Puppet Server) +bolt task run facts --targets node1.example.com + +# Use shell agent for local tasks not on the OpenVox/Puppet Server +bolt task run my_project::check --targets node1.example.com --choria-agent shell +``` + +Agent selection is deterministic with no automatic fallback. If the selected +agent is not available on a target, that target gets a clear error result. + +### upload / download + +Not yet supported. These will be implemented in Phase 4 with a new +chunked file-transfer agent. See the +[project plan](choria-transport-plan.md#phase-4-file-transfer-agent) for +details. + +### connected? + +Uses `rpcutil.ping`, which is built into every Choria node. No special +agents needed. + +## Agent selection + +### bolt_tasks (default) + +The bolt_tasks agent ships with Puppet-enabled Choria setups. It downloads +task files from an OpenVox/Puppet Server and executes them. This means: + +- Only `run_task` works (not `run_command` or `run_script`) +- Tasks must be installed on the OpenVox/Puppet Server +- Target nodes must be able to reach the OpenVox/Puppet Server + +### shell (separate install) + +The [shell agent](https://github.com/choria-plugins/shell-agent) is a +separate Choria plugin. Version 1.2.0 or later is required. It must be +installed on target nodes. + +With the shell agent: +- `run_command` and `run_script` work +- `run_task` can use either agent (bolt_tasks by default, or shell with + `--choria-agent shell`) + +The shell agent DDL (required by the client library) is bundled with OpenBolt +and loaded automatically. No client-side setup is needed. + +### Agent detection + +On first contact with a target, the transport automatically discovers which +agents are installed and what OS the target is running. This happens +transparently. Agents below the required minimum version (e.g., shell < 1.2.0) +are excluded and treated as unavailable. + +If a target is missing the required agent, it gets a clear error result with +a message suggesting what to install. Other targets in the same batch are +not affected. + +### Installing the shell agent + +The Choria plugin modules are not currently published on the Puppet Forge. +Install via Puppet by referencing the GitHub repository in your Puppetfile: + +```ruby +mod 'mcollective_agent_shell', + git: 'https://github.com/choria-plugins/shell-agent', + ref: 'v1.2.0' +``` + +Deploy with r10k or Code Manager, then apply via Hiera: + +```yaml +mcollective::plugin_classes: + - mcollective_agent_shell +``` + +Restart `choria-server` on target nodes after installing. + +For detailed installation instructions (including manual file copy), see +[choria-transport-testing.md](choria-transport-testing.md#shell-agent). + +## Using bolt_tasks with an OpenVox/Puppet Server + +### How bolt_tasks works + +The bolt_tasks agent doesn't receive task files from OpenBolt directly. OpenBolt +sends file metadata (SHA256 hashes, OpenVox/Puppet Server URIs) and the agent +downloads the files from the OpenVox/Puppet Server itself. This means: + +1. Task modules must be installed on the OpenVox/Puppet Server (in the environment's + modulepath) +2. Task helper dependencies (like `ruby_task_helper`) must also be on the + OpenVox/Puppet Server +3. Nodes must be able to reach the OpenVox/Puppet Server at their configured + `puppet_server` address (default `puppet:8140`) + +### Modulepath configuration + +OpenBolt needs task metadata locally to build the file specs it sends to +bolt_tasks. If you're running on the primary server, the task modules already +exist on disk. Add all server-side module paths to OpenBolt's modulepath: + +```yaml +# bolt-project.yaml +name: my_project +modulepath: + - /etc/puppetlabs/code/environments/production/modules # Environment modules + - /etc/puppetlabs/code/modules # Base modules shared across environments + - /opt/puppetlabs/puppet/modules # Puppet's vendored core modules (service, facts, etc.) + - modules # OpenBolt Puppetfile-installed deps (ruby_task_helper, etc.) +``` + +Server-side paths are listed first so that OpenBolt reads the same module versions +that the bolt_tasks agent will download from the server. When using +`--choria-agent shell`, OpenBolt uploads task files directly, so local modules +should take precedence instead — put `modules` first or omit the server paths. + +OpenBolt also auto-injects its own internal paths (visible in `--log-level debug` +output): `bolt-modules` is prepended, and `.modules` plus the gem's built-in +modules directory are appended. These don't need to be specified manually. + +**Important:** Setting `modulepath` replaces the default (`modules`), so you +must include `modules` explicitly. Without it, OpenBolt loses access to its +Puppetfile-installed modules (like `ruby_task_helper`, `facts`, etc.). + +Or per-invocation: + +```bash +bolt task run facts --targets node1,node2 \ + --modulepath "modules:/etc/puppetlabs/code/environments/production/modules:/etc/puppetlabs/code/modules:/opt/puppetlabs/puppet/modules" +``` + +### Installing task dependencies + +Common tasks require helper modules on the OpenVox/Puppet Server: + +```bash +# Required by most Ruby-based tasks (including 'facts') +sudo puppet module install puppetlabs-ruby_task_helper + +# Required by Python-based tasks +sudo puppet module install puppetlabs-python_task_helper +``` + +Without these, you'll see download errors like: +``` +bolt/choria-task-download-failed: ... ruby_task_helper/files/task_helper.rb: 404 +``` + +### Using the shell agent for tasks + +If a task is not available on the OpenVox/Puppet Server (e.g., it's a local project +task), set `choria-agent` to `shell` to upload and execute it directly via +the shell agent, bypassing the OpenVox/Puppet Server entirely: + +```yaml +# bolt-project.yaml +choria: + choria-agent: shell +``` + +Or per-invocation: + +```bash +bolt task run my_project::check --targets node1 --choria-agent shell +``` + +When using `--choria-agent shell`, the OpenVox/Puppet Server requirement is bypassed +entirely. OpenBolt uploads task files directly via the shell agent, so only the +local modulepath matters. + +## Limitations + +1. **Upload and download not yet supported.** These will be implemented in a + future release with a new file-transfer agent. + +2. **Shell agent not installed by default.** Without it, only `run_task` + (via bolt_tasks + OpenVox/Puppet Server) works. All other operations fail with a + clear error message. Version 1.2.0 or later is required. + +3. **bolt_tasks requires an OpenVox/Puppet Server.** The bolt_tasks agent downloads + task files from the OpenVox/Puppet Server. Tasks not served by the OpenVox/Puppet Server + will fail with an error suggesting `--choria-agent shell`. + +4. **No streaming output.** All output is returned on completion, not streamed + incrementally. + +5. **No run-as support.** Choria uses its own identity model based on TLS + certificates. There's no equivalent to SSH's `sudo` or `run-as`. + +6. **No TTY support.** Interactive commands are not possible through Choria's + messaging model. + +7. **Timeout behavior differs by agent.** Shell agent processes are killed on + timeout via `shell.kill`. bolt_tasks tasks continue running on the node + after OpenBolt reports a timeout (bolt_tasks has no kill mechanism). + +8. **File size limit for shell agent uploads.** When using the shell agent + (`run_script`, `run_task` with `--choria-agent shell`), files are + base64-encoded and sent as RPC messages. The maximum file size is limited + by the NATS max message size (default 1MB, roughly 750KB effective after + base64 overhead). Increase `plugin.choria.network.client_max_payload` in + the Choria broker config for larger files. The bolt_tasks agent is not + affected since it downloads files from the OpenVox/Puppet Server. + +9. **POSIX targets need `base64` CLI for shell agent uploads.** The `base64` + command (provided by coreutils on Linux, preinstalled on macOS) must be + available on POSIX target nodes. On Windows, PowerShell handles this + natively. The bolt_tasks agent is not affected. + +10. **Shell agent job state accumulates on target nodes.** The shell agent + stores job state in per-job directories under + `/var/run/mcollective-shell/`. These are not automatically cleaned up + after the process exits. Periodic manual cleanup may be necessary for + long-running infrastructure. + +11. **MCollective client library refuses to run as root.** Use a non-root + user with a Puppet CA-signed certificate. See the + [testing guide](choria-transport-testing.md#running-bolt-as-a-non-root-user) + for setup instructions. From 683a3ca2f39e771c167c716a9c60e3f1d7e273c8 Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:35:44 -0700 Subject: [PATCH 4/8] Add updated schemas for Choria options --- schemas/bolt-defaults.schema.json | 326 +++++++++++++++++++++++++++++ schemas/bolt-inventory.schema.json | 216 +++++++++++++++++++ 2 files changed, 542 insertions(+) diff --git a/schemas/bolt-defaults.schema.json b/schemas/bolt-defaults.schema.json index 89676df4e..b93bf4abb 100644 --- a/schemas/bolt-defaults.schema.json +++ b/schemas/bolt-defaults.schema.json @@ -98,6 +98,9 @@ "transport": { "$ref": "#/definitions/transport" }, + "choria": { + "$ref": "#/definitions/choria" + }, "docker": { "$ref": "#/definitions/docker" }, @@ -493,6 +496,7 @@ { "type": "string", "enum": [ + "choria", "docker", "jail", "local", @@ -508,6 +512,179 @@ } ] }, + "choria": { + "description": "A map of configuration options for the choria transport.", + "oneOf": [ + { + "type": "object", + "properties": { + "choria-agent": { + "oneOf": [ + { + "$ref": "#/transport_definitions/choria-agent" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "cleanup": { + "oneOf": [ + { + "$ref": "#/transport_definitions/cleanup" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "collective": { + "oneOf": [ + { + "$ref": "#/transport_definitions/collective" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "command-timeout": { + "oneOf": [ + { + "$ref": "#/transport_definitions/command-timeout" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "config-file": { + "oneOf": [ + { + "$ref": "#/transport_definitions/config-file" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "host": { + "oneOf": [ + { + "$ref": "#/transport_definitions/host" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "interpreters": { + "oneOf": [ + { + "$ref": "#/transport_definitions/interpreters" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "nats-connection-timeout": { + "oneOf": [ + { + "$ref": "#/transport_definitions/nats-connection-timeout" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "nats-servers": { + "oneOf": [ + { + "$ref": "#/transport_definitions/nats-servers" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "puppet-environment": { + "oneOf": [ + { + "$ref": "#/transport_definitions/puppet-environment" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "rpc-timeout": { + "oneOf": [ + { + "$ref": "#/transport_definitions/rpc-timeout" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-ca": { + "oneOf": [ + { + "$ref": "#/transport_definitions/ssl-ca" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-cert": { + "oneOf": [ + { + "$ref": "#/transport_definitions/ssl-cert" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-key": { + "oneOf": [ + { + "$ref": "#/transport_definitions/ssl-key" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "task-timeout": { + "oneOf": [ + { + "$ref": "#/transport_definitions/task-timeout" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "tmpdir": { + "oneOf": [ + { + "$ref": "#/transport_definitions/tmpdir" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + } + } + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "docker": { "description": "A map of configuration options for the docker transport.", "oneOf": [ @@ -1630,6 +1807,75 @@ } ] }, + "choria-agent": { + "description": "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' (downloads task files from a Puppet Server). Set to 'shell' for tasks not available on the Puppet Server.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "collective": { + "description": "The Choria collective to target. Overrides the main_collective from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "command-timeout": { + "description": "How long to wait in seconds for commands and scripts to complete when using the Choria transport.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "config-file": { + "description": "The path to the Choria or MCollective client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "nats-connection-timeout": { + "description": "How long to wait in seconds for the initial TCP connection to the NATS broker. If the connection cannot be made within this time, the operation fails.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "rpc-timeout": { + "description": "How long to wait in seconds for nodes to respond to an RPC request. Used for lightweight operations like agent discovery, shell.start, and shell.list polling. Distinct from command-timeout and task-timeout which govern the overall duration of commands and tasks.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "connect-timeout": { "description": "How long to wait in seconds when establishing connections. Set this value higher if you frequently encounter connection timeout errors when running Bolt.", "oneOf": [ @@ -1899,6 +2145,30 @@ } ] }, + "nats-servers": { + "description": "One or more NATS server addresses for the Choria transport. Overrides the middleware hosts from the Choria client configuration file. Can be a single string or an array.", + "oneOf": [ + { + "type": [ + "string", + "array" + ], + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + } + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "native-ssh": { "type": "boolean", "description": "This enables the native SSH transport, which shells out to SSH instead of using the net-ssh Ruby library" @@ -1968,6 +2238,17 @@ } ] }, + "puppet-environment": { + "description": "The Puppet environment to use when constructing task file URIs for the Choria bolt_tasks agent.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "read-timeout": { "description": "How long to wait in seconds when making requests to the Orchestrator.", "oneOf": [ @@ -2085,6 +2366,39 @@ } ] }, + "ssl-ca": { + "description": "The path to the CA certificate for Choria TLS connections. Overrides the CA from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-cert": { + "description": "The path to the client certificate for Choria TLS connections. Overrides the certificate from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-key": { + "description": "The path to the client private key for Choria TLS connections. Overrides the key from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "ssh-command": { "description": "The command and options to use when SSHing. This option is used when you need support for features or algorithms that are not supported by the net-ssh Ruby library. **This option is experimental.** You can read more about this option in [Native SSH transport](experimental_features.md#native-ssh-transport).", "oneOf": [ @@ -2164,6 +2478,18 @@ } ] }, + "task-timeout": { + "description": "How long to wait in seconds for tasks to complete when using the Choria transport.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "tmpdir": { "description": "The directory to upload and execute temporary files on the target.", "oneOf": [ diff --git a/schemas/bolt-inventory.schema.json b/schemas/bolt-inventory.schema.json index 2b1802a96..2437c5cc0 100644 --- a/schemas/bolt-inventory.schema.json +++ b/schemas/bolt-inventory.schema.json @@ -41,6 +41,7 @@ { "type": "string", "enum": [ + "choria", "docker", "jail", "local", @@ -56,6 +57,221 @@ } ] }, + "choria": { + "description": "A map of configuration options for the choria transport.", + "oneOf": [ + { + "type": "object", + "properties": { + "choria-agent": { + "description": "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' (downloads task files from a Puppet Server). Set to 'shell' for tasks not available on the Puppet Server.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "cleanup": { + "description": "Whether to clean up temporary files created on targets. When running commands on a target, Bolt might create temporary files. After completing the command, these files are automatically deleted. This value can be set to 'false' if you wish to leave these temporary files on the target.", + "oneOf": [ + { + "type": "boolean" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "collective": { + "description": "The Choria collective to target. Overrides the main_collective from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "command-timeout": { + "description": "How long to wait in seconds for commands and scripts to complete when using the Choria transport.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "config-file": { + "description": "The path to the Choria or MCollective client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "host": { + "description": "The target's hostname.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "interpreters": { + "description": "A map of an extension name to the absolute path of an executable, enabling you to override the shebang defined in a task executable. The extension can optionally be specified with the `.` character (`.py` and `py` both map to a task executable `task.py`) and the extension is case sensitive. When a target's name is `localhost`, Ruby tasks run with the Bolt Ruby interpreter by default.", + "oneOf": [ + { + "type": "object", + "additionalProperties": { + "type": [ + "string", + "array" + ] + }, + "propertyNames": { + "pattern": "^.?[a-zA-Z0-9]+$" + } + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "nats-connection-timeout": { + "description": "How long to wait in seconds for the initial TCP connection to the NATS broker. If the connection cannot be made within this time, the operation fails.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "nats-servers": { + "description": "One or more NATS server addresses for the Choria transport. Overrides the middleware hosts from the Choria client configuration file. Can be a single string or an array.", + "oneOf": [ + { + "type": [ + "string", + "array" + ], + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + } + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "puppet-environment": { + "description": "The Puppet environment to use when constructing task file URIs for the Choria bolt_tasks agent.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "rpc-timeout": { + "description": "How long to wait in seconds for nodes to respond to an RPC request. Used for lightweight operations like agent discovery, shell.start, and shell.list polling. Distinct from command-timeout and task-timeout which govern the overall duration of commands and tasks.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-ca": { + "description": "The path to the CA certificate for Choria TLS connections. Overrides the CA from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-cert": { + "description": "The path to the client certificate for Choria TLS connections. Overrides the certificate from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "ssl-key": { + "description": "The path to the client private key for Choria TLS connections. Overrides the key from the Choria client configuration file.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "task-timeout": { + "description": "How long to wait in seconds for tasks to complete when using the Choria transport.", + "oneOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, + "tmpdir": { + "description": "The directory to upload and execute temporary files on the target.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + } + } + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "docker": { "description": "A map of configuration options for the docker transport.", "oneOf": [ From 06fac8d35ba8e73e17832d8b84b785034243f72f Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 22:07:10 +0000 Subject: [PATCH 5/8] Add CLI flags for Choria transport options Add CLI flags for all Choria transport options so they can be passed on the command line. CLI flags use a choria- prefix for clarity (e.g., --choria-config-file, --choria-ssl-ca) while internal option keys remain unprefixed so inventory files stay clean (e.g., choria: { config-file: /path }). Rename choria-agent to task-agent since it only applies to task execution. The CLI flag becomes --choria-task-agent. New CLI flags: --choria-task-agent, --choria-config-file, --choria-ssl-ca, --choria-ssl-cert, --choria-ssl-key, --choria-collective, --choria-puppet-environment, --choria-rpc-timeout, --choria-task-timeout, --choria-command-timeout, --nats-servers, --nats-connection-timeout The nats-* flags are not prefixed since they are already clearly Choria-specific. Shared options (cleanup, tmpdir, host, interpreters) are unchanged. --- docs/choria-transport-dev.md | 2 +- docs/choria-transport-plan.md | 2 +- docs/choria-transport-testing.md | 22 ++++---- docs/choria-transport.md | 69 ++++++++++++++--------- lib/bolt/bolt_option_parser.rb | 53 ++++++++++++++++- lib/bolt/config/transport/choria.rb | 6 +- lib/bolt/config/transport/options.rb | 2 +- lib/bolt/transport/choria.rb | 8 +-- schemas/bolt-defaults.schema.json | 22 ++++---- schemas/bolt-inventory.schema.json | 22 ++++---- spec/unit/config/transport/choria_spec.rb | 12 ++-- spec/unit/transport/choria/shell_spec.rb | 2 +- spec/unit/transport/choria_spec.rb | 18 +++--- 13 files changed, 150 insertions(+), 90 deletions(-) diff --git a/docs/choria-transport-dev.md b/docs/choria-transport-dev.md index 5b7af7df7..9997a26de 100644 --- a/docs/choria-transport-dev.md +++ b/docs/choria-transport-dev.md @@ -359,7 +359,7 @@ One second provides reasonable responsiveness without excessive polling. ### Deterministic agent selection -Agent selection for `run_task` is explicit via the `choria-agent` config +Agent selection for `run_task` is explicit via the `task-agent` config option (default `bolt_tasks`). There is no automatic fallback between agents. If the selected agent is not available on a target, that target gets a clear error. This is simpler and more predictable than a try-and-fallback approach. diff --git a/docs/choria-transport-plan.md b/docs/choria-transport-plan.md index 6b09a3528..ed9ef2d21 100644 --- a/docs/choria-transport-plan.md +++ b/docs/choria-transport-plan.md @@ -123,7 +123,7 @@ What shipped: cleanup - `run_task` via shell agent with support for all input methods (environment, stdin, both) -- Deterministic agent selection via `choria-agent` config and `--choria-agent` +- Deterministic agent selection via `task-agent` config and `--choria-task-agent` CLI flag (no automatic fallback between agents) - Batched shell polling via `shell.list` + `shell.statuses` for scalability - Platform-aware command builders for POSIX and Windows (PowerShell) diff --git a/docs/choria-transport-testing.md b/docs/choria-transport-testing.md index 365d45755..da73d775f 100644 --- a/docs/choria-transport-testing.md +++ b/docs/choria-transport-testing.md @@ -163,7 +163,7 @@ bolt_tasks agent to download. The server-side module paths are listed first so that OpenBolt reads the same module versions that the bolt_tasks agent will actually download from the server. The local `modules` directory comes last as a fallback for Puppetfile-installed dependencies. When using -`--choria-agent shell`, OpenBolt uploads files directly, so local modules should +`--choria-task-agent shell`, OpenBolt uploads files directly, so local modules should take precedence instead — put `modules` first or omit the server paths. Task helper dependencies like `puppetlabs-ruby_task_helper` must also be @@ -483,10 +483,10 @@ Expected: Still fails with the shell agent error. ```bash # Force bolt_tasks (should work, same as default) -bolt task run facts --targets nodeA.example.com --choria-agent bolt_tasks +bolt task run facts --targets nodeA.example.com --choria-task-agent bolt_tasks # Force shell (should fail: not installed) -bolt task run facts --targets nodeA.example.com --choria-agent shell +bolt task run facts --targets nodeA.example.com --choria-task-agent shell ``` Expected: First succeeds (or fails at download, not at agent detection). @@ -553,7 +553,7 @@ default and both nodes have it. **run_task with local task (not on OpenVox/Puppet Server)** Test with a task that's NOT on the OpenVox/Puppet Server (a local custom task). -Without `--choria-agent shell`, this will fail because bolt_tasks can't find +Without `--choria-task-agent shell`, this will fail because bolt_tasks can't find the task on the OpenVox/Puppet Server: ```bash @@ -570,12 +570,12 @@ bolt task run choria_test::hello --targets nodeA.example.com ``` Expected: Fails with `bolt/choria-task-download-failed` and a message -suggesting `--choria-agent shell`. +suggesting `--choria-task-agent shell`. -Now retry with `--choria-agent shell` (Node A has the shell agent): +Now retry with `--choria-task-agent shell` (Node A has the shell agent): ```bash -bolt task run choria_test::hello --targets nodeA.example.com --choria-agent shell +bolt task run choria_test::hello --targets nodeA.example.com --choria-task-agent shell ``` Expected: Succeeds via the shell agent. @@ -584,10 +584,10 @@ Expected: Succeeds via the shell agent. ```bash # Use shell agent (Node A has it) -bolt task run choria_test::hello --targets nodeA.example.com --choria-agent shell +bolt task run choria_test::hello --targets nodeA.example.com --choria-task-agent shell # Use shell agent (Node B doesn't have it) -bolt task run choria_test::hello --targets nodeB.example.com --choria-agent shell +bolt task run choria_test::hello --targets nodeB.example.com --choria-task-agent shell ``` Expected: @@ -605,7 +605,7 @@ have both agents installed. bolt command run 'whoami' --targets nodeA.example.com,nodeB.example.com bolt script run /tmp/test_script.sh --targets nodeA.example.com,nodeB.example.com bolt task run facts --targets nodeA.example.com,nodeB.example.com -bolt task run choria_test::hello --targets nodeA.example.com,nodeB.example.com --choria-agent shell +bolt task run choria_test::hello --targets nodeA.example.com,nodeB.example.com --choria-task-agent shell ``` All should succeed on both nodes. @@ -728,7 +728,7 @@ OpenBolt caches agent lists per target for the transport's lifetime. Start a fre |-------------|----------------|-----------------|---------------------------|--------------------------------------| | run_command | agent error | agent error | works | works (shell) | | run_script | agent error | agent error | works | works (shell) | -| run_task | no-agent error | works (bt) | works (--choria-agent sh) | works (bt default, sh if configured) | +| run_task | no-agent error | works (bt) | works (--choria-task-agent sh) | works (bt default, sh if configured) | | upload | unsupported | unsupported | unsupported | unsupported | | download | unsupported | unsupported | unsupported | unsupported | | connected? | works | works | works | works | diff --git a/docs/choria-transport.md b/docs/choria-transport.md index dac29e9e8..d5475ba85 100644 --- a/docs/choria-transport.md +++ b/docs/choria-transport.md @@ -86,24 +86,37 @@ targets: ### Config option reference -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `choria-agent` | String | `bolt_tasks` | Agent for task execution: `bolt_tasks` or `shell`. Also available as `--choria-agent` CLI flag. | -| `cleanup` | Boolean | `true` | Clean up temp directories after operations. Set to `false` for debugging. | -| `collective` | String | (from Choria config file) | Choria collective to route messages through. Per-target. | -| `command-timeout` | Integer | `60` | Seconds to wait for commands and scripts to complete. | -| `config-file` | String | (auto-detected) | Path to a Choria/MCollective client config file. | -| `host` | String | (from URI) | Target's Choria identity (FQDN). Overrides the hostname from the URI. | -| `interpreters` | Hash | (none) | File extension to interpreter mapping (e.g., `{".rb": "/usr/bin/ruby"}`). | -| `nats-connection-timeout` | Integer | `30` | Seconds to wait for the TCP connection to the NATS broker. | -| `nats-servers` | String or Array | (from Choria config file) | NATS broker addresses. Overrides the config file. | -| `puppet-environment` | String | `production` | Puppet environment for bolt_tasks file URIs. | -| `rpc-timeout` | Integer | `30` | Seconds to wait for replies to individual RPC calls. | -| `ssl-ca` | String | (from Choria config file) | CA certificate path for TLS. | -| `ssl-cert` | String | (from Choria config file) | Client certificate path for TLS. | -| `ssl-key` | String | (from Choria config file) | Client private key path for TLS. | -| `task-timeout` | Integer | `300` | Seconds to wait for task execution to complete. | -| `tmpdir` | String | `/tmp` or `C:\Windows\Temp` | Base path for temp directories on remote nodes. Must be absolute. | +| Option | CLI Flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `task-agent` | `--choria-task-agent` | String | `bolt_tasks` | Agent for task execution: `bolt_tasks` or `shell`. | +| `cleanup` | `--cleanup` | Boolean | `true` | Clean up temp directories after operations. Set to `false` for debugging. | +| `collective` | `--choria-collective` | String | (from config file) | Choria collective to route messages through. Per-target. | +| `command-timeout` | `--choria-command-timeout` | Integer | `60` | Seconds to wait for commands and scripts to complete. | +| `config-file` | `--choria-config-file` | String | (auto-detected) | Path to a Choria/MCollective client config file. | +| `host` | | String | (from URI) | Target's Choria identity (FQDN). Overrides the hostname from the URI. | +| `interpreters` | | Hash | (none) | File extension to interpreter mapping (e.g., `{".rb": "/usr/bin/ruby"}`). | +| `nats-connection-timeout` | `--nats-connection-timeout` | Integer | `30` | Seconds to wait for the TCP connection to the NATS broker. | +| `nats-servers` | `--nats-servers` | String or Array | (from config file) | NATS broker addresses in `nats://host:port` format (comma-separated for multiple). Multiple servers provide failover if a broker is unavailable. Overrides the config file. | +| `puppet-environment` | `--choria-puppet-environment` | String | `production` | Puppet environment for bolt_tasks file URIs. | +| `rpc-timeout` | `--choria-rpc-timeout` | Integer | `30` | Seconds to wait for replies to individual RPC calls. | +| `ssl-ca` | `--choria-ssl-ca` | String | (from config file) | CA certificate path for TLS. | +| `ssl-cert` | `--choria-ssl-cert` | String | (from config file) | Client certificate path for TLS. | +| `ssl-key` | `--choria-ssl-key` | String | (from config file) | Client private key path for TLS. | +| `task-timeout` | `--choria-task-timeout` | Integer | `300` | Seconds to wait for task execution to complete. | +| `tmpdir` | `--tmpdir` | String | `/tmp` or `C:\Windows\Temp` | Base path for temp directories on remote nodes. Must be absolute. | + +**CLI flag precedence:** CLI flags provide default values that can be +overridden by inventory-level config (per-group or per-target). For example, +if a target has `collective: staging` in its inventory entry and +`--choria-collective production` is passed on the CLI, the inventory value +wins. For ad-hoc targets specified via `--targets` that aren't defined in an +inventory file, CLI flags take full effect. + +For options that have corresponding values in the Choria config file +(`nats-servers`, `ssl-ca`/`ssl-cert`/`ssl-key`, and `collective`), the full +precedence from lowest to highest is: Choria config file < CLI flags < +inventory. All other options use OpenBolt-level defaults and are not affected by +the Choria config file. **Timeout hierarchy:** Three levels of timeout control different things: - `nats-connection-timeout` (30s): How long to wait for the initial TCP @@ -160,7 +173,7 @@ Works with either the **bolt_tasks** or **shell** agent. bolt task run facts --targets node1.example.com # Use shell agent for local tasks not on the OpenVox/Puppet Server -bolt task run my_project::check --targets node1.example.com --choria-agent shell +bolt task run my_project::check --targets node1.example.com --choria-task-agent shell ``` Agent selection is deterministic with no automatic fallback. If the selected @@ -198,7 +211,7 @@ installed on target nodes. With the shell agent: - `run_command` and `run_script` work - `run_task` can use either agent (bolt_tasks by default, or shell with - `--choria-agent shell`) + `--choria-task-agent shell`) The shell agent DDL (required by the client library) is bundled with OpenBolt and loaded automatically. No client-side setup is needed. @@ -270,8 +283,8 @@ modulepath: Server-side paths are listed first so that OpenBolt reads the same module versions that the bolt_tasks agent will download from the server. When using -`--choria-agent shell`, OpenBolt uploads task files directly, so local modules -should take precedence instead — put `modules` first or omit the server paths. +`--choria-task-agent shell`, OpenBolt uploads task files directly, so local modules +should take precedence instead -- put `modules` first or omit the server paths. OpenBolt also auto-injects its own internal paths (visible in `--log-level debug` output): `bolt-modules` is prepended, and `.modules` plus the gem's built-in @@ -308,22 +321,22 @@ bolt/choria-task-download-failed: ... ruby_task_helper/files/task_helper.rb: 404 ### Using the shell agent for tasks If a task is not available on the OpenVox/Puppet Server (e.g., it's a local project -task), set `choria-agent` to `shell` to upload and execute it directly via +task), set `task-agent` to `shell` to upload and execute it directly via the shell agent, bypassing the OpenVox/Puppet Server entirely: ```yaml # bolt-project.yaml choria: - choria-agent: shell + task-agent: shell ``` Or per-invocation: ```bash -bolt task run my_project::check --targets node1 --choria-agent shell +bolt task run my_project::check --targets node1 --choria-task-agent shell ``` -When using `--choria-agent shell`, the OpenVox/Puppet Server requirement is bypassed +When using `--choria-task-agent shell`, the OpenVox/Puppet Server requirement is bypassed entirely. OpenBolt uploads task files directly via the shell agent, so only the local modulepath matters. @@ -338,7 +351,7 @@ local modulepath matters. 3. **bolt_tasks requires an OpenVox/Puppet Server.** The bolt_tasks agent downloads task files from the OpenVox/Puppet Server. Tasks not served by the OpenVox/Puppet Server - will fail with an error suggesting `--choria-agent shell`. + will fail with an error suggesting `--choria-task-agent shell`. 4. **No streaming output.** All output is returned on completion, not streamed incrementally. @@ -354,7 +367,7 @@ local modulepath matters. after OpenBolt reports a timeout (bolt_tasks has no kill mechanism). 8. **File size limit for shell agent uploads.** When using the shell agent - (`run_script`, `run_task` with `--choria-agent shell`), files are + (`run_script`, `run_task` with `--choria-task-agent shell`), files are base64-encoded and sent as RPC messages. The maximum file size is limited by the NATS max message size (default 1MB, roughly 750KB effective after base64 overhead). Increase `plugin.choria.network.client_max_payload` in diff --git a/lib/bolt/bolt_option_parser.rb b/lib/bolt/bolt_option_parser.rb index 6865a3975..6f0520afd 100644 --- a/lib/bolt/bolt_option_parser.rb +++ b/lib/bolt/bolt_option_parser.rb @@ -13,6 +13,9 @@ class BoltOptionParser < OptionParser run_context: %w[concurrency inventoryfile save-rerun cleanup puppetdb], global_config_setters: PROJECT_PATHS + %w[modulepath], transports: %w[transport connect-timeout tty native-ssh ssh-command copy-command], + choria: %w[config-file ssl-ca ssl-cert ssl-key collective + puppet-environment rpc-timeout task-timeout command-timeout + nats-servers nats-connection-timeout], display: %w[format color verbose trace stream], global: %w[help version log-level clear-cache] }.freeze @@ -168,7 +171,7 @@ def get_help_text(subcommand, action = nil) when 'task' case action when 'run' - { flags: ACTION_OPTS + %w[params tmpdir noop choria-agent], + { flags: ACTION_OPTS + %w[params tmpdir noop task-agent], banner: TASK_RUN_HELP } when 'show' { flags: OPTIONS[:global] + OPTIONS[:global_config_setters] + %w[filter format], @@ -1095,10 +1098,54 @@ def initialize(options) define('--tmpdir DIR', 'The directory to upload and execute temporary files on the target.') do |tmpdir| @options[:tmpdir] = tmpdir end - define('--choria-agent AGENT', %w[bolt_tasks shell], + define('--choria-task-agent AGENT', %w[bolt_tasks shell], "Which Choria agent to use for task execution (bolt_tasks, shell).", "Defaults to 'bolt_tasks'. Set to 'shell' for tasks not on the Puppet Server.") do |agent| - @options[:'choria-agent'] = agent + @options[:'task-agent'] = agent + end + define('--choria-config-file PATH', + 'Path to a Choria/MCollective client configuration file.') do |path| + @options[:'config-file'] = path + end + define('--choria-ssl-ca PATH', + 'CA certificate path for Choria TLS authentication.') do |path| + @options[:'ssl-ca'] = path + end + define('--choria-ssl-cert PATH', + 'Client certificate path for Choria TLS authentication.') do |path| + @options[:'ssl-cert'] = path + end + define('--choria-ssl-key PATH', + 'Client private key path for Choria TLS authentication.') do |path| + @options[:'ssl-key'] = path + end + define('--choria-collective NAME', + 'Choria collective to route messages through.') do |name| + @options[:collective] = name + end + define('--choria-puppet-environment ENV', + "Puppet environment for bolt_tasks file downloads (default: 'production').") do |env| + @options[:'puppet-environment'] = env + end + define('--choria-rpc-timeout SECONDS', Integer, + 'Seconds to wait for replies to individual Choria RPC calls (default: 30).') do |timeout| + @options[:'rpc-timeout'] = timeout + end + define('--choria-task-timeout SECONDS', Integer, + 'Seconds to wait for task execution to complete (default: 300).') do |timeout| + @options[:'task-timeout'] = timeout + end + define('--choria-command-timeout SECONDS', Integer, + 'Seconds to wait for commands and scripts to complete (default: 60).') do |timeout| + @options[:'command-timeout'] = timeout + end + define('--nats-servers SERVERS', + 'NATS broker addresses in nats://host:port format (comma-separated for multiple).') do |servers| + @options[:'nats-servers'] = servers + end + define('--nats-connection-timeout SECONDS', Integer, + 'Seconds to wait for the TCP connection to the NATS broker (default: 30).') do |timeout| + @options[:'nats-connection-timeout'] = timeout end separator "\n#{self.class.colorize(:cyan, 'Module options')}" diff --git a/lib/bolt/config/transport/choria.rb b/lib/bolt/config/transport/choria.rb index 3ebab3ac0..8c90011fd 100644 --- a/lib/bolt/config/transport/choria.rb +++ b/lib/bolt/config/transport/choria.rb @@ -8,7 +8,6 @@ class Config module Transport class Choria < Base OPTIONS = %w[ - choria-agent cleanup collective command-timeout @@ -22,6 +21,7 @@ class Choria < Base ssl-ca ssl-cert ssl-key + task-agent task-timeout tmpdir ].sort.freeze @@ -41,9 +41,9 @@ class Choria < Base private def validate super - if @config['choria-agent'] && !VALID_AGENTS.include?(@config['choria-agent']) + if @config['task-agent'] && !VALID_AGENTS.include?(@config['task-agent']) raise Bolt::ValidationError, - "choria-agent must be one of #{VALID_AGENTS.join(', ')}, got '#{@config['choria-agent']}'" + "task-agent must be one of #{VALID_AGENTS.join(', ')}, got '#{@config['task-agent']}'" end if @config['tmpdir'] && !absolute_path?(@config['tmpdir']) diff --git a/lib/bolt/config/transport/options.rb b/lib/bolt/config/transport/options.rb index 272998ff8..949365863 100644 --- a/lib/bolt/config/transport/options.rb +++ b/lib/bolt/config/transport/options.rb @@ -51,7 +51,7 @@ module Options _default: true, _example: false }, - "choria-agent" => { + "task-agent" => { type: String, description: "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' " \ "(downloads task files from a Puppet Server). Set to 'shell' for tasks " \ diff --git a/lib/bolt/transport/choria.rb b/lib/bolt/transport/choria.rb index 1e2389b22..ec8440c44 100644 --- a/lib/bolt/transport/choria.rb +++ b/lib/bolt/transport/choria.rb @@ -26,8 +26,8 @@ module Transport # # shell agent installed (>= 1.2.0): run_command, run_script, and # run_task work. run_task uses the bolt_tasks agent by default. - # To run local tasks via the shell agent, set choria-agent to 'shell' - # in project config or specify --choria-agent shell. + # To run local tasks via the shell agent, set task-agent to 'shell' + # in project config or specify --choria-task-agent shell. # # Upload, download, and plans are not yet supported. class Choria < Base @@ -97,7 +97,7 @@ def batches(targets) # @param callback [Proc] Called with :node_start and :node_result events # @return [Array] Results for all targets (successes and failures) def batch_task(targets, task, arguments, _options = {}, position = [], &callback) - chosen_agent = targets.first.options['choria-agent'] || 'bolt_tasks' + chosen_agent = targets.first.options['task-agent'] || 'bolt_tasks' result_opts = { action: 'task', name: task.name, position: position } # The results var here is the error results for incapable targets, to which we'll add in @@ -117,7 +117,7 @@ def batch_task(targets, task, arguments, _options = {}, position = [], &callback run_task_via_shell(capable, task, arguments, result_opts, &callback) else raise Bolt::Error.new( - "Unsupported choria-agent '#{chosen_agent}'", + "Unsupported task-agent '#{chosen_agent}'", 'bolt/choria-unsupported-agent' ) end diff --git a/schemas/bolt-defaults.schema.json b/schemas/bolt-defaults.schema.json index b93bf4abb..9cc2b312d 100644 --- a/schemas/bolt-defaults.schema.json +++ b/schemas/bolt-defaults.schema.json @@ -518,16 +518,6 @@ { "type": "object", "properties": { - "choria-agent": { - "oneOf": [ - { - "$ref": "#/transport_definitions/choria-agent" - }, - { - "$ref": "#/definitions/_plugin" - } - ] - }, "cleanup": { "oneOf": [ { @@ -658,6 +648,16 @@ } ] }, + "task-agent": { + "oneOf": [ + { + "$ref": "#/transport_definitions/task-agent" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "task-timeout": { "oneOf": [ { @@ -1807,7 +1807,7 @@ } ] }, - "choria-agent": { + "task-agent": { "description": "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' (downloads task files from a Puppet Server). Set to 'shell' for tasks not available on the Puppet Server.", "oneOf": [ { diff --git a/schemas/bolt-inventory.schema.json b/schemas/bolt-inventory.schema.json index 2437c5cc0..bce38b0f9 100644 --- a/schemas/bolt-inventory.schema.json +++ b/schemas/bolt-inventory.schema.json @@ -63,17 +63,6 @@ { "type": "object", "properties": { - "choria-agent": { - "description": "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' (downloads task files from a Puppet Server). Set to 'shell' for tasks not available on the Puppet Server.", - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/definitions/_plugin" - } - ] - }, "cleanup": { "description": "Whether to clean up temporary files created on targets. When running commands on a target, Bolt might create temporary files. After completing the command, these files are automatically deleted. This value can be set to 'false' if you wish to leave these temporary files on the target.", "oneOf": [ @@ -242,6 +231,17 @@ } ] }, + "task-agent": { + "description": "Which Choria agent to use for task execution. Defaults to 'bolt_tasks' (downloads task files from a Puppet Server). Set to 'shell' for tasks not available on the Puppet Server.", + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/_plugin" + } + ] + }, "task-timeout": { "description": "How long to wait in seconds for tasks to complete when using the Choria transport.", "oneOf": [ diff --git a/spec/unit/config/transport/choria_spec.rb b/spec/unit/config/transport/choria_spec.rb index d05c1e539..24ef024d0 100644 --- a/spec/unit/config/transport/choria_spec.rb +++ b/spec/unit/config/transport/choria_spec.rb @@ -22,7 +22,7 @@ context 'validating' do include_examples 'interpreters' - %w[choria-agent config-file collective host puppet-environment ssl-ca ssl-cert ssl-key tmpdir].each do |opt| + %w[task-agent config-file collective host puppet-environment ssl-ca ssl-cert ssl-key tmpdir].each do |opt| it "#{opt} rejects non-string value" do data[opt] = 100 expect { transport.new(data) }.to raise_error(Bolt::ValidationError) @@ -56,14 +56,14 @@ expect { transport.new(data) }.to raise_error(Bolt::ValidationError) end - it 'choria-agent rejects invalid values' do - data['choria-agent'] = 'not-an-agent' - expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /choria-agent must be one of/) + it 'task-agent rejects invalid values' do + data['task-agent'] = 'not-an-agent' + expect { transport.new(data) }.to raise_error(Bolt::ValidationError, /task-agent must be one of/) end %w[bolt_tasks shell].each do |agent| - it "choria-agent accepts '#{agent}'" do - data['choria-agent'] = agent + it "task-agent accepts '#{agent}'" do + data['task-agent'] = agent expect { transport.new(data) }.not_to raise_error end end diff --git a/spec/unit/transport/choria/shell_spec.rb b/spec/unit/transport/choria/shell_spec.rb index 647786324..bf95af703 100644 --- a/spec/unit/transport/choria/shell_spec.rb +++ b/spec/unit/transport/choria/shell_spec.rb @@ -328,7 +328,7 @@ describe '#run_task_via_shell' do before(:each) do - inventory.set_config(target, %w[choria choria-agent], 'shell') + inventory.set_config(target, %w[choria task-agent], 'shell') stub_agents(target, %w[rpcutil shell]) stub_shell_run stub_shell_start diff --git a/spec/unit/transport/choria_spec.rb b/spec/unit/transport/choria_spec.rb index 5e1bd37cd..7c330f7bd 100644 --- a/spec/unit/transport/choria_spec.rb +++ b/spec/unit/transport/choria_spec.rb @@ -192,10 +192,10 @@ end end - context 'with forced choria-agent' do + context 'with forced task-agent' do it 'uses only bolt_tasks when forced' do stub_agents(target, %w[rpcutil bolt_tasks shell]) - inventory.set_config(target, %w[choria choria-agent], 'bolt_tasks') + inventory.set_config(target, %w[choria task-agent], 'bolt_tasks') expect(transport).to receive(:run_task_via_bolt_tasks).and_return( [Bolt::Result.for_task(target, '{}', '', 0, task_name, [])] @@ -207,7 +207,7 @@ it 'uses only shell when forced' do stub_agents(target, %w[rpcutil bolt_tasks shell]) - inventory.set_config(target, %w[choria choria-agent], 'shell') + inventory.set_config(target, %w[choria task-agent], 'shell') expect(transport).not_to receive(:run_task_via_bolt_tasks) expect(transport).to receive(:run_task_via_shell).and_return( @@ -219,7 +219,7 @@ it 'returns error when forced agent is not available on target' do stub_agents(target, %w[rpcutil bolt_tasks]) - inventory.set_config(target, %w[choria choria-agent], 'shell') + inventory.set_config(target, %w[choria task-agent], 'shell') result = transport.batch_task([target], task, {}).first expect(result.ok?).to be false @@ -228,11 +228,11 @@ it 'raises for invalid forced agent value' do stub_agents(target, %w[rpcutil bolt_tasks shell invalid_agent]) - inventory.set_config(target, %w[choria choria-agent], 'invalid_agent') + inventory.set_config(target, %w[choria task-agent], 'invalid_agent') expect { transport.batch_task([target], task, {}) - }.to raise_error(Bolt::ValidationError, /choria-agent must be/) + }.to raise_error(Bolt::ValidationError, /task-agent must be/) end end end @@ -287,9 +287,9 @@ expect(error_results.first.error_hash['msg']).to match(/No agent information.*did not respond to discovery/) end - it 'uses shell agent for all targets when choria-agent is shell' do - inventory.set_config(target, %w[choria choria-agent], 'shell') - inventory.set_config(target2, %w[choria choria-agent], 'shell') + it 'uses shell agent for all targets when task-agent is shell' do + inventory.set_config(target, %w[choria task-agent], 'shell') + inventory.set_config(target2, %w[choria task-agent], 'shell') stub_agents([target, target2], %w[rpcutil shell]) allow(mock_rpc_client).to receive_messages( From 781974c52cd6d408c6aa20b8bac03b86873cb5dd Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 22:28:41 +0000 Subject: [PATCH 6/8] Fix help text filtering for Choria CLI flags BoltOptionParser::OPTIONS[:choria] needs CLI switch names (e.g., choria-config-file) not internal keys (config-file) so that remove_excluded_opts correctly includes them in --help output. Also fix task-agent -> choria-task-agent in the task run flags list. --- lib/bolt/bolt_option_parser.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/bolt/bolt_option_parser.rb b/lib/bolt/bolt_option_parser.rb index 6f0520afd..c18055270 100644 --- a/lib/bolt/bolt_option_parser.rb +++ b/lib/bolt/bolt_option_parser.rb @@ -13,9 +13,10 @@ class BoltOptionParser < OptionParser run_context: %w[concurrency inventoryfile save-rerun cleanup puppetdb], global_config_setters: PROJECT_PATHS + %w[modulepath], transports: %w[transport connect-timeout tty native-ssh ssh-command copy-command], - choria: %w[config-file ssl-ca ssl-cert ssl-key collective - puppet-environment rpc-timeout task-timeout command-timeout - nats-servers nats-connection-timeout], + choria: %w[choria-config-file choria-ssl-ca choria-ssl-cert choria-ssl-key + choria-collective choria-puppet-environment choria-rpc-timeout + choria-task-timeout choria-command-timeout nats-servers + nats-connection-timeout], display: %w[format color verbose trace stream], global: %w[help version log-level clear-cache] }.freeze @@ -171,7 +172,7 @@ def get_help_text(subcommand, action = nil) when 'task' case action when 'run' - { flags: ACTION_OPTS + %w[params tmpdir noop task-agent], + { flags: ACTION_OPTS + %w[params tmpdir noop choria-task-agent], banner: TASK_RUN_HELP } when 'show' { flags: OPTIONS[:global] + OPTIONS[:global_config_setters] + %w[filter format], From 411d044bca95dec04ba56affe3a13b0f01d0ce9c Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Fri, 27 Mar 2026 22:49:42 +0000 Subject: [PATCH 7/8] Update Pester parameter counts for new Choria CLI flags The 11 new Choria flags added to ACTION_OPTS increase the parameter count for bolt apply, bolt command, and bolt file. --- pwsh_module/command.tests.ps1 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pwsh_module/command.tests.ps1 b/pwsh_module/command.tests.ps1 index 763b01154..d3ffa46ec 100644 --- a/pwsh_module/command.tests.ps1 +++ b/pwsh_module/command.tests.ps1 @@ -55,7 +55,7 @@ Describe "test bolt command syntax" { It "has correct number of parameters" { ($command.Parameters.Values | Where-Object { $_.name -notin $common - } | measure-object).Count | Should -Be 38 + } | measure-object).Count | Should -Be 49 } } @@ -73,7 +73,7 @@ Describe "test bolt command syntax" { It "has correct number of parameters" { ($command.Parameters.Values | Where-Object { $_.name -notin $common - } | measure-object).Count | Should -Be 35 + } | measure-object).Count | Should -Be 46 } } @@ -95,7 +95,7 @@ Describe "test bolt command syntax" { It "has correct number of parameters" { ($command.Parameters.Values | Where-Object { $_.name -notin $common - } | measure-object).Count | Should -Be 36 + } | measure-object).Count | Should -Be 47 } } From 54f2bae7b8da59ca29b103a53ff1b492a204affd Mon Sep 17 00:00:00 2001 From: nmburgan <13688219+nmburgan@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:09:17 +0000 Subject: [PATCH 8/8] Bump minimum shell agent version from 1.2.0 to 1.2.1 The shell agent 1.2.0 release was broken, so 1.2.1 is now the working minimum version. --- docs/choria-transport-dev.md | 6 +++--- docs/choria-transport-plan.md | 2 +- docs/choria-transport-testing.md | 4 ++-- docs/choria-transport.md | 14 +++++++------- lib/bolt/transport/choria.rb | 2 +- lib/bolt/transport/choria/agent_discovery.rb | 2 +- lib/bolt/transport/choria/shell.rb | 2 +- lib/mcollective/agent/shell.ddl | 2 +- spec/lib/bolt_spec/choria.rb | 4 ++-- spec/unit/transport/choria/agent_discovery_spec.rb | 10 +++++----- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/choria-transport-dev.md b/docs/choria-transport-dev.md index 9997a26de..eb85b9740 100644 --- a/docs/choria-transport-dev.md +++ b/docs/choria-transport-dev.md @@ -60,7 +60,7 @@ gem, which provides `MCollective::RPC::Client`. Despite the MCollective name - **shell**: A separate plugin ([choria-plugins/shell-agent](https://github.com/choria-plugins/shell-agent)). Provides synchronous (`run`) and asynchronous (`start`/`list`/`statuses`/`kill`) - command execution. Version 1.2.0+ required for the batched `statuses` action. + command execution. Version 1.2.1+ required for the batched `statuses` action. ## File Layout @@ -93,7 +93,7 @@ identity. Each entry stores `{ agents: [...], os: 'redhat'|'windows'|... }`. The cache lives for the transport instance's lifetime. Non-responding targets are not cached (intentional, to allow retry on transient failures). -Agent versions are checked against `AGENT_MIN_VERSIONS` (e.g., shell >= 1.2.0). +Agent versions are checked against `AGENT_MIN_VERSIONS` (e.g., shell >= 1.2.1). Agents below the minimum are excluded from the cache and logged as warnings. ### RPC Request Pipeline @@ -368,7 +368,7 @@ error. This is simpler and more predictable than a try-and-fallback approach. Shell polling uses `shell.list` + `shell.statuses` instead of per-handle `shell.status` calls. This reduces RPC overhead from O(N) per round to O(1) -per target node, making it feasible at scale. This is why version 1.2.0 +per target node, making it feasible at scale. This is why version 1.2.1 of the shell agent is required, since this is the version version to include `shell.statuses`. diff --git a/docs/choria-transport-plan.md b/docs/choria-transport-plan.md index ed9ef2d21..d5839ee76 100644 --- a/docs/choria-transport-plan.md +++ b/docs/choria-transport-plan.md @@ -23,7 +23,7 @@ which Choria agents are available on the remote nodes: | Phase | Agents Required | Capabilities Added | |-------|----------------|-------------------| | Phase 1 | bolt_tasks (ships with Choria+Puppet) | `run_task` (OpenVox/Puppet Server tasks only) | -| Phase 2 | shell >= 1.2.0 (separate install) | `run_command`, `run_script`, `run_task` (local tasks) | +| Phase 2 | shell >= 1.2.1 (separate install) | `run_command`, `run_script`, `run_task` (local tasks) | | Phase 3 | bolt_tasks | [foreman_openbolt](https://github.com/overlookinfra/foreman_openbolt) and [smart_proxy_openbolt](https://github.com/overlookinfra/smart_proxy_openbolt) Choria transport support (bolt_tasks only) | | Phase 4 | file-transfer (new, to be written) | `upload`, `download` (any size, chunked) | | Phase 5 | (all above) | Full plan support including apply blocks | diff --git a/docs/choria-transport-testing.md b/docs/choria-transport-testing.md index da73d775f..07b88bfcd 100644 --- a/docs/choria-transport-testing.md +++ b/docs/choria-transport-testing.md @@ -221,10 +221,10 @@ mod 'choria-mcollective', :latest mod 'choria-mcollective_choria', :latest mod 'mcollective_agent_shell', git: 'https://github.com/choria-plugins/shell-agent', - ref: '1.2.0' + ref: '1.2.1' ``` -The shell agent requires version 1.2.0 or later (for the batched `statuses` +The shell agent requires version 1.2.1 or later (for the batched `statuses` action). The `bolt_tasks` agent is included in a standard Choria install and does not need a separate Puppetfile entry. diff --git a/docs/choria-transport.md b/docs/choria-transport.md index d5475ba85..4882f070b 100644 --- a/docs/choria-transport.md +++ b/docs/choria-transport.md @@ -24,7 +24,7 @@ For test environment setup, see [choria-transport-testing.md](choria-transport-t - A Choria client config file on the OpenBolt controller - At least one of the supported agents installed on target nodes: - **bolt_tasks** (ships with Puppet-enabled Choria setups) - - **shell** (separate install, version 1.2.0 or later) + - **shell** (separate install, version 1.2.1 or later) ## Configuration @@ -134,7 +134,7 @@ validation. ### run_command -Requires the **shell agent** (>= 1.2.0) on target nodes. +Requires the **shell agent** (>= 1.2.1) on target nodes. ```bash bolt command run 'hostname -f' --targets node1.example.com,node2.example.com @@ -146,7 +146,7 @@ process is killed on the target node. ### run_script -Requires the **shell agent** (>= 1.2.0) on target nodes. +Requires the **shell agent** (>= 1.2.1) on target nodes. ```bash bolt script run ./check_disk.sh --targets node1.example.com @@ -205,7 +205,7 @@ task files from an OpenVox/Puppet Server and executes them. This means: ### shell (separate install) The [shell agent](https://github.com/choria-plugins/shell-agent) is a -separate Choria plugin. Version 1.2.0 or later is required. It must be +separate Choria plugin. Version 1.2.1 or later is required. It must be installed on target nodes. With the shell agent: @@ -220,7 +220,7 @@ and loaded automatically. No client-side setup is needed. On first contact with a target, the transport automatically discovers which agents are installed and what OS the target is running. This happens -transparently. Agents below the required minimum version (e.g., shell < 1.2.0) +transparently. Agents below the required minimum version (e.g., shell < 1.2.1) are excluded and treated as unavailable. If a target is missing the required agent, it gets a clear error result with @@ -235,7 +235,7 @@ Install via Puppet by referencing the GitHub repository in your Puppetfile: ```ruby mod 'mcollective_agent_shell', git: 'https://github.com/choria-plugins/shell-agent', - ref: 'v1.2.0' + ref: 'v1.2.1' ``` Deploy with r10k or Code Manager, then apply via Hiera: @@ -347,7 +347,7 @@ local modulepath matters. 2. **Shell agent not installed by default.** Without it, only `run_task` (via bolt_tasks + OpenVox/Puppet Server) works. All other operations fail with a - clear error message. Version 1.2.0 or later is required. + clear error message. Version 1.2.1 or later is required. 3. **bolt_tasks requires an OpenVox/Puppet Server.** The bolt_tasks agent downloads task files from the OpenVox/Puppet Server. Tasks not served by the OpenVox/Puppet Server diff --git a/lib/bolt/transport/choria.rb b/lib/bolt/transport/choria.rb index ec8440c44..15b321be5 100644 --- a/lib/bolt/transport/choria.rb +++ b/lib/bolt/transport/choria.rb @@ -24,7 +24,7 @@ module Transport # them via task_wrapper. All other operations fail with an actionable # error directing the user to install the shell agent. # - # shell agent installed (>= 1.2.0): run_command, run_script, and + # shell agent installed (>= 1.2.1): run_command, run_script, and # run_task work. run_task uses the bolt_tasks agent by default. # To run local tasks via the shell agent, set task-agent to 'shell' # in project config or specify --choria-task-agent shell. diff --git a/lib/bolt/transport/choria/agent_discovery.rb b/lib/bolt/transport/choria/agent_discovery.rb index 61f4de8e7..66fc1c248 100644 --- a/lib/bolt/transport/choria/agent_discovery.rb +++ b/lib/bolt/transport/choria/agent_discovery.rb @@ -3,7 +3,7 @@ module Bolt module Transport class Choria - SHELL_MIN_VERSION = '1.2.0' + SHELL_MIN_VERSION = '1.2.1' AGENT_MIN_VERSIONS = { 'shell' => SHELL_MIN_VERSION diff --git a/lib/bolt/transport/choria/shell.rb b/lib/bolt/transport/choria/shell.rb index 6e8d737b2..f0b141032 100644 --- a/lib/bolt/transport/choria/shell.rb +++ b/lib/bolt/transport/choria/shell.rb @@ -466,7 +466,7 @@ def shell_list(remaining) end # Fetch stdout/stderr/exitcode from completed targets via the - # shell.statuses RPC action. Requires shell agent >= 1.2.0. + # shell.statuses RPC action. Requires shell agent >= 1.2.1. # # @param targets [Hash{Bolt::Target => Hash}] Completed targets mapped to { handle: uuid_string } # @return [Hash{Bolt::Target => Hash}] Output hash for each target diff --git a/lib/mcollective/agent/shell.ddl b/lib/mcollective/agent/shell.ddl index 1e0294cd5..ef8b4c3ce 100644 --- a/lib/mcollective/agent/shell.ddl +++ b/lib/mcollective/agent/shell.ddl @@ -2,7 +2,7 @@ metadata :name => "shell", :description => "Run commands with the local shell", :author => "Puppet Labs", :license => "Apache-2.0", - :version => "1.2.0", + :version => "1.2.1", :url => "https://github.com/choria-plugins/shell-agent", :timeout => 180 diff --git a/spec/lib/bolt_spec/choria.rb b/spec/lib/bolt_spec/choria.rb index edc4a96fd..7fe8e1d74 100644 --- a/spec/lib/bolt_spec/choria.rb +++ b/spec/lib/bolt_spec/choria.rb @@ -40,7 +40,7 @@ def make_rpc_result(sender:, statuscode: 0, statusmsg: 'OK', data: {}) # variables are cleared between `it` blocks. # # Accepts Bolt::Target objects or host strings, single or as an array. - # Agents can be strings (version defaults to '1.2.0') or [name, version] + # Agents can be strings (version defaults to '1.2.1') or [name, version] # pairs for version-specific scenarios. # # stub_agents(target, %w[rpcutil shell]) @@ -51,7 +51,7 @@ def stub_agents(targets, agents, os_family: 'RedHat') targets = [targets].flatten agent_data = agents.map do |agent| - name, version = agent.is_a?(Array) ? agent : [agent, '1.2.0'] + name, version = agent.is_a?(Array) ? agent : [agent, '1.2.1'] { 'agent' => name, 'name' => name, 'version' => version } end diff --git a/spec/unit/transport/choria/agent_discovery_spec.rb b/spec/unit/transport/choria/agent_discovery_spec.rb index ddef9bc80..2e1b7e501 100644 --- a/spec/unit/transport/choria/agent_discovery_spec.rb +++ b/spec/unit/transport/choria/agent_discovery_spec.rb @@ -16,7 +16,7 @@ agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'bolt_tasks' }] }) r2 = make_rpc_result(sender: target2, data: { - agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'shell', 'version' => '1.2.0' }] + agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'shell', 'version' => '1.2.1' }] }) f1 = make_rpc_result(sender: target, data: { value: 'RedHat' }) @@ -80,7 +80,7 @@ it 'discards responses from unexpected senders' do legit = make_rpc_result(sender: target, data: { agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, - { 'agent' => 'shell', 'version' => '1.2.0' }] + { 'agent' => 'shell', 'version' => '1.2.1' }] }) rogue = make_rpc_result(sender: 'evil.example.com', data: { agents: [{ 'agent' => 'rpcutil' }, { 'agent' => 'bolt_tasks' }] @@ -198,7 +198,7 @@ it 'defaults to POSIX when OS detection fails' do result = make_rpc_result(sender: target, data: { agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, - { 'agent' => 'shell', 'version' => '1.2.0' }] + { 'agent' => 'shell', 'version' => '1.2.1' }] }) allow(mock_rpc_client).to receive(:agent_inventory).and_return([result]) @@ -212,7 +212,7 @@ it 'defaults to POSIX when os.family fact is an empty string' do result = make_rpc_result(sender: target, data: { agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, - { 'agent' => 'shell', 'version' => '1.2.0' }] + { 'agent' => 'shell', 'version' => '1.2.1' }] }) fact_result = make_rpc_result(sender: target, data: { value: '' }) @@ -226,7 +226,7 @@ it 're-raises Bolt::Error from OS detection instead of swallowing it' do result = make_rpc_result(sender: target, data: { agents: [{ 'agent' => 'rpcutil', 'version' => '1.0.0' }, - { 'agent' => 'shell', 'version' => '1.2.0' }] + { 'agent' => 'shell', 'version' => '1.2.1' }] }) allow(mock_rpc_client).to receive(:agent_inventory).and_return([result])