Skip to content

Commit e5f453e

Browse files
authored
CP-311893: cross host backtraces for xe --trace (#6955)
xe `--trace` has existed since ~2014, but it isn't documented in `--help`, and therefore not well known. Also it only worked on a single host, limiting its usefulness in a pool. However propagating backtraces between XAPIs in a pool is doable, by using the already existing `backtrace` field in the Task object. Having working cross-host backtraces appears to have been the original design goal in [doc/content/design/backtraces.md](https://github.com/xapi-project/xen-api/blob/master/doc/content/design/backtraces.md) In theory this should also work cross-language, with python SM backends, however some plumbing is missing there: it currently doesn't work with either SMAPIv1 or SMAPIv3. Fixing that should be the topic of another PR (by someone else).
2 parents ea2a845 + f445a2c commit e5f453e

13 files changed

Lines changed: 79 additions & 11 deletions

File tree

ocaml/libs/log/debug.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ let facility = ref Syslog.Daemon
135135

136136
let set_facility f = facility := f
137137

138+
let set_backtrace_name this_host_name =
139+
let name = Printf.sprintf "%s @ %s" Sys.argv.(0) this_host_name in
140+
Backtrace.set_my_name name
141+
138142
let get_facility () = !facility
139143

140144
let output_log brand level priority s =

ocaml/libs/log/debug.mli

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ module type BRAND = sig val name : string end
3434
val set_facility : Syslog.facility -> unit
3535
(** Set the syslog facility that will be used by this program. *)
3636

37+
val set_backtrace_name : string -> unit
38+
(** Set the hostname for backtraces *)
39+
3740
val disable : ?level:Syslog.level -> string -> unit
3841
(** [disable brand] Suppress all log output from the given [brand]. Specifying a
3942
[level] disables * only this log level, otherwise all levels for the given

ocaml/xapi-idl/lib/task_server.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ functor
207207
try SMap.find id !(tasks.task_map)
208208
with _ -> raise (Interface.does_not_exist ("task", id))
209209

210+
let backtrace_of t = t.backtrace
211+
210212
let to_interface_task t =
211213
{
212214
Interface.Task.id= t.id

ocaml/xapi-idl/lib/task_server.mli

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ module Task : functor (Interface : INTERFACE) -> sig
6969

7070
val to_interface_task : task_handle -> Interface.Task.t
7171

72+
val backtrace_of : task_handle -> Backtrace.t
73+
7274
(* [add tasks dbg f] adds a new task with debug string [dbg] that will execute
7375
[f] when run *)
7476
val add :

ocaml/xapi-idl/lib/xcp_service.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ let configure_common ~options ~resources arg_parse_fn =
443443
(* Register the Logs reporter to ensure we get log messages from libraries
444444
using Logs *)
445445
Debug.init_logs () ;
446+
let () = try Unix.gethostname () |> Debug.set_backtrace_name with _ -> () in
446447
let resources = default_resources @ resources in
447448
let config_spec = common_options @ options @ to_opt resources in
448449
(* It's very confusing if there are duplicate key names *)

ocaml/xapi/create_misc.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ let read_localhost_info ~__context =
155155
try Some (Xapi_inventory.lookup k) with _ -> None
156156
in
157157
let this_host_name = Networking_info.get_hostname () in
158+
Debug.set_backtrace_name this_host_name ;
158159
let open Xapi_inventory in
159160
let open Xenops_interface.Host in
160161
{

ocaml/xapi/message_forwarding.ml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,14 @@ let do_op_on_common ~local_fn ~__context ~host ~remote_fn f =
141141
local_fn ~__context
142142
else
143143
let task_opt = set_forwarding_on_task ~__context ~host in
144-
f __context host task_opt remote_fn
144+
try f __context host task_opt remote_fn
145+
with Api_errors.Server_error (_, _) as e -> (
146+
match task_opt with
147+
| None ->
148+
raise e
149+
| Some task ->
150+
TaskHelper.reraise ~__context ~task e
151+
)
145152
with
146153
| ( Xmlrpc_client.Connection_reset
147154
| Http_client.Http_request_rejected _

ocaml/xapi/server_helpers.ml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,17 @@ let exec_with_context ~__context ~need_complete ?marshaller ?f_forward ?quiet f
8989
with
9090
| Api_errors.Server_error (a, _) as e when a = Api_errors.task_cancelled ->
9191
Backtrace.is_important e ;
92-
if need_complete then TaskHelper.cancel ~__context ;
92+
if need_complete then
93+
TaskHelper.cancel ~__context
94+
else
95+
TaskHelper.store_backtrace ~__context e ;
9396
raise e
9497
| e ->
9598
Backtrace.is_important e ;
96-
if need_complete then TaskHelper.failed ~__context e ;
99+
if need_complete then
100+
TaskHelper.failed ~__context e
101+
else
102+
TaskHelper.store_backtrace ~__context e ;
97103
raise e
98104
in
99105
let@ () =

ocaml/xapi/taskHelper.ml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,18 @@ let cancel ~__context =
262262
let@ self = operate_on_db_task ~__context in
263263
cancel_this ~__context ~self
264264

265+
let store_backtrace ~__context exn =
266+
D.log_and_ignore_exn @@ fun () ->
267+
let@ self = operate_on_db_task ~__context in
268+
let status = Db_actions.DB_Action.Task.get_status ~__context ~self in
269+
match status with
270+
| `pending ->
271+
(* store backtrace, for message-forwarding to read on the coordinator *)
272+
Db_actions.DB_Action.Task.set_backtrace ~__context ~self
273+
~value:(Sexplib.Sexp.to_string Backtrace.(sexp_of_t (get exn)))
274+
| _ ->
275+
()
276+
265277
let failed ~__context exn =
266278
let backtrace = Printexc.get_raw_backtrace () in
267279
let@ () = finally_complete_tracing ~error:(exn, backtrace) __context in
@@ -293,6 +305,24 @@ let failed ~__context exn =
293305
"`failure"
294306
)
295307

308+
let reraise ~__context ~task exn =
309+
Backtrace.is_important exn ;
310+
let () =
311+
D.log_and_ignore_exn @@ fun () ->
312+
(* best-effort: retrieve existing backtrace and join with local *)
313+
let remote_bt =
314+
Db_actions.DB_Action.Task.get_backtrace ~__context ~self:task
315+
|> Sexplib.Sexp.of_string
316+
|> Backtrace.t_of_sexp
317+
in
318+
let local_bt = Backtrace.remove exn in
319+
(* start with remote Backtrace *)
320+
Backtrace.add exn remote_bt ;
321+
(* add back local *)
322+
Backtrace.add exn local_bt
323+
in
324+
raise exn
325+
296326
type id = Sm of string | Xenops of string * string
297327

298328
(* queue name * id *)

ocaml/xapi/taskHelper.mli

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ val cancel_this : __context:Context.t -> self:API.ref_task -> unit
5555

5656
val cancel : __context:Context.t -> unit
5757

58+
val store_backtrace : __context:Context.t -> exn -> unit
59+
5860
val failed : __context:Context.t -> exn -> unit
5961
(** Call this when a task fails with [exn] *)
6062

63+
val reraise : __context:Context.t -> task:API.ref_task -> exn -> 'a
64+
6165
val init : unit -> unit
6266

6367
val rbac_assert_permission_fn :

0 commit comments

Comments
 (0)