Skip to content

CP-52524: Generate an alert when various host kernel taints are set #6128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ocaml/xapi-consts/api_messages.ml
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,12 @@ let host_internal_certificate_expiring_07 =

let failed_login_attempts = addMessage "FAILED_LOGIN_ATTEMPTS" 3L

let kernel_is_broken which =
addMessage ("HOST_KERNEL_ENCOUNTERED_ERROR_" ^ which) 2L

let kernel_is_broken_warning which =
addMessage ("HOST_KERNEL_ENCOUNTERED_WARNING_" ^ which) 3L

let tls_verification_emergency_disabled =
addMessage "TLS_VERIFICATION_EMERGENCY_DISABLED" 3L

Expand Down
27 changes: 15 additions & 12 deletions ocaml/xapi/dbsync_slave.ml
Original file line number Diff line number Diff line change
Expand Up @@ -63,21 +63,24 @@ let create_localhost ~__context info =
in
()

(* TODO cat /proc/stat for btime ? *)
let get_start_time () =
try
debug "Calculating boot time..." ;
let now = Unix.time () in
let uptime = Unixext.string_of_file "/proc/uptime" in
let uptime = String.trim uptime in
let uptime = String.split ' ' uptime in
let uptime = List.hd uptime in
let uptime = float_of_string uptime in
let boot_time = Date.of_unix_time (now -. uptime) in
debug " system booted at %s" (Date.to_rfc3339 boot_time) ;
boot_time
match
Unixext.string_of_file "/proc/stat"
|> String.trim
|> String.split '\n'
|> List.find (fun s -> String.starts_with ~prefix:"btime" s)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once found, the line could be parsed with sscanf as well.

|> String.split ' '
with
| _ :: btime :: _ ->
let boot_time = Date.of_unix_time (float_of_string btime) in
debug "%s: system booted at %s" __FUNCTION__ (Date.to_rfc3339 boot_time) ;
boot_time
| _ ->
failwith "Couldn't parse /proc/stat"
with e ->
debug "Calculating boot time failed with '%s'" (ExnHelper.string_of_exn e) ;
debug "%s: Calculating boot time failed with '%s'" __FUNCTION__
(ExnHelper.string_of_exn e) ;
Date.epoch

(* not sufficient just to fill in this data on create time [Xen caps may change if VT enabled in BIOS etc.] *)
Expand Down
75 changes: 75 additions & 0 deletions ocaml/xapi/xapi_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2923,6 +2923,81 @@ let emergency_reenable_tls_verification ~__context =
Helpers.touch_file Constants.verify_certificates_path ;
Db.Host.set_tls_verification_enabled ~__context ~self ~value:true

(** Issue an alert if /proc/sys/kernel/tainted indicates particular kernel
errors. Will send only one alert per reboot *)
let alert_if_kernel_broken =
let __context = Context.make "host_kernel_error_alert_startup_check" in
(* Only add an alert if
(a) an alert wasn't already issued for the currently booted kernel *)
let possible_alerts =
ref
( lazy
((* Check all the alerts since last reboot. Only done once at toolstack
startup, we track if alerts have been issued afterwards internally *)
let self = Helpers.get_localhost ~__context in
let boot_time =
Db.Host.get_other_config ~__context ~self
|> List.assoc "boot_time"
|> float_of_string
in
let all_alerts =
[
(* processor reported a Machine Check Exception (MCE) *)
(4, Api_messages.kernel_is_broken "MCE")
; (* bad page referenced or some unexpected page flags *)
(5, Api_messages.kernel_is_broken "BAD_PAGE")
; (* kernel died recently, i.e. there was an OOPS or BUG *)
(7, Api_messages.kernel_is_broken "BUG")
; (* kernel issued warning *)
(9, Api_messages.kernel_is_broken_warning "WARN")
; (* soft lockup occurred *)
(14, Api_messages.kernel_is_broken_warning "SOFT_LOCKUP")
]
in
all_alerts
|> List.filter (fun (_, alert_message) ->
let alert_already_issued_for_this_boot =
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Message.get_all_records ~rpc ~session_id
|> List.exists (fun (_, record) ->
record.API.message_name = fst alert_message
&& API.Date.is_later
~than:(API.Date.of_unix_time boot_time)
record.API.message_timestamp
)
)
in
alert_already_issued_for_this_boot
)
)
)
in
(* and (b) if we found a problem *)
fun ~__context ->
let self = Helpers.get_localhost ~__context in
possible_alerts :=
Lazy.from_val
(Lazy.force !possible_alerts
|> List.filter (fun (alert_bit, alert_message) ->
let is_bit_tainted =
Unixext.string_of_file "/proc/sys/kernel/tainted"
|> int_of_string
in
let is_bit_tainted = (is_bit_tainted lsr alert_bit) land 1 = 1 in
if is_bit_tainted then (
let host = Db.Host.get_name_label ~__context ~self in
let body =
Printf.sprintf "<body><host>%s</host></body>" host
in
Xapi_alert.add ~msg:alert_message ~cls:`Host
~obj_uuid:(Db.Host.get_uuid ~__context ~self)
~body ;
false (* alert issued, remove from the list *)
) else
true (* keep in the list, alert can be issued later *)
)
)

let alert_if_tls_verification_was_emergency_disabled ~__context =
let tls_verification_enabled_locally =
Stunnel_client.get_verify_by_default ()
Expand Down
2 changes: 2 additions & 0 deletions ocaml/xapi/xapi_host.mli
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,8 @@ val set_numa_affinity_policy :

val emergency_disable_tls_verification : __context:Context.t -> unit

val alert_if_kernel_broken : __context:Context.t -> unit

val alert_if_tls_verification_was_emergency_disabled :
__context:Context.t -> unit

Expand Down
7 changes: 7 additions & 0 deletions ocaml/xapi/xapi_periodic_scheduler_init.ml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ let register ~__context =
(Xapi_periodic_scheduler.Periodic freq) freq
Xapi_pool.alert_failed_login_attempts
) ;
Xapi_periodic_scheduler.add_to_queue "broken_kernel"
(Xapi_periodic_scheduler.Periodic 600.) 600. (fun () ->
Server_helpers.exec_with_new_task
"Periodic alert if the running kernel is broken in some serious way."
(fun __context -> Xapi_host.alert_if_kernel_broken ~__context
)
) ;
Xapi_periodic_scheduler.add_to_queue
"Period alert if TLS verification emergency disabled"
(Xapi_periodic_scheduler.Periodic 600.) 600. (fun () ->
Expand Down
12 changes: 6 additions & 6 deletions ocaml/xenopsd/xc/domain.ml
Original file line number Diff line number Diff line change
Expand Up @@ -835,12 +835,12 @@ let create_channels ~xc uuid domid =
let numa_hierarchy =
let open Xenctrlext in
let open Topology in
Lazy.from_fun (fun () ->
let xcext = get_handle () in
let distances = (numainfo xcext).distances in
let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in
NUMA.make ~distances ~cpu_to_node
)
lazy
(let xcext = get_handle () in
let distances = (numainfo xcext).distances in
let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in
NUMA.make ~distances ~cpu_to_node
)

let numa_mutex = Mutex.create ()

Expand Down
2 changes: 1 addition & 1 deletion quality-gate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

list-hd () {
N=294
N=293
LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc)
if [ "$LIST_HD" -eq "$N" ]; then
echo "OK counted $LIST_HD List.hd usages"
Expand Down
Loading