Skip to content

Commit f9b5e52

Browse files
authored
CA-404013: do not relock the mutex when backing up rrds (#6215)
The point of using try_lock is to not get lock while trying to hold the mutex. Releasing it and blocking to lock it defeats the purpose of using try_lock. Reorganise the sequence to read and copy all the rrds first while under the locked mutex, release it, and then archive the copies. This doesn't fix CA-404013 directly, so this can wait until the release is cut. I'm opening otherwise I'll forget to open it when things are unlocked. I've run the smoke and validation tests, as well as checking that the rrd files in the bugtools are well formed for hosts and vms (they are created suing this function)
2 parents 96f7cd1 + 77258a4 commit f9b5e52

File tree

2 files changed

+23
-36
lines changed

2 files changed

+23
-36
lines changed

ocaml/database/db_xml.ml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,9 @@ module From = struct
161161
D.warn "no lifetime information about %s.%s, ignoring" tblname k ;
162162
false
163163
in
164-
if do_not_load then (
165-
D.info
166-
{|dropping column "%s.%s": it has been removed from the datamodel|}
167-
tblname k ;
164+
if do_not_load then
168165
row
169-
) else
166+
else
170167
let column_schema = Schema.Table.find k table_schema in
171168
let value =
172169
Schema.Value.unmarshal column_schema.Schema.Column.ty

ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ let archive_rrd vm_uuid (remote_address : string option) : unit =
9999
master host, exclusively. Any attempt to send the rrds to pools outside
100100
the host will fail. *)
101101
let backup_rrds (remote_address : string option) () : unit =
102+
let __FUN = __FUNCTION__ in
102103
let transport =
103104
Option.map
104105
(fun address ->
@@ -119,50 +120,39 @@ let backup_rrds (remote_address : string option) () : unit =
119120
| Some address ->
120121
Printf.sprintf "host %s" address
121122
in
122-
info "%s: trying to back up RRDs to %s" __FUNCTION__ destination ;
123+
info "%s: trying to back up RRDs to %s" __FUN destination ;
123124
let total_cycles = 5 in
124125
let cycles_tried = ref 0 in
126+
let host_uuid = Inventory.lookup Inventory._installation_uuid in
125127
while !cycles_tried < total_cycles do
126128
if Mutex.try_lock mutex then (
127129
cycles_tried := total_cycles ;
128-
let vrrds =
129-
try Hashtbl.fold (fun k v acc -> (k, v.rrd) :: acc) vm_rrds []
130-
with exn -> Mutex.unlock mutex ; raise exn
131-
in
132-
Mutex.unlock mutex ;
133-
List.iter
134-
(fun (uuid, rrd) ->
135-
debug "%s: saving RRD for VM uuid=%s" __FUNCTION__ uuid ;
136-
let rrd = with_lock mutex (fun () -> Rrd.copy_rrd rrd) in
137-
archive_rrd_internal ~transport ~uuid ~rrd ()
138-
)
139-
vrrds ;
140-
Mutex.lock mutex ;
141-
let srrds =
142-
try Hashtbl.fold (fun k v acc -> (k, v.rrd) :: acc) sr_rrds []
143-
with exn -> Mutex.unlock mutex ; raise exn
130+
let rrds_copy =
131+
[
132+
Hashtbl.fold
133+
(fun k v acc -> ("VM", k, Rrd.copy_rrd v.rrd) :: acc)
134+
vm_rrds []
135+
; Hashtbl.fold
136+
(fun k v acc -> ("SR", k, Rrd.copy_rrd v.rrd) :: acc)
137+
sr_rrds []
138+
; Option.fold ~none:[]
139+
~some:(fun rrdi -> [("host", host_uuid, Rrd.copy_rrd rrdi.rrd)])
140+
!host_rrd
141+
]
142+
|> List.concat
144143
in
145144
Mutex.unlock mutex ;
145+
146146
List.iter
147-
(fun (uuid, rrd) ->
148-
debug "%s: saving RRD for SR uuid=%s" __FUNCTION__ uuid ;
149-
let rrd = with_lock mutex (fun () -> Rrd.copy_rrd rrd) in
147+
(fun (cls, uuid, rrd) ->
148+
debug "%s: saving RRD for %s uuid=%s" __FUN cls uuid ;
150149
archive_rrd_internal ~transport ~uuid ~rrd ()
151150
)
152-
srrds ;
153-
match !host_rrd with
154-
| Some rrdi ->
155-
debug "%s: saving RRD for host" __FUNCTION__ ;
156-
let rrd = with_lock mutex (fun () -> Rrd.copy_rrd rrdi.rrd) in
157-
archive_rrd_internal ~transport
158-
~uuid:(Inventory.lookup Inventory._installation_uuid)
159-
~rrd ()
160-
| None ->
161-
()
151+
rrds_copy
162152
) else (
163153
cycles_tried := 1 + !cycles_tried ;
164154
if !cycles_tried >= total_cycles then
165-
warn "%s: Could not acquire RRD lock, skipping RRD backup" __FUNCTION__
155+
warn "%s: Could not acquire RRD lock, skipping RRD backup" __FUN
166156
else
167157
Thread.delay 1.
168158
)

0 commit comments

Comments
 (0)