@@ -46,6 +46,7 @@ use sha2::{Digest, Sha256};
46
46
use slog_error_chain:: InlineErrorChain ;
47
47
use std:: future:: Future ;
48
48
use std:: io:: Write ;
49
+ use std:: num:: NonZeroU64 ;
49
50
use std:: sync:: Arc ;
50
51
use tokio:: io:: AsyncReadExt ;
51
52
use tokio:: io:: AsyncSeekExt ;
@@ -59,6 +60,10 @@ use zip::write::FullFileOptions;
59
60
// rather than "/tmp", which would keep this collected data in-memory.
60
61
const TEMPDIR : & str = "/var/tmp" ;
61
62
63
+ // The size of piece of a support bundle to transfer to the sled agent
64
+ // within a single streaming request.
65
+ const CHUNK_SIZE : NonZeroU64 = NonZeroU64 :: new ( 1024 * 1024 * 1024 ) . unwrap ( ) ;
66
+
62
67
fn authz_support_bundle_from_id ( id : SupportBundleUuid ) -> authz:: SupportBundle {
63
68
authz:: SupportBundle :: new (
64
69
authz:: FLEET ,
@@ -68,10 +73,22 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle {
68
73
}
69
74
70
75
// Specifies the data to be collected within the Support Bundle.
71
- #[ derive( Clone , Default ) ]
76
+ #[ derive( Clone ) ]
72
77
struct BundleRequest {
73
78
// If "false": Skip collecting host-specific info from each sled.
74
79
skip_sled_info : bool ,
80
+
81
+ // The size of chunks to use when transferring a bundle from Nexus
82
+ // to a sled agent.
83
+ //
84
+ // Typically, this is CHUNK_SIZE, but can be modified for testing.
85
+ transfer_chunk_size : NonZeroU64 ,
86
+ }
87
+
88
+ impl Default for BundleRequest {
89
+ fn default ( ) -> Self {
90
+ Self { skip_sled_info : false , transfer_chunk_size : CHUNK_SIZE }
91
+ }
75
92
}
76
93
77
94
// Result of asking a sled agent to clean up a bundle
@@ -390,6 +407,7 @@ impl SupportBundleCollector {
390
407
opctx : opctx. child ( std:: collections:: BTreeMap :: new ( ) ) ,
391
408
request : request. clone ( ) ,
392
409
bundle : bundle. clone ( ) ,
410
+ transfer_chunk_size : request. transfer_chunk_size ,
393
411
} ) ;
394
412
395
413
let authz_bundle = authz_support_bundle_from_id ( bundle. id . into ( ) ) ;
@@ -434,6 +452,7 @@ struct BundleCollection {
434
452
opctx : OpContext ,
435
453
request : BundleRequest ,
436
454
bundle : SupportBundle ,
455
+ transfer_chunk_size : NonZeroU64 ,
437
456
}
438
457
439
458
impl BundleCollection {
@@ -445,6 +464,20 @@ impl BundleCollection {
445
464
// as it's being collected.
446
465
let dir = tempdir_in ( TEMPDIR ) ?;
447
466
467
+ let report = self . collect_bundle_locally ( & dir) . await ?;
468
+ self . store_bundle_on_sled ( dir) . await ?;
469
+ Ok ( report)
470
+ }
471
+
472
+ // Create the support bundle, placing the contents into a user-specified
473
+ // directory.
474
+ //
475
+ // Does not attempt to convert the contents into a zipfile, nor send them
476
+ // to any durable storage.
477
+ async fn collect_bundle_locally (
478
+ self : & Arc < Self > ,
479
+ dir : & Utf8TempDir ,
480
+ ) -> anyhow:: Result < SupportBundleCollectionReport > {
448
481
let mut collection = Box :: pin ( self . collect_bundle_as_file ( & dir) ) ;
449
482
450
483
// We periodically check the state of the support bundle - if a user
@@ -456,7 +489,7 @@ impl BundleCollection {
456
489
work_duration,
457
490
) ;
458
491
459
- let report = loop {
492
+ loop {
460
493
tokio:: select! {
461
494
// Timer fired mid-collection - let's check if we should stop.
462
495
_ = yield_interval. tick( ) => {
@@ -487,15 +520,24 @@ impl BundleCollection {
487
520
"Bundle Collection completed" ;
488
521
"bundle" => %self . bundle. id
489
522
) ;
490
- break report? ;
523
+ return report;
491
524
} ,
492
525
}
493
- } ;
526
+ }
527
+ }
494
528
529
+ async fn store_bundle_on_sled (
530
+ & self ,
531
+ dir : Utf8TempDir ,
532
+ ) -> anyhow:: Result < ( ) > {
495
533
// Create the zipfile as a temporary file
496
534
let mut zipfile = tokio:: fs:: File :: from_std ( bundle_to_zipfile ( & dir) ?) ;
535
+ let total_len = zipfile. metadata ( ) . await ?. len ( ) ;
497
536
498
- // Verify the hash locally before we send it over the network
537
+ // Collect the hash locally before we send it over the network
538
+ //
539
+ // We'll use this later during finalization to confirm the bundle
540
+ // has been stored successfully.
499
541
zipfile. seek ( SeekFrom :: Start ( 0 ) ) . await ?;
500
542
let hash = sha2_hash ( & mut zipfile) . await ?;
501
543
@@ -515,27 +557,78 @@ impl BundleCollection {
515
557
)
516
558
. await ?;
517
559
518
- // Stream the zipfile to the sled where it should be kept
519
- zipfile. seek ( SeekFrom :: Start ( 0 ) ) . await ?;
520
- let file_access = hyper_staticfile:: vfs:: TokioFileAccess :: new ( zipfile) ;
521
- let file_stream =
522
- hyper_staticfile:: util:: FileBytesStream :: new ( file_access) ;
523
- let body =
524
- reqwest:: Body :: wrap ( hyper_staticfile:: Body :: Full ( file_stream) ) ;
560
+ let zpool = ZpoolUuid :: from ( self . bundle . zpool_id ) ;
561
+ let dataset = DatasetUuid :: from ( self . bundle . dataset_id ) ;
562
+ let support_bundle = SupportBundleUuid :: from ( self . bundle . id ) ;
563
+
564
+ // Tell this sled to create the bundle.
565
+ let creation_result = sled_client
566
+ . support_bundle_start_creation ( & zpool, & dataset, & support_bundle)
567
+ . await
568
+ . with_context ( || "Support bundle failed to start creation" ) ?;
569
+
570
+ if matches ! (
571
+ creation_result. state,
572
+ sled_agent_client:: types:: SupportBundleState :: Complete
573
+ ) {
574
+ // Early exit case: the bundle was already created -- we must have either
575
+ // crashed or failed between "finalizing" and "writing to the database that we
576
+ // finished".
577
+ info ! ( & self . log, "Support bundle was already collected" ; "bundle" => %self . bundle. id) ;
578
+ return Ok ( ( ) ) ;
579
+ }
580
+ info ! ( & self . log, "Support bundle creation started" ; "bundle" => %self . bundle. id) ;
581
+
582
+ let mut offset = 0 ;
583
+ while offset < total_len {
584
+ // Stream the zipfile to the sled where it should be kept
585
+ let mut file = zipfile
586
+ . try_clone ( )
587
+ . await
588
+ . with_context ( || "Failed to clone zipfile" ) ?;
589
+ file. seek ( SeekFrom :: Start ( offset) ) . await . with_context ( || {
590
+ format ! ( "Failed to seek to offset {offset} / {total_len} within zipfile" )
591
+ } ) ?;
592
+
593
+ // Only stream at most "transfer_chunk_size" bytes at once
594
+ let remaining = std:: cmp:: min (
595
+ self . transfer_chunk_size . get ( ) ,
596
+ total_len - offset,
597
+ ) ;
598
+ let limited_file = file. take ( remaining) ;
599
+ let stream = tokio_util:: io:: ReaderStream :: new ( limited_file) ;
600
+ let body = reqwest:: Body :: wrap_stream ( stream) ;
601
+
602
+ info ! (
603
+ & self . log,
604
+ "Streaming bundle chunk" ;
605
+ "bundle" => %self . bundle. id,
606
+ "offset" => offset,
607
+ "length" => remaining,
608
+ ) ;
609
+
610
+ sled_client. support_bundle_transfer (
611
+ & zpool, & dataset, & support_bundle, offset, body
612
+ ) . await . with_context ( || {
613
+ format ! ( "Failed to transfer bundle: {remaining}@{offset} of {total_len} to sled" )
614
+ } ) ?;
615
+
616
+ offset += self . transfer_chunk_size . get ( ) ;
617
+ }
525
618
526
619
sled_client
527
- . support_bundle_create (
528
- & ZpoolUuid :: from ( self . bundle . zpool_id ) ,
529
- & DatasetUuid :: from ( self . bundle . dataset_id ) ,
530
- & SupportBundleUuid :: from ( self . bundle . id ) ,
620
+ . support_bundle_finalize (
621
+ & zpool ,
622
+ & dataset ,
623
+ & support_bundle ,
531
624
& hash. to_string ( ) ,
532
- body,
533
625
)
534
- . await ?;
626
+ . await
627
+ . with_context ( || "Failed to finalize bundle" ) ?;
535
628
536
629
// Returning from this method should drop all temporary storage
537
630
// allocated locally for this support bundle.
538
- Ok ( report )
631
+ Ok ( ( ) )
539
632
}
540
633
541
634
// Perform the work of collecting the support bundle into a temporary directory
@@ -795,7 +888,7 @@ impl BackgroundTask for SupportBundleCollector {
795
888
Ok ( report) => collection_report = Some ( report) ,
796
889
Err ( err) => {
797
890
collection_err =
798
- Some ( json ! ( { "collect_error" : err. to_string( ) } ) )
891
+ Some ( json ! ( { "collect_error" : InlineErrorChain :: new ( err. as_ref ( ) ) . to_string( ) } ) )
799
892
}
800
893
} ;
801
894
@@ -1076,7 +1169,9 @@ async fn save_sp_dumps(
1076
1169
mod test {
1077
1170
use super :: * ;
1078
1171
1172
+ use crate :: app:: support_bundles:: SupportBundleQueryType ;
1079
1173
use camino_tempfile:: tempdir;
1174
+ use http_body_util:: BodyExt ;
1080
1175
use nexus_db_model:: PhysicalDisk ;
1081
1176
use nexus_db_model:: PhysicalDiskKind ;
1082
1177
use nexus_db_model:: RendezvousDebugDataset ;
@@ -1356,6 +1451,7 @@ mod test {
1356
1451
// NOTE: The support bundle querying interface isn't supported on
1357
1452
// the simulated sled agent (yet?) so we're skipping this step.
1358
1453
skip_sled_info : true ,
1454
+ ..Default :: default ( )
1359
1455
} ;
1360
1456
let report = collector
1361
1457
. collect_bundle ( & opctx, & request)
@@ -1382,6 +1478,85 @@ mod test {
1382
1478
assert ! ( report. is_none( ) ) ;
1383
1479
}
1384
1480
1481
+ #[ nexus_test( server = crate :: Server ) ]
1482
+ async fn test_collect_chunked ( cptestctx : & ControlPlaneTestContext ) {
1483
+ let nexus = & cptestctx. server . server_context ( ) . nexus ;
1484
+ let datastore = nexus. datastore ( ) ;
1485
+ let resolver = nexus. resolver ( ) ;
1486
+ let opctx = OpContext :: for_tests (
1487
+ cptestctx. logctx . log . clone ( ) ,
1488
+ datastore. clone ( ) ,
1489
+ ) ;
1490
+
1491
+ // Before we can create any bundles, we need to create the
1492
+ // space for them to be provisioned.
1493
+ let _datasets =
1494
+ TestDataset :: setup ( cptestctx, & datastore, & opctx, 1 ) . await ;
1495
+
1496
+ let bundle = datastore
1497
+ . support_bundle_create ( & opctx, "For collection testing" , nexus. id ( ) )
1498
+ . await
1499
+ . expect ( "Couldn't allocate a support bundle" ) ;
1500
+ assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1501
+
1502
+ let collector = SupportBundleCollector :: new (
1503
+ datastore. clone ( ) ,
1504
+ resolver. clone ( ) ,
1505
+ false ,
1506
+ nexus. id ( ) ,
1507
+ ) ;
1508
+
1509
+ // The bundle collection should complete successfully.
1510
+ //
1511
+ // We're going to use a really small chunk size here to force the bundle
1512
+ // to get split up.
1513
+ let request = BundleRequest {
1514
+ skip_sled_info : true ,
1515
+ transfer_chunk_size : NonZeroU64 :: new ( 16 ) . unwrap ( ) ,
1516
+ } ;
1517
+
1518
+ let report = collector
1519
+ . collect_bundle ( & opctx, & request)
1520
+ . await
1521
+ . expect ( "Collection should have succeeded under test" )
1522
+ . expect ( "Collecting the bundle should have generated a report" ) ;
1523
+ assert_eq ! ( report. bundle, bundle. id. into( ) ) ;
1524
+ assert ! ( report. listed_in_service_sleds) ;
1525
+ assert ! ( report. listed_sps) ;
1526
+ assert ! ( report. activated_in_db_ok) ;
1527
+
1528
+ let observed_bundle = datastore
1529
+ . support_bundle_get ( & opctx, bundle. id . into ( ) )
1530
+ . await
1531
+ . expect ( "Bundle should definitely be in db by this point" ) ;
1532
+ assert_eq ! ( observed_bundle. state, SupportBundleState :: Active ) ;
1533
+
1534
+ // Download a file from the bundle, to verify that it was trasnferred
1535
+ // successfully.
1536
+ let head = false ;
1537
+ let range = None ;
1538
+ let response = nexus
1539
+ . support_bundle_download (
1540
+ & opctx,
1541
+ observed_bundle. id . into ( ) ,
1542
+ SupportBundleQueryType :: Path {
1543
+ file_path : "bundle_id.txt" . to_string ( ) ,
1544
+ } ,
1545
+ head,
1546
+ range,
1547
+ )
1548
+ . await
1549
+ . unwrap ( ) ;
1550
+
1551
+ // Read the body to bytes, then convert to string
1552
+ let body_bytes =
1553
+ response. into_body ( ) . collect ( ) . await . unwrap ( ) . to_bytes ( ) ;
1554
+ let body_string = String :: from_utf8 ( body_bytes. to_vec ( ) ) . unwrap ( ) ;
1555
+
1556
+ // Verify the content matches the bundle ID
1557
+ assert_eq ! ( body_string, observed_bundle. id. to_string( ) ) ;
1558
+ }
1559
+
1385
1560
#[ nexus_test( server = crate :: Server ) ]
1386
1561
async fn test_collect_many ( cptestctx : & ControlPlaneTestContext ) {
1387
1562
let nexus = & cptestctx. server . server_context ( ) . nexus ;
@@ -1415,7 +1590,8 @@ mod test {
1415
1590
) ;
1416
1591
1417
1592
// Each time we call "collect_bundle", we collect a SINGLE bundle.
1418
- let request = BundleRequest { skip_sled_info : true } ;
1593
+ let request =
1594
+ BundleRequest { skip_sled_info : true , ..Default :: default ( ) } ;
1419
1595
let report = collector
1420
1596
. collect_bundle ( & opctx, & request)
1421
1597
. await
@@ -1553,7 +1729,8 @@ mod test {
1553
1729
false ,
1554
1730
nexus. id ( ) ,
1555
1731
) ;
1556
- let request = BundleRequest { skip_sled_info : true } ;
1732
+ let request =
1733
+ BundleRequest { skip_sled_info : true , ..Default :: default ( ) } ;
1557
1734
let report = collector
1558
1735
. collect_bundle ( & opctx, & request)
1559
1736
. await
@@ -1689,7 +1866,8 @@ mod test {
1689
1866
false ,
1690
1867
nexus. id ( ) ,
1691
1868
) ;
1692
- let request = BundleRequest { skip_sled_info : true } ;
1869
+ let request =
1870
+ BundleRequest { skip_sled_info : true , ..Default :: default ( ) } ;
1693
1871
let report = collector
1694
1872
. collect_bundle ( & opctx, & request)
1695
1873
. await
@@ -1768,7 +1946,8 @@ mod test {
1768
1946
false ,
1769
1947
nexus. id ( ) ,
1770
1948
) ;
1771
- let request = BundleRequest { skip_sled_info : true } ;
1949
+ let request =
1950
+ BundleRequest { skip_sled_info : true , ..Default :: default ( ) } ;
1772
1951
let report = collector
1773
1952
. collect_bundle ( & opctx, & request)
1774
1953
. await
0 commit comments