diff --git a/.codechecker.json b/.codechecker.json new file mode 100644 index 00000000000..6d7ef70943e --- /dev/null +++ b/.codechecker.json @@ -0,0 +1,6 @@ +{ + "analyze": [ + "--disable=misc-header-include-cycle", + "--disable=clang-diagnostic-unused-parameter" + ] +} diff --git a/.github/workflows/1.249-lcm.yml b/.github/workflows/1.249-lcm.yml index 39132476bd9..8057b255a92 100644 --- a/.github/workflows/1.249-lcm.yml +++ b/.github/workflows/1.249-lcm.yml @@ -1,5 +1,7 @@ name: Build and test (1.249-lcm, scheduled) +permissions: {} + on: schedule: # run every Monday, this refreshes the cache @@ -9,6 +11,8 @@ jobs: python-test: name: Python tests runs-on: ubuntu-20.04 + permissions: + contents: read strategy: fail-fast: false matrix: diff --git a/.github/workflows/codechecker.yml b/.github/workflows/codechecker.yml new file mode 100644 index 00000000000..da8ea12c005 --- /dev/null +++ b/.github/workflows/codechecker.yml @@ -0,0 +1,79 @@ +name: Run CodeChecker static analyzer on XAPI's C stubs +permissions: {} + +on: + push: + pull_request: + branches: + - master + - 'feature/**' + - '*-lcm' + +concurrency: # On new push, cancel old workflows from the same PR, branch or tag: + group: ${{ github.workflow }}-${{github.event_name}}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + staticanalyzer: + name: Static analyzer for OCaml C stubs + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + env: + XAPI_VERSION: "v0.0.0-${{ github.sha }}" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup XenAPI environment + uses: ./.github/workflows/setup-xapi-environment + with: + xapi_version: ${{ env.XAPI_VERSION }} + + - name: Install dune-compiledb to generate compile_commands.json + run: | + opam pin add -y ezjsonm https://github.com/mirage/ezjsonm/releases/download/v1.3.0/ezjsonm-1.3.0.tbz + opam pin add -y dune-compiledb https://github.com/edwintorok/dune-compiledb/releases/download/0.6.0/dune-compiledb-0.6.0.tbz + + - name: Trim dune cache + run: opam exec -- dune cache trim --size=2GiB + + - name: Generate compile_commands.json + run: opam exec -- make compile_commands.json + + - name: Upload compile commands json + uses: actions/upload-artifact@v4 + with: + path: ${{ github.workspace }}/compile_commands.json + + - uses: whisperity/codechecker-analysis-action@v1 + id: codechecker + with: + ctu: true + logfile: ${{ github.workspace }}/compile_commands.json + analyze-output: "codechecker_results" + + - name: Upload CodeChecker report + uses: actions/upload-artifact@v4 + with: + name: codechecker_results + path: "${{ steps.codechecker.outputs.result-html-dir }}" + + # cppcheck even for other analyzers apparently, this is + # codechecker's output + - name: convert to SARIF + shell: bash + run: report-converter "codechecker_results" --type cppcheck --output codechecker.sarif --export sarif + + - name: Upload CodeChecker SARIF report + uses: actions/upload-artifact@v4 + with: + name: codechecker_sarif + path: codechecker.sarif + + - name: Upload SARIF report + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: codechecker.sarif diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 096fe18227b..94c7c1a687e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,5 +1,7 @@ name: Generate and upload docs +permissions: {} + on: push: branches: master @@ -8,6 +10,8 @@ jobs: ocaml: name: Docs runs-on: ubuntu-22.04 + permissions: + contents: read env: XAPI_VERSION: "v0.0.0-${{ github.sha }}" STORAGE_DOCDIR: .gh-pages-xapi-storage diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 3c2d7148f90..1b9947ccec7 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -1,5 +1,7 @@ name: Check format +permissions: {} + on: pull_request: branches: @@ -12,6 +14,8 @@ jobs: ocaml-format: name: Ocaml files runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout code diff --git a/.github/workflows/generate-and-build-sdks.yml b/.github/workflows/generate-and-build-sdks.yml index 53a9b8452cb..ca1a67a4c78 100644 --- a/.github/workflows/generate-and-build-sdks.yml +++ b/.github/workflows/generate-and-build-sdks.yml @@ -1,5 +1,7 @@ name: Generate and Build SDKs +permissions: {} + on: workflow_call: inputs: @@ -11,6 +13,9 @@ jobs: generate-sdk-sources: name: Generate SDK sources runs-on: ubuntu-22.04 + permissions: + contents: read + steps: - name: Checkout code uses: actions/checkout@v4 @@ -25,7 +30,7 @@ jobs: run: opam exec -- make sdk # sdk-ci runs some Go unit tests. - # This setting ensures that SDK date time + # This setting ensures that SDK date time # tests are run on a machine that # isn't using UTC - name: Set Timezone to Tokyo for datetime tests @@ -77,6 +82,9 @@ jobs: name: Build C SDK runs-on: ubuntu-latest needs: generate-sdk-sources + permissions: + contents: read + steps: - name: Install dependencies run: sudo apt-get install libxml2-dev @@ -103,6 +111,9 @@ jobs: name: Build Java SDK runs-on: ubuntu-latest needs: generate-sdk-sources + permissions: + contents: read + steps: - name: Install dependencies run: sudo apt-get install maven @@ -120,9 +131,9 @@ jobs: distribution: 'temurin' # Java Tests are run at compile time. - # This setting ensures that SDK date time + # This setting ensures that SDK date time # tests are run on a machine that - # isn't using UTC + # isn't using UTC - name: Set Timezone to Tokyo for datetime tests run: | sudo timedatectl set-timezone Asia/Tokyo @@ -144,6 +155,9 @@ jobs: name: Build C# SDK runs-on: windows-2022 needs: generate-sdk-sources + permissions: + contents: read + steps: - name: Strip 'v' prefix from xapi version shell: pwsh @@ -158,7 +172,7 @@ jobs: # All tests builds and pipelines should # work on other timezones. This setting ensures that # SDK date time tests are run on a machine that - # isn't using UTC + # isn't using UTC - name: Set Timezone to Tokyo for datetime tests shell: pwsh run: Set-TimeZone -Id "Tokyo Standard Time" @@ -192,6 +206,9 @@ jobs: # PowerShell SDK for PowerShell 5.x needs to run on windows-2019 because # windows-2022 doesn't contain .NET Framework 4.x dev tools runs-on: windows-2019 + permissions: + contents: read + steps: - name: Strip 'v' prefix from xapi version shell: pwsh @@ -265,6 +282,8 @@ jobs: dotnet: ["6", "8"] needs: build-csharp-sdk runs-on: windows-2022 + permissions: + contents: read steps: - name: Strip 'v' prefix from xapi version diff --git a/.github/workflows/hugo.yml b/.github/workflows/hugo.yml index 9b831b12ae7..6a0116389fd 100644 --- a/.github/workflows/hugo.yml +++ b/.github/workflows/hugo.yml @@ -1,5 +1,7 @@ name: Generate and upload Hugo docs +permissions: {} + on: push: branches: master @@ -8,6 +10,9 @@ jobs: ocaml: name: Docs runs-on: ubuntu-22.04 + permissions: + contents: read + steps: - name: Checkout code diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f8dcee80945..92f5101d189 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,7 @@ name: Build and test +permissions: {} + on: # When only Hugo docs change, this workflow is not required: push: @@ -20,6 +22,8 @@ jobs: ocaml-tests: name: Run OCaml tests runs-on: ubuntu-22.04 + permissions: + contents: read env: # Ensure you also update test-sdk-builds # when changing this value, to keep builds diff --git a/.github/workflows/other.yml b/.github/workflows/other.yml index 7cac6522c2c..7ec6914045d 100644 --- a/.github/workflows/other.yml +++ b/.github/workflows/other.yml @@ -1,5 +1,7 @@ name: Build and test (other) +permissions: {} + on: # When only Hugo docs change, this workflow is not required: push: @@ -20,6 +22,10 @@ jobs: python-test: name: Python tests runs-on: ubuntu-22.04 + permissions: + contents: read + pull-requests: write # allow commenting on the PR + strategy: fail-fast: false matrix: @@ -29,7 +35,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 # To check which files changed: origin/master..HEAD - - uses: LizardByte/setup-python-action@master + - uses: actions/setup-python@v5 with: python-version: ${{matrix.python-version}} @@ -56,6 +62,7 @@ jobs: files: .git/coverage${{matrix.python-version}}.xml flag-name: python${{matrix.python-version}} parallel: true + fail-on-error: false - uses: dciborow/action-pylint@0.1.0 with: @@ -89,12 +96,14 @@ jobs: - name: Finish the parallel coverage upload to Coveralls uses: coverallsapp/github-action@v2 with: + fail-on-error: false parallel-finished: true - continue-on-error: true # Do not fail CI if this step fails deprecation-test: name: Deprecation tests runs-on: ubuntu-22.04 + permissions: + contents: read steps: - name: Checkout code @@ -109,6 +118,8 @@ jobs: test-sdk-builds: name: Test SDK builds uses: ./.github/workflows/generate-and-build-sdks.yml + permissions: + contents: read with: # Ensure you also update ocaml-tests # when changing this value, to keep builds diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5c3f1cd5502..d766f4f1e4a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,7 @@ name: Create release from tag +permissions: {} + on: push: tags: @@ -9,6 +11,8 @@ jobs: build-python: name: Build and upload Python artifacts runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout code @@ -36,10 +40,15 @@ jobs: build-sdks: name: Build and upload SDK artifacts uses: ./.github/workflows/generate-and-build-sdks.yml + permissions: + contents: read with: xapi_version: ${{ github.ref_name }} release: + permissions: + contents: write # allow creating a release + name: "Create and package release" runs-on: ubuntu-latest needs: [build-python, build-sdks] @@ -124,6 +133,7 @@ jobs: needs: release environment: pypi permissions: + contents: read id-token: write steps: - name: Retrieve python distribution artifacts diff --git a/.github/workflows/shellcheck.yaml b/.github/workflows/shellcheck.yaml index b078eaba549..f685b35d9f4 100644 --- a/.github/workflows/shellcheck.yaml +++ b/.github/workflows/shellcheck.yaml @@ -1,5 +1,7 @@ name: ShellCheck +permissions: {} + on: pull_request: merge_group: @@ -16,8 +18,11 @@ jobs: runs-on: ubuntu-latest permissions: + actions: read + contents: read + pull-requests: write # allow commenting on the PR security-events: write - + steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/Makefile b/Makefile index 7f7386bf6b1..805ece8f28b 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ JOBS = $(shell getconf _NPROCESSORS_ONLN) PROFILE=release OPTMANDIR ?= $(OPTDIR)/man/man1/ -.PHONY: build clean test doc python format install uninstall coverage +.PHONY: build clean test doc python format install uninstall coverage analyze # if we have XAPI_VERSION set then set it in dune-project so we use that version number instead of the one obtained from git # this is typically used when we're not building from a git repo @@ -196,6 +196,17 @@ uninstall: dune uninstall $(DUNE_IU_PACKAGES3) dune uninstall $(DUNE_IU_PACKAGES4) +# An approximation, we actually depend on all dune files recursively +# Also fixup the directory paths to remove _build +# (we must refer to paths that exist in the repository for static analysis results) +compile_commands.json: Makefile dune + mkdir -p _build/ + dune rules | dune-compiledb -o _build/ + sed -e 's/"directory".*/"directory": ".",/' <_build/$@ >$@ + +analyze: compile_commands.json Makefile .codechecker.json + CodeChecker check --config .codechecker.json -l compile_commands.json + compile_flags.txt: Makefile (ocamlc -config-var ocamlc_cflags;\ ocamlc -config-var ocamlc_cppflags;\ diff --git a/README.markdown b/README.markdown index 37174144a3e..1b9243c6ded 100644 --- a/README.markdown +++ b/README.markdown @@ -32,7 +32,7 @@ To build xen-api from source, we recommend using [opam](https://opam.ocaml.org/d - Run that line, e.g.: ```bash - export OCAML_VERSION_FULL="4.14.1" + export OCAML_VERSION_FULL="4.14.2" ``` 4) Setup opam with your environment (i.e. switch). @@ -99,18 +99,26 @@ git push origin --tags Contributions ------------- -To contribute patches to xen-api, please fork the repository on -Github, and then submit a pull request. If for some reason you can't -use Github to submit a pull request, then you may send your patch for -review to the [xen-api@lists.xenproject.org mailing list](http://www.xenproject.org/help/mailing-list.html), with a link to a -public git repository for review. We much prefer Github pull requests, -however, and submitting a patch to the mailing list will take much -more time for review. +To contribute changes to xen-api, please fork the repository on +GitHub, and then submit a pull request. -Maintainers +It is required to add a `Signed-off-by:` as a +[Developers Certificate of Origin](http://developercertificate.org). +It certifies the patch's origin and is licensed under an +appropriate open-source licence to include it in Xapi: +https://git-scm.com/docs/git-commit#Documentation/git-commit.txt---signoff + +Discussions ----------- -Maintainers can be contacted via this mailing list: `xen-api@lists.xenproject.org` +Discussions can be started at +https://github.com/xapi-project/xen-api/discussions + +Issues +------ + +Issues can be raised at +https://github.com/xapi-project/xen-api/issues Licensing --------- diff --git a/doc/assets/css/misc.css b/doc/assets/css/misc.css index beb5a28e43a..dad61421838 100644 --- a/doc/assets/css/misc.css +++ b/doc/assets/css/misc.css @@ -47,10 +47,6 @@ } -.table-striped > tbody > tr:nth-child(odd) { - background-color: #f9f9f9; -} - .btn { display: inline-block; padding: 6px 12px; diff --git a/doc/assets/css/xenapi.css b/doc/assets/css/xenapi.css index d75b1b6d089..4ab6ff3ea16 100644 --- a/doc/assets/css/xenapi.css +++ b/doc/assets/css/xenapi.css @@ -42,6 +42,16 @@ th { text-align: left; .field, .field2 { margin: 0em 0; padding: .5em .7em .7em; + /** + * doc/layouts/partials/content.html generates tables with alternating + * field and field2 for the rows of the XenAPI Class Reference tables. + * Their background colours are hard-coded to bright colours here, but the + * colors are not adjusted for dark mode. We cannot use the theme colours + * in this case. Thus we have to hard-code the colours for now. Ergo, also + * hard-code the text colour to ensure that it has contrast in dark mode too. + * Only shades of grey are used, so the text colour is hard-coded to black. + */ + color: black; background-color: #dddddd; cursor: pointer; font-size: 15px; @@ -113,3 +123,7 @@ th { text-align: left; margin: 0; vertical-align: middle; } + +div[id$='_details'] { + cursor: default; +} diff --git a/doc/assets/js/parse.js b/doc/assets/js/parse.js new file mode 100644 index 00000000000..9460aab1bf7 --- /dev/null +++ b/doc/assets/js/parse.js @@ -0,0 +1,146 @@ + +class Type {}; + +class Builtin extends Type { + constructor(name) { + super(); + this.name = name; + } + + static ofString(s) { + const concrete = ['string', 'bool', 'int', 'float', 'void', 'datetime']; + if (!concrete.includes(s)) + return null; + + return new Builtin(s); + } +}; + +class Enum extends Type { + constructor(name) { + super(); + this.name = name; + } +}; + +class Ctor extends Type { + constructor(params, name) { + super(); + this.params = params; + this.name = name; + } +}; + +function lex(str) { + if (str.indexOf('$') >= 0) + throw new Error('Not allowed to contain $'); + + let ts = str.replaceAll('(', ' ( '); + ts = ts.replaceAll(')', ' ) '); + ts = ts.split(' '); + ts = ts.filter(x => x !== ''); + ts.push('$'); + return ts; +} + +class Lexer { + constructor(tokens) { + this.tokens = tokens; + this.pos = 0; + } + + shift() { + if (this.pos >= this.tokens.length - 1) + return '$'; + + return this.tokens[this.pos++]; + } + + peek() { + const prev = this.pos; + let t = this.shift(); + this.pos = prev; + return t; + } + + expect(ts) { + if (!Array.isArray(ts)) + ts = [ts]; + + let l = this.shift(); + for (const t of ts) + if (l == t) return; + + throw new Error(`Expected ${t}, got ${l}`); + } +}; + +function lbp(t) { + switch (t) { + case '(': + case ')': + case '->': + case '\u2192': + return 0; + case '$': + return -1; + } + + return 1; +} + +function nud(l, t) { + switch (t) { + case 'enum': + return new Enum(l.shift()); + + case '(': + let left = parseType(l, 0); + l.expect(['->', '\u2192']); + let right = parseType(l, 0); + l.expect(')'); + l.expect('map'); + return new Ctor([left, right], 'map'); + } + + let bty = Builtin.ofString(t); + if (bty != null) + return bty; + + const fmt = /^[a-zA-Z_]+$/; + if (fmt.test(t)) + return new Ctor([], t); + + throw new Error(`No null denotation for ${t}`); +} + +function led(l, left, t) { + const known = ['set', 'ref', 'option', 'record']; + if (!known.includes(t)) + throw new Error(`Invalid type constructor: ${t}`); + + return new Ctor([left], t); +} + +function parseType(l, rbp) { + let left = nud(l, l.shift()); + + while (lbp(l.peek()) > rbp) + left = led(l, left, l.shift()); + + return left; +} + +function parseSingleType(input) { + try { + let lexer = new Lexer(lex(input)); + let ty = parseType(lexer, 0); + if (lexer.peek() != '$') + throw new Error('Did not consume entire input'); + return ty; + } catch (e) { + } + + return null; +} + diff --git a/doc/content/design/add-qcow-tool-for-vdi-import-export.md b/doc/content/design/add-qcow-tool-for-vdi-import-export.md new file mode 100644 index 00000000000..127369e3db5 --- /dev/null +++ b/doc/content/design/add-qcow-tool-for-vdi-import-export.md @@ -0,0 +1,121 @@ +--- +title: Add qcow tool to allow VDI import/export +layout: default +design_doc: true +revision: 1 +status: proposed +--- + +# Introduction + +At XCP-ng, we are working on overcoming the 2TiB limitation for VM disks while +preserving essential features such as snapshots, copy-on-write capabilities, and +live migration. + +To achieve this, we are introducing Qcow2 support in SMAPI and the blktap driver. +With the alpha release, we can: + - Create a VDI + - Snapshot it + - Export and import it to/from XVA + - Perform full backups + +However, we currently cannot export a VDI to a Qcow2 file, nor import one. + +The purpose of this design proposal is to outline a solution for implementing VDI +import/export in Qcow2 format. + +# Design Proposal + +The import and export of VHD-based VDIs currently rely on *vhd-tool*, which is +responsible for streaming data between a VDI and a file. It supports both Raw and +VHD formats, but not Qcow2. + +There is an existing tool called [qcow-tool](https://opam.ocaml.org/packages/qcow-tool/) +originally packaged by MirageOS. It is no longer actively maintained, but it can +produce Qcow files readable by QEMU. + +Currently, *qcow-tool* does not support streaming, but we propose to add this +capability. This means replicating the approach used in *vhd-tool*, where data is +pushed to a socket. + +We have contacted the original developer, David Scott, and there are no objections +to us maintaining the tool if needed. + +Therefore, the most appropriate way to enable Qcow2 import/export in XAPI is to +add streaming support to `qcow-tool`. + +# XenAPI changes + +## The workflow + +- The export and import of VDIs are handled by the XAPI HTTP server: + - `GET /export_raw_vdi` + - `PUT /import_raw_vdi` +- The corresponding handlers are `Export_raw_vdi.handler` and + `Import_raw_vdi.handler`. +- Since the format is checked in the handler, we need to add support for `Qcow2`, + as currently only `Raw`, `Tar`, and `Vhd` are supported. +- This requires adding a new type in the `Importexport.Format` module and a new + content type: `application/x-qemu-disk`. + See [mime-types format](https://www.digipres.org/formats/mime-types/#application/x-qemu-disk). +- This allows the format to be properly decoded. Currently, all formats use a + wrapper called `Vhd_tool_wrapper`, which sets up parameters for `vhd-tool`. + We need to add a new wrapper for the Qcow2 format, which will instead use + `qcow-tool`, a tool that we will package (see the section below). +- The new wrapper will be responsible for setting up parameters (source, + destination, etc.). Since it only manages Qcow2 files, we don’t need to pass + additional format information. +- The format (`qcow2`) will be specified in the URI. For example: + - `/import_raw_vdi?session_id=&task_id=&vdi=&format=qcow2` + +## Adding and modifying qcow-tool + +- We need to package [qcow-tool](https://opam.ocaml.org/packages/qcow-tool). +- This new tool will be called from `ocaml/xapi/qcow_tool_wrapper.ml`, as + described in the previous section. + +- To export a VDI to a Qcow2 file, we need to add functionality similar to + `Vhd_tool_wrapper.send`, which calls `vhd-tool stream`. + - It writes data from the source to a destination. Unlike `vhd-tool`, which + supports multiple destinations, we will only support Qcow2 files. + - Here is a typicall call to `vhd-tool stream` +```sh +/bin/vhd-tool stream \ + --source-protocol none \ + --source-format hybrid \ + --source /dev/sm/backend/ff1b27b1-3c35-972e-76ec-a56fe9f25e36/87711319-2b05-41a3-8ee0-3b63a2fc7035:/dev/VG_XenStorage-ff1b27b1-3c35-972e-76ec-a56fe9f25e36/VHD-87711319-2b05-41a3-8ee0-3b63a2fc7035 \ + --destination-protocol none \ + --destination-format vhd \ + --destination-fd 2585f988-7374-8131-5b66-77bbc239cbb2 \ + --tar-filename-prefix \ + --progress \ + --machine \ + --direct \ + --path /dev/mapper:. +``` + +- To import a VDI from a Qcow2 file, we need to implement functionality similar + to `Vhd_tool_wrapper.receive`, which calls `vhd-tool serve`. + - This is the reverse of the export process. As with export, we will only + support a single type of import: from a Qcow2 file. + - Here is a typical call to `vhd-tool serve` +```sh +/bin/vhd-tool serve \ + --source-format raw \ + --source-protocol none \ + --source-fd 3451d7ed-9078-8b01-95bf-293d3bc53e7a \ + --tar-filename-prefix \ + --destination file:///dev/sm/backend/f939be89-5b9f-c7c7-e1e8-30c419ee5de6/4868ac1d-8321-4826-b058-952d37a29b82 \ + --destination-format raw \ + --progress \ + --machine \ + --direct \ + --destination-size 180405760 \ + --prezeroed +``` + +- We don't need to propose different protocol and different format. As we will +not support different formats we just to handle data copy from socket into file +and from file to socket. Sockets and files will be managed into the +`qcow_tool_wrapper`. The `forkhelpers.ml` manages the list of file descriptors +and we will mimic what the vhd tool wrapper does to link a UUID to socket. diff --git a/doc/content/design/sm-supported-image-formats.md b/doc/content/design/sm-supported-image-formats.md new file mode 100644 index 00000000000..fd1118e885d --- /dev/null +++ b/doc/content/design/sm-supported-image-formats.md @@ -0,0 +1,76 @@ +--- +title: Add supported image formats in sm-list +layout: default +design_doc: true +revision: 2 +status: proposed +--- + +# Introduction + +At XCP-ng, we are enhancing support for QCOW2 images in SMAPI. The primary +motivation for this change is to overcome the 2TB size limitation imposed +by the VHD format. By adding support for QCOW2, a Storage Repository (SR) will +be able to host disks in VHD and/or QCOW2 formats, depending on the SR type. +In the future, additional formats—such as VHDx—could also be supported. + +We need a mechanism to expose to end users which image formats are supported +by a given SR. The proposal is to extend the SM API object with a new field +that clients (such as XenCenter, XenOrchestra, etc.) can use to determine the +available formats. + +# Design Proposal + +To expose the available image formats to clients (e.g., XenCenter, XenOrchestra, etc.), +we propose adding a new field called `supported-image-formats` to the Storage Manager (SM) +module. This field will be included in the output of the `SM.get_all_records` call. + +The `supported-image-formats` field will be populated by retrieving information +from the SMAPI drivers. Specifically, each driver will update its `DRIVER_INFO` +dictionary with a new key, `supported_image_formats`, which will contain a list +of strings representing the supported image formats +(for example: `["vhd", "raw", "qcow2"]`). + +The list designates the driver's preferred VDI format as its first entry. That +means that when migrating a VDI, the destination storage repository will +attempt to create a VDI in this preferred format. If the default format cannot +be used (e.g., due to size limitations), an error will be generated. + +If a driver does not provide this information (as is currently the case with existing +drivers), the default value will be an empty array. This signifies that it is the +driver that decides which format it will use. This ensures that the modification +remains compatible with both current and future drivers. + +With this new information, listing all parameters of the SM object will return: + +```bash +# xe sm-list params=all +``` + +will output something like: + +``` +uuid ( RO) : c6ae9a43-fff6-e482-42a9-8c3f8c533e36 +name-label ( RO) : Local EXT3 VHD +name-description ( RO) : SR plugin representing disks as VHD files stored on a local EXT3 filesystem, created inside an LVM volume +type ( RO) : ext +vendor ( RO) : Citrix Systems Inc +copyright ( RO) : (C) 2008 Citrix Systems Inc +required-api-version ( RO) : 1.0 +capabilities ( RO) [DEPRECATED] : SR_PROBE; SR_SUPPORTS_LOCAL_CACHING; SR_UPDATE; THIN_PROVISIONING; VDI_ACTIVATE; VDI_ATTACH; VDI_CLONE; VDI_CONFIG_CBT; VDI_CREATE; VDI_DEACTIVATE; VDI_DELETE; VDI_DETACH; VDI_GENERATE_CONFIG; VDI_MIRROR; VDI_READ_CACHING; VDI_RESET_ON_BOOT; VDI_RESIZE; VDI_SNAPSHOT; VDI_UPDATE +features (MRO) : SR_PROBE: 1; SR_SUPPORTS_LOCAL_CACHING: 1; SR_UPDATE: 1; THIN_PROVISIONING: 1; VDI_ACTIVATE: 1; VDI_ATTACH: 1; VDI_CLONE: 1; VDI_CONFIG_CBT: 1; VDI_CREATE: 1; VDI_DEACTIVATE: 1; VDI_DELETE: 1; VDI_DETACH: 1; VDI_GENERATE_CONFIG: 1; VDI_MIRROR: 1; VDI_READ_CACHING: 1; VDI_RESET_ON_BOOT: 2; VDI_RESIZE: 1; VDI_SNAPSHOT: 1; VDI_UPDATE: 1 +configuration ( RO) : device: local device path (required) (e.g. /dev/sda3) +driver-filename ( RO) : /opt/xensource/sm/EXTSR +required-cluster-stack ( RO) : +supported-image-formats ( RO) : vhd, raw, qcow2 +``` + +This change impacts the SM data model, and as such, the XAPI database version will +be incremented. + +# Impact + +- **Data Model:** A new field (`supported-image-formats`) is added to the SM records. +- **Client Awareness:** Clients like the `xe` CLI will now be able to query and display the supported image formats for a given SR. +- **Database Versioning:** The XAPI database version will be updated to reflect this change. + diff --git a/doc/content/lib/_index.md b/doc/content/lib/_index.md new file mode 100644 index 00000000000..a0592427b0b --- /dev/null +++ b/doc/content/lib/_index.md @@ -0,0 +1,5 @@ +--- +title: Libraries +hidden: true +--- +{{% children description=true %}} \ No newline at end of file diff --git a/doc/content/lib/xenctrl/_index.md b/doc/content/lib/xenctrl/_index.md new file mode 100644 index 00000000000..d38c927b83f --- /dev/null +++ b/doc/content/lib/xenctrl/_index.md @@ -0,0 +1,5 @@ +--- +title: libxenctrl +description: Xen Control library for controlling the Xen hypervisor +--- +{{% children description=true %}} \ No newline at end of file diff --git a/doc/content/lib/xenctrl/xc_domain_claim_pages.md b/doc/content/lib/xenctrl/xc_domain_claim_pages.md new file mode 100644 index 00000000000..7f72f01342c --- /dev/null +++ b/doc/content/lib/xenctrl/xc_domain_claim_pages.md @@ -0,0 +1,157 @@ +--- +title: xc_domain_claim_pages() +description: Stake a claim for further memory for a domain, and release it too. +--- + +## Purpose + +The purpose of `xc_domain_claim_pages()` is to attempt to +stake a claim on an amount of memory for a given domain which guarantees that +memory allocations for the claimed amount will be successful. + +The domain can still attempt to allocate beyond the claim, but those are not +guaranteed to be successful and will fail if the domain's memory reaches it's +`max_mem` value. + +Each domain can only have one claim, and the domid is the key of the claim. +By killing the domain, the claim is also released. + +Depending on the given size argument, the remaining stack of the domain +can be set initially, updated to the given amount, or reset to no claim (0). + +## Management of claims + +- The stake is centrally managed by the Xen hypervisor using a + [Hypercall](https://wiki.xenproject.org/wiki/Hypercall). +- Claims are not reflected in the amount of free memory reported by Xen. + +## Reporting of claims + +- `xl claims` reports the outstanding claims of the domains: + > [!info] Sample output of `xl claims`: + > ```js + > Name ID Mem VCPUs State Time(s) Claimed + > Domain-0 0 2656 8 r----- 957418.2 0 + > ``` +- `xl info` reports the host-wide outstanding claims: + > [!info] Sample output from `xl info | grep outstanding`: + > ```js + > outstanding_claims : 0 + > ``` + +## Tracking of claims + +Xen only tracks: +- the outstanding claims of each domain and +- the outstanding host-wide claims. + +Claiming zero pages effectively cancels the domain's outstanding claim +and is always successful. + +> [!info] +> - Allocations for outstanding claims are expected to always be successful. +> - But this reduces the amount of outstanding claims if the domain. +> - Freeing memory of the domain increases the domain's claim again: +> - But, when a domain consumes its claim, it is reset. +> - When the claim is reset, freed memory is longer moved to the outstanding claims! +> - It would have to get a new claim on memory to have spare memory again. + +> [!warning] The domain's `max_mem` value is used to deny memory allocation +> If an allocation would cause the domain to exceed it's `max_mem` +> value, it will always fail. + + +## Implementation + +Function signature of the libXenCtrl function to call the Xen hypercall: + +```c +long xc_memory_op(libxc_handle, XENMEM_claim_pages, struct xen_memory_reservation *) +``` + +`struct xen_memory_reservation` is defined as : + +```c +struct xen_memory_reservation { + .nr_extents = nr_pages, /* number of pages to claim */ + .extent_order = 0, /* an order 0 means: 4k pages, only 0 is allowed */ + .mem_flags = 0, /* no flags, only 0 is allowed (at the moment) */ + .domid = domid /* numerical domain ID of the domain */ +}; +``` + +### Concurrency + +Xen protects the consistency of the stake of the domain +using the domain's `page_alloc_lock` and the global `heap_lock` of Xen. +Thse spin-locks prevent any "time-of-check-time-of-use" races. +As the hypercall needs to take those spin-locks, it cannot be preempted. + +### Return value + +The call returns 0 if the hypercall successfully claimed the requested amount +of memory, else it returns non-zero. + +## Current users + +### libxl and the xl CLI + +If the `struct xc_dom_image` passed by `libxl` to the +[libxenguest](https://github.com/xen-project/xen/tree/master/tools/libs/guest) +functions +[meminit_hvm()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1348-L1649) +and +[meminit_pv()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1183-L1333) +has it's `claim_enabled` field set, they, +before allocating the domain's system memory using the allocation function +[xc_populate_physmap()](https://github.com/xen-project/xen/blob/de0254b9/xen/common/memory.c#L159-L314) which calls the hypercall to allocate and populate +the domain's main system memory, will attempt to claim the to-be allocated +memory using a call to `xc_domain_claim_pages()`. +In case this fails, they do not attempt to continue and return the error code +of `xc_domain_claim_pages()`. + +Both functions also (unconditionally) reset the claim upon return. + +But, the `xl` CLI uses this functionality (unless disabled in `xl.conf`) +to make building the domains fail to prevent running out of memory inside +the `meminit_hvm` and `meminit_pv` calls. +Instead, they immediately return an error. + +This means that in case the claim fails, `xl` avoids: +- The effort of allocating the memory, thereby not blocking it for other domains. +- The effort of potentially needing to scrub the memory after the build failure. + +### xenguest + +While [xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest) calls the +[libxenguest](https://github.com/xen-project/xen/tree/master/tools/libs/guest) +functions +[meminit_hvm()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1348-L1649) +and +[meminit_pv()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1183-L1333) +like `libxl` does, it does not set +[struct xc_dom_image.claim_enabled](https://github.com/xen-project/xen/blob/de0254b9/tools/include/xenguest.h#L186), +so it does not enable the first call to `xc_domain_claim_pages()` +which would claim the amount of memory that these functions will +attempt to allocate and populate for the domain. + +#### Future design ideas for improved NUMA support + +For improved support for [NUMA](../../../toolstack/features/NUMA/), `xenopsd` +may want to call an updated version of this function for the domain, so it has +a stake on the NUMA node's memory before `xenguest` will allocate for the domain +before assigning an NUMA node to a new domain. + +Further, as PV drivers `unmap` and `free` memory for grant tables to Xen and +then re-allocate memory for those grant tables, `xenopsd` may want to try to +stake a very small claim for the domain on the NUMA node of the domain so that +Xen can increase this claim when the PV drivers `free` this memory and re-use +the resulting claimed amount for allocating the grant tables. This would ensure +that the grant tables are then allocated on the local NUMA node of the domain, +avoiding remote memory accesses when accessing the grant tables from inside +the domain. + +Note: In case the corresponding backend process in Dom0 is running on another +NUMA node, it would access the domain's grant tables from a remote NUMA node, +but in this would enable a future improvement for Dom0, where it could prefer to +run the corresponding backend process on the same or a neighbouring NUMA node. diff --git a/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md b/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md new file mode 100644 index 00000000000..03b28e6b213 --- /dev/null +++ b/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md @@ -0,0 +1,146 @@ +--- +title: xc_domain_node_setaffinity() +description: Set a Xen domain's NUMA node affinity for memory allocations +mermaid: + force: true +--- + +`xc_domain_node_setaffinity()` controls the NUMA node affinity of a domain, +but it only updates the Xen hypervisor domain's `d->node_affinity` mask. +This mask is read by the Xen memory allocator as the 2nd preference for the +NUMA node to allocate memory from for this domain. + +> [!info] Preferences of the Xen memory allocator: +> 1. A NUMA node passed to the allocator directly takes precedence, if present. +> 2. Then, if the allocation is for a domain, it's `node_affinity` mask is tried. +> 3. Finally, it falls back to spread the pages over all remaining NUMA nodes. + +As this call has no practical effect on the Xen scheduler, vCPU affinities +need to be set separately anyways. + +The domain's `auto_node_affinity` flag is enabled by default by Xen. This means +that when setting vCPU affinities, Xen updates the `d->node_affinity` mask +to consist of the NUMA nodes to which its vCPUs have affinity to. + +See [xc_vcpu_setaffinity()](xc_vcpu_setaffinity) for more information +on how `d->auto_node_affinity` is used to set the NUMA node affinity. + +Thus, so far, there is no obvious need to call `xc_domain_node_setaffinity()` +when building a domain. + +Setting the NUMA node affinity using this call can be used, +for example, when there might not be enough memory on the +preferred NUMA node, but there are other NUMA nodes that have +enough free memory to be used for the system memory of the domain. + +In terms of future NUMA design, it might be even more favourable to +have a strategy in `xenguest` where in such cases, the superpages +of the preferred node are used first and a fallback to neighbouring +NUMA nodes only happens to the extent necessary. + +Likely, the future allocation strategy should be passed to `xenguest` +using Xenstore like the other platform parameters for the VM. + +## Walk-through of xc_domain_node_setaffinity() + +```mermaid +classDiagram +class `xc_domain_node_setaffinity()` { + +xch: xc_interface #42; + +domid: uint32_t + +nodemap: xc_nodemap_t + 0(on success) + -EINVAL(if a node in the nodemask is not online) +} +click `xc_domain_node_setaffinity()` href " +https://github.com/xen-project/xen/blob/master/tools/libs/ctrl/xc_domain.c#L122-L158" + +`xc_domain_node_setaffinity()` --> `Xen hypercall: do_domctl()` +`xc_domain_node_setaffinity()` <-- `Xen hypercall: do_domctl()` +class `Xen hypercall: do_domctl()` { + Calls domain_set_node_affinity#40;#41; and returns its return value + Passes: domain (struct domain *, looked up using the domid) + Passes: new_affinity (modemask, converted from xc_nodemap_t) +} +click `Xen hypercall: do_domctl()` href " +https://github.com/xen-project/xen/blob/master/xen/common/domctl.c#L516-L525" + +`Xen hypercall: do_domctl()` --> `domain_set_node_affinity()` +`Xen hypercall: do_domctl()` <-- `domain_set_node_affinity()` +class `domain_set_node_affinity()` { + domain: struct domain + new_affinity: nodemask + 0(on success, the domain's node_affinity is updated) + -EINVAL(if a node in the nodemask is not online) +} +click `domain_set_node_affinity()` href " +https://github.com/xen-project/xen/blob/master/xen/common/domain.c#L943-L970" +``` + +### domain_set_node_affinity() + +This function implements the functionality of `xc_domain_node_setaffinity` +to set the NUMA affinity of a domain as described above. +If the new_affinity does not intersect the `node_online_map`, +it returns `-EINVAL`. Otherwise, the result is a success, and it returns `0`. + +When the `new_affinity` is a specific set of NUMA nodes, it updates the NUMA +`node_affinity` of the domain to these nodes and disables `d->auto_node_affinity` +for this domain. With `d->auto_node_affinity` disabled, +[xc_vcpu_setaffinity()](xc_vcpu_setaffinity) no longer updates the NUMA affinity +of this domain. + +If `new_affinity` has all bits set, it re-enables the `d->auto_node_affinity` +for this domain and calls +[domain_update_node_aff()](https://github.com/xen-project/xen/blob/e16acd80/xen/common/sched/core.c#L1809-L1876) +to re-set the domain's `node_affinity` mask to the NUMA nodes of the current +the hard and soft affinity of the domain's online vCPUs. + +### Flowchart in relation to xc_set_vcpu_affinity() + +The effect of `domain_set_node_affinity()` can be seen more clearly on this +flowchart which shows how `xc_set_vcpu_affinity()` is currently used to set +the NUMA affinity of a new domain, but also shows how `domain_set_node_affinity()` +relates to it: + +{{% include "xc_vcpu_setaffinity-xenopsd-notes.md" %}} +{{% include "xc_vcpu_setaffinity-xenopsd.md" %}} + +`xc_domain_node_setaffinity` can be used to set the domain's `node_affinity` +(which is normally set by `xc_set_vcpu_affinity`) to different NUMA nodes. + +#### No effect on the Xen scheduler + +Currently, the node affinity does not affect the Xen scheudler: +In case `d->node_affinity` would be set before vCPU creation, the initial pCPU +of the new vCPU is the first pCPU of the first NUMA node in the domain's +`node_affinity`. This is further changed when one of more `cpupools` are set up. +As this is only the initial pCPU of the vCPU, this alone does not change the +scheduling of Xen Credit scheduler as it reschedules the vCPUs to other pCPUs. + +## Notes on future design improvements + +### It may be possible to call it before vCPUs are created + +When done early, before vCPU creation, some domain-related data structures +could be allocated using the domain's `d->node_affinity` NUMA node mask. + +With further changes in Xen and `xenopsd`, Xen could allocate the vCPU structs +on the affine NUMA nodes of the domain. + +For this, would be that `xenopsd` would have to call `xc_domain_node_setaffinity()` +before vCPU creation, after having decided the domain's NUMA placement, +preferably including claiming the required memory for the domain to ensure +that the domain will be populated from the same NUMA node(s). + +This call cannot influence the past: The `xenopsd` +[VM_create](../../xenopsd/walkthroughs/VM.start.md#2-create-a-xen-domain) +micro-ops calls `Xenctrl.domain_create`. It currently creates +the domain's data structures before `numa_placement` was done. + +Improving `Xenctrl.domain_create` to pass a NUMA node +for allocating the Hypervisor's data structures (e.g. vCPU) +of the domain would require changes +to the Xen hypervisor and the `xenopsd` +[xenopsd VM_create](../../xenopsd/walkthroughs/VM.start.md#2-create-a-xen-domain) +micro-op. diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md new file mode 100644 index 00000000000..48ebf1185dd --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md @@ -0,0 +1,30 @@ +--- +title: Simplified flowchart of xc_vcpu_setaffinity() +description: See lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md for an extended version +hidden: true +--- +```mermaid +flowchart TD +subgraph libxenctrl + xc_vcpu_setaffinity("xc_vcpu_setaffinity()")--hypercall-->xen +end +subgraph xen[Xen Hypervisor] +direction LR +vcpu_set_affinity("vcpu_set_affinity()
set the vCPU affinity") + -->check_auto_node{"Is the domain's
auto_node_affinity
enabled?"} + --"yes
(default)"--> + auto_node_affinity("Set the
domain's
node_affinity + mask as well
(used for further
NUMA memory
allocation)") + +click xc_vcpu_setaffinity +"https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank +click vcpu_set_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1353-L1393" _blank +click domain_update_node_aff +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1809-L1876" _blank +click check_auto_node +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1840-L1870" _blank +click auto_node_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1867-L1869" _blank +end +``` diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd-notes.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd-notes.md new file mode 100644 index 00000000000..a6e7a8be5be --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd-notes.md @@ -0,0 +1,13 @@ +--- +title: Notes for the flowchart on the use of setaffinity for VM.start +hidden: true +--- +In the flowchart, two code paths are set in bold: +- Show the path when `Host.numa_affinity_policy` is the default (off) in `xenopsd`. +- Show the default path of `xc_vcpu_setaffinity(XEN_VCPUAFFINITY_SOFT)` in Xen, + when the Domain's `auto_node_affinity` flag is enabled (default) to show + how it changes to the vCPU affinity update the domain's `node_affinity` + in this default case as well. + +[xenguest](../../xenopsd/walkthroughs/VM.build/xenguest/) uses the Xenstore +to read the static domain configuration that it needs reads to build the domain. diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md new file mode 100644 index 00000000000..f1fddecfbca --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md @@ -0,0 +1,176 @@ +--- +title: Flowchart of the use of xc_vcpu_setaffinity() by xenopsd +description: Shows how xenopsd uses xc_vcpu_setaffinity() to set NUMA affinity +hidden: true +--- +```mermaid +flowchart TD + +subgraph VM.create["xenopsd VM.create"] + + %% Is xe vCPU-params:mask= set? If yes, write to Xenstore: + + is_xe_vCPUparams_mask_set?{" + + Is + xe vCPU-params:mask= + set? Example: 1,2,3 + (Is used to enable vCPU
hard-affinity) + + "} --"yes"--> set_hard_affinity("Write hard-affinity to XenStore: + platform/vcpu/#domid/affinity + (xenguest will read this and other configuration data + from Xenstore)") + +end + +subgraph VM.build["xenopsd VM.build"] + + %% Labels of the decision nodes + + is_Host.numa_affinity_policy_set?{ + Is

Host.numa_affinity_policy

set?} + has_hard_affinity?{ + Is hard-affinity configured in

platform/vcpu/#domid/affinity?} + + %% Connections from VM.create: + set_hard_affinity --> is_Host.numa_affinity_policy_set? + is_xe_vCPUparams_mask_set? == "no"==> is_Host.numa_affinity_policy_set? + + %% The Subgraph itself: + + %% Check Host.numa_affinity_policy + + is_Host.numa_affinity_policy_set? + + %% If Host.numa_affinity_policy is "best_effort": + + -- Host.numa_affinity_policy is

best_effort --> + + %% If has_hard_affinity is set, skip numa_placement: + + has_hard_affinity? + --"yes"-->exec_xenguest + + %% If has_hard_affinity is not set, run numa_placement: + + has_hard_affinity? + --"no"-->numa_placement-->exec_xenguest + + %% If Host.numa_affinity_policy is off (default, for now), + %% skip NUMA placement: + + is_Host.numa_affinity_policy_set? + =="default: disabled"==> + exec_xenguest +end + +%% xenguest subgraph + +subgraph xenguest + + exec_xenguest + + ==> stub_xc_hvm_build("stub_xc_hvm_build()") + + ==> configure_vcpus("configure_vcpus()") + + %% Decision + ==> set_hard_affinity?{" + Is platform/
vcpu/#domid/affinity
+ set?"} + +end + +%% do_domctl Hypercalls + +numa_placement + --Set the NUMA placement using soft-affinity--> + XEN_VCPUAFFINITY_SOFT("xc_vcpu_setaffinity(SOFT)") + ==> do_domctl + +set_hard_affinity? + --yes--> + XEN_VCPUAFFINITY_HARD("xc_vcpu_setaffinity(HARD)") + --> do_domctl + +xc_domain_node_setaffinity("xc_domain_node_setaffinity() + and + xc_domain_node_getaffinity()") + <--> do_domctl + +%% Xen subgraph + +subgraph xen[Xen Hypervisor] + + subgraph domain_update_node_affinity["domain_update_node_affinity()"] + domain_update_node_aff("domain_update_node_aff()") + ==> check_auto_node{"Is domain's
auto_node_affinity
enabled?"} + =="yes (default)"==>set_node_affinity_from_vcpu_affinities(" + Calculate the domain's node_affinity mask from vCPU affinity + (used for further NUMA memory allocation for the domain)") + end + + do_domctl{"do_domctl()
op->cmd=?"} + ==XEN_DOMCTL_setvcpuaffinity==> + vcpu_set_affinity("vcpu_set_affinity()
set the vCPU affinity") + ==>domain_update_node_aff + do_domctl + --XEN_DOMCTL_setnodeaffinity (not used currently) + -->is_new_affinity_all_nodes? + + subgraph domain_set_node_affinity["domain_set_node_affinity()"] + + is_new_affinity_all_nodes?{new_affinity
is #34;all#34;?} + + --is #34;all#34; + + --> enable_auto_node_affinity("auto_node_affinity=1") + --> domain_update_node_aff + + is_new_affinity_all_nodes? + + --not #34;all#34; + + --> disable_auto_node_affinity("auto_node_affinity=0") + --> domain_update_node_aff + end + +%% setting and getting the struct domain's node_affinity: + +disable_auto_node_affinity + --node_affinity=new_affinity--> + domain_node_affinity + +set_node_affinity_from_vcpu_affinities + ==> domain_node_affinity@{ shape: bow-rect,label: "domain: node_affinity" } + --XEN_DOMCTL_getnodeaffinity--> do_domctl + +end +click is_Host.numa_affinity_policy_set? +"https://github.com/xapi-project/xen-api/blob/90ef043c1f3a3bc20f1c5d3ccaaf6affadc07983/ocaml/xenopsd/xc/domain.ml#L951-L962" +click numa_placement +"https://github.com/xapi-project/xen-api/blob/90ef043c/ocaml/xenopsd/xc/domain.ml#L862-L897" +click stub_xc_hvm_build +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L2329-L2436" _blank +click get_flags +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1164-L1288" _blank +click do_domctl +"https://github.com/xen-project/xen/blob/7cf163879/xen/common/domctl.c#L282-L894" _blank +click domain_set_node_affinity +"https://github.com/xen-project/xen/blob/7cf163879/xen/common/domain.c#L943-L970" _blank +click configure_vcpus +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1297-L1348" _blank +click set_hard_affinity? +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1305-L1326" _blank +click xc_vcpu_setaffinity +"https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank +click vcpu_set_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1353-L1393" _blank +click domain_update_node_aff +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1809-L1876" _blank +click check_auto_node +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1840-L1870" _blank +click set_node_affinity_from_vcpu_affinities +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1867-L1869" _blank +``` diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md new file mode 100644 index 00000000000..8586492d9cc --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md @@ -0,0 +1,92 @@ +--- +title: xc_vcpu_setaffinity() +description: Set a Xen vCPU's pCPU affinity and the domain's NUMA node affinity +mermaid: + force: true +--- +## Introduction + +In the Xen hypervisor, each vCPU has: + +- A _soft affinity_, This is the list of pCPUs where a vCPU prefers to run: + + This can be used in cases to make vCPUs prefer to run on a set on pCPUs, + for example the pCPUs of a NUMA node, but in case those are already busy, + the Credit schedule can still ignore the soft-affinity. + A typical use case for this are NUMA machines, where the soft affinity + for the vCPUs of a domain should be set equal to the pCPUs of the NUMA node where the domain's memory shall be placed. + + See the description of the [NUMA feature](../../../toolstack/features/NUMA/) + for more details. + +- A _hard affinity_, also known as pinning. + This is the list of pCPUs where a vCPU is allowed to run + + Hard affinity is currently not used for NUMA placement, but can be configured + manually for a given domain, either using `xe VCPUs-params:mask=` or the API. + + For example, the vCPU’s pinning can be configured using a template with: + ```py + xe template-param-set uuid= vCPUs-params:mask=1,2,3 + ``` + + There are also host-level `guest_VCPUs_params` which are used by + `host-cpu-tune` to exclusively pin Dom0 and guests (i.e. that their + pCPUs never overlap). Note: This isn't currently supported by the + NUMA code: It could result that the NUMA placement picks a node that + has reduced capacity or unavailable due to the host mask that + `host-cpu-tune` has set. + +## Purpose + +The libxenctrl library call `xc_set_vcpu_affinity()` +controls the pCPU affinity of the given vCPU. + +[xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest/#walkthrough-of-the-xenguest-build-mode) +uses it when building domains if +[xenopsd](../../xenopsd/walkthroughs/VM.build/Domain.build) +added vCPU affinity information to the XenStore platform data path +`platform/vcpu/#domid/affinity` of the domain. + +### Updating the NUMA node affinity of a domain + +Besides that, `xc_set_vcpu_affinity()` can also modify the NUMA node +affinity of the Xen domain if the vCPU: + +When Xen creates a domain, it enables the domain's `d->auto_node_affinity` +feature flag. + +When it is enabled, setting the vCPU affinity also updates the NUMA node +affinity which is used for memory allocations for the domain: + +### Simplified flowchart + +{{% include "xc_vcpu_setaffinity-simplified.md" %}} + +## Current use by xenopsd and xenguest + +When `Host.numa_affinity_policy` is set to +[best_effort](../../../toolstack/features/NUMA/#xapi-datamodel-design), +[xenopsd](../../../xenopsd/walkthroughs/VM.build) attempts NUMA node placement +when building new VMs and instructs +[xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest/#walkthrough-of-the-xenguest-build-mode) +to set the vCPU affinity of the domain. + +With the domain's `auto_node_affinity` flag enabled by default in Xen, +this automatically also sets the `d->node_affinity` mask of the domain. + +This then causes the Xen memory allocator to prefer the NUMA nodes in the +`d->node_affinity` NUMA node mask when allocating memory. + +That is, (for completeness) unless Xen's allocation function +`alloc_heap_pages()` receives a specific NUMA node in its `memflags` +argument when called. + +See [xc_domain_node_setaffinity()](xc_domain_node_setaffinity) for more +information about another way to set the `node_affinity` NUMA node mask +of Xen domains and more depth on how it is used in Xen. + +### Flowchart of its current use for NUMA affinity + +{{% include "xc_vcpu_setaffinity-xenopsd-notes.md" %}} +{{% include "xc_vcpu_setaffinity-xenopsd.md" %}} diff --git a/doc/content/squeezed/architecture/index.md b/doc/content/squeezed/architecture/index.md index fb86fd69989..2f7135fe926 100644 --- a/doc/content/squeezed/architecture/index.md +++ b/doc/content/squeezed/architecture/index.md @@ -1,8 +1,9 @@ +++ -title = "Architecture" +title = "Squeezed Architecture" +linkTitle = "Architecture" +++ -Squeezed is responsible for managing the memory on a single host. Squeezed +Squeezed is the XAPI Toolstack’s host memory ballooning daemon. It "balances" memory between VMs according to a policy written to Xenstore. The following diagram shows the internals of Squeezed: diff --git a/doc/content/xapi/internals/certificates.md b/doc/content/xapi/internals/certificates.md new file mode 100644 index 00000000000..63a4d0b84a0 --- /dev/null +++ b/doc/content/xapi/internals/certificates.md @@ -0,0 +1,111 @@ + ++++ +title = "Certificates and PEM Files" ++++ + +Xapi uses certificates for secure communication within a pool and with +external clients. These certificates are using the PEM file format and +reside in the Dom0 file system. This documents explains the purpose of +these files. + +##  Design Documents + +* [Pool Certificates](../../design/pool-certificates.md) +* [User Certificates](../../design/user-certificates.md) + +## Paths + +Below are paths used by Xapi for certificates; additional certficates +may be installed but they are not fundamental for Xapi's operation. + +``` +/etc/xensource/xapi-ssl.pem +/etc/xensource/xapi-pool-tls.pem +/etc/stunnel/certs-pool/1c111a1f-412e-47c0-9003-60789b839bc3.pem +/etc/stunnel/certs-pool/960abfff-6017-4d97-bd56-0a8f1a43e51a.pem +/etc/stunnel/xapi-stunnel-ca-bundle.pem +/etc/stunnel/certs/ +/etc/stunnel/xapi-pool-ca-bundle.pem +``` + + +## Fundamental Certificates + +Certificates that identify a host. These certificates are comprised of +both a private and a public key. The public key may be distributed to +other hosts. + +### xapi-ssl.pem + +This certificate identifies a host for extra-pool clients. + +This is the certificate used by the API HTTPS server that clients like +XenCenter or CVAD connect to. On installation of XenServer it is auto +generated but can be updated by a user using the API. This is the most +important certificate for a user to establish an HTTPS connection to a +pool or host to be used as an API. + +* /etc/xensource/xapi-ssl.pem +* contains private and public key for this host +* `Host.get_server_certificate` API call +* referenced by /etc/stunnel/xapi.conf +* `xe host-server-certificate-install` XE command to replace the + certificate. +* See below for xapi-stunnel-ca-bundle for additional certificates that + can be added to a pool in support of a user-supplied host certificate. +* `xe reset-server-certificate` creates a new self-signed certificate. + + +### `xapi-pool-tls.pem` + +This certificate identifies a host inside a pool. It is auto generated +and used for all intra-pool HTTPS connections. It needs to be +distributed inside a pool to establish trust. The distribution of the +public part of the certificate is performed by the API and must not be +done manually. + +* /etc/xensource/xapi-pool-tls.pem +* contains private and public key for this host +* referenced by /etc/stunnel/xapi.conf +* This certificate can be re-generated using the API or XE +* `Host.refresh_server_certificate` +* `xe host-refresh-server-certificate` + +## Certificate Bundles + +Certifiacte bundles are used by stunnel. They are a collection of public +keys from hosts and certificates provided by a user. Knowing a host's +public key facilitates stunnel connecting to the host. + +Bundles by themselves are a technicality as they organise a set of +certificates in a single file but don't add new certificates. + +### `xapi-pool-ca-bundle.pem` and `certs-pool/*.pem` + +Collection of public keys from xapi-pool-tls.pem across the +pool. The public keys are collected in the certs-pool directory: each is +named after the UUID of its host and the bundle is constructed from +them. + +* bundle of public keys from hosts' `xapi-pool-tls.pem` +* constructed from PEM files in `certs-pool/` +* `/opt/xensource/bin/update-ca-bundle.sh` generates the bundle from PEM + files + +### `xapi-stunnel-ca-bundle.pem` and `certs/*.pem` + +User-supplied certificates; they are not essential for the operation of +a pool from Xapi's perspective. They make stunnel aware of certificates +used by clients when using HTTPS for API calls. + +* in a plain pool installation, these are empty; PEMs supplied by a user + are stored here and bundled into the `xapi-stunnerl-ca-bundle.pem`. +* bundle of public keys supploed by a user +* constructed from PEM files in `certs/` +* `/opt/xensource/bin/update-ca-bundle.sh` generates the bundle from PEM files +* Updated by a user using `xe pool-install-ca-certificate` +* `Pool.install_ca_certificate` +* `Pool.uninstall_ca_certificate` +* `xe pool-certificate-sync` explicitly distribute these certificates in + the pool. +* User-provided certificates can be used to let xapi connect to WLB. diff --git a/doc/content/xen-api/topics/vm-lifecycle.md b/doc/content/xen-api/topics/vm-lifecycle.md index 7390dc61e80..44727bdf3f0 100644 --- a/doc/content/xen-api/topics/vm-lifecycle.md +++ b/doc/content/xen-api/topics/vm-lifecycle.md @@ -2,7 +2,7 @@ title = "VM Lifecycle" +++ -The following figure shows the states that a VM can be in and the +The following figure shows the states that a VM can be in and the API calls that can be used to move the VM between these states. ```mermaid diff --git a/doc/content/xenopsd/architecture/_index.md b/doc/content/xenopsd/architecture/_index.md index 0f4d5eccea5..8211e838684 100644 --- a/doc/content/xenopsd/architecture/_index.md +++ b/doc/content/xenopsd/architecture/_index.md @@ -1,5 +1,6 @@ +++ -title = "Architecture" +title = "Xenopsd Architecture" +linkTitle = "Architecture" +++ Xenopsd instances run on a host and manage VMs on behalf of clients. This diff --git a/doc/content/xenopsd/design/_index.md b/doc/content/xenopsd/design/_index.md index a55a9b124b7..2047d068ad5 100644 --- a/doc/content/xenopsd/design/_index.md +++ b/doc/content/xenopsd/design/_index.md @@ -1,3 +1,6 @@ +++ title = "Design" -+++ \ No newline at end of file ++++ + +Design documents for `xenopsd`: +{{% children %}} diff --git a/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md b/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md new file mode 100644 index 00000000000..ba4274e243a --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md @@ -0,0 +1,146 @@ +--- +title: Domain.build +description: + "Prepare the build of a VM: Wait for scrubbing, do NUMA placement, run xenguest." +--- + +## Overview + +```mermaid +flowchart LR +subgraph xenopsd VM_build[ + xenopsd thread pool with two VM_build micro#8209;ops: + During parallel VM_start, Many threads run this in parallel! +] +direction LR +build_domain_exn[ + VM.build_domain_exn + from thread pool Thread #1 +] --> Domain.build +Domain.build --> build_pre +build_pre --> wait_xen_free_mem +build_pre -->|if NUMA/Best_effort| numa_placement +Domain.build --> xenguest[Invoke xenguest] +click Domain.build "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210" _blank +click build_domain_exn "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225" _blank +click wait_xen_free_mem "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272" _blank +click numa_placement "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L862-L897" _blank +click build_pre "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L899-L964" _blank +click xenguest "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1139-L1146" _blank + +build_domain_exn2[ + VM.build_domain_exn + from thread pool Thread #2] --> Domain.build2[Domain.build] +Domain.build2 --> build_pre2[build_pre] +build_pre2 --> wait_xen_free_mem2[wait_xen_free_mem] +build_pre2 -->|if NUMA/Best_effort| numa_placement2[numa_placement] +Domain.build2 --> xenguest2[Invoke xenguest] +click Domain.build2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210" _blank +click build_domain_exn2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225" _blank +click wait_xen_free_mem2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272" _blank +click numa_placement2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L862-L897" _blank +click build_pre2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L899-L964" _blank +click xenguest2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1139-L1146" _blank +end +``` + +[`VM.build_domain_exn`](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024-L2248) +[calls](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225) +[`Domain.build`](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210) +to call: +- `build_pre` to prepare the build of a VM: + - If the `xe` config `numa_placement` is set to `Best_effort`, invoke the NUMA placement algorithm. + - Run `xenguest` +- `xenguest` to invoke the [xenguest](xenguest) program to setup the domain's system memory. + +## build_pre: Prepare building the VM + +[Domain.build](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210) +[calls](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1137) +[build_pre](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L899-L964) +(which is also used for VM restore) to: + +1. [Call](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L902-L911) + [wait_xen_free_mem](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272) + to wait (if necessary), for the Xen memory scrubber to catch up reclaiming memory. + It + 1. calls `Xenctrl.physinfo` which returns: + - `hostinfo.free_pages` - the free and already scrubbed pages (available) + - `host.scrub_pages` - the not yet scrubbed pages (not yet available) + 2. repeats this until a timeout as long as `free_pages` is *lower* + than the *required* pages + - unless if `scrub_pages` is 0 (no scrubbing left to do) + + Note: `free_pages` is system-wide memory, not memory specific to a NUMA node. + Because this is not NUMA-aware, in case of temporary node-specific memory shortage, + this check is not sufficient to prevent the VM from being spread over all NUMA nodes. + It is planned to resolve this issue by claiming NUMA node memory during NUMA placement. + +2. Call the hypercall to set the timer mode +3. Call the hypercall to set the number of vCPUs +4. Call the `numa_placement` function + as described in the [NUMA feature description](/toolstack/features/NUMA) + when the `xe` configuration option `numa_placement` is set to `Best_effort` + (except when the VM has a hard CPU affinity). + + ```ml + match !Xenops_server.numa_placement with + | Any -> + () + | Best_effort -> + log_reraise (Printf.sprintf "NUMA placement") (fun () -> + if has_hard_affinity then + D.debug "VM has hard affinity set, skipping NUMA optimization" + else + numa_placement domid ~vcpus + ~memory:(Int64.mul memory.xen_max_mib 1048576L) + ) + ``` + +## NUMA placement + +`build_pre` passes the `domid`, the number of `vCPUs` and `xen_max_mib` to the +[numa_placement](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L862-L897) +function to run the algorithm to find the best NUMA placement. + +When it returns a NUMA node to use, it calls the Xen hypercalls +to set the vCPU affinity to this NUMA node: + +```ml + let vm = NUMARequest.make ~memory ~vcpus in + let nodea = + match !numa_resources with + | None -> + Array.of_list nodes + | Some a -> + Array.map2 NUMAResource.min_memory (Array.of_list nodes) a + in + numa_resources := Some nodea ; + Softaffinity.plan ~vm host nodea +``` + +By using the default `auto_node_affinity` feature of Xen, +setting the vCPU affinity causes the Xen hypervisor to activate +NUMA node affinity for memory allocations to be aligned with +the vCPU affinity of the domain. + +Summary: This passes the information to the hypervisor that memory +allocation for this domain should preferably be done from this NUMA node. + +## Invoke the xenguest program + +With the preparation in `build_pre` completed, `Domain.build` +[calls](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1127-L1155) +the `xenguest` function to invoke the [xenguest](xenguest) program to build the domain. + +## Notes on future design improvements + +The Xen domain feature flag +[domain->auto_node_affinity](https://wiki.xenproject.org/wiki/NUMA_node_affinity_in_the_Xen_hypervisor) +can be disabled by calling +[xc_domain_node_setaffinity()](../../references/xc_domain_node_setaffinity.md) +to set a specific NUMA node affinity in special cases: + +This can be used, for example, when there might not be enough memory on the preferred +NUMA node, and there are other NUMA nodes (in the same CPU package) to use +([reference](../../../lib/xenctrl/xc_domain_node_setaffinity.md)). diff --git a/doc/content/xenopsd/walkthroughs/VM.build/VM_build-chart.md b/doc/content/xenopsd/walkthroughs/VM.build/VM_build-chart.md new file mode 100644 index 00000000000..eec1f05fc0e --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/VM_build-chart.md @@ -0,0 +1,27 @@ +--- +hidden: true +title: VM_build micro-op flowchart +description: For inclusion in _index.md and VM_build.md +weight: 10 +--- + +```mermaid +flowchart +subgraph xenopsd VM_build[xenopsd: VM_build micro#8209;op] +direction LR +VM_build --> VM.build +VM.build --> VM.build_domain +VM.build_domain --> VM.build_domain_exn +VM.build_domain_exn --> Domain.build +click VM_build " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/lib/xenops_server.ml#L2255-L2271" _blank +click VM.build " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/xenops_server_xen.ml#L2290-L2291" _blank +click VM.build_domain " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/xenops_server_xen.ml#L2250-L2288" _blank +click VM.build_domain_exn " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024-L2248" _blank +click Domain.build " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/domain.ml#L1111-L1210" _blank +end +``` diff --git a/doc/content/xenopsd/walkthroughs/VM.build/VM_build.md b/doc/content/xenopsd/walkthroughs/VM.build/VM_build.md new file mode 100644 index 00000000000..f83cccf5353 --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/VM_build.md @@ -0,0 +1,46 @@ +--- +title: VM_build micro-op +linkTitle: VM_build μ-op +description: Overview of the VM_build μ-op (runs after the VM_create μ-op created the domain). +weight: 10 +mermaid: + force: true +--- + +## Overview + +On Xen, `Xenctrl.domain_create` creates an empty domain and +returns the domain ID (`domid`) of the new domain to `xenopsd`. + +In the `build` phase, the `xenguest` program is called to create +the system memory layout of the domain, set vCPU affinity and a +lot more. + +The [VM_build](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L2255-L2271) +micro-op collects the VM build parameters and calls +[VM.build](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2290-L2291), +which calls +[VM.build_domain](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2250-L2288), +which calls +[VM.build_domain_exn](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024-L2248) +which calls [Domain.build](Domain.build): + +{{% include "VM_build-chart.md" %}} + +The function +[VM.build_domain_exn](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024) +must: + +1. Run pygrub (or eliloader) to extract the kernel and initrd, if necessary +2. [Call](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225) + [Domain.build](Domain.build) to + - optionally run NUMA placement and + - invoke [xenguest](VM.build/xenguest) to set up the domain memory. + + See the walk-through of the [Domain.build](Domain.build) function + for more details on this phase. +3. Apply the `cpuid` configuration +4. Store the current domain configuration on disk -- it's important to know + the difference between the configuration you started with and the configuration + you would use after a reboot because some properties (such as maximum memory + and vCPUs) as fixed on create. diff --git a/doc/content/xenopsd/walkthroughs/VM.build/_index.md b/doc/content/xenopsd/walkthroughs/VM.build/_index.md new file mode 100644 index 00000000000..63770bf6bdc --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/_index.md @@ -0,0 +1,12 @@ +--- +title: Building a VM +description: After VM_create, VM_build builds the core of the domain (vCPUs, memory) +weight: 20 +mermaid: + force: true +--- +{{% include "VM_build-chart.md" %}} + +Walk-through documents for the `VM_build` phase: + +{{% children description=true %}} diff --git a/doc/content/xenopsd/walkthroughs/VM.build/xenguest.md b/doc/content/xenopsd/walkthroughs/VM.build/xenguest.md new file mode 100644 index 00000000000..70908d556fb --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/xenguest.md @@ -0,0 +1,185 @@ +--- +title: xenguest +description: + "Perform building VMs: Allocate and populate the domain's system memory." +--- +As part of starting a new domain in VM_build, `xenopsd` calls `xenguest`. +When multiple domain build threads run in parallel, +also multiple instances of `xenguest` also run in parallel: + +```mermaid +flowchart +subgraph xenopsd VM_build[xenopsd VM_build micro#8209;ops] +direction LR +xenopsd1[Domain.build - Thread #1] --> xenguest1[xenguest #1] +xenopsd2[Domain.build - Thread #2] --> xenguest2[xenguest #2] +xenguest1 --> libxenguest +xenguest2 --> libxenguest2[libxenguest] +click xenopsd1 "../Domain.build/index.html" +click xenopsd2 "../Domain.build/index.html" +click xenguest1 "https://github.com/xenserver/xen.pg/blob/XS-8/patches/xenguest.patch" _blank +click xenguest2 "https://github.com/xenserver/xen.pg/blob/XS-8/patches/xenguest.patch" _blank +click libxenguest "https://github.com/xen-project/xen/tree/master/tools/libs/guest" _blank +click libxenguest2 "https://github.com/xen-project/xen/tree/master/tools/libs/guest" _blank +libxenguest --> Xen[Xen
Hypervisor] +libxenguest2 --> Xen +end +``` + +## About xenguest + +`xenguest` is called by the xenopsd [Domain.build](Domain.build) function +to perform the build phase for new VMs, which is part of the `xenopsd` +[VM.start operation](VM.start). + +[xenguest](https://github.com/xenserver/xen.pg/blob/XS-8/patches/xenguest.patch) +was created as a separate program due to issues with +`libxenguest`: + +- It wasn't threadsafe: fixed, but it still uses a per-call global struct +- It had an incompatible licence, but now licensed under the LGPL. + +Those were fixed, but we still shell out to `xenguest`, which is currently +carried in the patch queue for the Xen hypervisor packages, but could become +an individual package once planned changes to the Xen hypercalls are stabilised. + +Over time, `xenguest` has evolved to build more of the initial domain state. + +## Interface to xenguest + +```mermaid +flowchart +subgraph xenopsd VM_build[xenopsd VM_build micro#8209;op] +direction TB +mode +domid +memmax +Xenstore +end +mode[--mode build_hvm] --> xenguest +domid --> xenguest +memmax --> xenguest +Xenstore[Xenstore platform data] --> xenguest +``` + +`xenopsd` must pass this information to `xenguest` to build a VM: + +- The domain type to build for (HVM, PHV or PV). + - It is passed using the command line option `--mode hvm_build`. +- The `domid` of the created empty domain, +- The amount of system memory of the domain, +- A number of other parameters that are domain-specific. + +`xenopsd` uses the Xenstore to provide platform data: + +- the vCPU affinity +- the vCPU credit2 weight/cap parameters +- whether the NX bit is exposed +- whether the viridian CPUID leaf is exposed +- whether the system has PAE or not +- whether the system has ACPI or not +- whether the system has nested HVM or not +- whether the system has an HPET or not + +When called to build a domain, `xenguest` reads those and builds the VM accordingly. + +## Walkthrough of the xenguest build mode + +```mermaid +flowchart +subgraph xenguest[xenguest #8209;#8209;mode hvm_build domid] +direction LR +stub_xc_hvm_build[stub_xc_hvm_build#40;#41;] --> get_flags[ + get_flags#40;#41; <#8209; Xenstore platform data +] +stub_xc_hvm_build --> configure_vcpus[ + configure_vcpus#40;#41; #8209;> Xen hypercall +] +stub_xc_hvm_build --> setup_mem[ + setup_mem#40;#41; #8209;> Xen hypercalls to setup domain memory +] +end +``` + +Based on the given domain type, the `xenguest` program calls dedicated +functions for the build process of the given domain type. + +These are: + +- `stub_xc_hvm_build()` for HVM, +- `stub_xc_pvh_build()` for PVH, and +- `stub_xc_pv_build()` for PV domains. + +These domain build functions call these functions: + +1. `get_flags()` to get the platform data from the Xenstore +2. `configure_vcpus()` which uses the platform data from the Xenstore to configure vCPU affinity and the credit scheduler parameters vCPU weight and vCPU cap (max % pCPU time for throttling) +3. The `setup_mem` function for the given VM type. + +## The function hvm_build_setup_mem() + +For HVM domains, `hvm_build_setup_mem()` is responsible for deriving the memory +layout of the new domain, allocating the required memory and populating for the +new domain. It must: + +1. Derive the `e820` memory layout of the system memory of the domain + including memory holes depending on PCI passthrough and vGPU flags. +2. Load the BIOS/UEFI firmware images +3. Store the final MMIO hole parameters in the Xenstore +4. Call the `libxenguest` function `xc_dom_boot_mem_init()` (see below) +5. Call `construct_cpuid_policy()` to apply the CPUID `featureset` policy + +## The function xc_dom_boot_mem_init() + +```mermaid +flowchart LR +subgraph xenguest +hvm_build_setup_mem[hvm_build_setup_mem#40;#41;] +end +subgraph libxenguest +hvm_build_setup_mem --> xc_dom_boot_mem_init[xc_dom_boot_mem_init#40;#41;] +xc_dom_boot_mem_init -->|vmemranges| meminit_hvm[meninit_hvm#40;#41;] +click xc_dom_boot_mem_init "https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_boot.c#L110-L126" _blank +click meminit_hvm "https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_x86.c#L1348-L1648" _blank +end +``` + +`hvm_build_setup_mem()` calls +[xc_dom_boot_mem_init()](https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_boot.c#L110-L126) +to allocate and populate the domain's system memory. + +It calls +[meminit_hvm()](https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_x86.c#L1348-L1648) +to loop over the `vmemranges` of the domain for mapping the system RAM +of the guest from the Xen hypervisor heap. Its goals are: + +- Attempt to allocate 1GB superpages when possible +- Fall back to 2MB pages when 1GB allocation failed +- Fall back to 4k pages when both failed + +It uses the hypercall +[XENMEM_populate_physmap](https://github.com/xen-project/xen/blob/39c45c/xen/common/memory.c#L1408-L1477) +to perform memory allocation and to map the allocated memory +to the system RAM ranges of the domain. + +https://github.com/xen-project/xen/blob/39c45c/xen/common/memory.c#L1022-L1071 + +`XENMEM_populate_physmap`: + +1. Uses + [construct_memop_from_reservation](https://github.com/xen-project/xen/blob/39c45c/xen/common/memory.c#L1022-L1071) + to convert the arguments for allocating a page from + [struct xen_memory_reservation](https://github.com/xen-project/xen/blob/master/xen/include/public/memory.h#L46-L80) + to `struct memop_args`. +2. Sets flags and calls functions according to the arguments +3. Allocates the requested page at the most suitable place + - depending on passed flags, allocate on a specific NUMA node + - else, if the domain has node affinity, on the affine nodes + - also in the most suitable memory zone within the NUMA node +4. Falls back to less desirable places if this fails + - or fail for "exact" allocation requests +5. When no pages of the requested size are free, + it splits larger superpages into pages of the requested size. + +For more details on the VM build step involving `xenguest` and Xen side see: +https://wiki.xenproject.org/wiki/Walkthrough:_VM_build_using_xenguest diff --git a/doc/content/xenopsd/walkthroughs/VM.migrate.md b/doc/content/xenopsd/walkthroughs/VM.migrate.md index 080ebdb8edc..8982c4690da 100644 --- a/doc/content/xenopsd/walkthroughs/VM.migrate.md +++ b/doc/content/xenopsd/walkthroughs/VM.migrate.md @@ -1,37 +1,44 @@ --- title: 'Walkthrough: Migrating a VM' +linktitle: 'Migrating a VM' +description: Walkthrough of migrating a VM from one host to another. +weight: 50 +mermaid: + force: true --- +At the end of this walkthrough, a sequence diagram of the overall process is included. -A XenAPI client wishes to migrate a VM from one host to another within -the same pool. +## Invocation -The client will issue a command to migrate the VM and it will be dispatched +The command to migrate the VM is dispatched by the autogenerated `dispatch_call` function from **xapi/server.ml**. For more information about the generated functions you can have a look to [XAPI IDL model](https://github.com/xapi-project/xen-api/tree/master/ocaml/idl/ocaml_backend). -The command will trigger the operation +The command triggers the operation [VM_migrate](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/lib/xenops_server.ml#L2572) -that has low level operations performed by the backend. These atomics operations -that we will describe in the documentation are: - -- VM.restore -- VM.rename -- VBD.set_active -- VBD.plug -- VIF.set_active -- VGPU.set_active -- VM.create_device_model -- PCI.plug -- VM.set_domain_action_request - -The command have serveral parameters such as: should it be ran asynchronously, -should it be forwared to another host, how arguments should be marshalled and -so on. A new thread is created by [xapi/server_helpers.ml](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xapi/server_helpers.ml#L55) -to handle the command asynchronously. At this point the helper also check if +that uses many low level atomics operations. These are: + +- [VM.restore](#VM-restore) +- [VM.rename](#VM-rename) +- [VBD.set_active](#restoring-devices) +- [VBD.plug](#restoring-devices) +- [VIF.set_active](#restoring-devices) +- [VGPU.set_active](#restoring-devices) +- [VM.create_device_model](#creating-the-device-model) +- [PCI.plug](#pci-plug) + +The migrate command has several parameters such as: + +- Should it be started asynchronously, +- Should it be forwarded to another host, +- How arguments should be marshalled, and so on. + +A new thread is created by [xapi/server_helpers.ml](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xapi/server_helpers.ml#L55) +to handle the command asynchronously. The helper thread checks if the command should be passed to the [message forwarding](https://github.com/xapi-project/xen-api/blob/master/ocaml/xapi/message_forwarding.ml) -layer in order to be executed on another host (the destination) or locally if -we are already at the right place. +layer in order to be executed on another host (the destination) or locally (if +it is already at the destination host). It will finally reach [xapi/api_server.ml](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xapi/api_server.ml#L242) that will take the action of posted a command to the message broker [message switch](https://github.com/xapi-project/xen-api/tree/master/ocaml/message-switch). @@ -40,77 +47,77 @@ XAPI daemons. In the case of the migration this message sends by **XAPI** will b consumed by the [xenopsd](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd) daemon that will do the job of migrating the VM. -# The migration of the VM +## Overview The migration is an asynchronous task and a thread is created to handle this task. -The tasks's reference is returned to the client, which can then check +The task reference is returned to the client, which can then check its status until completion. -As we see in the introduction the [xenopsd](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd) -daemon will pop the operation +As shown in the introduction, [xenopsd](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd) +fetches the [VM_migrate](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/lib/xenops_server.ml#L2572) -from the message broker. +operation from the message broker. -Only one backend is know available that interacts with libxc, libxenguest -and xenstore. It is the [xc backend](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd/xc). +All tasks specific to [libxenctrl](../../lib/xenctrl), +[xenguest](VM.build/xenguest) and [Xenstore](https://wiki.xenproject.org/wiki/XenStore) +are handled by the xenopsd +[xc backend](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd/xc). The entities that need to be migrated are: *VDI*, *VIF*, *VGPU* and *PCI* components. -During the migration process the destination domain will be built with the same -uuid than the original VM but the last part of the UUID will be +During the migration process, the destination domain will be built with the same +UUID as the original VM, except that the last part of the UUID will be `XXXXXXXX-XXXX-XXXX-XXXX-000000000001`. The original domain will be removed using `XXXXXXXX-XXXX-XXXX-XXXX-000000000000`. -There are some points called *hooks* at which `xenopsd` can execute some script. -Before starting a migration a command is send to the original domain to execute -a pre migrate script if it exists. +## Preparing VM migration -Before starting the migration a command is sent to Qemu using the Qemu Machine Protocol (QMP) +At specific places, `xenopsd` can execute *hooks* to run scripts. +In case a pre-migrate script is in place, a command to run this script +is sent to the original domain. + +Likewise, a command is sent to Qemu using the Qemu Machine Protocol (QMP) to check that the domain can be suspended (see [xenopsd/xc/device_common.ml](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/device_common.ml)). -After checking with Qemu that the VM is suspendable we can start the migration. +After checking with Qemu that the VM is can be suspended, the migration can begin. ## Importing metadata As for *hooks*, commands to source domain are sent using [stunnel](https://github.com/xapi-project/xen-api/tree/master/ocaml/libs/stunnel) a daemon which is used as a wrapper to manage SSL encryption communication between two hosts on the same -pool. To import metada an XML RPC command is sent to the original domain. +pool. To import the metadata, an XML RPC command is sent to the original domain. -Once imported it will give us a reference id and will allow to build the new domain +Once imported, it will give us a reference id and will allow building the new domain on the destination using the temporary VM uuid `XXXXXXXX-XXXX-XXXX-XXXX-000000000001` where `XXX...` is the reference id of the original VM. -## Setting memory +## Memory setup -One of the first thing to do is to setup the memory. The backend will check that there -is no ballooning operation in progress. At this point the migration can fail if a -ballooning operation is in progress and takes too much time. +One of the first steps the setup of the VM's memory: The backend checks that there +is no ballooning operation in progress. If so, the migration could fail. -Once memory checked the daemon will get the state of the VM (running, halted, ...) and -information about the VM are retrieve by the backend like the maximum memory the domain -can consume but also information about quotas for example. -Information are retrieve by the backend from xenstore. +Once memory has been checked, the daemon will get the state of the VM (running, halted, ...) and +The backend retrieves the domain's platform data (memory, vCPUs setc) from the Xenstore. Once this is complete, we can restore VIF and create the domain. -The synchronisation of the memory is the first point of synchronisation and everythin +The synchronisation of the memory is the first point of synchronisation and everything is ready for VM migration. -## VM Migration +## Destination VM setup After receiving memory we can set up the destination domain. If we have a vGPU we need to kick -off its migration process. We will need to wait the acknowledge that indicates that the entry -for the GPU has been well initialized. before starting the main VM migration. +off its migration process. We will need to wait for the acknowledgement that the +GPU entry has been successfully initialized before starting the main VM migration. -Their is a mechanism of handshake for synchronizing between the source and the -destination. Using the handshake protocol the receiver inform the sender of the -request that everything has been setup and ready to save/restore. +The receiver informs the sender using a handshake protocol +that everything is set up and ready for save/restore. -### VM restore +## Destination VM restore VM restore is a low level atomic operation [VM.restore](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L2684). This operation is represented by a function call to [backend](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/domain.ml#L1540). It uses **Xenguest**, a low-level utility from XAPI toolstack, to interact with the Xen hypervisor -and libxc for sending a request of migration to the **emu-manager**. +and `libxc` for sending a migration request to the **emu-manager**. After sending the request results coming from **emu-manager** are collected by the main thread. It blocks until results are received. @@ -120,16 +127,14 @@ transitions for the devices and handling the message passing for the VM as it's moved between hosts. This includes making sure that the state of the VM's virtual devices, like disks or network interfaces, is correctly moved over. -### VM renaming +## Destination VM rename -Once all operations are done we can rename the VM on the target from its temporary -name to its real UUID. This operation is another low level atomic one +Once all operations are done, `xenopsd` renames the target VM from its temporary +name to its real UUID. This operation is a low-level atomic [VM.rename](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L1667) -that will take care of updating the xenstore on the destination. - -The next step is the restauration of devices and unpause the domain. +which takes care of updating the Xenstore on the destination host. -### Restoring remaining devices +## Restoring devices Restoring devices starts by activating VBD using the low level atomic operation [VBD.set_active](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3674). It is an update of Xenstore. VBDs that are read-write must @@ -140,39 +145,51 @@ is called. VDI are attached and activate. Next devices are VIFs that are set as active [VIF.set_active](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L4296) and plug [VIF.plug](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L4394). If there are VGPUs we will set them as active now using the atomic [VGPU.set_active](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3490). -We are almost done. The next step is to create the device model - -#### create device model +### Creating the device model -Create device model is done by using the atomic operation [VM.create_device_model](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L2375). This -will configure **qemu-dm** and started. This allow to manage PCI devices. +[create_device_model](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/xc/xenops_server_xen.ml#L2293-L2349) +configures **qemu-dm** and starts it. This allows to manage PCI devices. -#### PCI plug +### PCI plug [PCI.plug](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3399) -is executed by the backend. It plugs a PCI device and advertise it to QEMU if this option is set. It is -the case for NVIDIA SR-IOV vGPUS. +is executed by the backend. It plugs a PCI device and advertises it to QEMU if this option is set. It is +the case for NVIDIA SR-IOV vGPUs. + +## Unpause + +The libxenctrl call +[xc_domain_unpause()](https://github.com/xen-project/xen/blob/414dde3/tools/libs/ctrl/xc_domain.c#L76) +unpauses the domain, and it starts running. + +## Cleanup + +1. [VM_set_domain_action_request](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/lib/xenops_server.ml#L3004) + marks the domain as alive: In case `xenopsd` restarts, it no longer reboots the VM. + See the chapter on [marking domains as alive](VM.start#11-mark-the-domain-as-alive) + for more information. + +2. If a post-migrate script is in place, it is executed by the + [Xenops_hooks.VM_post_migrate](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/lib/xenops_server.ml#L3005-L3009) + hook. + +3. The final step is a handshake to seal the success of the migration +and the old VM can now be cleaned up. -At this point devices have been restored. The new domain is considered survivable. We can -unpause the domain and performs last actions +[Syncronisation point 4](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/lib/xenops_server.ml#L3014) +has been reached, the migration is complete. -### Unpause and done +## Live migration flowchart -Unpause is done by managing the state of the domain using bindings to [xenctrl](https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libs/ctrl/xc_domain.c;h=f2d9d14b4d9f24553fa766c5dcb289f88d684bb0;hb=HEAD#l76). -Once hypervisor has unpaused the domain some actions can be requested using [VM.set_domain_action_request](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3172). -It is a path in xenstore. By default no action is done but a reboot can be for example -initiated. +This flowchart gives a visual representation of the VM migration workflow: -Previously we spoke about some points called *hooks* at which `xenopsd` can execute some script. There -is also a hook to run a post migrate script. After the execution of the script if there is one -the migration is almost done. The last step is a handskake to seal the success of the migration -and the old VM can now be cleaned. +{{% include live-migration %}} -# Links +## References -Some links are old but even if many changes occured they are relevant for a global understanding -of the XAPI toolstack. +These pages might help for a better understanding of the XAPI toolstack: -- [XAPI architecture](https://xapi-project.github.io/xapi/architecture.html) -- [XAPI dispatcher](https://wiki.xenproject.org/wiki/XAPI_Dispatch) -- [Xenopsd architecture](https://xapi-project.github.io/xenopsd/architecture.html) +- See the [XAPI architecture](../../xapi/_index) for the overall architecture of Xapi +- See the [XAPI dispatcher](https://wiki.xenproject.org/wiki/XAPI_Dispatch) for service dispatch and message forwarding +- See the [Xenopsd architecture](../architecture/_index) for the overall architecture of Xenopsd +- See the [How Xen suspend and resume works](https://mirage.io/docs/xen-suspend) for very similar operations in more detail. diff --git a/doc/content/xenopsd/walkthroughs/VM.start.md b/doc/content/xenopsd/walkthroughs/VM.start.md index 7e24b6d66ba..b043a5d9bf0 100644 --- a/doc/content/xenopsd/walkthroughs/VM.start.md +++ b/doc/content/xenopsd/walkthroughs/VM.start.md @@ -1,5 +1,8 @@ --- title: 'Walkthrough: Starting a VM' +linktitle: 'Starting a VM' +description: Complete walkthrough of starting a VM, from receiving the request to unpause. +weight: 10 --- A Xenopsd client wishes to start a VM. They must first tell Xenopsd the VM @@ -30,7 +33,7 @@ users: - the XenAPI has many clients which are updated on long release cycles. The main property needed is backwards compatibility, so that new release of xapi - remain compatible with these older clients. Quite often we will chose to + remain compatible with these older clients. Quite often, we will choose to "grandfather in" some poorly designed interface simply because we wish to avoid imposing churn on 3rd parties. - the Xenopsd API clients are all open-source and are part of the xapi-project. @@ -89,7 +92,7 @@ exist for: From here we shall assume the use of the "Xen via libxc, libxenguest and xenstore" (a.k.a. "Xenopsd classic") backend. -The backend [VM.add](https://github.com/xapi-project/xenopsd/blob/2a476c132c0b5732f9b224316b851a1b4d57520b/xc/xenops_server_xen.ml#L719) +The backend [VM.add](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L1603-L1659) function checks whether the VM we have to manage already exists -- and if it does then it ensures the Xenstore configuration is intact. This Xenstore configuration is important because at any time a client can query the state of a VM with @@ -132,17 +135,15 @@ When the Task has completed successfully, then calls to *.stat will show: - a valid start time - valid "targets" for memory and vCPU -Note: before a Task completes, calls to *.stat will show partial updates e.g. -the power state may be Paused but none of the disks may have become plugged. +Note: before a Task completes, calls to *.stat will show partial updates. E.g. +the power state may be paused, but no disk may have been plugged. UI clients must choose whether they are happy displaying this in-between state or whether they wish to hide it and pretend the whole operation has happened -transactionally. If a particular client wishes to perform side-effects in -response to Xenopsd state changes -- for example to clean up an external resource -when a VIF becomes unplugged -- then it must be very careful to avoid responding -to these in-between states. Generally it is safest to passively report these -values without driving things directly from them. Think of them as status lights -on the front panel of a PC: fine to look at but it's not a good idea to wire -them up to actuators which actually do things. +transactionally. If a particular, when a client wishes to perform side-effects in +response to `xenopsd` state changes (for example, to clean up an external resource +when a VIF becomes unplugged), it must be very careful to avoid responding +to these in-between states. Generally, it is safest to passively report these +values without driving things directly from them. Note: the Xenopsd implementation guarantees that, if it is restarted at any point during the start operation, on restart the VM state shall be "fixed" by either @@ -163,7 +164,7 @@ via the function It is the responsibility of the client to call [TASK.destroy](https://github.com/xapi-project/xcp-idl/blob/2e5c3dd79c63e3711227892271a6bece98eb0fa1/xen/xenops_interface.ml#L406) -when the Task is nolonger needed. Xenopsd won't destroy the task because it contains +when the Task is no longer needed. Xenopsd won't destroy the task because it contains the success/failure result of the operation which is needed by the client. What happens when a Xenopsd receives a VM.start request? @@ -196,24 +197,43 @@ takes care of: Once a thread from the worker pool becomes free, it will execute the "do it now" function. In the example above this is `perform op t` where `op` is `VM_start vm` and `t` is the Task. The function -[perform](https://github.com/xapi-project/xenopsd/blob/524d57b3c70/lib/xenops_server.ml#L1198) +[perform_exn](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L2533) has fragments like this: ```ocaml - | VM_start id -> - debug "VM.start %s" id; - perform_atomics (atomics_of_operation op) t; - VM_DB.signal id + | VM_start (id, force) -> ( + debug "VM.start %s (force=%b)" id force ; + let power = (B.VM.get_state (VM_DB.read_exn id)).Vm.power_state in + match power with + | Running -> + info "VM %s is already running" id + | _ -> + perform_atomics (atomics_of_operation op) t ; + VM_DB.signal id "^^^^^^^^^^^^^^^^^^^^-------- + ) ``` Each "operation" (e.g. `VM_start vm`) is decomposed into "micro-ops" by the function -[atomics_of_operation](https://github.com/xapi-project/xenopsd/blob/524d57b3c70/lib/xenops_server.ml#L739) +[atomics_of_operation](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L1583) where the micro-ops are small building-block actions common to the higher-level operations. Each operation corresponds to a list of "micro-ops", where there is no if/then/else. Some of the "micro-ops" may be a no-op depending on the VM configuration (for example a PV domain may not need a qemu). In the case of -`VM_start vm` this decomposes into the sequence: +[`VM_start vm`](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L1584) +the `Xenopsd` server starts by calling the [functions that +decompose](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L1612-L1714) + the `VM_hook_script`, `VM_create` and `VM_build` micro-ops: +```ml + dequarantine_ops vgpus + ; [ + VM_hook_script + (id, Xenops_hooks.VM_pre_start, Xenops_hooks.reason__none) + ; VM_create (id, None, None, no_sharept) + ; VM_build (id, force) + ] +``` +This is the complete sequence of micro-ops: ## 1. run the "VM_pre_start" scripts @@ -225,8 +245,8 @@ module and looks for scripts in the hardcoded path `/etc/xapi.d`. ## 2. create a Xen domain The `VM_create` micro-op calls the `VM.create` function in the backend. -In the classic Xenopsd backend the -[VM.create_exn](https://github.com/xapi-project/xenopsd/blob/b33bab13080cea91e2fd59d5088622cd68152339/xc/xenops_server_xen.ml#L633) +In the classic Xenopsd backend, the +[VM.create_exn](https://github.com/xapi-project/xen-api/blob/bae7526faeb2a02a2fe5b71410083983f4695963/ocaml/xenopsd/xc/xenops_server_xen.ml#L1421-L1586) function must 1. check if we're creating a domain for a fresh VM or resuming an existing one: @@ -237,7 +257,13 @@ function must because domain create often fails in low-memory conditions. This means the "reservation" is associated with our "session" with squeezed; if Xenopsd crashes and restarts the reservation will be freed automatically. -3. create the Domain via the libxc hypercall +3. create the Domain via the libxc hypercall `Xenctrl.domain_create` +4. [call]( + https://github.com/xapi-project/xen-api/blob/bae7526faeb2a02a2fe5b71410083983f4695963/ocaml/xenopsd/xc/xenops_server_xen.ml#L1547) + [generate_create_info()]( + https://github.com/xapi-project/xen-api/blob/bae7526faeb2a02a2fe5b71410083983f4695963/ocaml/xenopsd/xc/xenops_server_xen.ml#L1302-L1419) + for storing the platform data (vCPUs, etc) the domain's Xenstore tree. + `xenguest` then uses this in the `build` phase (see below) to build the domain. 4. "transfer" the squeezed reservation to the domain such that squeezed will free the memory if the domain is destroyed later 5. compute and set an initial balloon target depending on the amount of memory @@ -253,38 +279,10 @@ function must ## 3. build the domain -On a Xen system a domain is created empty, and memory is actually allocated -from the host in the "build" phase via functions in *libxenguest*. The -[VM.build_domain_exn](https://github.com/xapi-project/xenopsd/blob/b33bab13080cea91e2fd59d5088622cd68152339/xc/xenops_server_xen.ml#L994) -function must - -1. run pygrub (or eliloader) to extract the kernel and initrd, if necessary -2. invoke the *xenguest* binary to interact with libxenguest. -3. apply the `cpuid` configuration -4. store the current domain configuration on disk -- it's important to know - the difference between the configuration you started with and the configuration - you would use after a reboot because some properties (such as maximum memory - and vCPUs) as fixed on create. - -The xenguest binary was originally -a separate binary for two reasons: (i) the libxenguest functions weren't -threadsafe since they used lots of global variables; and (ii) the libxenguest -functions used to have a different, incompatible license, which prevent us -linking. Both these problems have been resolved but we still shell out to -the xenguest binary. - -The xenguest binary has also evolved to configure more of the initial domain -state. It also [reads Xenstore](https://github.com/xapi-project/ocaml-xen-lowlevel-libs/blob/master/xenguest-4.4/xenguest_stubs.c#L42) -and configures - -- the vCPU affinity -- the vCPU credit2 weight/cap parameters -- whether the NX bit is exposed -- whether the viridian CPUID leaf is exposed -- whether the system has PAE or not -- whether the system has ACPI or not -- whether the system has nested HVM or not -- whether the system has an HPET or not +The `build` phase waits, if necessary, for the Xen memory scrubber to catch +up reclaiming memory, runs NUMA placement, sets vCPU affinity and invokes +the `xenguest` to build the system memory layout of the domain. +See the [walk-through of the VM_build μ-op](VM.build) for details. ## 4. mark each VBD as "active" @@ -304,7 +302,7 @@ calls bracket plug/unplug. If the "active" flag was set before the unplug attempt then as soon as the frontend/backend connection is removed clients would see the VBD as completely dissociated from the VM -- this would be misleading because Xenopsd will not have had time to use the storage API to release locks -on the disks. By doing all the cleanup before setting "active" to false, clients +on the disks. By cleaning up before setting "active" to false, clients can be assured that the disks are now free to be reassigned. ## 5. handle non-persistent disks @@ -370,7 +368,7 @@ to be the order the nodes were created so this means that (i) xenstored must continue to store directories as ordered lists rather than maps (which would be more efficient); and (ii) Xenopsd must make sure to plug the vifs in the same order. Note that relying on ethX device numbering has always been a -bad idea but is still common. I bet if you change this lots of tests will +bad idea but is still common. I bet if you change this, many tests will suddenly start to fail! The function diff --git a/doc/content/xenopsd/walkthroughs/_index.md b/doc/content/xenopsd/walkthroughs/_index.md index d54568dcbbf..6fe3f551f29 100644 --- a/doc/content/xenopsd/walkthroughs/_index.md +++ b/doc/content/xenopsd/walkthroughs/_index.md @@ -6,8 +6,10 @@ linkTitle = "Walk-throughs" Let's trace through interesting operations to see how the whole system works. -- [Starting a VM](VM.start.md) -- [Migrating a VM](VM.migrate.md) +{{% children depth=2 description=true %}} + +Inspiration for other walk-throughs: + - Shutting down a VM and waiting for it to happen - A VM wants to reboot itself - A disk is hotplugged diff --git a/doc/content/xenopsd/walkthroughs/live-migration.md b/doc/content/xenopsd/walkthroughs/live-migration.md index f0af797f85e..b93a4afbaa8 100644 --- a/doc/content/xenopsd/walkthroughs/live-migration.md +++ b/doc/content/xenopsd/walkthroughs/live-migration.md @@ -1,9 +1,13 @@ +++ title = "Live Migration Sequence Diagram" linkTitle = "Live Migration" +description = "Sequence diagram of the process of Live Migration." +# Note: This page is included by VM.migrate.md to provide a complete overview +# of the most important parts of live migration. Do not add text as that would +# break the mermaid diagram inclusion. +++ -{{}} +```mermaid sequenceDiagram autonumber participant tx as sender @@ -43,5 +47,4 @@ deactivate rx1 tx->>tx: VM_shutdown
VM_remove deactivate tx - -{{< /mermaid >}} +``` diff --git a/doc/hugo.toml b/doc/hugo.toml index 7b2dff698b4..a35112db945 100644 --- a/doc/hugo.toml +++ b/doc/hugo.toml @@ -29,6 +29,7 @@ home = [ "HTML", "RSS", "PRINT"] section = [ "HTML", "RSS", "PRINT"] [params] +editURL = 'https://github.com/xapi-project/xen-api/edit/master/doc/content/${FilePath}' # Enable the theme variant selector, default to auto: themeVariant = [ "auto", @@ -45,5 +46,31 @@ themeVariant = [ ] # auto switches between "red" and "zen-dark" depending on the browser/OS dark mode: themeVariantAuto = ["red", "zen-dark"] +# Consistency: Use the font of the Hugo Relearn theme also for Mermaid diagrams: +# securityLevel=loose is the default of Relearn, it allows HTML links in diagrams: +mermaidInitialize = '{ "fontFamily": "Roboto Flex", "securityLevel": "loose" }' alwaysopen = false collapsibleMenu = true + + [params.imageEffects] + + # + # Enable a soft shadow around the images that make the images appear to + # stand out ever so slightly like paper on a desk, giving them a smooth look: + # + shadow = true + + # + # The CSS-based photographer's lightbox makes the image border flash + # on mouse-over and darkens the rest of the page when clicking on images. + # + # It is better to disable it as it serves no proper function for the + # toolstack docs and causes a border around the image to appear/disappear + # in a flash when entering/leaving the image. Disabling it turns the sudden + # appearance and disappearance of the flashy border off. + # + # Initially, this was based on the Featherlight jQuery plugin, which would + # have enlarged the images, but the CSS-only solution appears inadequate + # for a proper lightbox as it does not zoom the image: + # + lightbox = false diff --git a/doc/layouts/partials/content.html b/doc/layouts/partials/content.html index ebba286db1e..007446b478c 100644 --- a/doc/layouts/partials/content.html +++ b/doc/layouts/partials/content.html @@ -8,6 +8,39 @@ {{ $c := .Page.Params.class }} {{ with index (where $.Site.Data.xenapi "name" $c) 0 }} + + {{ $style := resources.Get "css/xenapi.css" }} +{{ $parser := resources.Get "js/parse.js" }} + {{ with .lifecycle }}

@@ -64,11 +114,11 @@

Enums

{{ range $i, $x := .enums }}
-
{{ $x.name }}
+
{{ $x.name }}