docs: iter 4 (#532)

casperdcl · web-flow · commit b9cd04a981df · 2022-04-26T05:31:48.000+01:00
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 TPI is a [Terraform](https://terraform.io) plugin built with machine learning in mind. This CLI tool offers full lifecycle management of computing resources (including GPUs and respawning spot instances) from several cloud vendors (AWS, Azure, GCP, K8s)... without needing to be a cloud expert.
 
-- **Lower cost with spot recovery**: transparent auto-recovery from interrupted low-cost spot/preemptible instances
+- **Lower cost with spot recovery**: transparent data checkpoint/restore & auto-respawning of low-cost spot/preemptible instances
 - **No cloud vendor lock-in**: switch between clouds with just one line thanks to unified abstraction
 - **No waste**: auto-cleanup unused resources (terminate compute instances upon task completion/failure & remove storage upon download of results), pay only for what you use
 - **Developer-first experience**: one-command data sync & code execution with no external server, making the cloud feel like a laptop
@@ -39,10 +39,12 @@ There are a several reasons to use TPI instead of other related solutions (custo
    TPI is a CLI tool, not a running service. It requires no additional orchestrating machine (control plane/head nodes) to schedule/recover/terminate instances. Instead, TPI runs (spot) instances via cloud-native scaling groups[^scalers], taking care of recovery and termination automatically on the cloud provider's side. This design reduces management overhead & infrastructure costs. You can close your laptop while cloud tasks are running -- auto-recovery happens even if you are offline.
 2. **Unified tool for data science and software development teams**:
    TPI provides consistent tooling for both data scientists and DevOps engineers, improving cross-team collaboration. This simplifies compute management to a single config file, and reduces time to deliver ML models into production.
+3. **Reproducible, codified environments**:
+   Store hardware requirements in a single configuration file alongside the rest of your ML pipeline code.
 
 [^scalers]: [AWS Auto Scaling Groups](https://docs.aws.amazon.com/autoscaling/ec2/userguide/what-is-amazon-ec2-auto-scaling.html), [Azure VM Scale Sets](https://azure.microsoft.com/en-us/services/virtual-machine-scale-sets), [GCP managed instance groups](https://cloud.google.com/compute/docs/instance-groups#managed_instance_groups), and [Kubernetes Jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job).
 
-<img width=24px src="https://static.iterative.ai/logo/cml.svg"/> TPI is used to power [CML runners](https://cml.dev/doc/self-hosted-runners), bringing cloud providers to existing CI/CD workflows.
+<img width=24px src="https://static.iterative.ai/logo/cml.svg"/> TPI is used to power [CML](https://cml.dev), bringing cloud providers to existing GitHub, GitLab & Bitbucket CI/CD workflows ([repository](https://github.com/iterative/cml)).
 
 ## Usage
 
@@ -74,12 +76,12 @@ provider "iterative" {}
 resource "iterative_task" "example" {
   cloud      = "aws" # or any of: gcp, az, k8s
   machine    = "m"   # medium. Or any of: l, xl, m+k80, xl+v100, ...
-  spot       = 0     # auto-price. Or -1 to disable, or >0 to set a hourly USD limit
+  spot       = 0     # auto-price. Default -1 to disable, or >0 for hourly USD limit
   disk_size  = 30    # GB
 
   storage {
-    workdir = "."
-    output  = "results"
+    workdir = "."       # default blank (don't upload)
+    output  = "results" # default blank (don't download). Relative to workdir
   }
   script = <<-END
     #!/bin/bash
@@ -126,7 +128,7 @@ TF_LOG_PROVIDER=INFO terraform refresh
 TF_LOG_PROVIDER=INFO terraform show
 ```
 
-### Stop Task
+### End Task
 
 ```
 TF_LOG_PROVIDER=INFO terraform destroy
@@ -149,16 +151,16 @@ direction LR
     B[("Cloud Storage (low cost)")]
     C{{"Cloud instance scaler (zero cost)"}}
     D[["Cloud (spot) Instance"]]
-    A ---> |create cloud storage| B
-    A --> |create cloud instance scaler| C
-    A ==> |upload script & workdir| B
-    A -.-> |"offline (lunch break)"| A
-    C -.-> |"(re)provision instance"| D
-    D ==> |run script| D
-    B <-.-> |persistent workdir cache| D
-    D ==> |script end,\nshutdown instance| B
+    A ---> |2. create cloud storage| B
+    A --> |1. create cloud instance scaler| C
+    A ==> |3. upload script & workdir| B
+    A -.-> |"4. offline (lunch break)"| A
+    C -.-> |"5. (re)provision instance"| D
+    D ==> |7. run script| D
+    B <-.-> |6. persistent workdir cache| D
+    D ==> |8. script end,\nshutdown instance| B
     D -.-> |outage| C
-    B ==> |download output| A
+    B ==> |9. download output| A
 end
 style you fill:#FFFFFF00,stroke:#13ADC7
 style tpi fill:#FFFFFF00,stroke:#FFFFFF00,stroke-width:0px
diff --git a/docs/guides/generic-machine-types.md b/docs/guides/generic-machine-types.md
@@ -7,7 +7,7 @@ subcategory: Development
 
 The table below is a more detailed version of the common choices summarised in [Task Machine Types](https://registry.terraform.io/providers/iterative/iterative/latest/docs/resources/task#machine-type).
 
-| Type      | [`aws`]       | [`az`]                 | [`gcp`]                                         | [`k8s`]                                              |
+| Type      | [aws]         | [az]                   | [gcp]                                           | [k8s]                                                |
 | :-------- | :------------ | :--------------------- | :---------------------------------------------- | :--------------------------------------------------- |
 | `s`       | `t2.micro`    | `Standard_B1s`         | `g1-small`                                      | `cpu: 1`<br>`memory: 1G`                             |
 | `m`       | `m5.2xlarge`  | `Standard_F8s_v2`      | `e2-custom-8-32768`                             | `cpu: 8`<br>`memory: 32G`                            |
@@ -21,7 +21,13 @@ The table below is a more detailed version of the common choices summarised in [
 | `l+v100`  | `p3.8xlarge`  | `Standard_NC12s_v3`    | `custom-32-262144-ext`<br>4 `nvidia-tesla-v100` | `cpu: 32`<br>`memory: 256G`<br>4 `nvidia-tesla-v100` |
 | `xl+v100` | `p3.16xlarge` | `Standard_NC24s_v3`    | `custom-64-524288-ext`<br>8 `nvidia-tesla-v100` | `cpu: 64`<br>`memory: 512G`<br>8 `nvidia-tesla-v100` |
 
-[`aws`]: https://aws.amazon.com/ec2/instance-explorer
-[`az`]: https://azure.microsoft.com/en-us/pricing/vm-selector
-[`gcp`]: https://cloud.google.com/compute/docs/machine-types
-[`k8s`]: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers
+[aws]: https://aws.amazon.com/ec2/instance-explorer
+[az]: https://azure.microsoft.com/en-us/pricing/vm-selector
+[gcp]: https://cloud.google.com/compute/docs/machine-types
+[k8s]: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers
+
+## Pricing
+
+- aws: [on-demand](https://aws.amazon.com/ec2/pricing), [spot](https://aws.amazon.com/ec2/spot/pricing)
+- [az](https://azure.microsoft.com/en-us/pricing/calculator)
+- [gcp](https://cloud.google.com/products/calculator)
diff --git a/docs/guides/getting-started.md b/docs/guides/getting-started.md
@@ -37,12 +37,12 @@ provider "iterative" {}
 resource "iterative_task" "example" {
   cloud      = "aws" # or any of: gcp, az, k8s
   machine    = "m"   # medium. Or any of: l, xl, m+k80, xl+v100, ...
-  spot       = 0     # auto-price. Or -1 to disable, or >0 to set a hourly USD limit
+  spot       = 0     # auto-price. Default -1 to disable, or >0 for hourly USD limit
   disk_size  = 30    # GB
 
   storage {
-    workdir = "."
-    output  = "results"
+    workdir = "."       # default blank (don't upload)
+    output  = "results" # default blank (don't download). Relative to workdir
   }
   script = <<-END
     #!/bin/bash
@@ -96,6 +96,7 @@ This command will:
 1. Create all the required cloud resources (provisioning a `machine` with `disk_size` storage).
 2. Upload the working directory (`workdir`) to the cloud.
 3. Launch the task `script`.
+4. Terminate the `machine` on `script` completion/error.
 
 With spot/preemptible instances (`spot >= 0`), auto-recovery logic and persistent (`disk_size`) storage will be used to relaunch interrupted tasks.
 
@@ -117,7 +118,7 @@ These commands will:
 1. Query the task status from the cloud.
 2. Display the task status.
 
-## Stop Task
+## End Task
 
 ```console
 $ TF_LOG_PROVIDER=INFO terraform destroy
diff --git a/docs/index.md b/docs/index.md
@@ -7,7 +7,7 @@
 
 TPI is a [Terraform](https://terraform.io) plugin built with machine learning in mind. This CLI tool offers full lifecycle management of computing resources (including GPUs and respawning spot instances) from several cloud vendors (AWS, Azure, GCP, K8s)... without needing to be a cloud expert.
 
-- **Lower cost with spot recovery**: transparent auto-recovery from interrupted low-cost spot/preemptible instances
+- **Lower cost with spot recovery**: transparent data checkpoint/restore & auto-respawning of low-cost spot/preemptible instances
 - **No cloud vendor lock-in**: switch between clouds with just one line thanks to unified abstraction
 - **No waste**: auto-cleanup unused resources (terminate compute instances upon task completion/failure & remove storage upon download of results), pay only for what you use
 - **Developer-first experience**: one-command data sync & code execution with no external server, making the cloud feel like a laptop
@@ -37,8 +37,10 @@ There are a several reasons to use TPI instead of other related solutions (custo
    TPI is a CLI tool, not a running service. It requires no additional orchestrating machine (control plane/head nodes) to schedule/recover/terminate instances. Instead, TPI runs (spot) instances via cloud-native scaling groups ([AWS Auto Scaling Groups](https://docs.aws.amazon.com/autoscaling/ec2/userguide/what-is-amazon-ec2-auto-scaling.html), [Azure VM Scale Sets](https://azure.microsoft.com/en-us/services/virtual-machine-scale-sets), [GCP managed instance groups](https://cloud.google.com/compute/docs/instance-groups#managed_instance_groups), and [Kubernetes Jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job)), taking care of recovery and termination automatically on the cloud provider's side. This design reduces management overhead & infrastructure costs. You can close your laptop while cloud tasks are running -- auto-recovery happens even if you are offline.
 2. **Unified tool for data science and software development teams**:
    TPI provides consistent tooling for both data scientists and DevOps engineers, improving cross-team collaboration. This simplifies compute management to a single config file, and reduces time to deliver ML models into production.
+3. **Reproducible, codified environments**:
+   Store hardware requirements in a single configuration file alongside the rest of your ML pipeline code.
 
-<img width=24px src="https://static.iterative.ai/logo/cml.svg"/> TPI is used to power [CML runners](https://cml.dev/doc/self-hosted-runners), bringing cloud providers to existing CI/CD workflows.
+<img width=24px src="https://static.iterative.ai/logo/cml.svg"/> TPI is used to power [CML](https://cml.dev), bringing cloud providers to existing GitHub, GitLab & Bitbucket CI/CD workflows ([repository](https://github.com/iterative/cml)).
 
 ## Links
 
diff --git a/docs/resources/task.md b/docs/resources/task.md
@@ -11,19 +11,19 @@ This resource will:
 
 ```hcl
 resource "iterative_task" "example" {
-  cloud       = "aws"
+  cloud       = "aws"     # or any of: gcp, az, k8s
   machine     = "m"       # medium. Or any of: l, xl, m+k80, xl+v100, ...
-  image       = "ubuntu"
-  region      = "us-east"
+  image       = "ubuntu"  # or "nvidia", ...
+  region      = "us-west" # or "us-east", "eu-west", ...
   disk_size   = 30        # GB
-  spot        = 0         # auto-price. Or -1 to disable, or >0 to set a hourly USD limit
+  spot        = 0         # auto-price. Default -1 to disable, or >0 for hourly USD limit
   parallelism = 1
-  timeout     = 60*60     # max 1h before forced termination
+  timeout     = 24*60*60  # max 24h before forced termination
 
   environment = { GREETING = "Hello, world!" }
   storage {
-    workdir = "."
-    output  = "results"
+    workdir = "."         # default blank (don't upload)
+    output  = "results"   # default blank (don't download). Relative to workdir
   }
   script = <<-END
     #!/bin/bash
@@ -105,7 +105,7 @@ The above would allow:
 $ terraform output --raw logs
 ```
 
-Finally, JSON output can be parsed using `terraform output --json` and `jq` like this:
+Finally, JSON output can be parsed using `terraform show --json` and `jq` like this:
 
 ```console
 $ terraform show --json | jq --raw-output '
@@ -169,6 +169,7 @@ In addition to generic types, it's possible to specify any machine type supporte
 The Iterative Provider offers some common machine images which are roughly the same for all supported clouds.
 
 - `ubuntu` - Official [Ubuntu LTS](https://wiki.ubuntu.com/LTS) image (currently 20.04).
+- `nvidia` - Official [NVIDIA NGC](https://docs.nvidia.com/ngc/ngc-deploy-public-cloud)-based images, typically needing `disk_size = 32` GB or more.
 
 ### Cloud-specific
 
@@ -231,8 +232,8 @@ See https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findima
 
 The Iterative Provider offers some common cloud regions which are roughly the same for all supported clouds.
 
-- `us-east` - United States of America, East.
 - `us-west` - United States of America, West.
+- `us-east` - United States of America, East.
 - `eu-north` - Europe, North.
 - `eu-west` - Europe, West.