|
| 1 | +# Based on https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/example-full.yaml |
| 2 | + |
| 3 | +# An unique identifier for the head node and workers of this cluster. |
| 4 | +cluster_name: cubed-ray-cluster |
| 5 | + |
| 6 | +# The maximum number of workers nodes to launch in addition to the head |
| 7 | +# node. |
| 8 | +max_workers: 1 |
| 9 | + |
| 10 | +# The autoscaler will scale up the cluster faster with higher upscaling speed. |
| 11 | +# E.g., if the task requires adding more nodes then autoscaler will gradually |
| 12 | +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. |
| 13 | +# This number should be > 0. |
| 14 | +upscaling_speed: 1.0 |
| 15 | + |
| 16 | +# This executes all commands on all nodes in the docker container, |
| 17 | +# and opens all the necessary ports to support the Ray cluster. |
| 18 | +# Empty string means disabled. |
| 19 | +docker: |
| 20 | + # image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup |
| 21 | + image: rayproject/ray:latest-py312-cpu # use this one if you don't need ML dependencies, it's faster to pull |
| 22 | + container_name: "ray_container" |
| 23 | + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image |
| 24 | + # if no cached version is present. |
| 25 | + pull_before_run: True |
| 26 | + run_options: # Extra options to pass into "docker run" |
| 27 | + - --ulimit nofile=65536:65536 |
| 28 | + |
| 29 | + # Example of running a GPU head with CPU workers |
| 30 | + # head_image: "rayproject/ray-ml:latest-gpu" |
| 31 | + # Allow Ray to automatically detect GPUs |
| 32 | + |
| 33 | + # worker_image: "rayproject/ray-ml:latest-cpu" |
| 34 | + # worker_run_options: [] |
| 35 | + |
| 36 | +# If a node is idle for this many minutes, it will be removed. |
| 37 | +idle_timeout_minutes: 5 |
| 38 | + |
| 39 | +# Cloud-provider specific configuration. |
| 40 | +provider: |
| 41 | + type: aws |
| 42 | + region: eu-west-1 |
| 43 | + # Availability zone(s), comma-separated, that nodes may be launched in. |
| 44 | + # Nodes will be launched in the first listed availability zone and will |
| 45 | + # be tried in the subsequent availability zones if launching fails. |
| 46 | + availability_zone: eu-west-1a,eu-west-1b |
| 47 | + # Whether to allow node reuse. If set to False, nodes will be terminated |
| 48 | + # instead of stopped. |
| 49 | + cache_stopped_nodes: False # If not present, the default is True. |
| 50 | + |
| 51 | +# How Ray will authenticate with newly launched nodes. |
| 52 | +auth: |
| 53 | + ssh_user: ubuntu |
| 54 | +# By default Ray creates a new private keypair, but you can also use your own. |
| 55 | +# If you do so, make sure to also set "KeyName" in the head and worker node |
| 56 | +# configurations below. |
| 57 | +# ssh_private_key: /path/to/your/key.pem |
| 58 | + |
| 59 | +# Tell the autoscaler the allowed node types and the resources they provide. |
| 60 | +# The key is the name of the node type, which is just for debugging purposes. |
| 61 | +# The node config specifies the launch config and physical instance type. |
| 62 | +available_node_types: |
| 63 | + ray.head.default: |
| 64 | + # The node type's CPU and GPU resources are auto-detected based on AWS instance type. |
| 65 | + # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. |
| 66 | + # You can also set custom resources. |
| 67 | + # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set |
| 68 | + # resources: {"CPU": 1, "GPU": 1, "custom": 5} |
| 69 | + resources: {} |
| 70 | + # Provider-specific config for this node type, e.g. instance type. By default |
| 71 | + # Ray will auto-configure unspecified fields such as SubnetId and KeyName. |
| 72 | + # For more documentation on available fields, see: |
| 73 | + # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances |
| 74 | + node_config: |
| 75 | + InstanceType: m5.large |
| 76 | + # Default AMI for us-west-2. |
| 77 | + # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py |
| 78 | + # for default images for other zones. |
| 79 | + # ImageId: ami-0387d929287ab193e |
| 80 | + # You can provision additional disk space with a conf as follows |
| 81 | + BlockDeviceMappings: |
| 82 | + - DeviceName: /dev/sda1 |
| 83 | + Ebs: |
| 84 | + VolumeSize: 140 |
| 85 | + VolumeType: gp3 |
| 86 | + # Additional options in the boto docs. |
| 87 | + ray.worker.default: |
| 88 | + # The minimum number of worker nodes of this type to launch. |
| 89 | + # This number should be >= 0. |
| 90 | + min_workers: 1 |
| 91 | + # The maximum number of worker nodes of this type to launch. |
| 92 | + # This takes precedence over min_workers. |
| 93 | + max_workers: 1 |
| 94 | + # The node type's CPU and GPU resources are auto-detected based on AWS instance type. |
| 95 | + # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. |
| 96 | + # You can also set custom resources. |
| 97 | + # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set |
| 98 | + # resources: {"CPU": 1, "GPU": 1, "custom": 5} |
| 99 | + resources: {} |
| 100 | + # Provider-specific config for this node type, e.g. instance type. By default |
| 101 | + # Ray will auto-configure unspecified fields such as SubnetId and KeyName. |
| 102 | + # For more documentation on available fields, see: |
| 103 | + # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances |
| 104 | + node_config: |
| 105 | + InstanceType: m5.large |
| 106 | + # Default AMI for us-west-2. |
| 107 | + # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py |
| 108 | + # for default images for other zones. |
| 109 | + # ImageId: ami-0387d929287ab193e |
| 110 | + # Run workers on spot by default. Comment this out to use on-demand. |
| 111 | + # NOTE: If relying on spot instances, it is best to specify multiple different instance |
| 112 | + # types to avoid interruption when one instance type is experiencing heightened demand. |
| 113 | + # Demand information can be found at https://aws.amazon.com/ec2/spot/instance-advisor/ |
| 114 | + #InstanceMarketOptions: |
| 115 | + #MarketType: spot |
| 116 | + # Additional options can be found in the boto docs, e.g. |
| 117 | + # SpotOptions: |
| 118 | + # MaxPrice: MAX_HOURLY_PRICE |
| 119 | + # Additional options in the boto docs. |
| 120 | + BlockDeviceMappings: |
| 121 | + - DeviceName: /dev/sda1 |
| 122 | + Ebs: |
| 123 | + VolumeSize: 140 |
| 124 | + VolumeType: gp3 |
| 125 | + |
| 126 | +# Specify the node type of the head node (as configured above). |
| 127 | +head_node_type: ray.head.default |
| 128 | + |
| 129 | +# Files or directories to copy to the head and worker nodes. The format is a |
| 130 | +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. |
| 131 | +file_mounts: { |
| 132 | +# "/path1/on/remote/machine": "/path1/on/local/machine", |
| 133 | +# "/path2/on/remote/machine": "/path2/on/local/machine", |
| 134 | +} |
| 135 | + |
| 136 | +# Files or directories to copy from the head node to the worker nodes. The format is a |
| 137 | +# list of paths. The same path on the head node will be copied to the worker node. |
| 138 | +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases |
| 139 | +# you should just use file_mounts. Only use this if you know what you're doing! |
| 140 | +cluster_synced_files: [] |
| 141 | + |
| 142 | +# Whether changes to directories in file_mounts or cluster_synced_files in the head node |
| 143 | +# should sync to the worker node continuously |
| 144 | +file_mounts_sync_continuously: False |
| 145 | + |
| 146 | +# Patterns for files to exclude when running rsync up or rsync down |
| 147 | +rsync_exclude: |
| 148 | + - "**/.git" |
| 149 | + - "**/.git/**" |
| 150 | + |
| 151 | +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for |
| 152 | +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided |
| 153 | +# as a value, the behavior will match git's behavior for finding and using .gitignore files. |
| 154 | +rsync_filter: |
| 155 | + - ".gitignore" |
| 156 | + |
| 157 | +# List of commands that will be run before `setup_commands`. If docker is |
| 158 | +# enabled, these commands will run outside the container and before docker |
| 159 | +# is setup. |
| 160 | +initialization_commands: [] |
| 161 | + |
| 162 | +# List of shell commands to run to set up nodes. |
| 163 | +setup_commands: [] |
| 164 | + # Note: if you're developing Ray, you probably want to create a Docker image that |
| 165 | + # has your Ray repo pre-cloned. Then, you can replace the pip installs |
| 166 | + # below with a git checkout <your_sha> (and possibly a recompile). |
| 167 | + # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image |
| 168 | + # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line: |
| 169 | + # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" |
| 170 | + |
| 171 | +# Custom commands that will be run on the head node after common setup. |
| 172 | +head_setup_commands: [] |
| 173 | + |
| 174 | +# Custom commands that will be run on worker nodes after common setup. |
| 175 | +worker_setup_commands: [] |
| 176 | + |
| 177 | +# Command to start ray on the head node. You don't need to change this. |
| 178 | +head_start_ray_commands: |
| 179 | + - ray stop |
| 180 | + - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 |
| 181 | + |
| 182 | +# Command to start ray on worker nodes. You don't need to change this. |
| 183 | +worker_start_ray_commands: |
| 184 | + - ray stop |
| 185 | + - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 |
0 commit comments