Skip to content

Commit 6268a2d

Browse files
authored
[core][dashboard] Return status code other than 200 and 500 (#51417)
The utility function `rest_response` takes a boolean parameter, `success`, to generate an HTTP response. It returns 200 when `success` is true, and 500 otherwise. However, it's not enough. For example, if users try to kill a non-existent actor via `/api/actors/kill`, 404 should be returned. Revisiting status code of each API endpoint can be good first issues. --------- Signed-off-by: kaihsun <kaihsun@anyscale.com>
1 parent 6412808 commit 6268a2d

File tree

14 files changed

+101
-46
lines changed

14 files changed

+101
-46
lines changed

python/ray/dashboard/modules/actor/actor_head.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ async def _cleanup_actors(self):
264264
async def get_all_actors(self, req) -> aiohttp.web.Response:
265265
actors = await DataOrganizer.get_actor_infos()
266266
return dashboard_optional_utils.rest_response(
267-
success=True,
267+
status_code=dashboard_utils.HTTPStatusCode.OK,
268268
message="All actors fetched.",
269269
actors=actors,
270270
# False to avoid converting Ray resource name to google style.
@@ -279,7 +279,9 @@ async def get_actor(self, req) -> aiohttp.web.Response:
279279
actor_id = req.match_info.get("actor_id")
280280
actors = await DataOrganizer.get_actor_infos(actor_ids=[actor_id])
281281
return dashboard_optional_utils.rest_response(
282-
success=True, message="Actor details fetched.", detail=actors[actor_id]
282+
status_code=dashboard_utils.HTTPStatusCode.OK,
283+
message="Actor details fetched.",
284+
detail=actors[actor_id],
283285
)
284286

285287
async def run(self, server):

python/ray/dashboard/modules/event/event_head.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
109109

110110
async def limit_handler_(self):
111111
return dashboard_optional_utils.rest_response(
112-
success=False,
112+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
113113
error_message=(
114114
"Max number of in-progress requests="
115115
f"{self.max_num_call_} reached. "
@@ -174,12 +174,14 @@ async def get_event(self, req) -> aiohttp.web.Response:
174174
for job_id, job_events in self.events.items()
175175
}
176176
return dashboard_optional_utils.rest_response(
177-
success=True, message="All events fetched.", events=all_events
177+
status_code=dashboard_utils.HTTPStatusCode.OK,
178+
message="All events fetched.",
179+
events=all_events,
178180
)
179181

180182
job_events = self.events[job_id]
181183
return dashboard_optional_utils.rest_response(
182-
success=True,
184+
status_code=dashboard_utils.HTTPStatusCode.OK,
183185
message="Job events fetched.",
184186
job_id=job_id,
185187
events=list(job_events.values()),

python/ray/dashboard/modules/metrics/metrics_head.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ async def grafana_health(self, req) -> aiohttp.web.Response:
143143
# If disabled, we don't want to show the metrics tab at all.
144144
if self.grafana_host == GRAFANA_HOST_DISABLED_VALUE:
145145
return dashboard_optional_utils.rest_response(
146-
success=True,
146+
status_code=dashboard_utils.HTTPStatusCode.OK,
147147
message="Grafana disabled",
148148
grafana_host=GRAFANA_HOST_DISABLED_VALUE,
149149
)
@@ -156,22 +156,22 @@ async def grafana_health(self, req) -> aiohttp.web.Response:
156156
async with self.http_session.get(path) as resp:
157157
if resp.status != 200:
158158
return dashboard_optional_utils.rest_response(
159-
success=False,
159+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
160160
message="Grafana healtcheck failed",
161161
status=resp.status,
162162
)
163163
json = await resp.json()
164164
# Check if the required Grafana services are running.
165165
if json["database"] != "ok":
166166
return dashboard_optional_utils.rest_response(
167-
success=False,
167+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
168168
message="Grafana healtcheck failed. Database not ok.",
169169
status=resp.status,
170170
json=json,
171171
)
172172

173173
return dashboard_optional_utils.rest_response(
174-
success=True,
174+
status_code=dashboard_utils.HTTPStatusCode.OK,
175175
message="Grafana running",
176176
grafana_host=grafana_iframe_host,
177177
session_name=self.session_name,
@@ -185,7 +185,9 @@ async def grafana_health(self, req) -> aiohttp.web.Response:
185185
)
186186

187187
return dashboard_optional_utils.rest_response(
188-
success=False, message="Grafana healtcheck failed", exception=str(e)
188+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
189+
message="Grafana healtcheck failed",
190+
exception=str(e),
189191
)
190192

191193
@routes.get("/api/prometheus_health")
@@ -198,21 +200,23 @@ async def prometheus_health(self, req):
198200
) as resp:
199201
if resp.status != 200:
200202
return dashboard_optional_utils.rest_response(
201-
success=False,
203+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
202204
message="prometheus healthcheck failed.",
203205
status=resp.status,
204206
)
205207

206208
return dashboard_optional_utils.rest_response(
207-
success=True,
209+
status_code=dashboard_utils.HTTPStatusCode.OK,
208210
message="prometheus running",
209211
)
210212
except Exception as e:
211213
logger.debug(
212214
"Error fetching prometheus endpoint. Is prometheus running?", exc_info=e
213215
)
214216
return dashboard_optional_utils.rest_response(
215-
success=False, message="prometheus healthcheck failed.", reason=str(e)
217+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
218+
message="prometheus healthcheck failed.",
219+
reason=str(e),
216220
)
217221

218222
@staticmethod

python/ray/dashboard/modules/node/node_head.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ async def get_all_nodes(self, req) -> aiohttp.web.Response:
292292
)
293293

294294
return dashboard_optional_utils.rest_response(
295-
success=True,
295+
status_code=dashboard_utils.HTTPStatusCode.OK,
296296
message="Node summary fetched.",
297297
summary=all_node_summary,
298298
node_logical_resources=nodes_logical_resources,
@@ -303,13 +303,14 @@ async def get_all_nodes(self, req) -> aiohttp.web.Response:
303303
if node["state"] == "ALIVE":
304304
alive_hostnames.add(node["nodeManagerHostname"])
305305
return dashboard_optional_utils.rest_response(
306-
success=True,
306+
status_code=dashboard_utils.HTTPStatusCode.OK,
307307
message="Node hostname list fetched.",
308308
host_name_list=list(alive_hostnames),
309309
)
310310
else:
311311
return dashboard_optional_utils.rest_response(
312-
success=False, message=f"Unknown view {view}"
312+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
313+
message=f"Unknown view {view}",
313314
)
314315

315316
@routes.get("/nodes/{node_id}")
@@ -318,7 +319,9 @@ async def get_node(self, req) -> aiohttp.web.Response:
318319
node_id = req.match_info.get("node_id")
319320
node_info = await DataOrganizer.get_node_info(node_id)
320321
return dashboard_optional_utils.rest_response(
321-
success=True, message="Node details fetched.", detail=node_info
322+
status_code=dashboard_utils.HTTPStatusCode.OK,
323+
message="Node details fetched.",
324+
detail=node_info,
322325
)
323326

324327
@async_loop_forever(node_consts.NODE_STATS_UPDATE_INTERVAL_SECONDS)

python/ray/dashboard/modules/reporter/reporter_head.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
7878
@routes.get("/api/v0/cluster_metadata")
7979
async def get_cluster_metadata(self, req):
8080
return dashboard_optional_utils.rest_response(
81-
success=True, message="", **self.cluster_metadata
81+
status_code=dashboard_utils.HTTPStatusCode.OK,
82+
message="",
83+
**self.cluster_metadata,
8284
)
8385

8486
@routes.get("/api/cluster_status")
@@ -120,15 +122,15 @@ async def get_cluster_status(self, req):
120122

121123
if not return_formatted_output:
122124
return dashboard_optional_utils.rest_response(
123-
success=True,
125+
status_code=dashboard_utils.HTTPStatusCode.OK,
124126
message="Got cluster status.",
125127
autoscaling_status=legacy_status.decode() if legacy_status else None,
126128
autoscaling_error=error.decode() if error else None,
127129
cluster_status=formatted_status if formatted_status else None,
128130
)
129131
else:
130132
return dashboard_optional_utils.rest_response(
131-
success=True,
133+
status_code=dashboard_utils.HTTPStatusCode.OK,
132134
message="Got formatted cluster status.",
133135
cluster_status=debug_status(
134136
formatted_status_string, error, address=self.gcs_address

python/ray/dashboard/modules/snapshot/snapshot_head.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ async def kill_actor_gcs(self, req) -> aiohttp.web.Response:
8686
no_restart = req.query.get("no_restart", False) in ("true", "True")
8787
if not actor_id:
8888
return dashboard_optional_utils.rest_response(
89-
success=False, message="actor_id is required."
89+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
90+
message="actor_id is required.",
9091
)
9192

9293
status_code = await self.gcs_aio_client.kill_actor(
@@ -96,12 +97,11 @@ async def kill_actor_gcs(self, req) -> aiohttp.web.Response:
9697
timeout=SNAPSHOT_API_TIMEOUT_SECONDS,
9798
)
9899

99-
success = status_code == 200
100-
if status_code == 404:
100+
if status_code == dashboard_utils.HTTPStatusCode.NOT_FOUND:
101101
message = f"Actor with id {actor_id} not found."
102-
elif status_code == 500:
102+
elif status_code == dashboard_utils.HTTPStatusCode.INTERNAL_ERROR:
103103
message = f"Failed to kill actor with id {actor_id}."
104-
elif status_code == 200:
104+
elif status_code == dashboard_utils.HTTPStatusCode.OK:
105105
message = (
106106
f"Force killed actor with id {actor_id}"
107107
if force_kill
@@ -111,9 +111,9 @@ async def kill_actor_gcs(self, req) -> aiohttp.web.Response:
111111
else:
112112
message = f"Unknown status code: {status_code}. Please open a bug report in the Ray repository."
113113

114-
# TODO(kevin85421): The utility function needs to be refactored to handle
115-
# different status codes. Currently, it only returns 200 and 500.
116-
return dashboard_optional_utils.rest_response(success=success, message=message)
114+
return dashboard_optional_utils.rest_response(
115+
status_code=status_code, message=message
116+
)
117117

118118
@routes.get("/api/component_activities")
119119
async def get_component_activities(self, req) -> aiohttp.web.Response:

python/ray/dashboard/modules/snapshot/tests/test_actors.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,11 @@ def loop(self):
7373
actor_id = a._ray_actor_id.hex()
7474

7575
OK = 200
76-
INTERNAL_ERROR = 500
76+
NOT_FOUND = 404
7777

7878
# Kill an non-existent actor
79-
# TODO(kevin85421): It should return 404 instead of 500.
8079
resp = _kill_actor_using_dashboard_gcs(
81-
webui_url, "non-existent-actor-id", INTERNAL_ERROR
80+
webui_url, "non-existent-actor-id", NOT_FOUND
8281
)
8382
assert "not found" in resp["msg"]
8483

python/ray/dashboard/modules/tests/test_head.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,14 @@ async def dump(self, req) -> aiohttp.web.Response:
5151
if not k.startswith("_")
5252
}
5353
return dashboard_optional_utils.rest_response(
54-
success=True,
54+
status_code=dashboard_utils.HTTPStatusCode.OK,
5555
message="Fetch all data from datacenter success.",
5656
**all_data,
5757
)
5858
else:
5959
data = dict(DataSource.__dict__.get(key))
6060
return dashboard_optional_utils.rest_response(
61-
success=True,
61+
status_code=dashboard_utils.HTTPStatusCode.OK,
6262
message=f"Fetch {key} from datacenter success.",
6363
**{key: data},
6464
)
@@ -74,15 +74,21 @@ async def get_url(self, req) -> aiohttp.web.Response:
7474
async def test_aiohttp_cache(self, req) -> aiohttp.web.Response:
7575
value = req.query["value"]
7676
return dashboard_optional_utils.rest_response(
77-
success=True, message="OK", value=value, timestamp=time.time()
77+
status_code=dashboard_utils.HTTPStatusCode.OK,
78+
message="OK",
79+
value=value,
80+
timestamp=time.time(),
7881
)
7982

8083
@routes.get("/test/aiohttp_cache_lru/{sub_path}")
8184
@dashboard_optional_utils.aiohttp_cache(ttl_seconds=60, maxsize=5)
8285
async def test_aiohttp_cache_lru(self, req) -> aiohttp.web.Response:
8386
value = req.query.get("value")
8487
return dashboard_optional_utils.rest_response(
85-
success=True, message="OK", value=value, timestamp=time.time()
88+
status_code=dashboard_utils.HTTPStatusCode.OK,
89+
message="OK",
90+
value=value,
91+
timestamp=time.time(),
8692
)
8793

8894
@routes.get("/test/file")
@@ -101,7 +107,7 @@ async def block_event_loop(self, req) -> aiohttp.web.Response:
101107
time.sleep(seconds)
102108

103109
return dashboard_optional_utils.rest_response(
104-
success=True,
110+
status_code=dashboard_utils.HTTPStatusCode.OK,
105111
message=f"Blocked event loop for {seconds} seconds",
106112
timestamp=time.time(),
107113
)

python/ray/dashboard/modules/usage_stats/usage_stats_head.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
4545
@routes.get("/usage_stats_enabled")
4646
async def get_usage_stats_enabled(self, req) -> aiohttp.web.Response:
4747
return ray.dashboard.optional_utils.rest_response(
48-
success=True,
48+
status_code=dashboard_utils.HTTPStatusCode.OK,
4949
message="Fetched usage stats enabled",
5050
usage_stats_enabled=self.usage_stats_enabled,
5151
usage_stats_prompt_enabled=self.usage_stats_prompt_enabled,
@@ -54,7 +54,7 @@ async def get_usage_stats_enabled(self, req) -> aiohttp.web.Response:
5454
@routes.get("/cluster_id")
5555
async def get_cluster_id(self, req) -> aiohttp.web.Response:
5656
return ray.dashboard.optional_utils.rest_response(
57-
success=True,
57+
status_code=dashboard_utils.HTTPStatusCode.OK,
5858
message="Fetched cluster id",
5959
cluster_id=self.gcs_client.cluster_id.hex(),
6060
)

python/ray/dashboard/optional_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import ray
1919
import ray.dashboard.consts as dashboard_consts
20+
import ray.dashboard.utils as dashboard_utils
2021
from ray._private.ray_constants import RAY_INTERNAL_DASHBOARD_NAMESPACE, env_bool
2122

2223
# All third-party dependencies that are not included in the minimal Ray
@@ -83,7 +84,8 @@ def _update_cache(task):
8384
response = task.result()
8485
except Exception:
8586
response = rest_response(
86-
success=False, message=traceback.format_exc()
87+
status_code=dashboard_utils.HTTPStatusCode.INTERNAL_ERROR,
88+
message=traceback.format_exc(),
8789
)
8890
data = {
8991
"status": response.status,

0 commit comments

Comments
 (0)