Skip to content

Commit c008ab0

Browse files
authored
Fix bug where the external network is killed during run/restart (#1000)
When ToolHive starts or restarts a workload, it checks to see if a container with the same name exists. If the configuration has changed, it needs to remove the container and recreate it. This happens in a method named handleExistingContainer. Previously, the code used the RemoveWorkload method to clean up the old container. However, when the logic to create a separate external Docker network was added, the RemoveWorkload method was extended to remove the external network if no workloads were left. If no other workloads are running during a start or restart and the run/restart involves a workload with the same name as an exited container in Docker - a situation arises where ToolHive checks to see if the external network exists, then checks to see if there is an existing container, and ends up deleting the container and the external network after checking to ensure it exists. When the MCP server starts, it goes into an error state because it has been configured to use a network which no longer exists. It is not straightforward to move the external network check due to the sequence of operations needed to support the ingress/egress proxies (even for workloads where network isolation is disabled). Instead, I refactored the code to split the container deletion logic from RemoveWorkload into a private method, and I call that method directly in the handleExistingContainer method.
1 parent 8b0671c commit c008ab0

File tree

1 file changed

+80
-50
lines changed

1 file changed

+80
-50
lines changed

pkg/container/docker/client.go

Lines changed: 80 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ const (
4141
LabelValueTrue = "true"
4242
)
4343

44-
// Client implements the Runtime interface for container operations
44+
// Client implements the Runtime interface for Docker (and compatible runtimes)
4545
type Client struct {
4646
runtimeType runtime.Type
4747
socketPath string
@@ -717,51 +717,20 @@ func (c *Client) RemoveWorkload(ctx context.Context, workloadID string) error {
717717
containerName := containerResponse.Name
718718
containerName = strings.TrimPrefix(containerName, "/")
719719

720-
err = c.client.ContainerRemove(ctx, workloadID, container.RemoveOptions{
721-
Force: true,
722-
})
723-
if err != nil {
724-
// If the workload doesn't exist, that's fine - it's already removed
725-
if errdefs.IsNotFound(err) {
726-
return nil
727-
}
728-
return NewContainerError(err, workloadID, fmt.Sprintf("failed to remove workload: %v", err))
729-
}
730-
731-
// If network isolation is not enabled, then there is nothing else to do.
732-
// NOTE: This check treats all workloads created before the introduction of
733-
// this label as having network isolation enabled. This is to ensure that they
734-
// get cleaned up properly during stop/rm. There may be some spurious warnings
735-
// from the following code, but they can be ignored.
736-
if containerResponse.Config != nil && !lb.HasNetworkIsolation(containerResponse.Config.Labels) {
737-
return nil
720+
// remove the workload containers
721+
var labels map[string]string
722+
if containerResponse.Config != nil {
723+
labels = containerResponse.Config.Labels
724+
} else {
725+
labels = make(map[string]string)
738726
}
739-
740-
// remove egress, ingress, and dns containers
741-
suffixes := []string{"egress", "ingress", "dns"}
742-
743-
for _, suffix := range suffixes {
744-
containerName := fmt.Sprintf("%s-%s", containerName, suffix)
745-
containerId, err := c.findExistingContainer(ctx, containerName)
746-
if err != nil {
747-
logger.Debugf("Failed to find %s container %s: %v", suffix, containerName, err)
748-
continue
749-
}
750-
if containerId == "" {
751-
continue
752-
}
753-
754-
err = c.client.ContainerRemove(ctx, containerId, container.RemoveOptions{
755-
Force: true,
756-
})
757-
if err != nil {
758-
if errdefs.IsNotFound(err) {
759-
continue
760-
}
761-
return NewContainerError(err, containerId, fmt.Sprintf("failed to remove %s container: %v", suffix, err))
762-
}
727+
err = c.removeWorkloadContainers(ctx, containerName, workloadID, labels)
728+
if err != nil {
729+
return err // removeWorkloadContainers already wraps the error with context.
763730
}
764731

732+
// Clear up any networks associated with this workload.
733+
// This also deletes the external network if no other workloads are using it.
765734
err = c.deleteNetworks(ctx, containerName)
766735
if err != nil {
767736
logger.Warnf("Failed to delete networks for container %s: %v", containerName, err)
@@ -1358,14 +1327,17 @@ func (c *Client) handleExistingContainer(
13581327
return true, nil
13591328
}
13601329

1361-
// Configurations don't match, need to recreate the container
1362-
// Stop the workload
1363-
if err := c.StopWorkload(ctx, containerID); err != nil {
1364-
return false, err
1330+
// Configurations don't match, we need to recreate the container.
1331+
// Remove the existing container and any of the proxy containers.
1332+
// Note that we do not use the RemoveWorkload method because that
1333+
// will delete networks - which we do not want to do here.
1334+
var labels map[string]string
1335+
if info.Config != nil {
1336+
labels = info.Config.Labels
1337+
} else {
1338+
labels = make(map[string]string)
13651339
}
1366-
1367-
// Remove the workload
1368-
if err := c.RemoveWorkload(ctx, containerID); err != nil {
1340+
if err := c.removeWorkloadContainers(ctx, info.Name, containerID, labels); err != nil {
13691341
return false, err
13701342
}
13711343

@@ -1423,3 +1395,61 @@ func (c *Client) deleteNetwork(ctx context.Context, name string) error {
14231395
}
14241396
return nil
14251397
}
1398+
1399+
// removeWorkloadContainers removes the MCP server container and any proxy containers.
1400+
func (c *Client) removeWorkloadContainers(
1401+
ctx context.Context,
1402+
containerName string,
1403+
workloadID string,
1404+
workloadLabels map[string]string,
1405+
) error {
1406+
// remove the / if it starts with it
1407+
containerName = strings.TrimPrefix(containerName, "/")
1408+
1409+
err := c.client.ContainerRemove(ctx, workloadID, container.RemoveOptions{
1410+
Force: true,
1411+
})
1412+
if err != nil {
1413+
// If the workload doesn't exist, that's fine - it's already removed
1414+
if errdefs.IsNotFound(err) {
1415+
return nil
1416+
}
1417+
return NewContainerError(err, workloadID, fmt.Sprintf("failed to remove workload: %v", err))
1418+
}
1419+
1420+
// If network isolation is not enabled, then there is nothing else to do.
1421+
// NOTE: This check treats all workloads created before the introduction of
1422+
// this label as having network isolation enabled. This is to ensure that they
1423+
// get cleaned up properly during stop/rm. There may be some spurious warnings
1424+
// from the following code, but they can be ignored.
1425+
if !lb.HasNetworkIsolation(workloadLabels) {
1426+
return nil
1427+
}
1428+
1429+
// remove egress, ingress, and dns containers
1430+
suffixes := []string{"egress", "ingress", "dns"}
1431+
1432+
for _, suffix := range suffixes {
1433+
containerName := fmt.Sprintf("%s-%s", containerName, suffix)
1434+
containerId, err := c.findExistingContainer(ctx, containerName)
1435+
if err != nil {
1436+
logger.Debugf("Failed to find %s container %s: %v", suffix, containerName, err)
1437+
continue
1438+
}
1439+
if containerId == "" {
1440+
continue
1441+
}
1442+
1443+
err = c.client.ContainerRemove(ctx, containerId, container.RemoveOptions{
1444+
Force: true,
1445+
})
1446+
if err != nil {
1447+
if errdefs.IsNotFound(err) {
1448+
continue
1449+
}
1450+
return NewContainerError(err, containerId, fmt.Sprintf("failed to remove %s container: %v", suffix, err))
1451+
}
1452+
}
1453+
1454+
return nil
1455+
}

0 commit comments

Comments
 (0)