Skip to content

Commit 10ff71b

Browse files
authored
Merge pull request #304 from CybercentreCanada/persistent-service-update
Persistent service update
2 parents bc110aa + 1cc8eef commit 10ff71b

File tree

10 files changed

+120
-125
lines changed

10 files changed

+120
-125
lines changed

assemblyline_core/ingester/ingester.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
_retry_delay = 60 * 4 # Wait 4 minutes to retry
5050
_max_time = 2 * 24 * 60 * 60 # Wait 2 days for responses.
5151
HOUR_IN_SECONDS = 60 * 60
52+
COMPLETE_THREADS = int(environ.get('INGESTER_COMPLETE_THREADS', 4))
5253
INGEST_THREADS = int(environ.get('INGESTER_INGEST_THREADS', 1))
5354
SUBMIT_THREADS = int(environ.get('INGESTER_SUBMIT_THREADS', 4))
5455

@@ -147,7 +148,7 @@ def __init__(self, datastore=None, logger=None, classification=None, redis=None,
147148
datastore=datastore, config=config)
148149

149150
# Cache the user groups
150-
self.cache_lock = threading.RLock() # TODO are middle man instances single threaded now?
151+
self.cache_lock = threading.RLock()
151152
self._user_groups = {}
152153
self._user_groups_reset = time.time()//HOUR_IN_SECONDS
153154
self.cache = {}
@@ -223,10 +224,10 @@ def __init__(self, datastore=None, logger=None, classification=None, redis=None,
223224

224225
def try_run(self):
225226
threads_to_maintain = {
226-
'Complete': self.handle_complete,
227227
'Retries': self.handle_retries,
228228
'Timeouts': self.handle_timeouts
229229
}
230+
threads_to_maintain.update({f'Complete_{n}': self.handle_complete for n in range(COMPLETE_THREADS)})
230231
threads_to_maintain.update({f'Ingest_{n}': self.handle_ingest for n in range(INGEST_THREADS)})
231232
threads_to_maintain.update({f'Submit_{n}': self.handle_submit for n in range(SUBMIT_THREADS)})
232233
self.maintain_threads(threads_to_maintain)

assemblyline_core/metrics/heartbeat_formatter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from assemblyline.remote.datatypes.queues.comms import CommsQueue
2424
from assemblyline.remote.datatypes.queues.named import NamedQueue
2525
from assemblyline.remote.datatypes.queues.priority import PriorityQueue
26+
from assemblyline_core.ingester.constants import COMPLETE_QUEUE_NAME
2627

2728
STATUS_QUEUE = "status"
2829

@@ -68,6 +69,7 @@ def __init__(self, sender, log, config=None, redis=None):
6869
self.ingest_scanning = Hash('m-scanning-table', self.redis_persist)
6970
self.ingest_unique_queue = PriorityQueue('m-unique', self.redis_persist)
7071
self.ingest_queue = NamedQueue(INGEST_QUEUE_NAME, self.redis_persist)
72+
self.ingest_complete_queue = NamedQueue(COMPLETE_QUEUE_NAME, self.redis)
7173
self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.redis_persist)
7274

7375
constants = forge.get_constants(self.config)
@@ -162,6 +164,7 @@ def send_heartbeat(self, m_type, m_name, m_data, instances):
162164
"critical": c_q_len,
163165
"high": h_q_len,
164166
"ingest": self.ingest_queue.length(),
167+
"complete": self.ingest_complete_queue.length(),
165168
"low": l_q_len,
166169
"medium": m_q_len
167170
}

assemblyline_core/scaler/controllers/docker_ctl.py

Lines changed: 64 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
1+
from __future__ import annotations
12
import os
23
import threading
34
import time
5+
from collections import defaultdict
46
from typing import List, Tuple, Dict
7+
import uuid
58

6-
from assemblyline.odm.models.service import DockerConfig
9+
from assemblyline.odm.models.service import DependencyConfig, DockerConfig
710
from .interface import ControllerInterface, ServiceControlError
811

9-
# How to identify the update volume as a whole, in a way that the underlying container system recognizes.
10-
FILE_UPDATE_VOLUME = os.environ.get('FILE_UPDATE_VOLUME', None)
11-
1212
# Where to find the update directory inside this container.
13-
FILE_UPDATE_DIRECTORY = os.environ.get('FILE_UPDATE_DIRECTORY', None)
1413
INHERITED_VARIABLES = ['HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY', 'http_proxy', 'https_proxy', 'no_proxy']
1514

1615
# Every this many seconds, check that the services can actually reach the service server.
1716
NETWORK_REFRESH_INTERVAL = 60 * 3
17+
CHANGE_KEY_NAME = 'al_change_key'
1818

1919

2020
class DockerController(ControllerInterface):
2121
"""A controller for *non* swarm mode docker."""
22-
def __init__(self, logger, prefix='', labels=None, cpu_overallocation=1, memory_overallocation=1, log_level="INFO"):
22+
def __init__(self, logger, prefix='', labels: dict[str, str] = None, cpu_overallocation=1, memory_overallocation=1, log_level="INFO"):
2323
"""
2424
:param logger: A logger to report status and debug information.
2525
:param prefix: A prefix used to distinguish containers launched by this controller.
@@ -32,9 +32,11 @@ def __init__(self, logger, prefix='', labels=None, cpu_overallocation=1, memory_
3232
self.log = logger
3333
self.log_level = log_level
3434
self.global_mounts: List[Tuple[str, str]] = []
35+
self.core_mounts: List[Tuple[str, str]] = []
3536
self._prefix: str = prefix
36-
self._labels = labels
37+
self._labels: dict[str, str] = labels or {}
3738
self.prune_lock = threading.Lock()
39+
self._service_limited_env: dict[str, dict[str, str]] = defaultdict(dict)
3840

3941
for network in self.client.networks.list(names=['external']):
4042
self.external_network = network
@@ -109,8 +111,8 @@ def _flush_containers(self):
109111

110112
def add_profile(self, profile, scale=0):
111113
"""Tell the controller about a service profile it needs to manage."""
112-
self._pull_image(profile)
113114
self._profiles[profile.name] = profile
115+
self._pull_image(profile)
114116

115117
def _start(self, service_name):
116118
"""Launch a docker container in a manner suitable for Assemblyline."""
@@ -124,15 +126,13 @@ def _start(self, service_name):
124126

125127
# Prepare the volumes and folders
126128
volumes = {row[0]: {'bind': row[1], 'mode': 'ro'} for row in self.global_mounts}
127-
volumes[os.path.join(FILE_UPDATE_VOLUME, service_name)] = {'bind': '/mount/updates/', 'mode': 'ro'}
128-
if not os.path.exists(os.path.join(FILE_UPDATE_DIRECTORY, service_name)):
129-
os.makedirs(os.path.join(FILE_UPDATE_DIRECTORY, service_name), 0x777)
130129

131130
# Define environment variables
132131
env = [f'{_e.name}={_e.value}' for _e in cfg.environment]
133132
env += ['UPDATE_PATH=/mount/updates/']
134133
env += [f'{name}={os.environ[name]}' for name in INHERITED_VARIABLES if name in os.environ]
135134
env += [f'LOG_LEVEL={self.log_level}']
135+
env += [f'{_n}={_v}' for _n, _v in self._service_limited_env[service_name].items()]
136136

137137
container = self.client.containers.run(
138138
image=cfg.image,
@@ -152,7 +152,7 @@ def _start(self, service_name):
152152
if cfg.allow_internet_access:
153153
self.external_network.connect(container)
154154

155-
def _start_container(self, name, labels, volumes, cfg: DockerConfig, network, hostname):
155+
def _start_container(self, service_name, name, labels, volumes, cfg: DockerConfig, network, hostname, core_container=False):
156156
"""Launch a docker container."""
157157
# Take the port strings and convert them to a dictionary
158158
ports = {}
@@ -174,9 +174,13 @@ def _start_container(self, name, labels, volumes, cfg: DockerConfig, network, ho
174174
self.log.warning(f"Not sure how to parse port string {port_string} for container {name} not using it...")
175175

176176
# Put together the environment variables
177-
env = [f'{_e.name}={_e.value}' for _e in cfg.environment]
177+
env = []
178+
if core_container:
179+
env += [f'{_n}={_v}' for _n, _v in os.environ.items()
180+
if any(term in _n for term in ['ELASTIC', 'FILESTORE', 'UI_SERVER'])]
181+
env += [f'{_e.name}={_e.value}' for _e in cfg.environment]
178182
env += [f'{name}={os.environ[name]}' for name in INHERITED_VARIABLES if name in os.environ]
179-
env += [f'LOG_LEVEL={self.log_level}']
183+
env += [f'LOG_LEVEL={self.log_level}', f'AL_SERVICE_NAME={service_name}']
180184

181185
container = self.client.containers.run(
182186
image=cfg.image,
@@ -192,8 +196,9 @@ def _start_container(self, name, labels, volumes, cfg: DockerConfig, network, ho
192196
network=network,
193197
environment=env,
194198
detach=True,
195-
ports=ports,
199+
# ports=ports,
196200
)
201+
197202
if cfg.allow_internet_access:
198203
self.external_network.connect(container, aliases=[hostname])
199204

@@ -324,16 +329,44 @@ def get_running_container_names(self):
324329
out.append(container.name)
325330
return out
326331

327-
def start_stateful_container(self, service_name, container_name, spec, labels, mount_updates=False, change_key=''):
328-
volumes = {_n: {'bind': _v.mount_path, 'mode': 'rw'} for _n, _v in spec.volumes.items()}
332+
def start_stateful_container(self, service_name: str, container_name: str, spec: DependencyConfig,
333+
labels: dict[str, str], change_key: str):
334+
import docker.errors
329335
deployment_name = f'{service_name}-dep-{container_name}'
330336

337+
change_check = change_key + service_name + container_name + str(spec)
338+
339+
try:
340+
old_container = self.client.containers.get(deployment_name)
341+
instance_key = old_container.attrs["Config"]["Env"]['AL_INSTANCE_KEY']
342+
if old_container.labels.get(CHANGE_KEY_NAME) == change_check and old_container.status == 'running':
343+
self._service_limited_env[service_name][f'{container_name}_host'] = deployment_name
344+
self._service_limited_env[service_name][f'{container_name}_key'] = instance_key
345+
if spec.container.ports:
346+
self._service_limited_env[service_name][f'{container_name}_port'] = spec.container.ports[0]
347+
return
348+
else:
349+
old_container.kill()
350+
except docker.errors.NotFound:
351+
instance_key = uuid.uuid4().hex
352+
353+
volumes = {_n: {'bind': _v.mount_path, 'mode': 'rw'} for _n, _v in spec.volumes.items()}
354+
if spec.run_as_core:
355+
volumes.update({row[0]: {'bind': row[1], 'mode': 'ro'} for row in self.core_mounts})
356+
331357
all_labels = dict(self._labels)
332-
all_labels.update({'component': service_name})
358+
all_labels.update({'component': service_name, CHANGE_KEY_NAME: change_check})
333359
all_labels.update(labels)
334360

335-
self._start_container(name=deployment_name, labels=all_labels, volumes=volumes, hostname=container_name,
336-
cfg=spec.container, network=self._get_network(service_name).name)
361+
spec.container.environment.append({'name': 'AL_INSTANCE_KEY', 'value': instance_key})
362+
363+
self._service_limited_env[service_name][f'{container_name}_host'] = deployment_name
364+
self._service_limited_env[service_name][f'{container_name}_key'] = instance_key
365+
if spec.container.ports:
366+
self._service_limited_env[service_name][f'{container_name}_port'] = spec.container.ports[0]
367+
368+
self._start_container(service_name=service_name, name=deployment_name, labels=all_labels, volumes=volumes, hostname=container_name,
369+
cfg=spec.container, core_container=spec.run_as_core, network=self._get_network(service_name).name)
337370

338371
def stop_containers(self, labels):
339372
label_strings = [f'{name}={value}' for name, value in labels.items()]
@@ -368,6 +401,7 @@ def _pull_image(self, service):
368401
369402
This lets us override the auth_config on a per image basis.
370403
"""
404+
from docker.errors import ImageNotFound
371405
# Split the image string into "[registry/]image_name" and "tag"
372406
repository, _, tag = service.container_config.image.rpartition(':')
373407
if '/' in tag:
@@ -385,4 +419,13 @@ def _pull_image(self, service):
385419
'password': service.container_config.registry_password
386420
}
387421

388-
self.client.images.pull(repository, tag, auth_config=auth_config)
422+
try:
423+
self.client.images.pull(repository, tag, auth_config=auth_config)
424+
except ImageNotFound:
425+
self.log.error(f"Couldn't pull image {repository}:{tag} check authentication settings. "
426+
"Will try to use local copy.")
427+
428+
try:
429+
self.client.images.get(repository + ':' + tag)
430+
except ImageNotFound:
431+
self.log.error(f"Couldn't find local image {repository}:{tag}")

assemblyline_core/scaler/controllers/interface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def get_running_container_names(self):
5353
def new_events(self):
5454
return []
5555

56-
def start_stateful_container(self, service_name, container_name, spec, labels, change_key):
56+
def start_stateful_container(self, service_name: str, container_name: str, spec, labels, change_key):
5757
raise NotImplementedError()
5858

5959
def stop_containers(self, labels):

assemblyline_core/scaler/controllers/kubernetes_ctl.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -436,12 +436,6 @@ def memory_info(self):
436436
return self._quota_mem_limit - self._pod_used_namespace_ram, self._quota_mem_limit
437437
return self._node_pool_max_ram - self._pod_used_ram, self._node_pool_max_ram
438438

439-
def _create_volumes(self, core_mounts=False):
440-
volumes, mounts = [], []
441-
442-
443-
return volumes, mounts
444-
445439
def _create_containers(self, service_name: str, deployment_name: str, container_config, mounts,
446440
core_container=False):
447441
cores = container_config.cpu_cores
@@ -476,9 +470,9 @@ def _create_containers(self, service_name: str, deployment_name: str, container_
476470
)]
477471

478472
def _create_deployment(self, service_name: str, deployment_name: str, docker_config: DockerConfig,
479-
shutdown_seconds: int, scale: int, labels:dict[str,str]=None,
480-
volumes:list[V1Volume]=None, mounts:list[V1VolumeMount]=None,
481-
core_mounts:bool=False, change_key:str=''):
473+
shutdown_seconds: int, scale: int, labels: dict[str, str] = None,
474+
volumes: list[V1Volume] = None, mounts: list[V1VolumeMount] = None,
475+
core_mounts: bool = False, change_key: str = ''):
482476
# Build a cache key to check for changes, just trying to only patch what changed
483477
# will still potentially result in a lot of restarts due to different kubernetes
484478
# systems returning differently formatted data
@@ -490,7 +484,8 @@ def _create_deployment(self, service_name: str, deployment_name: str, docker_con
490484
# Check if a deployment already exists, and if it does check if it has the same change key set
491485
replace = None
492486
try:
493-
replace = self.apps_api.read_namespaced_deployment(deployment_name, namespace=self.namespace, _request_timeout=API_TIMEOUT)
487+
replace = self.apps_api.read_namespaced_deployment(
488+
deployment_name, namespace=self.namespace, _request_timeout=API_TIMEOUT)
494489
if replace.metadata.annotations.get(CHANGE_KEY_NAME) == change_key:
495490
if replace.spec.replicas != scale:
496491
self.set_target(service_name, scale)
@@ -530,7 +525,7 @@ def _create_deployment(self, service_name: str, deployment_name: str, docker_con
530525
# Send it to the server
531526
if current_pull_secret:
532527
self.api.patch_namespaced_secret(pull_secret_name, namespace=self.namespace, body=new_pull_secret,
533-
_request_timeout=API_TIMEOUT)
528+
_request_timeout=API_TIMEOUT)
534529
else:
535530
self.api.create_namespaced_secret(namespace=self.namespace, body=new_pull_secret,
536531
_request_timeout=API_TIMEOUT)
@@ -613,7 +608,7 @@ def set_target(self, service_name: str, target: int):
613608
_request_timeout=API_TIMEOUT)
614609
scale.spec.replicas = target
615610
self.apps_api.patch_namespaced_deployment_scale(name=name, namespace=self.namespace, body=scale,
616-
_request_timeout=API_TIMEOUT)
611+
_request_timeout=API_TIMEOUT)
617612
return
618613
except client.ApiException as error:
619614
# If the error is a conflict, it means multiple attempts to scale a deployment
@@ -673,7 +668,7 @@ def new_events(self):
673668
return new
674669

675670
def start_stateful_container(self, service_name: str, container_name: str,
676-
spec, labels: dict[str, str], change_key:str):
671+
spec, labels: dict[str, str], change_key: str):
677672
# Setup PVC
678673
deployment_name = self._dependency_name(service_name, container_name)
679674
mounts, volumes = [], []

assemblyline_core/scaler/scaler_server.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from contextlib import contextmanager
1717

1818
import elasticapm
19+
import yaml
1920

2021
from assemblyline.remote.datatypes.queues.named import NamedQueue
2122
from assemblyline.remote.datatypes.queues.priority import PriorityQueue, length as pq_length
@@ -26,7 +27,7 @@
2627
from assemblyline.odm.messages.scaler_heartbeat import Metrics
2728
from assemblyline.odm.messages.scaler_status_heartbeat import Status
2829
from assemblyline.odm.messages.changes import ServiceChange, Operation
29-
from assemblyline.common.forge import get_service_queue
30+
from assemblyline.common.forge import get_classification, get_service_queue
3031
from assemblyline.common.constants import SCALER_TIMEOUT_QUEUE, SERVICE_STATE_HASH, ServiceStatus
3132
from assemblyline_core.scaler.controllers import KubernetesController
3233
from assemblyline_core.scaler.controllers.interface import ServiceControlError
@@ -60,6 +61,8 @@
6061
CLASSIFICATION_CONFIGMAP = os.getenv('CLASSIFICATION_CONFIGMAP', None)
6162
CLASSIFICATION_CONFIGMAP_KEY = os.getenv('CLASSIFICATION_CONFIGMAP_KEY', 'classification.yml')
6263

64+
DOCKER_CONFIGURATION_PATH = os.getenv('DOCKER_CONFIGURATION_PATH', None)
65+
DOCKER_CONFIGURATION_VOLUME = os.getenv('DOCKER_CONFIGURATION_VOLUME', None)
6366
CONFIGURATION_CONFIGMAP = os.getenv('CONFIGURATION_CONFIGMAP', None)
6467
CONFIGURATION_CONFIGMAP_KEY = os.getenv('CONFIGURATION_CONFIGMAP_KEY', 'config')
6568

@@ -276,6 +279,16 @@ def __init__(self, config=None, datastore=None, redis=None, redis_persist=None):
276279
cpu_overallocation=self.config.core.scaler.cpu_overallocation,
277280
memory_overallocation=self.config.core.scaler.memory_overallocation,
278281
labels=labels, log_level=self.config.logging.log_level)
282+
283+
if DOCKER_CONFIGURATION_PATH and DOCKER_CONFIGURATION_VOLUME:
284+
self.controller.core_mounts.append((DOCKER_CONFIGURATION_VOLUME, '/etc/assemblyline/'))
285+
286+
with open(os.path.join(DOCKER_CONFIGURATION_PATH, 'config.yml'), 'w') as handle:
287+
yaml.dump(self.config.as_primitives(), handle)
288+
289+
with open(os.path.join(DOCKER_CONFIGURATION_PATH, 'classification.yml'), 'w') as handle:
290+
yaml.dump(get_classification().original_definition, handle)
291+
279292
# If we know where to find it, mount the classification into the service containers
280293
if CLASSIFICATION_HOST_PATH:
281294
self.controller.global_mounts.append((CLASSIFICATION_HOST_PATH, '/etc/assemblyline/classification.yml'))
@@ -397,6 +410,7 @@ def prepare_container(docker_config: DockerConfig) -> DockerConfig:
397410
dependency_blobs[_n] = str(dependency)
398411

399412
if service.enabled and stage == ServiceStage.Off:
413+
self.log.info(f'Preparing environment for {service.name}')
400414
# Move to the next service stage (do this first because the container we are starting may care)
401415
if service.update_config and service.update_config.wait_for_update:
402416
self._service_stage_hash.set(name, ServiceStage.Update)
@@ -408,6 +422,7 @@ def prepare_container(docker_config: DockerConfig) -> DockerConfig:
408422
# Enable this service's dependencies before trying to launch the service containers
409423
self.controller.prepare_network(service.name, service.docker_config.allow_internet_access)
410424
for _n, dependency in dependency_config.items():
425+
self.log.info(f'Launching {service.name} dependency {_n}')
411426
self.controller.start_stateful_container(
412427
service_name=service.name,
413428
container_name=_n,

0 commit comments

Comments
 (0)