From 2322e71f956ac5dcca0fa469967b6f52483c9209 Mon Sep 17 00:00:00 2001 From: Samarth Singh <45368705+sam6134@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:21:34 +0100 Subject: [PATCH 01/25] Pass GOMEMLIMIT as env variable (#56) --- .../templates/linux/neuron-monitor-daemonset.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 32136484..262051b5 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -35,6 +35,8 @@ spec: fieldPath: spec.nodeName - name: PATH value: /usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin + - name: GOMEMLIMIT + value: 160MiB ports: - name: "metrics" port: {{ .Values.neuronMonitor.service.port }} From e0e99c77f69ef388b0ffce769371f7c735a776e4 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:17:42 -0400 Subject: [PATCH 02/25] release 1.8.0 (#57) --- RELEASE_NOTES | 11 +++++++++++ charts/amazon-cloudwatch-observability/Chart.yaml | 2 +- charts/amazon-cloudwatch-observability/values.yaml | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index da689279..e4f116db 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,3 +1,14 @@ +======================================================================= +amazon-cloudwatch-observability v1.8.0 (2024-07-02) +======================================================================== +Bug Fixes: +* Add GOMEMLIMIT environment variable for Neuron Monitor to fix OOM crash issue (#56) + +Enhancements: +* Update Windows Fluent-Bit configuration to export Kubelet and kube-proxy service logs to host log group (#45) +* Upgrade CWAgent Operator to v1.4.1 +* Upgrade CWAgent to v1.300041.0 + ======================================================================= amazon-cloudwatch-observability v1.7.0 (2024-05-23) ======================================================================== diff --git a/charts/amazon-cloudwatch-observability/Chart.yaml b/charts/amazon-cloudwatch-observability/Chart.yaml index eb268ad8..b9290b5f 100644 --- a/charts/amazon-cloudwatch-observability/Chart.yaml +++ b/charts/amazon-cloudwatch-observability/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: amazon-cloudwatch-observability -version: 1.7.0 +version: 1.8.0 appVersion: 1.0.0 description: A Helm chart for Amazon CloudWatch Observability type: application diff --git a/charts/amazon-cloudwatch-observability/values.yaml b/charts/amazon-cloudwatch-observability/values.yaml index 4b133a0d..c8ed4972 100644 --- a/charts/amazon-cloudwatch-observability/values.yaml +++ b/charts/amazon-cloudwatch-observability/values.yaml @@ -392,7 +392,7 @@ manager: name: image: repository: cloudwatch-agent-operator - tag: 1.4.0 + tag: 1.4.1 repositoryDomainMap: public: public.ecr.aws/cloudwatch-agent cn-north-1: 934860584483.dkr.ecr.cn-north-1.amazonaws.com.cn @@ -496,7 +496,7 @@ agent: name: image: repository: cloudwatch-agent - tag: 1.300040.0b650 + tag: 1.300041.0b681 repositoryDomainMap: public: public.ecr.aws/cloudwatch-agent cn-north-1: 934860584483.dkr.ecr.cn-north-1.amazonaws.com.cn From 48781f522fb78f91e769bea5f6f8d09ce7f67136 Mon Sep 17 00:00:00 2001 From: Parampreet Singh <50599809+Paramadon@users.noreply.github.com> Date: Wed, 3 Jul 2024 09:27:23 -0400 Subject: [PATCH 03/25] EKS Addon Fargate Bug Fix (#58) --- .../linux/cloudwatch-agent-daemonset.yaml | 9 ++++ .../linux/dcgm-exporter-daemonset.yaml | 12 +++-- .../templates/linux/fluent-bit-daemonset.yaml | 9 ++++ .../linux/neuron-monitor-daemonset.yaml | 20 +++++---- .../values.yaml | 44 ++++++++++--------- 5 files changed, 61 insertions(+), 33 deletions(-) diff --git a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml index 84b26b7c..c5298d42 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml @@ -30,6 +30,15 @@ spec: nodeSelector: kubernetes.io/os: linux serviceAccount: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate {{- if .Values.agent.config }} config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.config) . ) }} {{- else }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml index 82ce49ce..f812bb0f 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml @@ -15,10 +15,14 @@ spec: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.gpuInstances | toYaml | nindent 16 }} + - matchExpressions: + - key: {{ .Values.nodeLabelKey }} + operator: In + values: {{ .Values.gpuInstances | toYaml | nindent 16 }} + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate resources: requests: cpu: 250m diff --git a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml index ed8a8d9c..79d3698f 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml @@ -95,6 +95,15 @@ spec: hostPath: path: /var/log/dmesg serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate nodeSelector: kubernetes.io/os: linux {{- with .Values.tolerations }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 262051b5..686a6084 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -14,13 +14,17 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: kubernetes.io/os - operator: In - values: - - linux - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.neuronInstances | toYaml | nindent 20 }} + - key: kubernetes.io/os + operator: In + values: + - linux + - key: {{ .Values.nodeLabelKey }} + operator: In + values: {{ .Values.neuronInstances | toYaml | nindent 20 }} + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate resources: limits: cpu: 500m @@ -41,7 +45,7 @@ spec: - name: "metrics" port: {{ .Values.neuronMonitor.service.port }} command: - - "/opt/bin/entrypoint.sh" + - "/opt/bin/entrypoint.sh" args: port: "{{ .Values.neuronMonitor.service.port }}" cert-file: "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt" diff --git a/charts/amazon-cloudwatch-observability/values.yaml b/charts/amazon-cloudwatch-observability/values.yaml index c8ed4972..284dd190 100644 --- a/charts/amazon-cloudwatch-observability/values.yaml +++ b/charts/amazon-cloudwatch-observability/values.yaml @@ -18,6 +18,8 @@ clusterName: region: nodeLabelKey: node.kubernetes.io/instance-type +fargateLabelKey: eks.amazonaws.com/compute-type + ## NVIDIA GPU instance types gpuInstances: [ p2.xlarge, p2.8xlarge, p2.16xlarge, p3.2xlarge, p3.8xlarge, p3.16xlarge, p3dn.24xlarge, p4d.24xlarge, p4de.24xlarge, p5.48xlarge, g3s.xlarge, g3.4xlarge, g3.8xlarge, g3.16xlarge, g4dn.xlarge, g4dn.2xlarge, g4dn.4xlarge, g4dn.8xlarge, g4dn.16xlarge, g4dn.12xlarge, g4dn.metal, g4ad.xlarge, g4ad.2xlarge, g4ad.4xlarge, g4ad.8xlarge, g4ad.16xlarge, g5.xlarge, g5.2xlarge, g5.4xlarge, g5.8xlarge, g5.16xlarge, g5.12xlarge, g5.24xlarge, g5.48xlarge, g5g.xlarge, g5g.2xlarge, g5g.4xlarge, g5g.8xlarge, g5g.16xlarge, g5g.metal, ml.p5.48xlarge, ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.p3dn.24xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.g3s.xlarge, ml.g3.4xlarge, ml.g3.8xlarge, ml.g3.16xlarge, ml.g4dn.xlarge, ml.g4dn.2xlarge, ml.g4dn.4xlarge, ml.g4dn.8xlarge, ml.g4dn.16xlarge, ml.g4dn.12xlarge, ml.g4dn.metal, ml.g4ad.xlarge, ml.g4ad.2xlarge, ml.g4ad.4xlarge, ml.g4ad.8xlarge, ml.g4ad.16xlarge, ml.g5.xlarge, ml.g5.2xlarge, ml.g5.4xlarge, ml.g5.8xlarge, ml.g5.16xlarge, ml.g5.12xlarge, ml.g5.24xlarge, ml.g5.48xlarge, ml.g5g.xlarge, ml.g5g.2xlarge, ml.g5g.4xlarge, ml.g5g.8xlarge, ml.g5g.16xlarge, ml.g5g.metal] @@ -60,14 +62,14 @@ containerLogs: Regex ^(?