diff --git a/.github/workflows/amazon-cloudwatch-observability-image-scan.yaml b/.github/workflows/amazon-cloudwatch-observability-image-scan.yaml new file mode 100644 index 00000000..c545a3da --- /dev/null +++ b/.github/workflows/amazon-cloudwatch-observability-image-scan.yaml @@ -0,0 +1,84 @@ +name: Run Image Scan for Amazon CloudWatch Observability Helm Chart + +on: + schedule: + - cron: 0 13 * * MON # Every Monday at 1PM UTC (9AM EST) + workflow_dispatch: + +permissions: + id-token: write + contents: read + +env: + TERRAFORM_AWS_ASSUME_ROLE: ${{ secrets.TERRAFORM_AWS_ASSUME_ROLE }} + AWS_DEFAULT_REGION: us-west-2 + +jobs: + ContainerImageScan: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + container_images: + - registry: ".manager.image.repositoryDomainMap.public" + repository: ".manager.image.repository" + tag: ".manager.image.tag" + + - registry: ".manager.autoInstrumentationImage.java.repositoryDomain" + repository: ".manager.autoInstrumentationImage.java.repository" + tag: ".manager.autoInstrumentationImage.java.tag" + + - registry: ".manager.autoInstrumentationImage.python.repositoryDomain" + repository: ".manager.autoInstrumentationImage.python.repository" + tag: ".manager.autoInstrumentationImage.python.tag" + + - registry: ".manager.autoInstrumentationImage.dotnet.repositoryDomain" + repository: ".manager.autoInstrumentationImage.dotnet.repository" + tag: ".manager.autoInstrumentationImage.dotnet.tag" + + - registry: ".agent.image.repositoryDomainMap.public" + repository: ".agent.image.repository" + tag: ".agent.image.tag" + + - registry: ".dcgmExporter.image.repositoryDomainMap.public" + repository: ".dcgmExporter.image.repository" + tag: ".dcgmExporter.image.tag" + + - registry: ".neuronMonitor.image.repositoryDomainMap.public" + repository: ".neuronMonitor.image.repository" + tag: ".neuronMonitor.image.tag" + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }} + aws-region: ${{ env.AWS_DEFAULT_REGION }} + + - name: "Get image registry" + id: registry + uses: mikefarah/yq@master + with: + cmd: yq '${{ matrix.container_images.registry }}' charts/amazon-cloudwatch-observability/values.yaml + + - name: "Get image repository" + id: repository + uses: mikefarah/yq@master + with: + cmd: yq '${{ matrix.container_images.repository }}' charts/amazon-cloudwatch-observability/values.yaml + + - name: "Get image tag" + id: tag + uses: mikefarah/yq@master + with: + cmd: yq '${{ matrix.container_images.tag }}' charts/amazon-cloudwatch-observability/values.yaml + + - name: "Scan for vulnerabilities" + uses: crazy-max/ghaction-container-scan@v3 + with: + image: ${{ steps.registry.outputs.result }}/${{ steps.repository.outputs.result }}:${{ steps.tag.outputs.result }} + severity_threshold: HIGH \ No newline at end of file diff --git a/RELEASE_NOTES b/RELEASE_NOTES index da689279..64c70c9e 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,3 +1,50 @@ +======================================================================= +amazon-cloudwatch-observability v2.0.0 (2024-08-15) +======================================================================== +Breaking Changes: +* Enforce default requests and limits for auto instrumentation init containers + +Enhancements: +* Allow configurable requests and limits for auto instrumentation init containers (#65) +* Restructure resources configurations for AppSignals (#80) +* Upgrade CWAgent to v1.300044.0 +* Upgrade CWAgent Operator to v1.6.0 +* Upgrade .Net SDK to v1.2.0 +* Upgrade FluentBit for Linux to 2.32.2.20240627 + +======================================================================= +amazon-cloudwatch-observability v1.10.0 (2024-07-30) +======================================================================== +New Features: +* Adding support for .Net auto instrumentation for Application Signals (#64) + +Enhancements: +* Upgrade CWAgent Operator to v1.5.0 + +======================================================================= +amazon-cloudwatch-observability v1.9.0 (2024-07-22) +======================================================================== +Bug Fixes: +* Add nodeAffinity rule to not spin up resources on Fargate instances (#58) +* Increase the default memory limit of DCGM Exporter to 500Mi to fix OOM crashing issue (#67) + +Enhancements: +* Support parameterized resources configuration (#63) +* Upgrade Java SDK to v1.32.3 +* Upgrade Python SDK to v0.3.0 +* Upgrade CWAgent to v1.300042.1 + +======================================================================= +amazon-cloudwatch-observability v1.8.0 (2024-07-02) +======================================================================== +Bug Fixes: +* Add GOMEMLIMIT environment variable for Neuron Monitor to fix OOM crash issue (#56) + +Enhancements: +* Update Windows Fluent-Bit configuration to export Kubelet and kube-proxy service logs to host log group (#45) +* Upgrade CWAgent Operator to v1.4.1 +* Upgrade CWAgent to v1.300041.0 + ======================================================================= amazon-cloudwatch-observability v1.7.0 (2024-05-23) ======================================================================== diff --git a/charts/amazon-cloudwatch-observability/Chart.yaml b/charts/amazon-cloudwatch-observability/Chart.yaml index eb268ad8..e8e3e4b6 100644 --- a/charts/amazon-cloudwatch-observability/Chart.yaml +++ b/charts/amazon-cloudwatch-observability/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: amazon-cloudwatch-observability -version: 1.7.0 +version: 2.0.0 appVersion: 1.0.0 description: A Helm chart for Amazon CloudWatch Observability type: application diff --git a/charts/amazon-cloudwatch-observability/templates/_helpers.tpl b/charts/amazon-cloudwatch-observability/templates/_helpers.tpl index b258b8f5..521ad011 100644 --- a/charts/amazon-cloudwatch-observability/templates/_helpers.tpl +++ b/charts/amazon-cloudwatch-observability/templates/_helpers.tpl @@ -154,6 +154,14 @@ Get the current recommended auto instrumentation python image {{- printf "%s/%s:%s" .Values.manager.autoInstrumentationImage.python.repositoryDomain .Values.manager.autoInstrumentationImage.python.repository .Values.manager.autoInstrumentationImage.python.tag -}} {{- end -}} + +{{/* +Get the current recommended auto instrumentation dotnet image +*/}} +{{- define "auto-instrumentation-dotnet.image" -}} +{{- printf "%s/%s:%s" .Values.manager.autoInstrumentationImage.dotnet.repositoryDomain .Values.manager.autoInstrumentationImage.dotnet.repository .Values.manager.autoInstrumentationImage.dotnet.tag -}} +{{- end -}} + {{/* Get the current recommended auto instrumentation nodejs image */}} @@ -240,4 +248,4 @@ Define the default service name */}} {{- define "amazon-cloudwatch-observability.webhookServiceName" -}} {{- default (printf "%s-webhook-service" (include "amazon-cloudwatch-observability.name" .)) .Values.manager.service.name }} -{{- end -}} \ No newline at end of file +{{- end -}} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml index 84b26b7c..693cecb1 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml @@ -30,18 +30,24 @@ spec: nodeSelector: kubernetes.io/os: linux serviceAccount: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate + hostNetwork: true {{- if .Values.agent.config }} config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.config) . ) }} {{- else }} config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.defaultConfig) . ) }} {{- end }} - resources: - requests: - memory: "128Mi" - cpu: "250m" - limits: - memory: "512Mi" - cpu: "500m" + {{- with .Values.agent.resources }} + resources: {{- toYaml . | nindent 4}} + {{- end }} volumeMounts: - mountPath: /rootfs name: rootfs diff --git a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml index 82ce49ce..0eefeebf 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml @@ -15,17 +15,17 @@ spec: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.gpuInstances | toYaml | nindent 16 }} - resources: - requests: - cpu: 250m - memory: 128Mi - limits: - cpu: 500m - memory: 250Mi + - matchExpressions: + - key: {{ .Values.nodeLabelKey }} + operator: In + values: {{ .Values.gpuInstances | toYaml | nindent 16 }} + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate + {{- with .Values.dcgmExporter.resources }} + resources: {{- toYaml . | nindent 4}} + {{- end }} env: - name: "DCGM_EXPORTER_KUBERNETES" value: "true" diff --git a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml index ed8a8d9c..5d1eb54c 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml @@ -47,13 +47,9 @@ spec: fieldPath: metadata.name - name: CI_VERSION value: "k8s/1.3.17" - resources: - limits: - cpu: 500m - memory: 250Mi - requests: - cpu: 50m - memory: 25Mi + {{- with .Values.containerLogs.fluentBit.resources }} + resources: {{- toYaml . | nindent 10}} + {{- end }} volumeMounts: # Please don't change below read-only permissions - name: fluentbitstate @@ -95,6 +91,15 @@ spec: hostPath: path: /var/log/dmesg serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate nodeSelector: kubernetes.io/os: linux {{- with .Values.tolerations }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 32136484..75e20561 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -14,20 +14,20 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: kubernetes.io/os - operator: In - values: - - linux - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.neuronInstances | toYaml | nindent 20 }} - resources: - limits: - cpu: 500m - memory: 256Mi - requests: - cpu: 256m - memory: 128Mi + - key: kubernetes.io/os + operator: In + values: + - linux + - key: {{ .Values.nodeLabelKey }} + operator: In + values: {{ .Values.neuronInstances | toYaml | nindent 20 }} + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate + {{- with .Values.neuronMonitor.resources }} + resources: {{- toYaml . | nindent 4}} + {{- end }} env: - name: NODE_NAME valueFrom: @@ -35,11 +35,13 @@ spec: fieldPath: spec.nodeName - name: PATH value: /usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin + - name: GOMEMLIMIT + value: 160MiB ports: - name: "metrics" port: {{ .Values.neuronMonitor.service.port }} command: - - "/opt/bin/entrypoint.sh" + - "/opt/bin/entrypoint.sh" args: port: "{{ .Values.neuronMonitor.service.port }}" cert-file: "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt" diff --git a/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml b/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml index e10de7fa..0409120c 100644 --- a/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml +++ b/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml @@ -26,9 +26,11 @@ spec: containers: - image: {{ template "cloudwatch-agent-operator.image" . }} args: + - {{ printf "--auto-instrumentation-config=%s" (dict "java" (.Values.manager.autoInstrumentationResources.java) "python" (.Values.manager.autoInstrumentationResources.python) "dotnet" (.Values.manager.autoInstrumentationResources.dotnet) "nodejs" (.Values.manager.autoInstrumentationResources.nodejs) | toJson) | quote }} - {{ printf "--auto-annotation-config=%s" (.Values.manager.autoAnnotateAutoInstrumentation | toJson) | quote }} - "--auto-instrumentation-java-image={{ template "auto-instrumentation-java.image" . }}" - "--auto-instrumentation-python-image={{ template "auto-instrumentation-python.image" . }}" + - "--auto-instrumentation-dotnet-image={{ template "auto-instrumentation-dotnet.image" . }}" - "--auto-instrumentation-nodejs-image={{ template "auto-instrumentation-nodejs.image" . }}" - "--feature-gates=operator.autoinstrumentation.multi-instrumentation,operator.autoinstrumentation.multi-instrumentation.skip-container-validation" command: @@ -38,7 +40,7 @@ spec: - containerPort: {{ .Values.manager.ports.containerPort }} name: webhook-server protocol: TCP - resources: {{ toYaml .Values.manager.resources | nindent 12 }} + resources: {{ toYaml .Values.manager.resources | nindent 10 }} volumeMounts: - mountPath: /tmp/k8s-webhook-server/serving-certs name: cert diff --git a/charts/amazon-cloudwatch-observability/templates/windows/cloudwatch-agent-windows-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/windows/cloudwatch-agent-windows-daemonset.yaml index f801bbba..440af39a 100644 --- a/charts/amazon-cloudwatch-observability/templates/windows/cloudwatch-agent-windows-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/windows/cloudwatch-agent-windows-daemonset.yaml @@ -19,13 +19,9 @@ spec: nodeSelector: kubernetes.io/os: windows config: {{ .Values.agent.windowsDefaultConfig | toJson | quote }} - resources: - requests: - memory: "128Mi" - cpu: "250m" - limits: - memory: "512Mi" - cpu: "500m" + {{- with .Values.agent.resources }} + resources: {{- toYaml . | nindent 4}} + {{- end }} env: - name: K8S_NODE_NAME valueFrom: diff --git a/charts/amazon-cloudwatch-observability/templates/windows/fluent-bit-windows-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/windows/fluent-bit-windows-daemonset.yaml index 83adc0a2..f475a137 100644 --- a/charts/amazon-cloudwatch-observability/templates/windows/fluent-bit-windows-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/windows/fluent-bit-windows-daemonset.yaml @@ -53,13 +53,9 @@ spec: fieldPath: metadata.name - name: CI_VERSION value: "k8s/1.3.17" - resources: - limits: - cpu: 500m - memory: 600Mi - requests: - cpu: 300m - memory: 300Mi + {{- with .Values.containerLogs.fluentBit.resources }} + resources: {{- toYaml . | nindent 10}} + {{- end }} volumeMounts: - name: fluent-bit-config mountPath: fluent-bit\configuration\ diff --git a/charts/amazon-cloudwatch-observability/values.yaml b/charts/amazon-cloudwatch-observability/values.yaml index 101f2e27..a90bdff8 100644 --- a/charts/amazon-cloudwatch-observability/values.yaml +++ b/charts/amazon-cloudwatch-observability/values.yaml @@ -8,9 +8,6 @@ replicaCount: 1 ## nameOverride: "" -## Reference one or more secrets to be used when pulling images from authenticated repositories. -imagePullSecrets: [ ] - ## Provide the ClusterName (this is a required parameter) clusterName: @@ -18,6 +15,8 @@ clusterName: region: nodeLabelKey: node.kubernetes.io/instance-type +fargateLabelKey: eks.amazonaws.com/compute-type + ## NVIDIA GPU instance types gpuInstances: [ p2.xlarge, p2.8xlarge, p2.16xlarge, p3.2xlarge, p3.8xlarge, p3.16xlarge, p3dn.24xlarge, p4d.24xlarge, p4de.24xlarge, p5.48xlarge, g3s.xlarge, g3.4xlarge, g3.8xlarge, g3.16xlarge, g4dn.xlarge, g4dn.2xlarge, g4dn.4xlarge, g4dn.8xlarge, g4dn.16xlarge, g4dn.12xlarge, g4dn.metal, g4ad.xlarge, g4ad.2xlarge, g4ad.4xlarge, g4ad.8xlarge, g4ad.16xlarge, g5.xlarge, g5.2xlarge, g5.4xlarge, g5.8xlarge, g5.16xlarge, g5.12xlarge, g5.24xlarge, g5.48xlarge, g5g.xlarge, g5g.2xlarge, g5g.4xlarge, g5g.8xlarge, g5g.16xlarge, g5g.metal, ml.p5.48xlarge, ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.p3dn.24xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.g3s.xlarge, ml.g3.4xlarge, ml.g3.8xlarge, ml.g3.16xlarge, ml.g4dn.xlarge, ml.g4dn.2xlarge, ml.g4dn.4xlarge, ml.g4dn.8xlarge, ml.g4dn.16xlarge, ml.g4dn.12xlarge, ml.g4dn.metal, ml.g4ad.xlarge, ml.g4ad.2xlarge, ml.g4ad.4xlarge, ml.g4ad.8xlarge, ml.g4ad.16xlarge, ml.g5.xlarge, ml.g5.2xlarge, ml.g5.4xlarge, ml.g5.8xlarge, ml.g5.16xlarge, ml.g5.12xlarge, ml.g5.24xlarge, ml.g5.48xlarge, ml.g5g.xlarge, ml.g5g.2xlarge, ml.g5g.4xlarge, ml.g5g.8xlarge, ml.g5g.16xlarge, ml.g5g.metal] @@ -33,7 +32,7 @@ containerLogs: fluentBit: image: repository: aws-for-fluent-bit - tag: 2.32.0.20240304 + tag: 2.32.2.20240627 tagWindows: 2.31.12-windowsservercore repositoryDomainMap: public: public.ecr.aws/aws-observability @@ -41,6 +40,13 @@ containerLogs: cn-northwest-1: 128054284489.dkr.ecr.cn-northwest-1.amazonaws.com.cn us-gov-east-1: 161423150738.dkr.ecr.us-gov-east-1.amazonaws.com us-gov-west-1: 161423150738.dkr.ecr.us-gov-west-1.amazonaws.com + resources: + limits: + cpu: 500m + memory: 250Mi + requests: + cpu: 50m + memory: 25Mi config: service: | [SERVICE] @@ -60,14 +66,14 @@ containerLogs: Regex ^(?