From e4253b0f0abc998ed345204d230820ff2f7602c0 Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Mon, 5 May 2025 14:19:31 +0530 Subject: [PATCH 1/8] apply default resource limit to discovery job --- charts/logan/templates/discovery-cronjob.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/logan/templates/discovery-cronjob.yaml b/charts/logan/templates/discovery-cronjob.yaml index 5b6444f..b280e47 100644 --- a/charts/logan/templates/discovery-cronjob.yaml +++ b/charts/logan/templates/discovery-cronjob.yaml @@ -36,6 +36,9 @@ spec: mountPath: {{ .Values.oci.path }} readOnly: true {{- end }} + {{- if .Values.resources }} + resources: {{- toYaml .Values.resources | nindent 14 }} + {{- end }} command: {{- /* object discovery script */}} - bundle From 94fce1c3fcc9a793d5db8551f78308a118d846fc Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Mon, 5 May 2025 14:32:19 +0530 Subject: [PATCH 2/8] enables tcpconnect logs collection --- charts/logan/templates/_helpers.tpl | 9 + charts/logan/templates/logs-configmap.yaml | 5 + .../logan/templates/tcpconnect-daemonset.yaml | 70 +++ charts/logan/values.yaml | 14 +- charts/oci-onm/values.yaml | 2 +- .../v1.0/oraclelinux/8-slim/Dockerfile | 12 +- .../v1.0/oraclelinux/8-slim/tcpconnect.bpf.c | 270 +++++++++ .../v1.0/oraclelinux/8-slim/tcpconnect.c | 539 ++++++++++++++++++ .../v1.0/oraclelinux/8-slim/tcpconnect.h | 51 ++ 9 files changed, 967 insertions(+), 5 deletions(-) create mode 100644 charts/logan/templates/tcpconnect-daemonset.yaml create mode 100644 logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c create mode 100644 logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c create mode 100644 logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h diff --git a/charts/logan/templates/_helpers.tpl b/charts/logan/templates/_helpers.tpl index 1518d17..033b9bf 100644 --- a/charts/logan/templates/_helpers.tpl +++ b/charts/logan/templates/_helpers.tpl @@ -43,6 +43,15 @@ {{- end -}} {{- end -}} +#ociLAClusterEntityID +{{- define "logan.ociLAClusterEntityID" -}} + {{- if .Values.ociLAClusterEntityID -}} + {{ include "common.tplvalues.render" ( dict "value" .Values.ociLAClusterEntityID "context" .) }} + {{- else -}} + {{- "UNDEFINED" -}} + {{- end -}} +{{- end -}} + #kubernetesClusterName {{- define "logan.kubernetesClusterName" -}} {{- if .Values.kubernetesClusterName -}} diff --git a/charts/logan/templates/logs-configmap.yaml b/charts/logan/templates/logs-configmap.yaml index 210ecac..f18ee07 100644 --- a/charts/logan/templates/logs-configmap.yaml +++ b/charts/logan/templates/logs-configmap.yaml @@ -3,6 +3,7 @@ {{- $kubernetesClusterName := (include "logan.kubernetesClusterName" .) }} {{- $kubernetesClusterId := (include "logan.kubernetesClusterId" .) }} +{{- $ociLAClusterEntityID := (include "logan.ociLAClusterEntityID" .) }} apiVersion: v1 kind: ConfigMap metadata: @@ -86,12 +87,16 @@ data: @type record_transformer enable_ruby true + {{- if eq $name "tcpconnect" }} + oci_la_metadata ${{"{{"}}"Kubernetes Cluster Name":"{{ $kubernetesClusterName }}", "Kubernetes Cluster ID": "{{ $kubernetesClusterId }}", "Kubernetes Cluster Entity ID": "{{ $ociLAClusterEntityID }}" {{- range $k, $v := $logDefinition.metadata }},{{ $k | quote }}: {{ $v | quote -}} {{- end }}{{"}}"}} + {{- else }} {{- if $logDefinition.metadata }} oci_la_metadata ${{"{{"}}"Kubernetes Cluster Name":"{{ $kubernetesClusterName }}", "Kubernetes Cluster ID": "{{ $kubernetesClusterId }}" {{- range $k, $v := $logDefinition.metadata }},{{ $k | quote }}: {{ $v | quote -}} {{- end }}{{"}}"}} {{- else if $.Values.fluentd.kubernetesSystem.metadata }} oci_la_metadata ${{"{{"}}"Kubernetes Cluster Name":"{{ $kubernetesClusterName }}", "Kubernetes Cluster ID": "{{ $kubernetesClusterId }}" {{- range $k, $v := $.Values.fluentd.kubernetesSystem.metadata }},{{ $k | quote }}: {{ $v | quote -}} {{- end }}{{"}}"}} {{- else }} oci_la_metadata ${{"{{"}}"Kubernetes Cluster Name":"{{ $kubernetesClusterName }}", "Kubernetes Cluster ID": "{{ $kubernetesClusterId }}" {{- range $k, $v := $.Values.metadata }},{{ $k | quote }}: {{ $v | quote -}} {{- end }}{{"}}"}} + {{- end -}} {{- end }} {{- if $logDefinition.ociLALogGroupID }} oci_la_log_group_id "{{ $logDefinition.ociLALogGroupID }}" diff --git a/charts/logan/templates/tcpconnect-daemonset.yaml b/charts/logan/templates/tcpconnect-daemonset.yaml new file mode 100644 index 0000000..1aa8b5f --- /dev/null +++ b/charts/logan/templates/tcpconnect-daemonset.yaml @@ -0,0 +1,70 @@ +# Copyright (c) 2025, Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. + +--- +{{- $authtype := .Values.authtype | lower }} +{{- $imagePullSecrets := .Values.image.imagePullSecrets }} +{{- $resourceNamePrefix := (include "logan.resourceNamePrefix" .) }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ $resourceNamePrefix }}-tcpconnect + namespace: {{ include "logan.namespace" . }} + labels: + app: {{ $resourceNamePrefix }}-tcpconnect + version: v1 +spec: + selector: + matchLabels: + app: {{ $resourceNamePrefix }}-tcpconnect + version: v1 + template: + metadata: + labels: + app: {{ $resourceNamePrefix }}-tcpconnect + version: v1 + spec: + serviceAccountName: {{ include "logan.serviceAccount" . }} + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + {{- if $imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.image.imagePullSecrets }} + {{- end}} + containers: + - name: {{ $resourceNamePrefix }}-tcpconnect + image: {{ .Values.image.url }} + command: + - /bin/bash + - -c + - -- + args: + - /usr/bin/tcpconnect -e + - -i {{ .Values.fluentd.kubernetesSystem.logs.tcpconnect.interval }} + env: + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + resources: + requests: + cpu: 50m + memory: 50Mi + imagePullPolicy: {{ default "IfNotPresent" .Values.image.imagePullPolicy }} + securityContext: + capabilities: + add: + - CAP_BPF + privileged: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tty: true + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 \ No newline at end of file diff --git a/charts/logan/values.yaml b/charts/logan/values.yaml index db748fc..3f4968f 100644 --- a/charts/logan/values.yaml +++ b/charts/logan/values.yaml @@ -43,7 +43,7 @@ ociDomain: # -- Kubernetes Namespace for deploying monitoring resources deployed by this chart. namespace: "{{ .Values.global.namespace }}" -# -- Resoure Name Prefix: Wherever allowed, this prefix will be used with all resources used by this chart +# -- Resource Name Prefix: Wherever allowed, this prefix will be used with all resources used by this chart resourceNamePrefix: "{{ .Values.global.resourceNamePrefix }}" # -- Kubernetes ServiceAccount @@ -53,7 +53,7 @@ image: # Image pull secrets for. Secret must be in the namespace defined by namespace imagePullSecrets: # -- Replace this value with actual docker image url - url: container-registry.oracle.com/oci_observability_management/oci-la-fluentd-collector:1.6.0 + url: container-registry.oracle.com/oci_observability_management/oci-la-fluentd-collector:1.7.0 # -- Image pull policy imagePullPolicy: Always @@ -364,6 +364,15 @@ fluentd: # The regular expression pattern for the starting line in case of multi-line logs. multilineStartRegExp: /^\S\d{2}\d{2}\s+[^\:]+:[^\:]+:[^\.]+\.\d{0,3}/ + # Config specific to EBPF TCPCONNECT Network logs collection + tcpconnect: + # The path to the source files. + path: /var/log/containers/*-tcpconnect*.log # TODO- Check if this will render as expected, suspecting it won't + # Logging Analytics log source to use for parsing and processing the logs: TCP CONNECT Logs + ociLALogSourceName: "Kubernetes TCP Connect Logs" + # Network logs Polling frequency in seconds + interval: 30 + # Config specific to Kubernetes Audit Logs Collection kube-audit: # The path to the source files. @@ -600,6 +609,7 @@ fluentd: - '"/var/log/containers/etcd-*.log"' - '"/var/log/containers/kube-controller-manager-*.log"' - '"/var/log/containers/kube-scheduler-*.log"' + - '"/var/log/containers/*-tcpconnect-*.log"' # Worker number in case of multi process workers enabled. If not set when multi process workers enabled, then it defaults to 0. #worker: 1 # -- To set timezone override for genericContainerLogs (applies only to log records without explicit timezone identifier in the record itself) diff --git a/charts/oci-onm/values.yaml b/charts/oci-onm/values.yaml index 4272bf1..130a100 100644 --- a/charts/oci-onm/values.yaml +++ b/charts/oci-onm/values.yaml @@ -31,7 +31,7 @@ oci-onm-logan: kubernetesClusterID: "{{ .Values.global.kubernetesClusterID }}" kubernetesClusterName: "{{ .Values.global.kubernetesClusterName }}" image: - url: container-registry.oracle.com/oci_observability_management/oci-la-fluentd-collector:1.6.0 + url: container-registry.oracle.com/oci_observability_management/oci-la-fluentd-collector:1.7.0 # Go to OCI Logging Analytics Administration, click Service Details, and note the namespace value. ociLANamespace: # OCI Logging Analytics Default Log Group OCID diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile b/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile index b8a6ce4..0883647 100644 --- a/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile @@ -18,6 +18,7 @@ ENV GEM_HOME /fluentd/vendor/bundle/ruby/3.3 ENV FLUENTD_DISABLE_BUNDLER_INJECTION 1 COPY Gemfile* /fluentd/ +COPY tcpconnect.* /fluentd/ # Install ruby, ruby-libs along with rubygems and bundler. RUN microdnf -y module enable ruby:3.3 \ @@ -27,7 +28,7 @@ RUN microdnf -y module enable ruby:3.3 \ && microdnf -y install --setopt=install_weak_deps=0 --nodocs rubygems \ && gem install bundler -v 2.5.16 \ # Install development dependent packages for gems native installation - && microdnf --enablerepo ol8_codeready_builder -y install --nodocs gcc make redhat-rpm-config openssl ruby-devel gcc-c++ libtool libffi-devel bzip2 git libyaml-devel \ + && microdnf --enablerepo ol8_codeready_builder -y install --nodocs gcc make redhat-rpm-config openssl ruby-devel gcc-c++ libtool libffi-devel bzip2 git libyaml-devel which elfutils-libelf-devel clang llvm \ # Install Fluentd, it's dependencies along with other run time dependencies for OCI Logging Analytics Solution && bundle config silence_root_warning true \ && bundle config --local path /fluentd/vendor/bundle \ @@ -40,7 +41,13 @@ RUN microdnf -y module enable ruby:3.3 \ && cd /tmp && ls /tmp \ && git clone -b 5.3.0 https://github.com/jemalloc/jemalloc.git && cd jemalloc/ \ && ./autogen.sh && make && make install_bin install_include install_lib \ - && mv lib/libjemalloc.so.2 /usr/lib + && mv lib/libjemalloc.so.2 /usr/lib \ +# Install libbpf-tools from bcc + && cd /tmp && ls /tmp \ + && git clone -b v0.29.1 https://github.com/iovisor/bcc.git && cd bcc/ && git submodule update --init --recursive \ + && cd libbpf-tools/ && cp /fluentd/tcpconnect.* ./ && make \ + && mv ./tcpconnect /usr/bin/ \ + && cd /fluentd/ ## To build the final docker image @@ -72,6 +79,7 @@ RUN microdnf -y module enable ruby:3.3 \ COPY --from=builder /fluentd /fluentd COPY --from=builder /usr/bin/tini /usr/bin/tini COPY --from=builder /usr/lib/libjemalloc.so.2 /usr/lib/libjemalloc.so.2 +COPY --from=builder /usr/bin/tcpconnect /usr/bin/tcpconnect RUN mkdir -p /fluentd/etc /fluentd/plugins \ && touch /fluentd/etc/disable.conf diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c new file mode 100644 index 0000000..54ed8ad --- /dev/null +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Anton Protopopov +// +// Based on tcpconnect(8) from BCC by Brendan Gregg +// +// Copyright (c) 2025, Oracle and/or its affiliates. +// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. +#include + +#include +#include +#include + +#include "maps.bpf.h" +#include "tcpconnect.h" + +const volatile int filter_ports[MAX_PORTS]; +const volatile int filter_ports_len = 0; +const volatile uid_t filter_uid = -1; +const volatile pid_t filter_pid = 0; +const volatile bool do_count = 0; +const volatile bool do_ec = 0; +const volatile bool source_port = 0; + +/* Define here, because there are conflicts with include files */ +#define AF_INET 2 +#define AF_INET6 10 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, u32); + __type(value, struct sock *); +} sockets SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct ipv4_flow_key); + __type(value, u64); +} ipv4_count SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct ipv6_flow_key); + __type(value, u64); +} ipv6_count SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); + +static __always_inline bool filter_port(__u16 port) +{ + int i; + + if (filter_ports_len == 0) + return false; + + for (i = 0; i < filter_ports_len && i < MAX_PORTS; i++) { + if (port == filter_ports[i]) + return false; + } + return true; +} + +static __always_inline int +enter_tcp_connect(struct pt_regs *ctx, struct sock *sk) +{ + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 pid = pid_tgid >> 32; + __u32 tid = pid_tgid; + __u32 uid; + + if (filter_pid && pid != filter_pid) + return 0; + + uid = bpf_get_current_uid_gid(); + if (filter_uid != (uid_t) -1 && uid != filter_uid) + return 0; + + bpf_map_update_elem(&sockets, &tid, &sk, 0); + return 0; +} + +static __always_inline void count_v4(struct sock *sk, __u16 sport, __u16 dport) +{ + struct ipv4_flow_key key = {}; + static __u64 zero; + __u64 *val; + + BPF_CORE_READ_INTO(&key.saddr, sk, __sk_common.skc_rcv_saddr); + BPF_CORE_READ_INTO(&key.daddr, sk, __sk_common.skc_daddr); + key.sport = sport; + key.dport = dport; + val = bpf_map_lookup_or_try_init(&ipv4_count, &key, &zero); + if (val) + __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); +} + +static __always_inline void extended_count_v4(struct sock *sk, __u16 sport, __u16 dport) +{ + struct ipv4_flow_key key = {}; + static __u64 zero; + __u64 *val; + + BPF_CORE_READ_INTO(&key.saddr, sk, __sk_common.skc_rcv_saddr); + BPF_CORE_READ_INTO(&key.daddr, sk, __sk_common.skc_daddr); + //key.sport = sport; + key.dport = dport; + bpf_get_current_comm(&key.task, sizeof(key.task)); + val = bpf_map_lookup_or_try_init(&ipv4_count, &key, &zero); + if (val) + __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); +} + +static __always_inline void count_v6(struct sock *sk, __u16 sport, __u16 dport) +{ + struct ipv6_flow_key key = {}; + static const __u64 zero; + __u64 *val; + + BPF_CORE_READ_INTO(&key.saddr, sk, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&key.daddr, sk, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + key.sport = sport; + key.dport = dport; + + val = bpf_map_lookup_or_try_init(&ipv6_count, &key, &zero); + if (val) + __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); +} + +static __always_inline void extended_count_v6(struct sock *sk, __u16 sport, __u16 dport) +{ + struct ipv6_flow_key key = {}; + static const __u64 zero; + __u64 *val; + + BPF_CORE_READ_INTO(&key.saddr, sk, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&key.daddr, sk, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + //key.sport = sport; + key.dport = dport; + bpf_get_current_comm(&key.task, sizeof(key.task)); + + val = bpf_map_lookup_or_try_init(&ipv6_count, &key, &zero); + if (val) + __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); +} + +static __always_inline void +trace_v4(struct pt_regs *ctx, pid_t pid, struct sock *sk, __u16 sport, __u16 dport) +{ + struct event event = {}; + + event.af = AF_INET; + event.pid = pid; + event.uid = bpf_get_current_uid_gid(); + event.ts_us = bpf_ktime_get_ns() / 1000; + BPF_CORE_READ_INTO(&event.saddr_v4, sk, __sk_common.skc_rcv_saddr); + BPF_CORE_READ_INTO(&event.daddr_v4, sk, __sk_common.skc_daddr); + event.sport = sport; + event.dport = dport; + bpf_get_current_comm(event.task, sizeof(event.task)); + + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(event)); +} + +static __always_inline void +trace_v6(struct pt_regs *ctx, pid_t pid, struct sock *sk, __u16 sport, __u16 dport) +{ + struct event event = {}; + + event.af = AF_INET6; + event.pid = pid; + event.uid = bpf_get_current_uid_gid(); + event.ts_us = bpf_ktime_get_ns() / 1000; + BPF_CORE_READ_INTO(&event.saddr_v6, sk, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&event.daddr_v6, sk, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + event.sport = sport; + event.dport = dport; + bpf_get_current_comm(event.task, sizeof(event.task)); + + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(event)); +} + +static __always_inline int +exit_tcp_connect(struct pt_regs *ctx, int ret, int ip_ver) +{ + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 pid = pid_tgid >> 32; + __u32 tid = pid_tgid; + struct sock **skpp; + struct sock *sk; + __u16 sport = 0; + __u16 dport; + + skpp = bpf_map_lookup_elem(&sockets, &tid); + if (!skpp) + return 0; + + if (ret) + goto end; + + sk = *skpp; + + if (source_port) + BPF_CORE_READ_INTO(&sport, sk, __sk_common.skc_num); + BPF_CORE_READ_INTO(&dport, sk, __sk_common.skc_dport); + + if (filter_port(dport)) + goto end; + + if (do_ec) { + if (ip_ver == 4) + extended_count_v4(sk, sport, dport); + else + extended_count_v6(sk, sport, dport); + } else if (do_count) { + if (ip_ver == 4) + count_v4(sk, sport, dport); + else + count_v6(sk, sport, dport); + } else { + if (ip_ver == 4) + trace_v4(ctx, pid, sk, sport, dport); + else + trace_v6(ctx, pid, sk, sport, dport); + } + +end: + bpf_map_delete_elem(&sockets, &tid); + return 0; +} + +SEC("kprobe/tcp_v4_connect") +int BPF_KPROBE(tcp_v4_connect, struct sock *sk) +{ + return enter_tcp_connect(ctx, sk); +} + +SEC("kretprobe/tcp_v4_connect") +int BPF_KRETPROBE(tcp_v4_connect_ret, int ret) +{ + return exit_tcp_connect(ctx, ret, 4); +} + +SEC("kprobe/tcp_v6_connect") +int BPF_KPROBE(tcp_v6_connect, struct sock *sk) +{ + return enter_tcp_connect(ctx, sk); +} + +SEC("kretprobe/tcp_v6_connect") +int BPF_KRETPROBE(tcp_v6_connect_ret, int ret) +{ + return exit_tcp_connect(ctx, ret, 6); +} + +char LICENSE[] SEC("license") = "GPL"; \ No newline at end of file diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c new file mode 100644 index 0000000..e4e8ed3 --- /dev/null +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Anton Protopopov +// +// Based on tcpconnect(8) from BCC by Brendan Gregg +// +// Copyright (c) 2025, Oracle and/or its affiliates. +// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. +#include +#include +#include +#include +#include +#include +#include +#include +#include "tcpconnect.h" +#include "tcpconnect.skel.h" +#include "btf_helpers.h" +#include "trace_helpers.h" +#include "map_helpers.h" + +#define warn(...) fprintf(stderr, __VA_ARGS__) + +static volatile sig_atomic_t exiting = 0; + +const char *argp_program_version = "tcpconnect 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; +static const char argp_program_doc[] = + "\ntcpconnect: Count/Trace active tcp connections\n" + "\n" + "EXAMPLES:\n" + " tcpconnect # trace all TCP connect()s\n" + " tcpconnect -t # include timestamps\n" + " tcpconnect -p 181 # only trace PID 181\n" + " tcpconnect -P 80 # only trace port 80\n" + " tcpconnect -P 80,81 # only trace port 80 and 81\n" + " tcpconnect -U # include UID\n" + " tcpconnect -u 1000 # only trace UID 1000\n" + " tcpconnect -c # count connects per src, dest, port\n" + " tcpconnect -e # count the connects per command, src ip, dst ip/port and prints the output (space separated, without headers) to STDOUT periodically (default 60s)\n" + " tcpconnect -e -i 30 # count the connects per command, src ip, dst ip/port and prints the output (space separated, without headers) to STDOUT once in every 30s\n" + " tcpconnect --C mappath # only trace cgroups in the map\n" + " tcpconnect --M mappath # only trace mount namespaces in the map\n" + ; + +static int get_int(const char *arg, int *ret, int min, int max) +{ + char *end; + long val; + + errno = 0; + val = strtol(arg, &end, 10); + if (errno) { + warn("strtol: %s: %s\n", arg, strerror(errno)); + return -1; + } else if (end == arg || val < min || val > max) { + return -1; + } + if (ret) + *ret = val; + return 0; +} + +static int get_ints(const char *arg, int *size, int *ret, int min, int max) +{ + const char *argp = arg; + int max_size = *size; + int sz = 0; + char *end; + long val; + + while (sz < max_size) { + errno = 0; + val = strtol(argp, &end, 10); + if (errno) { + warn("strtol: %s: %s\n", arg, strerror(errno)); + return -1; + } else if (end == arg || val < min || val > max) { + return -1; + } + ret[sz++] = val; + if (*end == 0) + break; + argp = end + 1; + } + + *size = sz; + return 0; +} + +static int get_uint(const char *arg, unsigned int *ret, + unsigned int min, unsigned int max) +{ + char *end; + long val; + + errno = 0; + val = strtoul(arg, &end, 10); + if (errno) { + warn("strtoul: %s: %s\n", arg, strerror(errno)); + return -1; + } else if (end == arg || val < min || val > max) { + return -1; + } + if (ret) + *ret = val; + return 0; +} + +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { "timestamp", 't', NULL, 0, "Include timestamp on output" }, + { "count", 'c', NULL, 0, "Count connects per src ip and dst ip/port" }, + { "extended-count", 'e', NULL, 0, "Count the connects per command, src ip, dst ip/port and prints the output (space separated, without headers) to STDOUT periodically (default 60s)" }, + { "interval", 'i', "INTERVAL", 0, "Interval for extended-count in seconds, defaults to 60" }, + { "print-uid", 'U', NULL, 0, "Include UID on output" }, + { "pid", 'p', "PID", 0, "Process PID to trace" }, + { "uid", 'u', "UID", 0, "Process UID to trace" }, + { "source-port", 's', NULL, 0, "Consider source port when counting" }, + { "port", 'P', "PORTS", 0, + "Comma-separated list of destination ports to trace" }, + { "cgroupmap", 'C', "PATH", 0, "trace cgroups in this map" }, + { "mntnsmap", 'M', "PATH", 0, "trace mount namespaces in this map" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static struct env { + bool verbose; + bool count; + bool ec; + int interval; + bool print_timestamp; + bool print_uid; + pid_t pid; + uid_t uid; + int nports; + int ports[MAX_PORTS]; + bool source_port; +} env = { + .uid = (uid_t) -1, + .interval = (int) 60 +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + int err; + int nports; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'c': + env.count = true; + break; + case 'e': + env.ec = true; + break; + case 'i': + err = get_int(arg, &env.interval, 1, INT_MAX); + if (err) { + warn("invalid Interval: %s\n", arg); + argp_usage(state); + } + break; + case 's': + env.source_port = true; + break; + case 't': + env.print_timestamp = true; + break; + case 'U': + env.print_uid = true; + break; + case 'p': + err = get_int(arg, &env.pid, 1, INT_MAX); + if (err) { + warn("invalid PID: %s\n", arg); + argp_usage(state); + } + break; + case 'u': + err = get_uint(arg, &env.uid, 0, (uid_t) -2); + if (err) { + warn("invalid UID: %s\n", arg); + argp_usage(state); + } + break; + case 'P': + nports = MAX_PORTS; + err = get_ints(arg, &nports, env.ports, 1, 65535); + if (err) { + warn("invalid PORT_LIST: %s\n", arg); + argp_usage(state); + } + env.nports = nports; + break; + case 'C': + warn("not implemented: --cgroupmap"); + break; + case 'M': + warn("not implemented: --mntnsmap"); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sig_int(int signo) +{ + exiting = 1; +} + +static void print_count_ipv4(int map_fd, time_t start) +{ + static struct ipv4_flow_key keys[MAX_ENTRIES]; + __u32 value_size = sizeof(__u64); + __u32 key_size = sizeof(keys[0]); + static struct ipv4_flow_key zero; + static __u64 counts[MAX_ENTRIES]; + char s[INET_ADDRSTRLEN]; + char d[INET_ADDRSTRLEN]; + __u32 i, n = MAX_ENTRIES; + //__u64 init = 0; + struct in_addr src; + struct in_addr dst; + + if (dump_hash(map_fd, keys, key_size, counts, value_size, &n, &zero)) { + warn("dump_hash: %s", strerror(errno)); + return; + } + + for (i = 0; i < n; i++) { + src.s_addr = keys[i].saddr; + dst.s_addr = keys[i].daddr; + + if (env.ec) { + printf("v1 %lu %lu %s %s %d %llu %s", + (unsigned long)start, + (unsigned long)time(NULL), + inet_ntop(AF_INET, &src, s, sizeof(s)), + inet_ntop(AF_INET, &dst, d, sizeof(d)), + ntohs(keys[i].dport), + counts[i], + keys[i].task); + printf("\n"); + bpf_map_delete_elem(map_fd, &keys[i]); + } else { + printf("%-25s %-25s", + inet_ntop(AF_INET, &src, s, sizeof(s)), + inet_ntop(AF_INET, &dst, d, sizeof(d))); + if (env.source_port) + printf(" %-20d", keys[i].sport); + printf(" %-20d", ntohs(keys[i].dport)); + printf(" %-10llu", counts[i]); + printf("\n"); + } + } +} + +static void print_count_ipv6(int map_fd, time_t start) +{ + static struct ipv6_flow_key keys[MAX_ENTRIES]; + __u32 value_size = sizeof(__u64); + __u32 key_size = sizeof(keys[0]); + static struct ipv6_flow_key zero; + static __u64 counts[MAX_ENTRIES]; + char s[INET6_ADDRSTRLEN]; + char d[INET6_ADDRSTRLEN]; + __u32 i, n = MAX_ENTRIES; + struct in6_addr src; + struct in6_addr dst; + + if (dump_hash(map_fd, keys, key_size, counts, value_size, &n, &zero)) { + warn("dump_hash: %s", strerror(errno)); + return; + } + + for (i = 0; i < n; i++) { + memcpy(src.s6_addr, keys[i].saddr, sizeof(src.s6_addr)); + memcpy(dst.s6_addr, keys[i].daddr, sizeof(src.s6_addr)); + + if (env.ec) { + printf("v1 %lu %lu %s %s %d %llu %s", + (unsigned long)start, + (unsigned long)time(NULL), + inet_ntop(AF_INET6, &src, s, sizeof(s)), + inet_ntop(AF_INET6, &dst, d, sizeof(d)), + ntohs(keys[i].dport), + counts[i], + keys[i].task); + printf("\n"); + bpf_map_delete_elem(map_fd, &keys[i]); + } else { + printf("%-25s %-25s", + inet_ntop(AF_INET6, &src, s, sizeof(s)), + inet_ntop(AF_INET6, &dst, d, sizeof(d))); + if (env.source_port) + printf(" %-20d", keys[i].sport); + printf(" %-20d", ntohs(keys[i].dport)); + printf(" %-10llu", counts[i]); + printf("\n"); + } + } +} + +static void print_count_header() +{ + printf("\n%-25s %-25s", "LADDR", "RADDR"); + if (env.source_port) + printf(" %-20s", "LPORT"); + printf(" %-20s", "RPORT"); + printf(" %-10s", "CONNECTS"); + printf("\n"); +} + +static void print_count(int map_fd_ipv4, int map_fd_ipv6) +{ + while (!exiting) + pause(); + + print_count_header(); + print_count_ipv4(map_fd_ipv4, 0); + print_count_ipv6(map_fd_ipv6, 0); +} + +static void print_extended_count(int map_fd_ipv4, int map_fd_ipv6) +{ + time_t end; + time_t start = 0; + while (!exiting) { + start = time(NULL); + end = time(NULL) + env.interval; + while (time(NULL) <= end && !exiting) { + sleep(1); + } + print_count_ipv4(map_fd_ipv4, start); + print_count_ipv6(map_fd_ipv6, start); + } + + print_count_ipv4(map_fd_ipv4, start); + print_count_ipv6(map_fd_ipv6, start); +} + +static void print_events_header() +{ + if (env.print_timestamp) + printf("%-9s", "TIME(s)"); + if (env.print_uid) + printf("%-6s", "UID"); + printf("%-6s %-12s %-2s %-16s %-16s", + "PID", "COMM", "IP", "SADDR", "DADDR"); + if (env.source_port) + printf(" %-5s", "SPORT"); + printf(" %-5s\n", "DPORT"); +} + +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + struct event event; + char src[INET6_ADDRSTRLEN]; + char dst[INET6_ADDRSTRLEN]; + union { + struct in_addr x4; + struct in6_addr x6; + } s, d; + static __u64 start_ts; + + if (data_sz < sizeof(event)) { + printf("Error: packet too small\n"); + return; + } + /* Copy data as alignment in the perf buffer isn't guaranteed. */ + memcpy(&event, data, sizeof(event)); + + if (event.af == AF_INET) { + s.x4.s_addr = event.saddr_v4; + d.x4.s_addr = event.daddr_v4; + } else if (event.af == AF_INET6) { + memcpy(&s.x6.s6_addr, event.saddr_v6, sizeof(s.x6.s6_addr)); + memcpy(&d.x6.s6_addr, event.daddr_v6, sizeof(d.x6.s6_addr)); + } else { + warn("broken event: event.af=%d", event.af); + return; + } + + if (env.print_timestamp) { + if (start_ts == 0) + start_ts = event.ts_us; + printf("%-9.3f", (event.ts_us - start_ts) / 1000000.0); + } + + if (env.print_uid) + printf("%-6d", event.uid); + + printf("%-6d %-12.12s %-2d %-16s %-16s", + event.pid, event.task, + event.af == AF_INET ? 4 : 6, + inet_ntop(event.af, &s, src, sizeof(src)), + inet_ntop(event.af, &d, dst, sizeof(dst))); + + if (env.source_port) + printf(" %-5d", event.sport); + + printf(" %-5d", ntohs(event.dport)); + + printf("\n"); +} + +static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) +{ + warn("Lost %llu events on CPU #%d!\n", lost_cnt, cpu); +} + +static void print_events(int perf_map_fd) +{ + struct perf_buffer *pb; + int err; + + pb = perf_buffer__new(perf_map_fd, 128, + handle_event, handle_lost_events, NULL, NULL); + if (!pb) { + err = -errno; + warn("failed to open perf buffer: %d\n", err); + goto cleanup; + } + + print_events_header(); + while (!exiting) { + err = perf_buffer__poll(pb, 100); + if (err < 0 && err != -EINTR) { + warn("error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + /* reset err to return 0 if exiting */ + err = 0; + } + +cleanup: + perf_buffer__free(pb); +} + +int main(int argc, char **argv) +{ + LIBBPF_OPTS(bpf_object_open_opts, open_opts); + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + .args_doc = NULL, + }; + struct tcpconnect_bpf *obj; + int i, err; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + err = ensure_core_btf(&open_opts); + if (err) { + fprintf(stderr, "failed to fetch necessary BTF for CO-RE: %s\n", strerror(-err)); + return 1; + } + + obj = tcpconnect_bpf__open_opts(&open_opts); + if (!obj) { + warn("failed to open BPF object\n"); + return 1; + } + + // ec takes precedence over count + if (env.ec) + obj->rodata->do_ec = true; + if (env.count) + obj->rodata->do_count = true; + if (env.pid) + obj->rodata->filter_pid = env.pid; + if (env.uid != (uid_t) -1) + obj->rodata->filter_uid = env.uid; + if (env.nports > 0) { + obj->rodata->filter_ports_len = env.nports; + for (i = 0; i < env.nports; i++) { + obj->rodata->filter_ports[i] = htons(env.ports[i]); + } + } + // count per unique source_port not applicable for extended count + if (env.source_port && !env.ec) + obj->rodata->source_port = true; + + err = tcpconnect_bpf__load(obj); + if (err) { + warn("failed to load BPF object: %d\n", err); + goto cleanup; + } + + err = tcpconnect_bpf__attach(obj); + if (err) { + warn("failed to attach BPF programs: %s\n", strerror(-err)); + goto cleanup; + } + + if (signal(SIGINT, sig_int) == SIG_ERR) { + warn("can't set signal handler: %s\n", strerror(errno)); + err = 1; + goto cleanup; + } + + // ec takes precedence over count + if (env.ec) { + print_extended_count(bpf_map__fd(obj->maps.ipv4_count), + bpf_map__fd(obj->maps.ipv6_count)); + } else if (env.count) { + print_count(bpf_map__fd(obj->maps.ipv4_count), + bpf_map__fd(obj->maps.ipv6_count)); + } else { + print_events(bpf_map__fd(obj->maps.events)); + } + +cleanup: + tcpconnect_bpf__destroy(obj); + cleanup_core_btf(&open_opts); + + return err != 0; +} \ No newline at end of file diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h new file mode 100644 index 0000000..6e31a73 --- /dev/null +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Anton Protopopov +// +// Copyright (c) 2025, Oracle and/or its affiliates. +// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. +#ifndef __TCPCONNECT_H +#define __TCPCONNECT_H + +/* The maximum number of items in maps */ +#define MAX_ENTRIES 8192 + +/* The maximum number of ports to filter */ +#define MAX_PORTS 64 + +#define TASK_COMM_LEN 16 + +struct ipv4_flow_key { + __u32 saddr; + __u32 daddr; + __u16 sport; + __u16 dport; + char task[TASK_COMM_LEN]; +}; + +struct ipv6_flow_key { + __u8 saddr[16]; + __u8 daddr[16]; + __u16 sport; + __u16 dport; + char task[TASK_COMM_LEN]; +}; + +struct event { + union { + __u32 saddr_v4; + __u8 saddr_v6[16]; + }; + union { + __u32 daddr_v4; + __u8 daddr_v6[16]; + }; + char task[TASK_COMM_LEN]; + __u64 ts_us; + __u32 af; // AF_INET or AF_INET6 + __u32 pid; + __u32 uid; + __u16 sport; + __u16 dport; +}; + +#endif /* __TCPCONNECT_H */ \ No newline at end of file From e97000ac1a636f14ef9ab8fe31c9eb5615a20faa Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Wed, 7 May 2025 15:22:00 +0530 Subject: [PATCH 3/8] Revert "apply default resource limit to discovery job" This reverts commit e4253b0f0abc998ed345204d230820ff2f7602c0. --- charts/logan/templates/discovery-cronjob.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/charts/logan/templates/discovery-cronjob.yaml b/charts/logan/templates/discovery-cronjob.yaml index b280e47..5b6444f 100644 --- a/charts/logan/templates/discovery-cronjob.yaml +++ b/charts/logan/templates/discovery-cronjob.yaml @@ -36,9 +36,6 @@ spec: mountPath: {{ .Values.oci.path }} readOnly: true {{- end }} - {{- if .Values.resources }} - resources: {{- toYaml .Values.resources | nindent 14 }} - {{- end }} command: {{- /* object discovery script */}} - bundle From 1ffb373f5c9ba29237073cc4ab731fcb6d41b832 Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Thu, 29 May 2025 12:09:27 +0530 Subject: [PATCH 4/8] app topology enhancements/fixes; policy updates; helm outout fix --- charts/logan/templates/_helpers.tpl | 2 +- charts/logan/templates/discovery-cronjob.yaml | 2 ++ charts/logan/templates/fluentd-daemonset.yaml | 6 ++-- .../logan/templates/fluentd-deployment.yaml | 6 ++-- charts/logan/templates/logs-configmap.yaml | 4 +-- .../logan/templates/tcpconnect-daemonset.yaml | 19 +++++------ charts/logan/values.yaml | 32 ++++++++++++++++--- charts/oci-onm/values.yaml | 2 +- .../v1.0/oraclelinux/8-slim/Dockerfile | 2 +- .../v1.0/oraclelinux/8-slim/tcpconnect.bpf.c | 6 ++-- .../v1.0/oraclelinux/8-slim/tcpconnect.c | 7 ++-- .../v1.0/oraclelinux/8-slim/tcpconnect.h | 6 ++-- terraform/modules/helm/helm-outputs.tf | 2 +- terraform/modules/iam/iam.tf | 2 +- 14 files changed, 64 insertions(+), 34 deletions(-) diff --git a/charts/logan/templates/_helpers.tpl b/charts/logan/templates/_helpers.tpl index 033b9bf..fcc9642 100644 --- a/charts/logan/templates/_helpers.tpl +++ b/charts/logan/templates/_helpers.tpl @@ -1,5 +1,5 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. # tpl render function diff --git a/charts/logan/templates/discovery-cronjob.yaml b/charts/logan/templates/discovery-cronjob.yaml index 5b6444f..e5290f3 100644 --- a/charts/logan/templates/discovery-cronjob.yaml +++ b/charts/logan/templates/discovery-cronjob.yaml @@ -1,3 +1,5 @@ +# Copyright (c) 2025, Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. {{- $authtype := .Values.authtype | lower }} {{- $resourceNamePrefix := .Values.global.resourceNamePrefix }} {{- $kubernetesClusterName := (include "logan.kubernetesClusterName" .) }} diff --git a/charts/logan/templates/fluentd-daemonset.yaml b/charts/logan/templates/fluentd-daemonset.yaml index 3338789..558da02 100644 --- a/charts/logan/templates/fluentd-daemonset.yaml +++ b/charts/logan/templates/fluentd-daemonset.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. --- @@ -69,7 +69,9 @@ spec: {{- if .Values.extraEnv }} {{- toYaml .Values.extraEnv | nindent 10 }} {{- end }} - {{- if .Values.resources }} + {{- if .Values.resourceOverrides.fluentdDaemonset }} + resources: {{- toYaml .Values.resourceOverrides.fluentdDaemonset | nindent 10 }} + {{- else if .Values.resources }} resources: {{- toYaml .Values.resources | nindent 10 }} {{- end }} volumeMounts: diff --git a/charts/logan/templates/fluentd-deployment.yaml b/charts/logan/templates/fluentd-deployment.yaml index 251a6eb..7166e3a 100644 --- a/charts/logan/templates/fluentd-deployment.yaml +++ b/charts/logan/templates/fluentd-deployment.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. --- @@ -61,7 +61,9 @@ spec: {{- if .Values.extraEnv }} {{- toYaml .Values.extraEnv | nindent 10 }} {{- end }} - {{- if .Values.resources }} + {{- if .Values.resourceOverrides.fluentdDeployment }} + resources: {{- toYaml .Values.resourceOverrides.fluentdDeployment | nindent 10 }} + {{- else if .Values.resources }} resources: {{- toYaml .Values.resources | nindent 10 }} {{- end }} volumeMounts: diff --git a/charts/logan/templates/logs-configmap.yaml b/charts/logan/templates/logs-configmap.yaml index f18ee07..84296b1 100644 --- a/charts/logan/templates/logs-configmap.yaml +++ b/charts/logan/templates/logs-configmap.yaml @@ -1,6 +1,6 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. - +--- {{- $kubernetesClusterName := (include "logan.kubernetesClusterName" .) }} {{- $kubernetesClusterId := (include "logan.kubernetesClusterId" .) }} {{- $ociLAClusterEntityID := (include "logan.ociLAClusterEntityID" .) }} diff --git a/charts/logan/templates/tcpconnect-daemonset.yaml b/charts/logan/templates/tcpconnect-daemonset.yaml index 1aa8b5f..b5f8565 100644 --- a/charts/logan/templates/tcpconnect-daemonset.yaml +++ b/charts/logan/templates/tcpconnect-daemonset.yaml @@ -8,20 +8,20 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: {{ $resourceNamePrefix }}-tcpconnect + name: {{ $resourceNamePrefix }}-logan-tcpconnect namespace: {{ include "logan.namespace" . }} labels: - app: {{ $resourceNamePrefix }}-tcpconnect + app: {{ $resourceNamePrefix }}-logan-tcpconnect version: v1 spec: selector: matchLabels: - app: {{ $resourceNamePrefix }}-tcpconnect + app: {{ $resourceNamePrefix }}-logan-tcpconnect version: v1 template: metadata: labels: - app: {{ $resourceNamePrefix }}-tcpconnect + app: {{ $resourceNamePrefix }}-logan-tcpconnect version: v1 spec: serviceAccountName: {{ include "logan.serviceAccount" . }} @@ -35,7 +35,7 @@ spec: - name: {{ .Values.image.imagePullSecrets }} {{- end}} containers: - - name: {{ $resourceNamePrefix }}-tcpconnect + - name: {{ $resourceNamePrefix }}-logan-tcpconnect image: {{ .Values.image.url }} command: - /bin/bash @@ -50,10 +50,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: spec.nodeName - resources: - requests: - cpu: 50m - memory: 50Mi + {{- if .Values.resourceOverrides.tcpconnectDaemonset }} + resources: {{- toYaml .Values.resourceOverrides.tcpconnectDaemonset | nindent 10 }} + {{- else if .Values.resources }} + resources: {{- toYaml .Values.resources | nindent 10 }} + {{- end }} imagePullPolicy: {{ default "IfNotPresent" .Values.image.imagePullPolicy }} securityContext: capabilities: diff --git a/charts/logan/values.yaml b/charts/logan/values.yaml index 3f4968f..c6bfab6 100644 --- a/charts/logan/values.yaml +++ b/charts/logan/values.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. global: @@ -97,7 +97,7 @@ enableEKSControlPlaneLogs: false # value: ENV_VARIABLE_VALUE extraEnv: [] -# Requests and limits for Memory and CPU +# Requests and limits for Memory and CPU [Defaults] resources: # -- Limits limits: @@ -107,6 +107,30 @@ resources: cpu: 100m memory: 250Mi +# Requests and limits for Memory and CPU [Overrides] +resourceOverrides: + tcpconnectDaemonset: + # -- Resource requests + requests: + cpu: 10m + memory: 50Mi + fluentdDaemonset: + # -- Limits + limits: + memory: 500Mi + # -- Resource requests + requests: + cpu: 100m + memory: 250Mi + fluentdDeployment: + # -- Limits + limits: + memory: 500Mi + # -- Resource requests + requests: + cpu: 100m + memory: 250Mi + # -- @param extraVolumes Extra volumes. # Example: # - name: tmpDir @@ -367,7 +391,7 @@ fluentd: # Config specific to EBPF TCPCONNECT Network logs collection tcpconnect: # The path to the source files. - path: /var/log/containers/*-tcpconnect*.log # TODO- Check if this will render as expected, suspecting it won't + path: /var/log/containers/*-logan-tcpconnect*.log # Logging Analytics log source to use for parsing and processing the logs: TCP CONNECT Logs ociLALogSourceName: "Kubernetes TCP Connect Logs" # Network logs Polling frequency in seconds @@ -609,7 +633,7 @@ fluentd: - '"/var/log/containers/etcd-*.log"' - '"/var/log/containers/kube-controller-manager-*.log"' - '"/var/log/containers/kube-scheduler-*.log"' - - '"/var/log/containers/*-tcpconnect-*.log"' + - '"/var/log/containers/*-logan-tcpconnect-*.log"' # Worker number in case of multi process workers enabled. If not set when multi process workers enabled, then it defaults to 0. #worker: 1 # -- To set timezone override for genericContainerLogs (applies only to log records without explicit timezone identifier in the record itself) diff --git a/charts/oci-onm/values.yaml b/charts/oci-onm/values.yaml index 130a100..0aaf84f 100644 --- a/charts/oci-onm/values.yaml +++ b/charts/oci-onm/values.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. global: diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile b/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile index 0883647..baf4c5c 100644 --- a/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/Dockerfile @@ -1,4 +1,4 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. ### Build the docker image using multi-stage build diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c index 54ed8ad..2821ad4 100644 --- a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.bpf.c @@ -1,10 +1,10 @@ +// Copyright (c) 2025, Oracle and/or its affiliates. +// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. +// // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Anton Protopopov // // Based on tcpconnect(8) from BCC by Brendan Gregg -// -// Copyright (c) 2025, Oracle and/or its affiliates. -// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. #include #include diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c index e4e8ed3..a80e942 100644 --- a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.c @@ -1,11 +1,10 @@ +// Copyright (c) 2025, Oracle and/or its affiliates. +// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. +// // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Anton Protopopov // // Based on tcpconnect(8) from BCC by Brendan Gregg -// -// Copyright (c) 2025, Oracle and/or its affiliates. -// Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. -#include #include #include #include diff --git a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h index 6e31a73..3415c45 100644 --- a/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h +++ b/logan/docker-images/v1.0/oraclelinux/8-slim/tcpconnect.h @@ -1,8 +1,8 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2020 Anton Protopopov -// // Copyright (c) 2025, Oracle and/or its affiliates. // Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. +// +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Anton Protopopov #ifndef __TCPCONNECT_H #define __TCPCONNECT_H diff --git a/terraform/modules/helm/helm-outputs.tf b/terraform/modules/helm/helm-outputs.tf index 1277fec..007115d 100644 --- a/terraform/modules/helm/helm-outputs.tf +++ b/terraform/modules/helm/helm-outputs.tf @@ -9,7 +9,7 @@ locals { "helm install oci-kubernetes-monitoring oci-onm/oci-onm", "--set global.namespace=${var.kubernetes_namespace}", "--set global.kubernetesClusterID=${var.kubernetes_cluster_id}", - "--set global.kubernetesClusterName=${local.kubernetes_cluster_name}", + "--set global.kubernetesClusterName='${local.kubernetes_cluster_name}'", "--set oci-onm-logan.ociLALogGroupID=${var.oci_la_log_group_ocid}", "--set oci-onm-logan.ociLANamespace=${var.oci_la_namespace}", "--set oci-onm-logan.ociLAClusterEntityID=${var.oci_la_cluster_entity_ocid}", diff --git a/terraform/modules/iam/iam.tf b/terraform/modules/iam/iam.tf index 1b1dbb3..c1227a8 100644 --- a/terraform/modules/iam/iam.tf +++ b/terraform/modules/iam/iam.tf @@ -34,7 +34,7 @@ locals { # Allows log analytics service to query OKE infra resources # TODO: check if CLUSTER_READ will lead to duplicate ENTITY creation via service connector flow # Ref - https://docs.oracle.com/en-us/iaas/logging-analytics/doc/ingest-logs-other-oci-services-using-service-connector.html#LOGAN-GUID-3848C538-28AC-4F53-B217-90129278D84F - "Allow service loganalytics to {VCN_READ,SUBNET_READ,LOAD_BALANCER_READ,CLUSTER_READ,VNIC_READ} in ${local.oke_compartment_scope}", + "Allow resource loganalyticsvrp LogAnalyticsVirtualResource to {VCN_READ,SUBNET_READ,LOAD_BALANCER_READ,CLUSTER_READ,VNIC_READ} in ${local.oke_compartment_scope}", # https://docs.oracle.com/en-us/iaas/Content/Identity/Reference/contengpolicyreference.htm "Allow dynamic-group ${local.dynamic_group_name} to {CLUSTER_READ} in ${local.oke_compartment_scope} where target.cluster.id='${var.oke_cluster_ocid}'", "Allow dynamic-group ${local.dynamic_group_name} to read cluster-node-pools in ${local.oke_compartment_scope}", From eb3eaa091aad51223100ae4a88f9fa23aec3dd50 Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Thu, 29 May 2025 19:49:51 +0530 Subject: [PATCH 5/8] option to disable tcpconnect logs --- charts/logan/Chart.yaml | 2 +- charts/logan/templates/tcpconnect-daemonset.yaml | 4 +++- charts/logan/values.yaml | 5 +++++ charts/oci-onm/Chart.yaml | 4 ++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/charts/logan/Chart.yaml b/charts/logan/Chart.yaml index 25b9902..7a5bdba 100644 --- a/charts/logan/Chart.yaml +++ b/charts/logan/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: oci-onm-logan description: Charts for sending Kubernetes platform logs, compute logs, and Kubernetes Objects information to OCI Logging Analytics. type: application -version: 3.6.0 +version: 4.0.0 appVersion: "3.0.0" dependencies: diff --git a/charts/logan/templates/tcpconnect-daemonset.yaml b/charts/logan/templates/tcpconnect-daemonset.yaml index b5f8565..00947ea 100644 --- a/charts/logan/templates/tcpconnect-daemonset.yaml +++ b/charts/logan/templates/tcpconnect-daemonset.yaml @@ -2,6 +2,7 @@ # Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. --- +{{- if .Values.enableTCPConnectLogs }} {{- $authtype := .Values.authtype | lower }} {{- $imagePullSecrets := .Values.image.imagePullSecrets }} {{- $resourceNamePrefix := (include "logan.resourceNamePrefix" .) }} @@ -68,4 +69,5 @@ spec: restartPolicy: Always schedulerName: default-scheduler securityContext: {} - terminationGracePeriodSeconds: 30 \ No newline at end of file + terminationGracePeriodSeconds: 30 +{{- end }} \ No newline at end of file diff --git a/charts/logan/values.yaml b/charts/logan/values.yaml index c6bfab6..3e637b5 100644 --- a/charts/logan/values.yaml +++ b/charts/logan/values.yaml @@ -82,6 +82,11 @@ ociLAClusterEntityID: # In Kubernetes environments where SELinux mode is enforced, set this flag to 'true' to allow fluentd pods to access log files. privileged: false +# -- Enables collection of TCPConnect logs +# Default: 'true'. +# This is required for automated discovery of workload <-> workload relationships in your cluster +enableTCPConnectLogs: true + # -- Enables collection of AWS EKS Control Plane logs through CloudWatch or S3 Fluentd plugin enableEKSControlPlaneLogs: false diff --git a/charts/oci-onm/Chart.yaml b/charts/oci-onm/Chart.yaml index ada4fec..369854a 100644 --- a/charts/oci-onm/Chart.yaml +++ b/charts/oci-onm/Chart.yaml @@ -18,7 +18,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 3.6.0 +version: 4.0.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to @@ -32,7 +32,7 @@ dependencies: repository: "file://../common" condition: oci-onm-common.enabled - name: oci-onm-logan - version: "3.6.0" + version: "4.0.0" repository: "file://../logan" condition: oci-onm-logan.enabled - name: oci-onm-mgmt-agent From e7c95b4b7e5f9482ebb763045276219ca333b2ea Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Thu, 29 May 2025 20:23:44 +0530 Subject: [PATCH 6/8] build script updates --- terraform/oke/stack.auto.tfvars | 5 +++++ terraform/oke/version.auto.tfvars | 7 ------- terraform/oke/version.txt | 1 + util/build_stack.sh | 16 +++++++++++----- 4 files changed, 17 insertions(+), 12 deletions(-) create mode 100644 terraform/oke/stack.auto.tfvars delete mode 100644 terraform/oke/version.auto.tfvars create mode 100644 terraform/oke/version.txt diff --git a/terraform/oke/stack.auto.tfvars b/terraform/oke/stack.auto.tfvars new file mode 100644 index 0000000..3294155 --- /dev/null +++ b/terraform/oke/stack.auto.tfvars @@ -0,0 +1,5 @@ +# Copyright (c) 2023, 2025, Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. + +# DO NOT MODIFY: This file is updated by build script while creating the stack artefact +template_id = "COMMIT_ID_PLACEHOLDER" \ No newline at end of file diff --git a/terraform/oke/version.auto.tfvars b/terraform/oke/version.auto.tfvars deleted file mode 100644 index e4934b2..0000000 --- a/terraform/oke/version.auto.tfvars +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) 2023, 2024, Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v1.0 as shown at https://oss.oracle.com/licenses/upl. - -# The "template_id" is only to identity the version of template in a particular production region. -# This version does not control the version of the template to be used by the stack. -# This is auto managed by build script -template_id = "COMMIT_ID_PLACEHOLDER" \ No newline at end of file diff --git a/terraform/oke/version.txt b/terraform/oke/version.txt new file mode 100644 index 0000000..cc6612c --- /dev/null +++ b/terraform/oke/version.txt @@ -0,0 +1 @@ +2.3.0 \ No newline at end of file diff --git a/util/build_stack.sh b/util/build_stack.sh index 9b45228..0b4b668 100755 --- a/util/build_stack.sh +++ b/util/build_stack.sh @@ -37,13 +37,14 @@ RELEASE_PATH="$ROOT_DIR/releases" UTIL_PATH="$ROOT_DIR/util" BUILD_ZIP="${UTIL_PATH}/temp.zip" BUILD_DIR="${UTIL_PATH}/temp" +VERSION_FILE="$ROOT_DIR/terraform/oke/version.txt" HELM_SOURCE="$BUILD_DIR/charts" MODULES_SOURCE="$BUILD_DIR/terraform/modules" STACK_BUILD_PATH="$BUILD_DIR/terraform/oke" HELM_SYMLINK="$STACK_BUILD_PATH/charts" -TEMPLATE_ID_FILE="$STACK_BUILD_PATH/version.auto.tfvars" +TEMPLATE_ID_FILE="$STACK_BUILD_PATH/stack.auto.tfvars" MODULES_SYMLINK="$STACK_BUILD_PATH/modules" # Usage Instructions @@ -52,8 +53,9 @@ $(basename "$0") [-h][-n name][-d][-s][-b] -- program to build OCI RMS stack zip where: -h show this help text - -n name of output zip file without extention (Optional) - -d flag to generate dev build; contains local helm chart + -n name of output zip file without extension (Optional) + -l flag to generate build alongside local helm chart + -r flag to generate release build; generates artefact with release name and version -s flag to turn-off output; only final build file path is printed to stdout -b flag to generate additional base64 string of stack @@ -61,7 +63,7 @@ The zip artifacts shall be stored at - $RELEASE_PATH" # Parse inputs -while getopts "hn:dsb" option; do +while getopts "hn:lsbr" option; do case $option in h) # display Help echo "$usage" @@ -70,7 +72,11 @@ while getopts "hn:dsb" option; do n) release_name=$OPTARG ;; - d) + r) + VERSION="$(head -n 1 $VERSION_FILE)" + release_name="oci-kubernetes-monitoring-rms-template-$VERSION" + ;; + l) INCLUDE_LOCAL_HELM=true ;; s) # Run SILENT_MODE From 65bea2a1679eb6eb67493ec36aa8ab2fca4d1eb9 Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Fri, 6 Jun 2025 19:22:57 +0530 Subject: [PATCH 7/8] doc updates --- CHANGELOG.md | 20 +++++ charts/logan/templates/_helpers.tpl | 6 +- .../logan/templates/tcpconnect-daemonset.yaml | 2 + charts/logan/values.schema.json | 6 +- charts/logan/values.yaml | 9 ++- docs/FAQ.md | 24 +++++- docs/helm-chart-upgrade-guide.md | 79 +++++++++++++++++++ 7 files changed, 136 insertions(+), 10 deletions(-) create mode 100644 docs/helm-chart-upgrade-guide.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 061a64f..544de34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Change Log +# 2025-06-09 + +### Added +- Introduced a new DaemonSet that uses eBPF (Extended Berkeley Packet Filter) to capture TCP connection logs, enabling visualization of application-level communication within the Kubernetes cluster. +- OCI Console integration supporting new features: + - **Network View:** Dynamically discover and visualize workload-to-workload communication within the cluster. + - **Infrastructure View:** Visualize OKE infrastructure components such as Subnets, Load Balancers, Nodes, and their interactions. + - **Kubernetes Spec Change Detection (View Insights):** Monitor changes/diffs of 50+ key properties across primary Kubernetes workload types: + - DaemonSet + - Deployment + - ReplicaSet + - StatefulSet + - CronJob & Job + - Exclusion: Managed workloads (ex - A Job created via a CronJob) are not tracked + + **Note:** Additional enhancements and features are available in the OCI Console beyond those listed here. Please refer to the OCI Log Analytics Release Notes for more details. + +### Changed +- `kubernetesClusterID` (in the Helm chart) is now a mandatory field. *(This is not backward compatible.)* +- Updated resource limits for Log Analytics pods and workloads. ## 2025-03-19 ### Added diff --git a/charts/logan/templates/_helpers.tpl b/charts/logan/templates/_helpers.tpl index fcc9642..b129814 100644 --- a/charts/logan/templates/_helpers.tpl +++ b/charts/logan/templates/_helpers.tpl @@ -45,11 +45,7 @@ #ociLAClusterEntityID {{- define "logan.ociLAClusterEntityID" -}} - {{- if .Values.ociLAClusterEntityID -}} - {{ include "common.tplvalues.render" ( dict "value" .Values.ociLAClusterEntityID "context" .) }} - {{- else -}} - {{- "UNDEFINED" -}} - {{- end -}} + {{ include "common.tplvalues.render" ( dict "value" .Values.ociLAClusterEntityID "context" .) }} {{- end -}} #kubernetesClusterName diff --git a/charts/logan/templates/tcpconnect-daemonset.yaml b/charts/logan/templates/tcpconnect-daemonset.yaml index 00947ea..cc2ffaf 100644 --- a/charts/logan/templates/tcpconnect-daemonset.yaml +++ b/charts/logan/templates/tcpconnect-daemonset.yaml @@ -57,6 +57,8 @@ spec: resources: {{- toYaml .Values.resources | nindent 10 }} {{- end }} imagePullPolicy: {{ default "IfNotPresent" .Values.image.imagePullPolicy }} + # The container runs in privileged mode, but with only the CAP_BPF capability enabled. + # This allows it to execute the required BPF programs while maintaining a minimal security footprint. securityContext: capabilities: add: diff --git a/charts/logan/values.schema.json b/charts/logan/values.schema.json index 8c539e6..ac7b053 100644 --- a/charts/logan/values.schema.json +++ b/charts/logan/values.schema.json @@ -7,7 +7,8 @@ "image", "ociLANamespace", "ociLALogGroupID", - "fluentd" + "fluentd", + "ociLAClusterEntityID" ], "properties": { "image": { @@ -64,6 +65,9 @@ "type": "string", "minLength": 3, "maxLength": 63 + }, + "ociLAClusterEntityID": { + "type": "string" } } } diff --git a/charts/logan/values.yaml b/charts/logan/values.yaml index 3e637b5..2d1c1e3 100644 --- a/charts/logan/values.yaml +++ b/charts/logan/values.yaml @@ -82,9 +82,9 @@ ociLAClusterEntityID: # In Kubernetes environments where SELinux mode is enforced, set this flag to 'true' to allow fluentd pods to access log files. privileged: false -# -- Enables collection of TCPConnect logs -# Default: 'true'. -# This is required for automated discovery of workload <-> workload relationships in your cluster +# -- Enables the collection of TCP connect logs. +# Default: true +# Warning: Disabling this will prevent automatic discovery of workload-to-workload communication within the cluster. enableTCPConnectLogs: true # -- Enables collection of AWS EKS Control Plane logs through CloudWatch or S3 Fluentd plugin @@ -114,11 +114,13 @@ resources: # Requests and limits for Memory and CPU [Overrides] resourceOverrides: + # Responsible for TCP connection events collection. tcpconnectDaemonset: # -- Resource requests requests: cpu: 10m memory: 50Mi + # Responsible for log collection. fluentdDaemonset: # -- Limits limits: @@ -127,6 +129,7 @@ resourceOverrides: requests: cpu: 100m memory: 250Mi + # Responsible for the collection of EKS control plane logs. fluentdDeployment: # -- Limits limits: diff --git a/docs/FAQ.md b/docs/FAQ.md index fa0d1d6..49960d9 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -18,6 +18,7 @@ Refer [here](../README.md#installation-instructions). | :----: | :----: | :----: | :----: | :----: | | Namespace | All | oci-onm | Namespace in which all the resources would be installed. | There is a provision to choose pre-created namespace or to create a different namespace and then use it. | | DaemonSet | Logs | oci-onm-logan | Responsible for log collection. | | +| DaemonSet | Logs | oci-onm-logan-tcpconnect | Responsible for TCP connection events collection. | The pods in this DaemonSet run in privileged mode, but with only the CAP_BPF capability enabled. This allows them to execute the required BPF programs while maintaining a minimal security footprint. | | CronJob | Discovery, Kubernetes Objects State | oci-onm-discovery | Responsible for Kubernetes discovery and objects state collection. | | | StatefulSet | Metrics | oci-onm-mgmt-agent | Responsible for metrics collection. | | | ConfigMap | Logs | oci-onm-logs | Contains Fluentd configuration aiding the log collection. | | @@ -28,7 +29,7 @@ Refer [here](../README.md#installation-instructions). | Role | Discovery, Kubernetes Objects State | oci-onm | Contains pre-defined set of required rules/permissions at namespace level for the solution to work. | | | RoleBinding | Discovery, Kubernetes Objects State | oci-onm | Binding between Role and ServiceAccount. | | | Secret | Logs, Discovery, Kubernetes Objects State | oci-onm-oci-config | To store OCI config credentials. | Created only when configFile based auth is chosen over the default instancePrincipal based auth. | -| Deployment | Logs | oci-onm | Responsible for the collection of EKS control plane logs. | Created only when installing on EKS and setting `oci-onm-logan.enableEKSControlPlaneLogs` helm variable set to true. | +| Deployment | Logs | oci-onm-logan | Responsible for the collection of EKS control plane logs. | Created only when installing on EKS and setting `oci-onm-logan.enableEKSControlPlaneLogs` helm variable set to true. | | ConfigMap | Logs | oci-onm-ekscp-logs | Contains Fluentd configuration aiding EKS control plane log collection. | Created only when installing on EKS and setting `oci-onm-logan.enableEKSControlPlaneLogs` helm variable set to true. | | Service | Metrics | oci-onm-mgmt-agent | Kubernetes Service for Mgmt Agent Pods. | | | ConfigMap | Metrics | oci-onm-metrics | Configuration aiding Mgmt Agent Pods. | | @@ -622,6 +623,27 @@ Allow dynamic-group ${OKE_DYNAMIC_GROUP} to read log-content in tenancy Allow service loganalytics to {VCN_READ,SUBNET_READ,VNIC_READ} in tenancy ``` +### Why does the TcpConnect DaemonSet use privileged mode? Can it be disabled? + +The tcpconnect DaemonSet runs an eBPF program to collect TCP connection events, which are essential for dynamically mapping communication between workloads in the cluster. These relationships are visualized in the network topology view. + +To enable the eBPF program, the DaemonSet requires privileged mode with the CAP_BPF capability. + +You can disable this feature by setting the following property to false: + +```yaml +... +... +oci-onm-logan: + .. + .. + enableTCPConnectLogs: false + .. + .. +``` + +**Warning:** Warning: Disabling this will prevent automatic discovery of workload-to-workload communication within the cluster, resulting in an empty network topology view in the OCI Console. + ### Control plane log collection for AWS EKS (Amazon Elastic Kubernetes Service) AWS EKS control plane logs are available in CloudWatch. diff --git a/docs/helm-chart-upgrade-guide.md b/docs/helm-chart-upgrade-guide.md new file mode 100644 index 0000000..8b1809b --- /dev/null +++ b/docs/helm-chart-upgrade-guide.md @@ -0,0 +1,79 @@ + +# OCI-ONM Helm Chart Upgrade Guide + +This guide provides step-by-step instructions, version-specific changes, and important considerations for upgrading the `oci-onm` Helm chart. + +> Important: Always test upgrades in a staging environment before applying them to production clusters. + +## Upgrade: v3.6.0 → v4.0.0 + +### What's New + + +- TCP Connection Logging with eBPF: + A new DaemonSet leverages eBPF (Extended Berkeley Packet Filter) to capture TCP connection logs, enabling enhanced visualization of application-level communication within your Kubernetes cluster. + +- OCI Console Integration Enhancements: + - Network View: Discover and visualize real-time communication between workloads in the cluster. + - Infrastructure View: Visualize OKE infrastructure components such as subnets, load balancers, and nodes, and how they interact. + - Kubernetes Spec Change Detection (View Insights): Track changes to over 50+ key properties across primary workload types: + - DaemonSet + - Deployment + - ReplicaSet + - StatefulSet + - CronJob & Job + + Note: Managed workloads (e.g., a Job created by a CronJob) are excluded. + +Additional features are available in the OCI Console beyond what’s listed here. Refer to the OCI Log Analytics Release Notes for more details. + +## Upgrade Instructions + + +1. Update IAM Policies: + - This version requires additional policy statements for infrastructure discovery. + See the pre-requisites section in the [README](../README.md#0-pre-requisites) for details. + +2. Create Logging Analytics Cluster Entity: + - Follow [these steps](../README.md#1-create-logging-analytics-entity-of-type-kubernetes-cluster) to create a Kubernetes Cluster entity in Logging Analytics. + Note: If this was already configured in earlier versions, no further action is required, you can upgrade your chart. (step 4) + +3. Update your `values.yaml` file: + + ```yaml + ... + ... + oci-onm-logan: + .. + .. + ociLAClusterEntityID: + .. + .. + ``` + +4. Upgrade the Helm chart: + + ```bash + helm upgrade oci/oci-onm -f values.yaml + ``` + +## Post-upgrade Checklist + +- [ ] Ensure `tcpconnect` DaemonSet pods are running (not in CrashLoopBackOff) +- [ ] Review logs of `tcpconnect` pods for any errors +- [ ] Review logs of `discovery` pods for any errors +- [ ] Verify that Network View and Application Topology are functional in the OCI Console + +## Warnings & Considerations + + +- Disabling TCPConnect Logs: + Setting `enableTCPConnectLogs: false` disables automatic discovery of workload communication, resulting in an empty topology view. + +- Privileged Mode Required: + The TCPConnect DaemonSet requires privileged mode to execute eBPF programs. + +## Resources & Support + +- Project Documentation: https://github.com/oracle-quickstart/oci-kubernetes-monitoring +- Report Issues on GitHub: https://github.com/oracle-quickstart/oci-kubernetes-monitoring/issues From a82bd433f9d484585c7aa060b6b845a14162ab98 Mon Sep 17 00:00:00 2001 From: paliwalparitosh Date: Tue, 17 Jun 2025 15:38:31 +0530 Subject: [PATCH 8/8] doc updates and helm chart changes --- CHANGELOG.md | 22 ++---- README.md | 30 ++++++- .../logan/templates/tcpconnect-daemonset.yaml | 2 +- charts/logan/values.yaml | 10 +-- docs/FAQ.md | 12 +-- docs/helm-chart-upgrade-guide.md | 79 ------------------- 6 files changed, 47 insertions(+), 108 deletions(-) delete mode 100644 docs/helm-chart-upgrade-guide.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 544de34..cfadfbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,25 +1,17 @@ # Change Log -# 2025-06-09 - +# 2025-06-17 ### Added -- Introduced a new DaemonSet that uses eBPF (Extended Berkeley Packet Filter) to capture TCP connection logs, enabling visualization of application-level communication within the Kubernetes cluster. +- Introduced a new DaemonSet that uses eBPF (Extended Berkeley Packet Filter) to capture TCP connection logs and builds application/network topology representing workload to workload relationships within the Kubernetes cluster. + - To be able to run the required eBPF program, the pods needs to run in privileged mode but restricting to CAP_BPF capability only. +- New helm variable to control the resource limits at individual logan workloads. +- Enables OKE infra discovery and service logs collection (default) - OCI Console integration supporting new features: - - **Network View:** Dynamically discover and visualize workload-to-workload communication within the cluster. - - **Infrastructure View:** Visualize OKE infrastructure components such as Subnets, Load Balancers, Nodes, and their interactions. - - **Kubernetes Spec Change Detection (View Insights):** Monitor changes/diffs of 50+ key properties across primary Kubernetes workload types: - - DaemonSet - - Deployment - - ReplicaSet - - StatefulSet - - CronJob & Job - - Exclusion: Managed workloads (ex - A Job created via a CronJob) are not tracked - - **Note:** Additional enhancements and features are available in the OCI Console beyond those listed here. Please refer to the OCI Log Analytics Release Notes for more details. + - Topology : New Views (Infra and Network) along with Platform. + - View Insights for Workloads including capabilities to view the detailed spec of a workload, monitor the changes to the spec of a workload, create in-line labels for issues etc. ### Changed - `kubernetesClusterID` (in the Helm chart) is now a mandatory field. *(This is not backward compatible.)* -- Updated resource limits for Log Analytics pods and workloads. ## 2025-03-19 ### Added diff --git a/README.md b/README.md index cf42594..9fefb75 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ It does extensive enrichment of logs, metrics and object information to enable c ## Get Started :rocket: -:stop_sign: Upgrading to a major version (like 2.x to 3.x)? See [upgrade](#upgrading-to-a-major-version) section below for details. :warning: +:stop_sign: Upgrading to a major version (like 3.x to 4.x)? See [upgrade](#upgrading-to-a-major-version) section below for details. :warning: ### Pre-requisites @@ -366,6 +366,34 @@ Refer [here](#3c-import-dashboards). ### Upgrading to a major version +#### 3.6.0 to 4.0.0 + +For changes in this release, refer to [CHANGELOG.md](CHANGELOG.md) + +##### Upgrade instructions + +1. Update IAM Policies: + * This version requires additional policy statements for infrastructure discovery. + * See the pre-requisites section in the [README](../README.md#0-pre-requisites) for details. + +1. As mentioned in the change log, this version introduces a new DaemonSet that uses eBPF (Extended Berkeley Packet Filter) to capture TCP connection logs and builds application/network topology representing workload to workload relationships within the Kubernetes cluster. + * To be able to run the required eBPF program, the pods needs to run in privileged mode but restricting to CAP_BPF capability only. + * In your environment, if you have any restrictions with respect to running pods in privileged mode, you may need to adjust your cluster configuration accordingly. + +2. Upgrade the Helm chart: + + ```sh + # fetch latest (4.x) helm repo for oci + helm repo update oci-onm + + # fetch the current release configuration + helm get values -n > override_values.yaml + + # Upgrade the helm chart + helm upgrade oci/oci-onm -n -f override_values.yaml + ``` + + #### 2.x to 3.x One of the major changes introduced in 3.0.0 is refactoring of helm chart where major features of the solution got split into separate sub-charts. 2.x has only support for logs and objects collection using Fluentd and OCI Logging Analytics and this is now moved into a separate chart oci-onm-logan and included as a sub-chart to the main chart oci-onm. This is a breaking change w.r.t the values.yaml and any customisations that you might have done on top of it. There is no breaking change w.r.t functionality offered in 2.x. For full list of changes in 3.x, refer to [changelog](CHANGELOG.md). diff --git a/charts/logan/templates/tcpconnect-daemonset.yaml b/charts/logan/templates/tcpconnect-daemonset.yaml index cc2ffaf..f034deb 100644 --- a/charts/logan/templates/tcpconnect-daemonset.yaml +++ b/charts/logan/templates/tcpconnect-daemonset.yaml @@ -44,7 +44,7 @@ spec: - -- args: - /usr/bin/tcpconnect -e - - -i {{ .Values.fluentd.kubernetesSystem.logs.tcpconnect.interval }} + - -i 30 env: - name: K8S_NODE_NAME valueFrom: diff --git a/charts/logan/values.yaml b/charts/logan/values.yaml index 2d1c1e3..578ae01 100644 --- a/charts/logan/values.yaml +++ b/charts/logan/values.yaml @@ -84,7 +84,7 @@ privileged: false # -- Enables the collection of TCP connect logs. # Default: true -# Warning: Disabling this will prevent automatic discovery of workload-to-workload communication within the cluster. +# Note: Disabling this will prevent automatic discovery of workload-to-workload communication within the cluster. enableTCPConnectLogs: true # -- Enables collection of AWS EKS Control Plane logs through CloudWatch or S3 Fluentd plugin @@ -114,13 +114,13 @@ resources: # Requests and limits for Memory and CPU [Overrides] resourceOverrides: - # Responsible for TCP connection events collection. + # Responsible for TCP connect logs collection aiding discovery of workload to workload relationships. tcpconnectDaemonset: # -- Resource requests requests: cpu: 10m memory: 50Mi - # Responsible for log collection. + # Responsible for various logs collection. fluentdDaemonset: # -- Limits limits: @@ -402,8 +402,6 @@ fluentd: path: /var/log/containers/*-logan-tcpconnect*.log # Logging Analytics log source to use for parsing and processing the logs: TCP CONNECT Logs ociLALogSourceName: "Kubernetes TCP Connect Logs" - # Network logs Polling frequency in seconds - interval: 30 # Config specific to Kubernetes Audit Logs Collection kube-audit: @@ -727,7 +725,7 @@ k8sDiscovery: infra: # Enable Logs collection for OKE's OCI infra components - LB, OKE Cluster control plane, Subnet logs etc # Not supported for Non OKE clusters - enable_service_log: false + enable_service_log: true # Discovers OKE Node Pools in all compartments of tenant # when false, Node Pools present in OKE's compartment are discovered probe_all_compartments: false diff --git a/docs/FAQ.md b/docs/FAQ.md index 49960d9..fe877dd 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -18,7 +18,7 @@ Refer [here](../README.md#installation-instructions). | :----: | :----: | :----: | :----: | :----: | | Namespace | All | oci-onm | Namespace in which all the resources would be installed. | There is a provision to choose pre-created namespace or to create a different namespace and then use it. | | DaemonSet | Logs | oci-onm-logan | Responsible for log collection. | | -| DaemonSet | Logs | oci-onm-logan-tcpconnect | Responsible for TCP connection events collection. | The pods in this DaemonSet run in privileged mode, but with only the CAP_BPF capability enabled. This allows them to execute the required BPF programs while maintaining a minimal security footprint. | +| DaemonSet | Logs | oci-onm-logan-tcpconnect | Responsible for TCP connect logs collection aiding discovery of workload to workload relationships. | The pods in this DaemonSet run in privileged mode, but with only the CAP_BPF capability which enables the pods to run the required eBPF program. | | CronJob | Discovery, Kubernetes Objects State | oci-onm-discovery | Responsible for Kubernetes discovery and objects state collection. | | | StatefulSet | Metrics | oci-onm-mgmt-agent | Responsible for metrics collection. | | | ConfigMap | Logs | oci-onm-logs | Contains Fluentd configuration aiding the log collection. | | @@ -625,11 +625,13 @@ Allow service loganalytics to {VCN_READ,SUBNET_READ,VNIC_READ} in tenancy ### Why does the TcpConnect DaemonSet use privileged mode? Can it be disabled? -The tcpconnect DaemonSet runs an eBPF program to collect TCP connection events, which are essential for dynamically mapping communication between workloads in the cluster. These relationships are visualized in the network topology view. +TcpConnect DaemonSet is responsible for TCP connect logs collection aiding discovery of workload to workload relationships. -To enable the eBPF program, the DaemonSet requires privileged mode with the CAP_BPF capability. +To be able to run the required eBPF program, the pods needs to run in privileged mode but restricting to CAP_BPF capability only. -You can disable this feature by setting the following property to false: +If you need to disable this feature, set the following property to false: + +> Note: Disabling this will prevent automatic discovery of workload-to-workload communication within the cluster, resulting in an empty network topology view in the OCI Console. ```yaml ... @@ -642,8 +644,6 @@ oci-onm-logan: .. ``` -**Warning:** Warning: Disabling this will prevent automatic discovery of workload-to-workload communication within the cluster, resulting in an empty network topology view in the OCI Console. - ### Control plane log collection for AWS EKS (Amazon Elastic Kubernetes Service) AWS EKS control plane logs are available in CloudWatch. diff --git a/docs/helm-chart-upgrade-guide.md b/docs/helm-chart-upgrade-guide.md deleted file mode 100644 index 8b1809b..0000000 --- a/docs/helm-chart-upgrade-guide.md +++ /dev/null @@ -1,79 +0,0 @@ - -# OCI-ONM Helm Chart Upgrade Guide - -This guide provides step-by-step instructions, version-specific changes, and important considerations for upgrading the `oci-onm` Helm chart. - -> Important: Always test upgrades in a staging environment before applying them to production clusters. - -## Upgrade: v3.6.0 → v4.0.0 - -### What's New - - -- TCP Connection Logging with eBPF: - A new DaemonSet leverages eBPF (Extended Berkeley Packet Filter) to capture TCP connection logs, enabling enhanced visualization of application-level communication within your Kubernetes cluster. - -- OCI Console Integration Enhancements: - - Network View: Discover and visualize real-time communication between workloads in the cluster. - - Infrastructure View: Visualize OKE infrastructure components such as subnets, load balancers, and nodes, and how they interact. - - Kubernetes Spec Change Detection (View Insights): Track changes to over 50+ key properties across primary workload types: - - DaemonSet - - Deployment - - ReplicaSet - - StatefulSet - - CronJob & Job - - Note: Managed workloads (e.g., a Job created by a CronJob) are excluded. - -Additional features are available in the OCI Console beyond what’s listed here. Refer to the OCI Log Analytics Release Notes for more details. - -## Upgrade Instructions - - -1. Update IAM Policies: - - This version requires additional policy statements for infrastructure discovery. - See the pre-requisites section in the [README](../README.md#0-pre-requisites) for details. - -2. Create Logging Analytics Cluster Entity: - - Follow [these steps](../README.md#1-create-logging-analytics-entity-of-type-kubernetes-cluster) to create a Kubernetes Cluster entity in Logging Analytics. - Note: If this was already configured in earlier versions, no further action is required, you can upgrade your chart. (step 4) - -3. Update your `values.yaml` file: - - ```yaml - ... - ... - oci-onm-logan: - .. - .. - ociLAClusterEntityID: - .. - .. - ``` - -4. Upgrade the Helm chart: - - ```bash - helm upgrade oci/oci-onm -f values.yaml - ``` - -## Post-upgrade Checklist - -- [ ] Ensure `tcpconnect` DaemonSet pods are running (not in CrashLoopBackOff) -- [ ] Review logs of `tcpconnect` pods for any errors -- [ ] Review logs of `discovery` pods for any errors -- [ ] Verify that Network View and Application Topology are functional in the OCI Console - -## Warnings & Considerations - - -- Disabling TCPConnect Logs: - Setting `enableTCPConnectLogs: false` disables automatic discovery of workload communication, resulting in an empty topology view. - -- Privileged Mode Required: - The TCPConnect DaemonSet requires privileged mode to execute eBPF programs. - -## Resources & Support - -- Project Documentation: https://github.com/oracle-quickstart/oci-kubernetes-monitoring -- Report Issues on GitHub: https://github.com/oracle-quickstart/oci-kubernetes-monitoring/issues