Merge pull request #13943 from mburke5678/nodes-node-problem-detector

mburke5678 · web-flow · commit 5b3ec8f68551 · 2019-03-08T17:18:54.000-05:00
Document Node Problem Detector for 4.0
diff --git a/modules/nodes-nodes-problem-detector-customizing.adoc b/modules/nodes-nodes-problem-detector-customizing.adoc
@@ -5,27 +5,57 @@
 [id='nodes-nodes-problem-detector-customizing_{context}']
 = Customizing Node Problem Detector conditions
 
-You can configure the Node Problem Detector to watch for any log string by editing the Node Problem Detector configuration map.
+You can configure the Node Problem Detector to watch for any log string by editing the Node Problem Detector custom resource (CR).
+
+.Prerequisites
+
+* The Node Problem Detector Operator must be installed.
+
+* If needed, get the name of the Node Problem Detector CR:
++
+----
+$ oc get NodeProblemDetector
+NAME                    AGE
+node-problem-detector   6m6s
+----
+
+* Set the Node Problem Detector to the unmanaged state. In managed state, the Node Problem Detector Operator reverts changes made to the problem node detector configuration map.
 
 .Procedure
 
-To configure the Node Problem Detector, add or remove problem conditions and events.
+To modify the Node Problem Detector:
 
-. Edit the Node Problem Detector configuration map with a text editor.
+. Open the Node Problem Detector CR for editing.
 +
 ----
-$ oc edit configmap /openshift-node-problem-detector
+$ oc edit problem-node-detector <node>
 ----
 +
-.Sample Node Problem Detector Configuration Map
-[source,yaml]
+For example:
++
 ----
-apiVersion: v1
-kind: ConfigMap
+oc edit problem-node-detector problem-node-detector
+
+apiVersion: node-problem-detector.operator.k8s.io/v1alpha1
+kind: NodeProblemDetector
 metadata:
+  creationTimestamp: 2019-03-04T00:18:48Z
+  generation: 1
   name: node-problem-detector
-data:
-    kernel-monitor.json: |  <1>
+  namespace: default
+  resourceVersion: "47179"
+  selfLink: /apis/node-problem-detector.operator.k8s.io/v1alpha1/namespaces/default/nodeproblemdetectors/node-problem-detector
+  uid: 14acef47-3e13-11e9-a640-0a4ad769663a
+namespace: openshift-node-problem-detector
+----
+
+. Change the parameters and values as needed: 
++
+.Sample Node Problem Detector Configuration Map
+[source,yaml]
+----
+spec:
+  kernel-monitor.json: |  <8>
     {
         "plugin": "journald", <2>
         "pluginConfig": {
@@ -42,7 +72,7 @@ data:
                         "message": "kernel has no deadlock"  <7>
                 }
         ],
-        "rules": [ <8>
+        "rules": [
                 {
                         "type": "temporary",
                         "reason": "OOMKilling",
@@ -76,6 +106,37 @@ data:
                 },
         ]
     }
+
+  kubelet-monitor.json: |-
+    {
+        "plugin": "custom",
+        "pluginConfig": {
+            "invoke_interval": "120s",
+            "timeout": "60s",
+            "concurrency": 1
+        },
+        "source": "kubelet-custom-plugin-monitor",
+        "conditions": [{
+            "type": "KubeletProblem",
+            "reason": "KubeletIsUp",
+            "message": "kubelet is up"
+        }],
+        "rules": [{
+                "type": "temporary",
+                "reason": "KubeletIsDown",
+                "path": "/etc/npd-plugins/kubelet-health.sh",
+                "timeout": "30s"
+            },
+            {
+                "type": "permanent",
+                "condition": "KubeletProblem",
+                "reason": "KubeletIsDown",
+                "path": "/etc/npd-plugins/kubelet-health.sh",
+                "timeout": "45s"
+            }
+        ]
+    }
+
 ----
 
 <1> Rules and conditions that apply to container images.
@@ -92,7 +153,7 @@ https://kubernetes.io/docs/tasks/debug-application-cluster/monitor-node-health/#
 The Node Problem Detector supports file-based kernel logging. However, it is easy to extend it to support other log formats.
 ////
 
-. Remove, add, or edit any node conditions or events as needed.
+. Optionally, you can add new node conditions or events:
 +
 [source,yaml]
 ----
@@ -134,3 +195,4 @@ spec:
 <1> Sends the output to standard output (stdout).
 <2> Path to the error log.
 <3> Comma-separated path to the plug-in configuration files.
+
diff --git a/modules/nodes-nodes-problem-detector-installing.adoc b/modules/nodes-nodes-problem-detector-installing.adoc
@@ -5,14 +5,14 @@
 [id='nodes-nodes-problem-detector-installing_{context}']
 = Installing the {product-title} Node Problem Detector
 
-You can use the {product-title} console to install the Node Problem Detector (NPD), which creates the Node Problem Detector Operator.
+You can use the {product-title} console to install the Node Problem Detector Operator.
 
 .Prerequisites
 
 . Create a Project for the NPD:
 +
 ----
-$ oc create ns openshift-node-problem-detector
+$ oc create ns openshift-node-problem-detector --node-selector: ""
 ----
 
 . Create an Operator Group
@@ -32,18 +32,192 @@ EOF
 
 .Procedure
 
-To install the Node Problem Detector:
+The process to install the Node Problem Detector involves installing the Node Problem Detector Operator and creating a Node Problem Detector instance.
 
 . In the {product-title} console, click *Catalog* -> *Operator Hub*.
 
-. Choose  *node-problem-detector* from the list of available Operators, and click Install.
-
 . On the *Create Operator Subscription* page: 
 
 .. Select the `openshift-node-problem-detector` project from the *A specific namespace on the cluster* drop-down list.
 
 .. Click *Subscribe*.
 
-. On the *Catalog* → *Installed Operators* page, verify that the NodeProblemDetector (CSV) eventually shows up and its *Status* ultimately resolves to *InstallSucceeded*.
+.. Click *Subscribe*.
 
+. On the *Catalog* → *Installed Operators* page, verify that the NodeProblemDetector (CSV) eventually shows up and its *Status* ultimately resolves to *InstallSucceeded*.
++
 If it does not, switch to the *Catalog* → *Operator Management* page and inspect the *Operator Subscriptions* and *Install Plans* tabs for any failure or errors under *Status*. Then, check the logs in any Pods in the openshift-operators project (on the *Workloads* → *Pods* page) that are reporting issues to troubleshoot further.
+
+. Click *Administration* -> *CRD*.
+
+. On the *Custom Resource Definitions* page, click *NodeProblemDetector*.
+
+. On the *Node Problem Detector* page, click *Create Node Problem Detector*.
+
+. Specify a name and enter the *openshift-node-problem-detector* namespace.
++
+[source,yaml]
+----
+apiVersion: node-problem-detector.operator.k8s.io/v1alpha1
+kind: NodeProblemDetector
+metadata:
+  name: example
+  namespace: default
+spec: {}
+----
+<1> Specify a name for the Node Problem Detector.
+<2> Specify `openshift-operators` as the namespace.
++
+For example:
++
+[source,yaml]
+----
+apiVersion: node-problem-detector.operator.k8s.io/v1alpha1
+kind: NodeProblemDetector
+metadata:
+  name: node-problem-detector
+  namespace: openshift-node-problem-detector
+spec: {}
+----
+
+//Beta steps https://bugzilla.redhat.com/show_bug.cgi?id=1679467
+
+. Create a Node Problem Detector Custom Resource Definition (CRD):
++
+[source,yaml]
+----
+apiVersion: apiextensions.k8s.io/v1beta1
+kind: CustomResourceDefinition
+metadata:
+  name: nodeproblemdetectors.node-problem-detector.operator.k8s.io
+spec:
+  group: node-problem-detector.operator.k8s.io
+  names:
+    kind: NodeProblemDetector
+    listKind: NodeProblemDetectorList
+    plural: nodeproblemdetectors
+    singular: nodeproblemdetector
+  scope: Namespaced
+version: v1alpha1
+----
+
+. Create a Node Problem Detector Service Account (SA):
++
+[source,yaml]
+----
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: node-problem-detector-operator
+namespace: openshift-node-problem-detector
+----
+
+. Create a Node Problem Detector RBAC (RBAC):
++
+[source,yaml]
+----
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: node-problem-detector-operator
+  namespace: openshift-node-problem-detector
+rules:
+- apiGroups:
+  - node-problem-detector.operator.k8s.io
+  resources:
+  - "*"
+  verbs:
+  - "*"
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  - events
+  - configmaps
+  - secrets
+  - services
+  - endpoints
+  - serviceaccounts
+  verbs:
+  - "*"
+- apiGroups:
+  - apps
+  resources:
+  - daemonsets
+  verbs:
+  - "*"
+
+---
+
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: node-problem-detector-operator
+  namespace: openshift-node-problem-detector
+subjects:
+- kind: ServiceAccount
+  name: node-problem-detector-operator
+roleRef:
+  kind: Role
+  name: node-problem-detector-operator
+  apiGroup: rbac.authorization.k8s.io
+
+---
+
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: openshift-node-problem-detector-operator
+rules:
+- apiGroups:
+  - rbac.authorization.k8s.io
+  resources:
+  # the operator needs to be able to bind the cluster role
+  # system:node-problem-detector to the node-problem-detector service account
+  - clusterrolebindings
+  verbs:
+  - "*"
+- apiGroups:
+  - security.openshift.io
+  resources:
+  # the operator needs to be able to add the node-problem-detector service account
+  # to the list of accounts that can use the privileged SCC
+  - securitycontextconstraints
+  verbs:
+  - "*"
+
+---
+
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: openshift-node-problem-detector-operator-1
+subjects:
+- kind: ServiceAccount
+  name: node-problem-detector-operator
+  namespace: openshift-node-problem-detector
+roleRef:
+  kind: ClusterRole
+  name: openshift-node-problem-detector-operator
+  apiGroup: rbac.authorization.k8s.io
+
+---
+
+oc create -f deploy/rbac.yaml
+oc create -f deploy/operator.yaml
+oc create -f deploy/cr.yaml
+
+
+. Create a Node Problem Detector custom resource (CR):
++
+[source,yaml]
+----
+apiVersion: node-problem-detector.operator.k8s.io/v1alpha1
+kind: NodeProblemDetector
+metadata:
+  name: node-problem-detector
+namespace: openshift-node-problem-detector
+----
+
+. Configure the Node Problem Detector policy as needed and click *Create*. 
+
diff --git a/nodes/nodes/nodes-nodes-problem-detector.adoc b/nodes/nodes/nodes-nodes-problem-detector.adoc
@@ -27,6 +27,11 @@ https://access.redhat.com/support/offerings/techpreview/.
 endif::[]
 ====
 
+[NOTE]
+====
+Procedures in this topic require your cluster to be in an unmanaged state. 
+====
+
 // The following include statements pull in the module files that comprise
 // the assembly. Include any combination of concept, procedure, or reference
 // modules required to cover the user story. You can also include other