1
+ #! /bin/bash
2
+
3
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
+ # SPDX-License-Identifier: MIT-0
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy of this
7
+ # software and associated documentation files (the "Software"), to deal in the Software
8
+ # without restriction, including without limitation the rights to use, copy, modify,
9
+ # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so.
11
+ #
12
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13
+ # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14
+ # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
17
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18
+
19
+ source /etc/parallelcluster/cfnconfig
20
+ cfn_fsx_fs_id=$( cat /etc/chef/dna.json | grep \" cfn_fsx_fs_id\" | awk ' {print $2}' | sed " s/\" ,//g;s/\" //g" )
21
+ master_instance_id=$( ec2-metadata -i | awk ' {print $2}' )
22
+ cfn_max_queue_size=$( aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r ' .Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue' )
23
+ monitoring_dir_name=" monitoring"
24
+ monitoring_home=" ${SHARED_FS_DIR} /${monitoring_dir_name} "
25
+ chef_dna=" /etc/chef/dna.json"
26
+ s3_bucket=$( echo $cfn_postinstall | sed " s/s3:\/\///g;s/\/.*//" )
27
+ grafana_password=$( aws secretsmanager get-secret-value --secret-id " ${stack_name} " --query SecretString --output text --region " ${cfn_region} " )
28
+ NICE_ROOT=$( jq --arg default " ${SHARED_FS_DIR} /nice" -r ' .post_install.enginframe | if has("nice_root") then .nice_root else $default end' " ${dna_json} " )
29
+
30
+
31
+ set -x
32
+ set -e
33
+
34
+ installPreReq () {
35
+ yum -y install docker golang-bin
36
+ service docker start
37
+ chkconfig docker on
38
+ usermod -a -G docker $cfn_cluster_user
39
+
40
+ # to be replaced with yum -y install docker-compose as the repository problem is fixed
41
+ curl -L " https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$( uname -s) -$( uname -m) " -o /usr/local/bin/docker-compose
42
+ chmod +x /usr/local/bin/docker-compose
43
+ }
44
+
45
+ saveClusterConfigLocally (){
46
+
47
+ cluster_s3_bucket=$( cat " ${chef_dna} " | grep \" cluster_s3_bucket\" | awk ' {print $2}' | sed " s/\" ,//g;s/\" //g" )
48
+ cluster_config_s3_key=$( cat " ${chef_dna} " | grep \" cluster_config_s3_key\" | awk ' {print $2}' | sed " s/\" ,//g;s/\" //g" )
49
+ cluster_config_version=$( cat " ${chef_dna} " | grep \" cluster_config_version\" | awk ' {print $2}' | sed " s/\" ,//g;s/\" //g" )
50
+ log_group_names=" \/aws\/parallelcluster\/$( echo ${stack_name} | cut -d " -" -f2-) "
51
+
52
+ mkdir -p " ${monitoring_home} /parallelcluster"
53
+ aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version " ${monitoring_home} /parallelcluster/cluster-config.json"
54
+ }
55
+
56
+ installMonitoring (){
57
+
58
+ aws s3 cp --recursive " ${post_install_base} /monitoring" " ${monitoring_home} " --region " ${cfn_region} " || exit 1
59
+ chown $cfn_cluster_user :$cfn_cluster_user -R " ${monitoring_home} "
60
+ chmod +x ${monitoring_home} /custom-metrics/*
61
+
62
+ cp -rp ${monitoring_home} /custom-metrics/* /usr/local/bin/
63
+ mv -f " ${monitoring_home} /prometheus-slurm-exporter/slurm_exporter.service" /etc/systemd/system/
64
+
65
+ cp -rp ${monitoring_home} /www/* " ${NICE_ROOT} /enginframe/conf/tomcat/webapps/ROOT/"
66
+ }
67
+
68
+
69
+
70
+ configureMonitoring () {
71
+
72
+ (crontab -l -u $cfn_cluster_user ; echo " */1 * * * * /usr/local/bin/1m-cost-metrics.sh" ) | crontab -u $cfn_cluster_user -
73
+ (crontab -l -u $cfn_cluster_user ; echo " */60 * * * * /usr/local/bin/1h-cost-metrics.sh" ) | crontab -u $cfn_cluster_user -
74
+
75
+ # replace tokens
76
+ sed -i " s/_S3_BUCKET_/${s3_bucket} /g" " ${monitoring_home} /grafana/dashboards/ParallelCluster.json"
77
+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" " ${monitoring_home} /grafana/dashboards/ParallelCluster.json"
78
+ sed -i " s/__FSX_ID__/${cfn_fsx_fs_id} /g" " ${monitoring_home} /grafana/dashboards/ParallelCluster.json"
79
+ sed -i " s/__AWS_REGION__/${cfn_region} /g" " ${monitoring_home} /grafana/dashboards/ParallelCluster.json"
80
+
81
+ sed -i " s/__AWS_REGION__/${cfn_region} /g" " ${monitoring_home} /grafana/dashboards/logs.json"
82
+ sed -i " s/__LOG_GROUP__NAMES__/${log_group_names} /g" " ${monitoring_home} /grafana/dashboards/logs.json"
83
+
84
+ sed -i " s/__Application__/${stack_name} /g" " ${monitoring_home} /prometheus/prometheus.yml"
85
+
86
+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" " ${monitoring_home} /grafana/dashboards/master-node-details.json"
87
+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" " ${monitoring_home} /grafana/dashboards/compute-node-list.json"
88
+ sed -i " s/__INSTANCE_ID__/${master_instance_id} /g" " ${monitoring_home} /grafana/dashboards/compute-node-details.json"
89
+
90
+ sed -i " s~__MONITORING_DIR__~${monitoring_home} ~g" " ${monitoring_home} /docker-compose/docker-compose.master.yml"
91
+ sed -i " s~__GRAFANA_PASSWORD__~${grafana_password} ~g" " ${monitoring_home} /docker-compose/docker-compose.master.yml"
92
+
93
+
94
+ # Download and build prometheus-slurm-exporter
95
+ # #### Plese note this software package is under GPLv3 License #####
96
+ # More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
97
+ cd " ${monitoring_home} "
98
+ # FIXME: temporary
99
+ rm -rf prometheus-slurm-exporter
100
+ git clone https://github.com/vpenso/prometheus-slurm-exporter.git
101
+ cd prometheus-slurm-exporter
102
+ sed -i ' s/NodeList,AllocMem,Memory,CPUsState,StateLong/NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:/' node.go
103
+ GOPATH=/root/go-modules-cache HOME=/root go mod download
104
+ GOPATH=/root/go-modules-cache HOME=/root go build
105
+ mv -f " ${monitoring_home} /prometheus-slurm-exporter/prometheus-slurm-exporter" /usr/bin/prometheus-slurm-exporter
106
+ }
107
+
108
+
109
+ startMonitoringDaemons () {
110
+
111
+ /usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f " ${monitoring_home} /docker-compose/docker-compose.master.yml" -p monitoring-master up -d
112
+ systemctl daemon-reload
113
+ systemctl enable slurm_exporter
114
+ systemctl start slurm_exporter
115
+
116
+ }
117
+
118
+ # main
119
+ # ----------------------------------------------------------------------------
120
+ main () {
121
+ echo " [INFO][$( date ' +%Y-%m-%d %H:%M:%S' ) ] 40.install.monitoring.master.sh: START" >&2
122
+ installPreReq
123
+ saveClusterConfigLocally
124
+ installMonitoring
125
+ configureMonitoring
126
+ startMonitoringDaemons
127
+ echo " [INFO][$( date ' +%Y-%m-%d %H:%M:%S' ) ] 40.install.monitoring.master.sh: STOP" >&2
128
+ }
129
+
130
+ main " $@ "
0 commit comments