Skip to content

Commit d97718a

Browse files
authored
Merge pull request #5 from aws-samples/monitoring
Monitoring
2 parents 2150d30 + e8541df commit d97718a

38 files changed

+11598
-16
lines changed

Templates/AWS-HPC-Cluster.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -957,8 +957,8 @@ Outputs:
957957
Cloud9URL:
958958
Description: Cloud9 Environment
959959
Value: !Sub 'https://${AWS::Region}.console.aws.amazon.com/cloud9/ide/${Cloud9}'
960-
EnginFrameURL:
960+
WebURL:
961961
Description: "EnginFrame HPC Portal, default username: ec2-user , default password: Change_this!"
962962
Value: !Sub
963-
- 'https://${ALB}/enginframe'
963+
- 'https://${ALB}/'
964964
- ALB: !GetAtt ApplicationLoadBalancer.DNSName
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
3+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
# SPDX-License-Identifier: MIT-0
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
7+
# software and associated documentation files (the "Software"), to deal in the Software
8+
# without restriction, including without limitation the rights to use, copy, modify,
9+
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
10+
# permit persons to whom the Software is furnished to do so.
11+
#
12+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13+
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14+
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15+
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
17+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18+
19+
source /etc/parallelcluster/cfnconfig
20+
compute_instance_type=$(ec2-metadata -t | awk '{print $2}')
21+
gpu_instances="[pg][2-9].*\.[0-9]*[x]*large"
22+
23+
monitoring_dir_name="monitoring"
24+
monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}"
25+
26+
set -x
27+
set -e
28+
29+
installPreReq() {
30+
yum -y install docker golang-bin
31+
service docker start
32+
chkconfig docker on
33+
usermod -a -G docker $cfn_cluster_user
34+
35+
#to be replaced with yum -y install docker-compose as the repository problem is fixed
36+
curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
37+
chmod +x /usr/local/bin/docker-compose
38+
}
39+
40+
configureMonitoring() {
41+
42+
if [[ $compute_instance_type =~ $gpu_instances ]]; then
43+
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
44+
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo
45+
yum -y clean expire-cache
46+
yum -y install nvidia-docker2
47+
systemctl restart docker
48+
/usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.gpu.yml" -p monitoring-compute up -d
49+
50+
else
51+
/usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.yml" -p monitoring-compute up -d
52+
fi
53+
}
54+
55+
# main
56+
# ----------------------------------------------------------------------------
57+
main() {
58+
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: START" >&2
59+
installPreReq
60+
configureMonitoring
61+
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: STOP" >&2
62+
}
63+
64+
main "$@"
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/bin/bash
2+
3+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
# SPDX-License-Identifier: MIT-0
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
7+
# software and associated documentation files (the "Software"), to deal in the Software
8+
# without restriction, including without limitation the rights to use, copy, modify,
9+
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
10+
# permit persons to whom the Software is furnished to do so.
11+
#
12+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13+
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14+
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15+
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
17+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18+
19+
source /etc/parallelcluster/cfnconfig
20+
cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"cfn_fsx_fs_id\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
21+
master_instance_id=$(ec2-metadata -i | awk '{print $2}')
22+
cfn_max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue')
23+
monitoring_dir_name="monitoring"
24+
monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}"
25+
chef_dna="/etc/chef/dna.json"
26+
s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
27+
grafana_password=$(aws secretsmanager get-secret-value --secret-id "${stack_name}" --query SecretString --output text --region "${cfn_region}")
28+
NICE_ROOT=$(jq --arg default "${SHARED_FS_DIR}/nice" -r '.post_install.enginframe | if has("nice_root") then .nice_root else $default end' "${dna_json}")
29+
30+
31+
set -x
32+
set -e
33+
34+
installPreReq() {
35+
yum -y install docker golang-bin
36+
service docker start
37+
chkconfig docker on
38+
usermod -a -G docker $cfn_cluster_user
39+
40+
#to be replaced with yum -y install docker-compose as the repository problem is fixed
41+
curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
42+
chmod +x /usr/local/bin/docker-compose
43+
}
44+
45+
saveClusterConfigLocally(){
46+
47+
cluster_s3_bucket=$(cat "${chef_dna}" | grep \"cluster_s3_bucket\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
48+
cluster_config_s3_key=$(cat "${chef_dna}" | grep \"cluster_config_s3_key\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
49+
cluster_config_version=$(cat "${chef_dna}" | grep \"cluster_config_version\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
50+
log_group_names="\/aws\/parallelcluster\/$(echo ${stack_name} | cut -d "-" -f2-)"
51+
52+
mkdir -p "${monitoring_home}/parallelcluster"
53+
aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version "${monitoring_home}/parallelcluster/cluster-config.json"
54+
}
55+
56+
installMonitoring(){
57+
58+
aws s3 cp --recursive "${post_install_base}/monitoring" "${monitoring_home}" --region "${cfn_region}" || exit 1
59+
chown $cfn_cluster_user:$cfn_cluster_user -R "${monitoring_home}"
60+
chmod +x ${monitoring_home}/custom-metrics/*
61+
62+
cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/
63+
mv -f "${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service" /etc/systemd/system/
64+
65+
cp -rp ${monitoring_home}/www/* "${NICE_ROOT}/enginframe/conf/tomcat/webapps/ROOT/"
66+
}
67+
68+
69+
70+
configureMonitoring() {
71+
72+
(crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user -
73+
(crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user -
74+
75+
# replace tokens
76+
sed -i "s/_S3_BUCKET_/${s3_bucket}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
77+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
78+
sed -i "s/__FSX_ID__/${cfn_fsx_fs_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
79+
sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
80+
81+
sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/logs.json"
82+
sed -i "s/__LOG_GROUP__NAMES__/${log_group_names}/g" "${monitoring_home}/grafana/dashboards/logs.json"
83+
84+
sed -i "s/__Application__/${stack_name}/g" "${monitoring_home}/prometheus/prometheus.yml"
85+
86+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/master-node-details.json"
87+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-list.json"
88+
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-details.json"
89+
90+
sed -i "s~__MONITORING_DIR__~${monitoring_home}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml"
91+
sed -i "s~__GRAFANA_PASSWORD__~${grafana_password}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml"
92+
93+
94+
# Download and build prometheus-slurm-exporter
95+
##### Plese note this software package is under GPLv3 License #####
96+
# More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
97+
cd "${monitoring_home}"
98+
#FIXME: temporary
99+
rm -rf prometheus-slurm-exporter
100+
git clone https://github.com/vpenso/prometheus-slurm-exporter.git
101+
cd prometheus-slurm-exporter
102+
sed -i 's/NodeList,AllocMem,Memory,CPUsState,StateLong/NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:/' node.go
103+
GOPATH=/root/go-modules-cache HOME=/root go mod download
104+
GOPATH=/root/go-modules-cache HOME=/root go build
105+
mv -f "${monitoring_home}/prometheus-slurm-exporter/prometheus-slurm-exporter" /usr/bin/prometheus-slurm-exporter
106+
}
107+
108+
109+
startMonitoringDaemons() {
110+
111+
/usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f "${monitoring_home}/docker-compose/docker-compose.master.yml" -p monitoring-master up -d
112+
systemctl daemon-reload
113+
systemctl enable slurm_exporter
114+
systemctl start slurm_exporter
115+
116+
}
117+
118+
# main
119+
# ----------------------------------------------------------------------------
120+
main() {
121+
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: START" >&2
122+
installPreReq
123+
saveClusterConfigLocally
124+
installMonitoring
125+
configureMonitoring
126+
startMonitoringDaemons
127+
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: STOP" >&2
128+
}
129+
130+
main "$@"
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/bin/bash
2+
#
3+
#
4+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
# SPDX-License-Identifier: MIT-0
6+
#
7+
#
8+
9+
#source the AWS ParallelCluster profile
10+
. /etc/parallelcluster/cfnconfig
11+
12+
export AWS_DEFAULT_REGION=$cfn_region
13+
aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region)
14+
aws_region_long_name=${aws_region_long_name/Europe/EU}
15+
16+
masterInstanceType=$(ec2-metadata -t | awk '{print $2}')
17+
masterInstanceId=$(ec2-metadata -i | awk '{print $2}')
18+
s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
19+
s3_size_gb=$(echo "$(aws s3api list-objects --bucket $s3_bucket --output json --query "[sum(Contents[].Size)]"| sed -n 2p | tr -d ' ') / 1024 / 1024 / 1024" | bc)
20+
21+
22+
#retrieve the s3 cost
23+
if [[ $s3_size_gb -le 51200 ]]; then
24+
s3_range=51200
25+
elif [[ $VAR -le 512000 ]]; then
26+
s3_range=512000
27+
else
28+
s3_range="Inf"
29+
fi
30+
31+
####################### S3 #########################
32+
33+
s3_cost_gb_month=$(aws --region us-east-1 pricing get-products \
34+
--service-code AmazonS3 \
35+
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
36+
'Type=TERM_MATCH,Field=storageClass,Value=General Purpose' \
37+
--query 'PriceList[0]' --output text \
38+
| jq -r --arg endRange $s3_range '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[].value | select(.endRange==$endRange).pricePerUnit.USD')
39+
40+
s3=$(echo "scale=2; $s3_cost_gb_month * $s3_size_gb / 720" | bc)
41+
echo "s3_cost $s3" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
42+
43+
44+
####################### Master #########################
45+
master_node_h_price=$(aws pricing get-products \
46+
--region us-east-1 \
47+
--service-code AmazonEC2 \
48+
--filters 'Type=TERM_MATCH,Field=instanceType,Value='$masterInstanceType \
49+
'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
50+
'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \
51+
'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \
52+
'Type=TERM_MATCH,Field=tenancy,Value=Shared' \
53+
'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \
54+
--output text \
55+
--query 'PriceList' \
56+
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
57+
58+
echo "master_node_cost $master_node_h_price" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
59+
60+
61+
fsx_id=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region \
62+
| jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "FSXOptions"))[0].ParameterValue' \
63+
| awk -F "," '{print $2}')
64+
fsx_summary=$(aws fsx describe-file-systems --region $cfn_region --file-system-ids $fsx_id)
65+
fsx_size_gb=$(echo $fsx_summary | jq -r '.FileSystems[0].StorageCapacity')
66+
fsx_type=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.DeploymentType')
67+
fsx_throughput=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.PerUnitStorageThroughput')
68+
69+
if [[ $fsx_type = "SCRATCH_2" ]] || [[ $fsx_type = "SCRATCH_1" ]]; then
70+
fsx_cost_gb_month=$(aws pricing get-products \
71+
--region us-east-1 \
72+
--service-code AmazonFSx \
73+
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
74+
'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \
75+
'Type=TERM_MATCH,Field=throughputCapacity,Value=N/A' \
76+
--output text \
77+
--query 'PriceList' \
78+
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
79+
80+
elif [ $fsx_type = "PERSISTENT_1" ]; then
81+
fsx_cost_gb_month=$(aws pricing get-products \
82+
--region us-east-1 \
83+
--service-code AmazonFSx \
84+
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
85+
'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \
86+
'Type=TERM_MATCH,Field=throughputCapacity,Value='$fsx_throughput \
87+
--output text \
88+
--query 'PriceList' \
89+
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
90+
91+
else
92+
fsx_cost_gb_month=0
93+
fi
94+
95+
fsx=$(echo "scale=2; $fsx_cost_gb_month * $fsx_size_gb / 720" | bc)
96+
echo "fsx_cost $fsx" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
97+
98+
99+
#parametrize:
100+
ebs_volume_total_cost=0
101+
ebs_volume_ids=$(aws ec2 describe-instances --instance-ids $masterInstanceId \
102+
| jq -r '.Reservations | to_entries[].value | .Instances | to_entries[].value | .BlockDeviceMappings | to_entries[].value | .Ebs.VolumeId')
103+
104+
for ebs_volume_id in $ebs_volume_ids
105+
do
106+
ebs_volume_type=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.VolumeType')
107+
#ebs_volume_iops=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Iops')
108+
ebs_volume_size=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Size')
109+
110+
ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \
111+
--service-code AmazonEC2 \
112+
--query 'PriceList' \
113+
--output text \
114+
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
115+
'Type=TERM_MATCH,Field=productFamily,Value=Storage' \
116+
'Type=TERM_MATCH,Field=volumeApiName,Value='$ebs_volume_type \
117+
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
118+
119+
ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $ebs_volume_size / 720" | bc)
120+
ebs_volume_total_cost=$(echo "scale=2; $ebs_volume_total_cost + $ebs_volume_cost" | bc)
121+
done
122+
123+
echo "ebs_master_cost $ebs_volume_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost

0 commit comments

Comments
 (0)