Skip to content
This repository was archived by the owner on Apr 18, 2024. It is now read-only.

Commit 6d404f5

Browse files
committed
Added BDC export script
1 parent 782a13c commit 6d404f5

File tree

1 file changed

+359
-0
lines changed

1 file changed

+359
-0
lines changed

scripts/exportBDC.py

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
4+
#
5+
# Utility script to export user data from Oracle Big Data Service.
6+
# This script exports Hive Metadata, Zeppelin Notebooks, Service Configuration and Version data
7+
# to a tar file.
8+
#
9+
# This must be run as root user on Ambari Host. Hive and Zeppelin Serivce must be in
10+
# stopped state, otherwise, script will exit.
11+
#
12+
# Usage - exportBDC.py <Config File>
13+
# Run this script on Ambari host as root user.
14+
#
15+
16+
import json
17+
import urllib2, base64
18+
import os, tarfile, shutil, glob, socket, sys, subprocess
19+
from contextlib import closing
20+
import datetime, logging as log
21+
22+
if(len(sys.argv) < 2):
23+
log.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', stream=sys.stdout, level=log.INFO)
24+
log.error("Usage: exportBDC.py <Config File> [-v]")
25+
log.error("Run this script on Ambari host as root user.")
26+
log.error("Use -v for more detailed log")
27+
sys.exit(0)
28+
29+
if("-v" in sys.argv):
30+
log.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', stream=sys.stdout, level=log.DEBUG)
31+
else:
32+
log.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', stream=sys.stdout, level=log.INFO)
33+
34+
ambari_ip = socket.gethostname()
35+
config_path = sys.argv[1]
36+
37+
FNULL = open(os.devnull, 'w')
38+
# FNULL = subprocess.STDOUT # This helps in debugging hive export issue
39+
40+
ambari_port = '8080'
41+
ambari_url = 'https://' + ambari_ip + ':' + ambari_port + '/api/v1/clusters/'
42+
43+
config_json = json.loads("{}")
44+
output_path = ""
45+
cluster_name = ""
46+
services = []
47+
components = {}
48+
temp_config_tars = ""
49+
temp_extract_path = ""
50+
final_tarball = ""
51+
52+
53+
def readConfig():
54+
global config_json
55+
if os.path.exists(config_path):
56+
with open(config_path) as data_file:
57+
config_json = json.load(data_file)
58+
else:
59+
log.error("Config file, " + config_path + " not found...")
60+
sys.exit(0)
61+
62+
63+
def loadConfig():
64+
global output_path, cluster_name, temp_extract_path, final_tarball, services
65+
66+
output_path = config_json["export_dir"]
67+
os.system('mkdir -p ' + output_path)
68+
69+
cluster_name = getClusterName()
70+
log.debug("Cluster Name - " + cluster_name)
71+
temp_extract_path = output_path + "/" + cluster_name
72+
if(os.path.exists(temp_extract_path)):
73+
shutil.rmtree(temp_extract_path)
74+
os.mkdir(temp_extract_path)
75+
76+
now = datetime.datetime.now()
77+
timestamp = now.strftime('%d_%b_%Y_%H_%M_%S')
78+
final_tarball = """%s/export_%s_%s.tar.gz"""%(output_path, cluster_name, timestamp)
79+
services = getServices()
80+
log.debug("List of services found in the cluster - " + ','.join(map(str, services)))
81+
82+
log.info("Exporting Oracle Big Data Cloud Service : " + cluster_name)
83+
log.info("This may take a few minutes to complete.\n")
84+
85+
def ambariApiRes(url):
86+
base64string = base64.encodestring('%s:%s' % (config_json["ambari_username"], config_json["ambari_password"])).replace('\n', '')
87+
req = urllib2.Request(url)
88+
req.add_header('X-Requested-By', 'ambari')
89+
req.add_header("Authorization", "Basic %s" % base64string)
90+
91+
try:
92+
response = urllib2.urlopen(req).read()
93+
except urllib2.HTTPError, e:
94+
log.debug("Ambari Rest Api failed for - " + url)
95+
response = "{}"
96+
97+
responseJson = json.loads(response)
98+
return responseJson
99+
100+
101+
def isServiceStopped(service_name):
102+
url = """%s%s/services/%s?fields=ServiceInfo/state"""%(ambari_url, cluster_name, service_name)
103+
104+
log.debug("""Url to check the status of service, %s - %s"""%(service_name, url))
105+
106+
base64string = base64.encodestring('%s:%s' % (config_json["ambari_username"], config_json["ambari_password"])).replace('\n', '')
107+
req = urllib2.Request(url)
108+
req.add_header('X-Requested-By', 'ambari')
109+
req.add_header("Authorization", "Basic %s" % base64string)
110+
111+
response = urllib2.urlopen(req).read()
112+
responseJson = json.loads(response)
113+
114+
if(responseJson["ServiceInfo"]["state"] == "INSTALLED"):
115+
return True
116+
else:
117+
return False
118+
119+
120+
def preCheck():
121+
servicesToBeChecked = ["HIVE", "ZEPPELIN"]
122+
123+
log.debug("Performing ")
124+
for service in servicesToBeChecked:
125+
# Checking if a service is stopped
126+
if not (isServiceStopped(service)):
127+
log.error("""%s service is not in stopped state in Ambari. Please stop it using Ambari and rerun"""%(service))
128+
sys.exit(0)
129+
130+
131+
def ambari_config_download(url):
132+
# log.info("Config url --- " + url)
133+
base64string = base64.encodestring('%s:%s' % (config_json["ambari_username"], config_json["ambari_password"])).replace('\n', '')
134+
req = urllib2.Request(url)
135+
req.add_header('X-Requested-By', 'ambari')
136+
req.add_header("Authorization", "Basic %s" % base64string)
137+
138+
try:
139+
# response = urllib2.urlopen(req, context=ctx)
140+
response = urllib2.urlopen(req)
141+
except urllib2.HTTPError, e:
142+
response = None
143+
144+
return response
145+
146+
147+
def getClusterName():
148+
url = ambari_url
149+
responseJson = ambariApiRes(url)
150+
151+
return responseJson["items"][0]['Clusters']['cluster_name']
152+
153+
154+
def getServices():
155+
url = ambari_url + cluster_name + '/services'
156+
responseJson = ambariApiRes(url)
157+
for item in responseJson['items']:
158+
services.append(item['ServiceInfo']['service_name'])
159+
return services
160+
161+
162+
def populateComponents():
163+
for service in services:
164+
# log.info("Getting components for service, " + service)
165+
url = ambari_url + cluster_name + '/services/' + service + '/components'
166+
responseJson = ambariApiRes(url)
167+
for item in responseJson["items"]:
168+
if components.has_key(service):
169+
components[service].append(item["ServiceComponentInfo"]["component_name"])
170+
else:
171+
components[service] = []
172+
components[service].append(item["ServiceComponentInfo"]["component_name"])
173+
return components
174+
175+
176+
def downloadFile(fileName, resp):
177+
with open(fileName, "w") as local_file:
178+
local_file.write(resp.read())
179+
180+
181+
def getConfigs():
182+
global temp_config_tars
183+
# Cleaning up before downloading the configs
184+
# log.info("Cleaning up before downloading the configs...")
185+
temp_config_tars = output_path + "/config_tars/"
186+
if (os.path.isdir(temp_config_tars)):
187+
shutil.rmtree(temp_config_tars)
188+
os.mkdir(temp_config_tars)
189+
190+
for service in components:
191+
for component in components[service]:
192+
# log.info("Getting config for service, " + service + " & component, " + component)
193+
url = ambari_url + cluster_name + '/services/' + service + '/components/' + component + "?format=client_config_tar"
194+
resp = ambari_config_download(url)
195+
fileName = temp_config_tars + "/" + component + "-configs.tar.gz"
196+
if(resp != None):
197+
downloadFile(fileName, resp)
198+
log.debug("Configuration is downloaded to " + fileName + " ...")
199+
else:
200+
log.debug("No config found for service, " + service + " & component, " + component)
201+
202+
203+
def prepareForPackaging():
204+
temp_configs_path = temp_extract_path + "/" + "config"
205+
if(os.path.exists(temp_configs_path)):
206+
shutil.rmtree(temp_configs_path)
207+
os.mkdir(temp_configs_path)
208+
for file in glob.glob(temp_config_tars + "/*.tar.gz"):
209+
name = os.path.basename(file).split("-configs.tar.gz")[0]
210+
tf = tarfile.open(file)
211+
tf.extractall(path=temp_configs_path + "/" + name)
212+
tf.close()
213+
# Delete the temp config tars directory
214+
if(os.path.exists(temp_config_tars)):
215+
shutil.rmtree(temp_config_tars)
216+
217+
218+
def package():
219+
log.debug("Creating the target tarball, " + final_tarball)
220+
with closing(tarfile.open(final_tarball, "w:gz")) as tar:
221+
tar.add(temp_extract_path, arcname='.')
222+
223+
224+
def cleanup():
225+
log.debug("Perform final cleanup...")
226+
shutil.rmtree(temp_extract_path)
227+
228+
229+
def backupHDPConfigs():
230+
log.info("")
231+
printDottedLine()
232+
log.info("Configuration")
233+
printDottedLine()
234+
log.info("Exporting Service Configuration data ....")
235+
populateComponents()
236+
getConfigs()
237+
prepareForPackaging()
238+
log.info("Completed exporting Exporting Service Configuration data.")
239+
240+
241+
def getVersions():
242+
log.info("")
243+
printDottedLine()
244+
log.info("Stack component versions")
245+
printDottedLine()
246+
log.info("Exporting stack component versions....")
247+
services_list = ",".join(services)
248+
versions = ""
249+
version_file_path = temp_extract_path + "/stack"
250+
version_file = version_file_path + "/StackVersions.txt"
251+
if(os.path.isdir(version_file_path)):
252+
shutil.rmtree(version_file_path)
253+
os.mkdir(version_file_path)
254+
temp_file = temp_extract_path + "/StackVersions_temp"
255+
256+
command=""" curl -o %s -u %s:%s -1 -s -k 'https://%s:%s/api/v1/stacks/HDP/versions/2.4/services?StackServices/service_name.in(%s)&fields=StackServices/*' """%(temp_file, config_json["ambari_username"], config_json["ambari_password"], ambari_ip,ambari_port, services_list)
257+
log.debug("Generated command to get the stack versions, " + command)
258+
subprocess.call(command, shell=True)
259+
260+
f = open(temp_file, "r")
261+
res = f.read()
262+
263+
responseJson = json.loads(res)
264+
for service in responseJson["items"]:
265+
versions = versions + service["StackServices"]["service_name"] + " : " + service["StackServices"]["service_version"] + "\n"
266+
267+
f = open(version_file, "w")
268+
f.write(versions)
269+
log.debug("Cleaning temporary files created for Stack component versions Export...")
270+
if(os.path.exists(temp_file)):
271+
os.remove(temp_file)
272+
log.info("Completed exporting stack component versions.")
273+
274+
275+
def backupZeppelinNotes():
276+
log.info("")
277+
printDottedLine()
278+
log.info("Zeppelin Notebooks")
279+
printDottedLine()
280+
log.info("Exporting Zeppelin Notebooks....")
281+
temp_zeppelin_notes = temp_extract_path + "/zeppelin/notebook"
282+
if (os.path.isdir(temp_zeppelin_notes)):
283+
shutil.rmtree(temp_zeppelin_notes)
284+
# The command below creates Zeppelin_Notebooks in hdfs home directory
285+
if (os.path.isdir("/var/lib/hadoop-hdfs/notebook")):
286+
shutil.rmtree("/var/lib/hadoop-hdfs/notebook")
287+
288+
log.debug("Taking the zeppelin notebooks from hdfs://user/zeppelin/notebook notebook")
289+
command = "su - hdfs -c 'hdfs dfs -copyToLocal /user/zeppelin/notebook notebook'"
290+
subprocess.call(command, shell=True)
291+
292+
log.debug("Cleaning temporary files created for Zeppelin Notebook Export...")
293+
shutil.copytree("/var/lib/hadoop-hdfs/notebook", temp_zeppelin_notes)
294+
shutil.rmtree("/var/lib/hadoop-hdfs/notebook")
295+
log.info("Completed exporting Zeppelin Notebooks.")
296+
297+
298+
def getHiveMetaDBName():
299+
lookup = "ambari.hive.db.schema.name"
300+
url = """%s%s/configurations/service_config_versions?service_name=HIVE"""%(ambari_url, cluster_name)
301+
log.debug("Url to get the hive metastore db name - " + url)
302+
303+
try:
304+
response_json = ambariApiRes(url)
305+
for config in response_json["items"]:
306+
if (config["is_current"] == True):
307+
for configuration in config["configurations"]:
308+
if lookup in configuration["properties"]:
309+
log.debug("Hive metastore DBName is - " + configuration["properties"][lookup])
310+
return configuration["properties"][lookup]
311+
except:
312+
log.error("Failed to get hive metastore db name from Ambari. hive is the default metastore db name")
313+
# On failing to find return hive as default
314+
return "hive"
315+
316+
317+
def backupHiveMetadata():
318+
log.info("")
319+
printDottedLine()
320+
log.info("Hive metadata")
321+
printDottedLine()
322+
log.info("Exporting Hive metadata....")
323+
324+
hive_metastore_db = getHiveMetaDBName()
325+
326+
if (os.path.isdir(temp_extract_path + "/hive_metadata")):
327+
shutil.rmtree(temp_extract_path + "/hive_metadata")
328+
os.mkdir(temp_extract_path + "/hive_metadata")
329+
temp_extract_hive_file = temp_extract_path + "/hive_metadata/hive_metadata_dump.sql"
330+
command="""mysqldump %s > %s"""%(hive_metastore_db, temp_extract_hive_file)
331+
subprocess.call(command, shell=True)
332+
333+
log.info("Completed exporting Hive metadata.")
334+
335+
def printDottedLine():
336+
log.info("-------------------------------------------------------")
337+
338+
339+
log.info("")
340+
printDottedLine()
341+
log.info("Utility to export metadata from Big Data Cloud Service")
342+
printDottedLine()
343+
log.info("")
344+
345+
readConfig()
346+
loadConfig()
347+
348+
preCheck()
349+
350+
backupHDPConfigs()
351+
backupZeppelinNotes()
352+
backupHiveMetadata()
353+
getVersions()
354+
355+
package()
356+
cleanup()
357+
log.info("")
358+
log.info("")
359+
log.info("""Completed export from Oracle Big Data Cloud Service : %s to %s."""%(cluster_name, final_tarball))

0 commit comments

Comments
 (0)