1
1
#!/usr/bin/env python
2
- # -*- coding: utf-8 -*--
3
2
4
3
# Copyright (c) 2024 Oracle and/or its affiliates.
5
4
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6
5
7
- import argparse
8
6
import logging
9
7
import os
10
8
import shutil
11
9
import sys
12
10
import tempfile
13
- import time
14
- from string import Template
15
- from typing import Any , Dict , List , Tuple
16
- import pandas as pd
17
- from ads .opctl import logger
18
- import oracledb
11
+ from typing import List , Union
19
12
20
13
import fsspec
21
- import yaml
22
- from typing import Union
14
+ import oracledb
15
+ import pandas as pd
23
16
17
+ from ads .common .object_storage_details import ObjectStorageDetails
24
18
from ads .opctl import logger
19
+ from ads .opctl .operator .common .operator_config import OutputDirectory
25
20
from ads .opctl .operator .lowcode .common .errors import (
26
- InputDataError ,
27
21
InvalidParameterError ,
28
- PermissionsError ,
29
- DataMismatchError ,
30
22
)
31
- from ads .opctl .operator .common .operator_config import OutputDirectory
32
- from ads .common .object_storage_details import ObjectStorageDetails
33
23
from ads .secrets import ADBSecretKeeper
34
24
35
25
36
26
def call_pandas_fsspec (pd_fn , filename , storage_options , ** kwargs ):
37
- if fsspec .utils .get_protocol (filename ) == "file" :
38
- return pd_fn ( filename , ** kwargs )
39
- elif fsspec . utils . get_protocol ( filename ) in ["http" , "https" ]:
27
+ if fsspec .utils .get_protocol (filename ) == "file" or fsspec . utils . get_protocol (
28
+ filename
29
+ ) in ["http" , "https" ]:
40
30
return pd_fn (filename , ** kwargs )
41
31
42
32
storage_options = storage_options or (
@@ -48,7 +38,7 @@ def call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs):
48
38
49
39
def load_data (data_spec , storage_options = None , ** kwargs ):
50
40
if data_spec is None :
51
- raise InvalidParameterError (f "No details provided for this data source." )
41
+ raise InvalidParameterError ("No details provided for this data source." )
52
42
filename = data_spec .url
53
43
format = data_spec .format
54
44
columns = data_spec .columns
@@ -67,7 +57,7 @@ def load_data(data_spec, storage_options=None, **kwargs):
67
57
if not format :
68
58
_ , format = os .path .splitext (filename )
69
59
format = format [1 :]
70
- if format in ["json" , "clipboard" , "excel" , "csv" , "feather" , "hdf" ]:
60
+ if format in ["json" , "clipboard" , "excel" , "csv" , "feather" , "hdf" , "parquet" ]:
71
61
read_fn = getattr (pd , f"read_{ format } " )
72
62
data = call_pandas_fsspec (
73
63
read_fn , filename , storage_options = storage_options
@@ -84,19 +74,31 @@ def load_data(data_spec, storage_options=None, **kwargs):
84
74
with tempfile .TemporaryDirectory () as temp_dir :
85
75
if vault_secret_id is not None :
86
76
try :
87
- with ADBSecretKeeper .load_secret (vault_secret_id , wallet_dir = temp_dir ) as adwsecret :
88
- if 'wallet_location' in adwsecret and 'wallet_location' not in connect_args :
89
- shutil .unpack_archive (adwsecret ["wallet_location" ], temp_dir )
90
- connect_args ['wallet_location' ] = temp_dir
91
- if 'user_name' in adwsecret and 'user' not in connect_args :
92
- connect_args ['user' ] = adwsecret ['user_name' ]
93
- if 'password' in adwsecret and 'password' not in connect_args :
94
- connect_args ['password' ] = adwsecret ['password' ]
95
- if 'service_name' in adwsecret and 'service_name' not in connect_args :
96
- connect_args ['service_name' ] = adwsecret ['service_name' ]
77
+ with ADBSecretKeeper .load_secret (
78
+ vault_secret_id , wallet_dir = temp_dir
79
+ ) as adwsecret :
80
+ if (
81
+ "wallet_location" in adwsecret
82
+ and "wallet_location" not in connect_args
83
+ ):
84
+ shutil .unpack_archive (
85
+ adwsecret ["wallet_location" ], temp_dir
86
+ )
87
+ connect_args ["wallet_location" ] = temp_dir
88
+ if "user_name" in adwsecret and "user" not in connect_args :
89
+ connect_args ["user" ] = adwsecret ["user_name" ]
90
+ if "password" in adwsecret and "password" not in connect_args :
91
+ connect_args ["password" ] = adwsecret ["password" ]
92
+ if (
93
+ "service_name" in adwsecret
94
+ and "service_name" not in connect_args
95
+ ):
96
+ connect_args ["service_name" ] = adwsecret ["service_name" ]
97
97
98
98
except Exception as e :
99
- raise Exception (f"Could not retrieve database credentials from vault { vault_secret_id } : { e } " )
99
+ raise Exception (
100
+ f"Could not retrieve database credentials from vault { vault_secret_id } : { e } "
101
+ )
100
102
101
103
con = oracledb .connect (** connect_args )
102
104
if table_name is not None :
@@ -105,11 +107,11 @@ def load_data(data_spec, storage_options=None, **kwargs):
105
107
data = pd .read_sql (sql , con )
106
108
else :
107
109
raise InvalidParameterError (
108
- f "Database `connect_args` provided without sql query or table name. Please specify either `sql` or `table_name`."
110
+ "Database `connect_args` provided without sql query or table name. Please specify either `sql` or `table_name`."
109
111
)
110
112
else :
111
113
raise InvalidParameterError (
112
- f "No filename/url provided, and no connect_args provided. Please specify one of these if you want to read data from a file or a database respectively."
114
+ "No filename/url provided, and no connect_args provided. Please specify one of these if you want to read data from a file or a database respectively."
113
115
)
114
116
if columns :
115
117
# keep only these columns, done after load because only CSV supports stream filtering
@@ -232,7 +234,7 @@ def human_time_friendly(seconds):
232
234
accumulator .append (
233
235
"{} {}{}" .format (int (amount ), unit , "" if amount == 1 else "s" )
234
236
)
235
- accumulator .append ("{} secs" . format ( round (seconds , 2 )) )
237
+ accumulator .append (f" { round (seconds , 2 )} secs" )
236
238
return ", " .join (accumulator )
237
239
238
240
@@ -248,9 +250,7 @@ def find_output_dirname(output_dir: OutputDirectory):
248
250
unique_output_dir = f"{ output_dir } _{ counter } "
249
251
counter += 1
250
252
logger .warn (
251
- "Since the output directory was not specified, the output will be saved to {} directory." .format (
252
- unique_output_dir
253
- )
253
+ f"Since the output directory was not specified, the output will be saved to { unique_output_dir } directory."
254
254
)
255
255
return unique_output_dir
256
256
0 commit comments