Skip to content

Commit 4773b29

Browse files
authored
Merge pull request #131 from zypp-io/development
Release 1.0.0
2 parents 715aac3 + 349d1e9 commit 4773b29

File tree

11 files changed

+105
-67
lines changed

11 files changed

+105
-67
lines changed

.pre-commit-config.yaml

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/pre-commit/pre-commit-hooks
3-
rev: v4.3.0
3+
rev: v4.5.0
44
hooks:
55
- id: trailing-whitespace
66
- id: end-of-file-fixer
@@ -12,21 +12,16 @@ repos:
1212
- id: name-tests-test
1313
args: [--pytest-test-first]
1414
- id: requirements-txt-fixer
15-
- repo: https://github.com/pycqa/flake8
16-
rev: 5.0.4
17-
hooks:
18-
- id: flake8
19-
args: ["--statistics", "--count", "--max-complexity=10", "--max-line-length=120", "--per-file-ignore=__init__.py: F401"]
20-
- repo: https://github.com/psf/black
21-
rev: 22.3.0
22-
hooks:
23-
- id: black
24-
args: [--line-length=120]
25-
- repo: https://github.com/PyCQA/isort
26-
rev: 5.12.0
27-
hooks:
28-
- id: isort
29-
args: ["--profile", "black", --line-length=120]
15+
- repo: https://github.com/astral-sh/ruff-pre-commit
16+
# Ruff version.
17+
rev: v0.4.4
18+
hooks:
19+
# Run the linter.
20+
- id: ruff
21+
args: [ --fix ]
22+
# Run the formatter.
23+
- id: ruff-format
24+
args: [ --line-length=120 ]
3025
- repo: local
3126
hooks:
3227
- id: check-requirements

CHANGELOG.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,48 @@ Changelog
1515
# 0.5.2
1616

1717
- Removed the required environment variables `ls_sql_name` and `ls_blob_name` since it's standard.
18+
19+
# 0.5.3
20+
21+
- Added `dtype` argument, so users can specify their own SqlAlchemy dtype for certain columns.
22+
23+
# 0.5.4
24+
25+
- Set requirements to specific version
26+
27+
# 0.6.0
28+
29+
- Always create linked services for blob and sql. This way the user can switch source blob storages and sink databases easier.
30+
- Use environment variable AZURE_STORAGE_CONNECTION_STRING for parquet upload
31+
- If parquet upload is a single file, place it in the root of the folder
32+
33+
# 0.7.0
34+
35+
- Add upsert for parquet files
36+
- Logging level to debug for query
37+
- Fix bugs and add requirements checks
38+
- Fix bug for staging schema with upsert
39+
- Add support all pandas int dtypes
40+
- Add customisable container name
41+
42+
# 0.8.0
43+
44+
- Upgrade dependency packages
45+
- Fix failing pipeline because of removed staging schema
46+
47+
# 0.9.0
48+
49+
- Add more dtypes
50+
- Upgrade package version
51+
- Fix bug when dataframe is empty
52+
53+
# 0.9.1
54+
55+
- Fix bug categorical dtype
56+
- Make pyodbc driver dynamic
57+
58+
# 1.0.0
59+
60+
- Upgrade packages and set minimal versions
61+
- Fix code to work with upgraded packages
62+
- Export to parquet on storage instead of csv

df_to_azure/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import logging
22

3-
from .export import df_to_azure
3+
from .export import df_to_azure as df_to_azure
44

5-
__version__ = "0.9.1"
5+
__version__ = "1.0.0"
66

77
logging.basicConfig(
88
format="%(asctime)s.%(msecs)03d [%(levelname)-5s] [%(name)s] - %(message)s",

df_to_azure/adf.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Factory,
2020
LinkedServiceReference,
2121
LinkedServiceResource,
22+
ParquetFormat,
2223
PipelineResource,
2324
SecureString,
2425
SqlServerStoredProcedureActivity,
@@ -173,29 +174,20 @@ def create_linked_service_blob(self):
173174
def create_input_blob(self):
174175
ds_name = f"BLOB_dftoazure_{self.table_name}"
175176

176-
ds_ls = LinkedServiceReference(reference_name=self.ls_blob_name)
177+
ds_ls = LinkedServiceReference(type="LinkedServiceReference", reference_name=self.ls_blob_name)
177178
ds_azure_blob = AzureBlobDataset(
178179
linked_service_name=ds_ls,
179180
folder_path=f"dftoazure/{self.table_name}",
180-
file_name=f"{self.table_name}.csv",
181-
format={
182-
"type": "TextFormat",
183-
"columnDelimiter": "^",
184-
"rowDelimiter": "\n",
185-
"treatEmptyAsNull": "true",
186-
"skipLineCount": 0,
187-
"firstRowAsHeader": "true",
188-
"quoteChar": '"',
189-
},
181+
file_name=f"{self.table_name}.parquet",
182+
format=ParquetFormat(),
190183
)
191184
ds_azure_blob = DatasetResource(properties=ds_azure_blob)
192185
self.adf_client.datasets.create_or_update(self.rg_name, self.df_name, ds_name, ds_azure_blob)
193186

194187
def create_output_sql(self):
195-
196188
ds_name = f"SQL_dftoazure_{self.table_name}"
197189

198-
ds_ls = LinkedServiceReference(reference_name=self.ls_sql_name)
190+
ds_ls = LinkedServiceReference(type="LinkedServiceReference", reference_name=self.ls_sql_name)
199191
data_azure_sql = AzureSqlTableDataset(
200192
linked_service_name=ds_ls,
201193
table_name=f"{self.schema}.{self.table_name}",
@@ -204,7 +196,6 @@ def create_output_sql(self):
204196
self.adf_client.datasets.create_or_update(self.rg_name, self.df_name, ds_name, data_azure_sql)
205197

206198
def create_pipeline(self, pipeline_name):
207-
208199
activities = [self.create_copy_activity()]
209200
# If user wants to upsert, we append stored procedure activity to pipeline.
210201
if self.method == "upsert":
@@ -226,8 +217,8 @@ def create_copy_activity(self):
226217
blob_source = BlobSource()
227218
sql_sink = SqlSink()
228219

229-
ds_in_ref = DatasetReference(reference_name=f"BLOB_dftoazure_{self.table_name}")
230-
ds_out_ref = DatasetReference(reference_name=f"SQL_dftoazure_{self.table_name}")
220+
ds_in_ref = DatasetReference(type="DatasetReference", reference_name=f"BLOB_dftoazure_{self.table_name}")
221+
ds_out_ref = DatasetReference(type="DatasetReference", reference_name=f"SQL_dftoazure_{self.table_name}")
231222
copy_activity = CopyActivity(
232223
name=act_name,
233224
inputs=[ds_in_ref],
@@ -243,7 +234,9 @@ def stored_procedure_activity(self):
243234
dependency = ActivityDependency(
244235
activity=f"Copy {self.table_name} to SQL", dependency_conditions=[dependency_condition]
245236
)
246-
linked_service_reference = LinkedServiceReference(reference_name=self.ls_sql_name)
237+
linked_service_reference = LinkedServiceReference(
238+
type="LinkedServiceReference", reference_name=self.ls_sql_name
239+
)
247240
activity = SqlServerStoredProcedureActivity(
248241
stored_procedure_name=f"UPSERT_{self.table_name}",
249242
name="UPSERT procedure",

df_to_azure/db.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ def create_merge_query(self):
4949
"""
5050
logging.debug(query)
5151

52-
return query
52+
return text(query)
5353

5454
def drop_procedure(self):
5555
query = f"DROP PROCEDURE IF EXISTS [UPSERT_{self.table_name}];"
56-
return query
56+
return text(query)
5757

5858
def create_stored_procedure(self):
5959
with auth_azure() as con:
@@ -73,7 +73,6 @@ def create_stored_procedure(self):
7373

7474

7575
def auth_azure(driver: str = None):
76-
7776
if driver is None:
7877
import pyodbc
7978

df_to_azure/export.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
from azure.storage.blob import BlobServiceClient
1010
from pandas import CategoricalDtype, DataFrame
1111
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, is_string_dtype
12-
from sqlalchemy.sql.visitors import VisitableType
13-
from sqlalchemy.types import BigInteger, Boolean, DateTime, Integer, Numeric, String
12+
from sqlalchemy.types import BigInteger, Boolean, DateTime, Integer, Numeric, String, TypeEngine
1413

1514
from df_to_azure.adf import ADF
1615
from df_to_azure.db import SqlUpsert, auth_azure, execute_stmt
@@ -34,7 +33,6 @@ def df_to_azure(
3433
clean_staging=True,
3534
container_name="parquet",
3635
):
37-
3836
if parquet:
3937
DfToParquet(
4038
df=df,
@@ -96,13 +94,11 @@ def __init__(
9694
self.clean_staging = clean_staging
9795

9896
def run(self):
99-
10097
if self.df.empty:
10198
logging.info("Data empty, no new records to upload.")
10299
return None, None
103100

104101
if self.create:
105-
106102
# azure components
107103
self.create_resourcegroup()
108104
self.create_datafactory()
@@ -133,11 +129,10 @@ def run(self):
133129

134130
def _checks(self):
135131
if self.dtypes:
136-
if not all([type(given_type) == VisitableType for given_type in self.dtypes.keys()]):
132+
if not all([type(given_type) == TypeEngine for given_type in self.dtypes.keys()]):
137133
WrongDtypeError("Wrong dtype given, only SqlAlchemy types are accepted")
138134

139135
def upload_dataset(self):
140-
141136
if self.method == "create":
142137
self.create_schema()
143138
self.push_to_azure()
@@ -183,10 +178,18 @@ def upload_to_blob(self):
183178
blob_client = self.blob_service_client()
184179
blob_client = blob_client.get_blob_client(
185180
container="dftoazure",
186-
blob=f"{self.table_name}/{self.table_name}.csv",
181+
blob=f"{self.table_name}/{self.table_name}.parquet",
187182
)
188183

189-
data = self.df.to_csv(index=False, sep="^", quotechar='"', lineterminator="\n")
184+
# This is needed because ADF converts datetime to Unix Epoch
185+
# resulting in INT64 type,
186+
# which conflicts with our Datetime column in the database
187+
# https://shorturl.at/dtSm6
188+
datetime_dtypes = self.df.select_dtypes("datetime")
189+
if datetime_dtypes.empty is False:
190+
for col in datetime_dtypes.columns:
191+
self.df[col] = self.df[col].astype(str)
192+
data = self.df.to_parquet(index=False)
190193
blob_client.upload_blob(data, overwrite=True)
191194

192195
def create_schema(self):

df_to_azure/tests/test_append.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
# #### APPEND METHOD TESTS ####
1111
# #############################
1212
def test_append():
13-
1413
df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4.0, 5.0, nan]})
1514

1615
# 1. we create a new dataframe

df_to_azure/tests/test_zz_clean_up.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from df_to_azure.db import auth_azure
2+
from sqlalchemy.sql import text
23

34

45
# --- CLEAN UP ----
@@ -36,5 +37,5 @@ def test_clean_up_db():
3637
with con.begin():
3738
for schema, tables in tables_dict.items():
3839
for table in tables:
39-
query = f"DROP TABLE IF EXISTS {schema}.{table};"
40+
query = text(f"DROP TABLE IF EXISTS {schema}.{table};")
4041
con.execute(query)

requirements.txt

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
azure-identity>=1.7.1
2-
azure-mgmt-datafactory>=2.2.0,<2.7.0
3-
azure-mgmt-resource>=20.1.0
4-
azure-storage-blob>=12.8.1
5-
pandas>=1.5.0
6-
pyarrow>=7.0.0
7-
pyodbc>=4.0.32
8-
sqlalchemy>=1.4.31,<2.0.0
1+
azure-identity>=1.12.0
2+
azure-mgmt-datafactory>=7.1.0
3+
azure-mgmt-resource>=23.1.1
4+
azure-storage-blob>=12.20.0
5+
pandas>=2.2.2
6+
pyarrow>=16.1.0
7+
pyodbc>=5.1.0
8+
sqlalchemy>=2.0.30

ruff.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Set the maximum line length to 120.
2+
line-length = 120

0 commit comments

Comments
 (0)