Skip to content

Commit 759fc4d

Browse files
authored
Merge pull request #1 from LukasZahradnik/feature/blueprint
The blueprint model
2 parents 7c07163 + b21f656 commit 759fc4d

File tree

105 files changed

+6242
-2698
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+6242
-2698
lines changed

.github/workflows/black.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: Lint
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- master
8+
- 'releases/**'
9+
10+
jobs:
11+
lint:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v3
15+
- uses: psf/black@stable
16+
with:
17+
src: "./db_transformer"

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ __pycache__/
44
processed/
55
raw/
66

7-
results/
7+
ray_results/
88
logs/
99
lightning_logs/
1010
torch-models/

.vscode/settings.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"editor.formatOnSave": true,
3+
"[python]": {
4+
"editor.defaultFormatter": "ms-python.black-formatter"
5+
},
6+
"notebook.formatOnSave.enabled": true,
7+
"editor.detectIndentation": false,
8+
"editor.indentSize": "tabSize",
9+
"editor.tabSize": 4,
10+
}

db_transformer/data/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .dataset_defaults import *
2+
3+
from .ctu_dataset import CTUDataset
4+
from .fit_dataset import FITRelationalDataset

db_transformer/data/converter/column/cat_converter.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,22 @@
1212
from db_transformer.schema.columns import CategoricalColumnDef
1313
from db_transformer.schema.schema import ColumnDef
1414

15-
__ALL__ = ['CategoricalConverter']
15+
__ALL__ = ["CategoricalConverter"]
1616

1717

1818
class CategoricalConverter(SeriesConverter[CategoricalColumnDef]):
19-
def __init__(self,
20-
mapper: Optional[Union[SimpleStringSeriesMapper, SeriesMapper]] = None,
21-
) -> None:
19+
def __init__(
20+
self,
21+
mapper: Optional[Union[SimpleStringSeriesMapper, SeriesMapper]] = None,
22+
) -> None:
2223
super().__init__()
2324
self.mapper = get_string_mapper(mapper) if mapper is not None else None
2425

25-
def __call__(self,
26-
column_def: CategoricalColumnDef,
27-
column: pd.Series,
28-
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
26+
def __call__(
27+
self,
28+
column_def: CategoricalColumnDef,
29+
column: pd.Series,
30+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
2931
distinct_vals, mapper = self._guess_value_set(column_def.card, column)
3032

3133
# give None index of 0
@@ -37,13 +39,15 @@ def __call__(self,
3739

3840
out_column = mapper(column).map(value_map)
3941

40-
return (out_column, ), (column_def, )
42+
return (out_column,), (column_def,)
4143

42-
def _guess_value_set(self, cardinality: int, column: pd.Series) -> Tuple[List[Any], SeriesMapper]:
44+
def _guess_value_set(
45+
self, cardinality: int, column: pd.Series
46+
) -> Tuple[List[Any], SeriesMapper]:
4347
failed_mappings: List[Tuple[str, int, Optional[Exception]]] = []
4448

4549
if self.mapper is not None:
46-
mappers = {'user_provided': self.mapper}
50+
mappers = {"user_provided": self.mapper}
4751
else:
4852
mappers = SIMPLE_STRING_SERIES_MAPPERS
4953

@@ -58,13 +62,17 @@ def _guess_value_set(self, cardinality: int, column: pd.Series) -> Tuple[List[An
5862

5963
def _exception_to_str(e: Optional[Exception]) -> str:
6064
if e is None:
61-
return ''
65+
return ""
6266

6367
return f" (failed: {e})"
6468

6569
errormsg = [
66-
f" -> {mapping_name} (cardinality {card}){_exception_to_str(e)}" for mapping_name, card, e in failed_mappings]
67-
68-
raise RuntimeError(f"Expected {cardinality} unique values, "
69-
f"but the following operations on values provided the following cardinalities instead:\n"
70-
+ '\n'.join(errormsg))
70+
f" -> {mapping_name} (cardinality {card}){_exception_to_str(e)}"
71+
for mapping_name, card, e in failed_mappings
72+
]
73+
74+
raise RuntimeError(
75+
f"Expected {cardinality} unique values, "
76+
f"but the following operations on values provided the following cardinalities instead:\n"
77+
+ "\n".join(errormsg)
78+
)

db_transformer/data/converter/column/converter_list.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@
77

88
from .series_converter import SeriesConverter
99

10-
__ALL__ = ['ConverterList']
10+
__ALL__ = ["ConverterList"]
1111

1212

1313
class ConverterList(SeriesConverter):
1414
def __init__(self, *converters: SeriesConverter) -> None:
1515
self.converters = converters
1616

17-
def __call__(self, column_def: ColumnDef, column: pd.Series) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
17+
def __call__(
18+
self, column_def: ColumnDef, column: pd.Series
19+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
1820
out: List[pd.Series] = []
1921
out_column_defs: List[ColumnDef] = []
2022

db_transformer/data/converter/column/default_datetime_converters.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,22 @@
77

88
from .pandas_converter import PandasConverter
99

10-
__ALL__ = ['DateConverter', 'DateTimeConverter', 'TimestampConverter', 'TimeConverter']
10+
__ALL__ = ["DateConverter", "DateTimeConverter", "TimestampConverter", "TimeConverter"]
1111

1212

1313
class DateConverter(PandasConverter):
1414
"""Converts column to year and day of year."""
1515

1616
def __init__(self, skip_if_allsame=True) -> None:
1717
super().__init__(
18-
('_year', lambda s: (s.dt.year, NumericColumnDef())),
19-
('_dayofyear', lambda s: (s.dt.dayofyear, NumericColumnDef())),
20-
skip_if_allsame=skip_if_allsame)
18+
("_year", lambda s: (s.dt.year, NumericColumnDef())),
19+
("_dayofyear", lambda s: (s.dt.dayofyear, NumericColumnDef())),
20+
skip_if_allsame=skip_if_allsame,
21+
)
2122

2223

2324
def _get_seconds_since_midnight(s: pd.Series) -> pd.Series:
24-
return ((s - s.dt.normalize()) / pd.Timedelta('1 second')).fillna(0).astype(int)
25+
return ((s - s.dt.normalize()) / pd.Timedelta("1 second")).fillna(0).astype(int)
2526

2627

2728
def _get_seconds_since_midnight_time(t: Optional[datetime.time]) -> Optional[int]:
@@ -36,19 +37,29 @@ class DateTimeConverter(PandasConverter):
3637

3738
def __init__(self, skip_if_allsame=True) -> None:
3839
super().__init__(
39-
('_year', lambda s: (s.dt.year, NumericColumnDef())),
40-
('_dayofyear', lambda s: (s.dt.dayofyear, NumericColumnDef())),
41-
('_seconds_since_midnight', lambda s: (_get_seconds_since_midnight(s), NumericColumnDef())),
42-
skip_if_allsame=skip_if_allsame)
40+
("_year", lambda s: (s.dt.year, NumericColumnDef())),
41+
("_dayofyear", lambda s: (s.dt.dayofyear, NumericColumnDef())),
42+
(
43+
"_seconds_since_midnight",
44+
lambda s: (_get_seconds_since_midnight(s), NumericColumnDef()),
45+
),
46+
skip_if_allsame=skip_if_allsame,
47+
)
4348

4449

4550
class TimeConverter(PandasConverter):
4651
"""Converts column to seconds since midnight."""
4752

4853
def __init__(self, skip_if_allsame=True) -> None:
4954
super().__init__(
50-
('', lambda s: (s.map(lambda v: _get_seconds_since_midnight_time(v)), NumericColumnDef())),
51-
skip_if_allsame=skip_if_allsame
55+
(
56+
"",
57+
lambda s: (
58+
s.map(lambda v: _get_seconds_since_midnight_time(v)),
59+
NumericColumnDef(),
60+
),
61+
),
62+
skip_if_allsame=skip_if_allsame,
5263
)
5364

5465

@@ -57,5 +68,6 @@ class TimestampConverter(PandasConverter):
5768

5869
def __init__(self, skip_if_allsame=True) -> None:
5970
super().__init__(
60-
('', lambda s: (s.astype('int64') // 10**9, NumericColumnDef())),
61-
skip_if_allsame=skip_if_allsame)
71+
("", lambda s: (s.astype("int64") // 10**9, NumericColumnDef())),
72+
skip_if_allsame=skip_if_allsame,
73+
)

db_transformer/data/converter/column/identity_converter.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66

77
from .series_converter import SeriesConverter
88

9-
__ALL__ = ['IdentityConverter']
9+
__ALL__ = ["IdentityConverter"]
1010

1111

1212
class IdentityConverter(SeriesConverter):
13-
def __call__(self, column_def: ColumnDef, column: pd.Series) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
14-
return (column, ), (column_def, )
13+
def __call__(
14+
self, column_def: ColumnDef, column: pd.Series
15+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
16+
return (column,), (column_def,)

db_transformer/data/converter/column/omit_converter.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66

77
from .series_converter import SeriesConverter
88

9-
__ALL__ = ['OmitConverter']
9+
__ALL__ = ["OmitConverter"]
1010

1111

1212
class OmitConverter(SeriesConverter):
13-
def __call__(self, column_def: ColumnDef, column: pd.Series) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
13+
def __call__(
14+
self, column_def: ColumnDef, column: pd.Series
15+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
1416
return (), ()

db_transformer/data/converter/column/pandas_converter.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,27 @@
66

77
from .series_converter import SeriesConverter
88

9-
__ALL__ = ['PandasConverter']
9+
__ALL__ = ["PandasConverter"]
1010

1111

1212
class PandasConverter(SeriesConverter):
13-
def __init__(self,
14-
*segments: Tuple[str, Callable[[pd.Series], Tuple[pd.Series, ColumnDef]]],
15-
skip_if_allsame=True) -> None:
13+
def __init__(
14+
self,
15+
*segments: Tuple[str, Callable[[pd.Series], Tuple[pd.Series, ColumnDef]]],
16+
skip_if_allsame=True
17+
) -> None:
1618
self.segments = segments
1719
self.skip_if_allsame = skip_if_allsame
1820

1921
@classmethod
20-
def single(cls, func: Callable[[pd.Series], Tuple[pd.Series, ColumnDef]], skip_if_allsame=True) -> SeriesConverter:
21-
return PandasConverter(('', func), skip_if_allsame=skip_if_allsame)
22-
23-
def __call__(self, column_def: ColumnDef, column: pd.Series) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
22+
def single(
23+
cls, func: Callable[[pd.Series], Tuple[pd.Series, ColumnDef]], skip_if_allsame=True
24+
) -> SeriesConverter:
25+
return PandasConverter(("", func), skip_if_allsame=skip_if_allsame)
26+
27+
def __call__(
28+
self, column_def: ColumnDef, column: pd.Series
29+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
2430
out: List[pd.Series] = []
2531
out_column_defs: List[ColumnDef] = []
2632

db_transformer/data/converter/column/per_type_converter.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,23 @@
33
import pandas as pd
44

55
from db_transformer.data.converter.column.series_converter import SeriesConverter
6-
from db_transformer.data.utils.column_def_matching import ColumnDefMatcherLike, find_value_for_matcher, get_matcher
6+
from db_transformer.data.utils.column_def_matching import (
7+
ColumnDefMatcherLike,
8+
find_value_for_matcher,
9+
get_matcher,
10+
)
711
from db_transformer.schema.schema import ColumnDef
812

9-
__ALL__ = ['PerTypeSeriesConverter']
13+
__ALL__ = ["PerTypeSeriesConverter"]
1014

1115

1216
class PerTypeSeriesConverter(SeriesConverter[ColumnDef]):
13-
def __init__(self,
14-
*converters: Tuple[ColumnDefMatcherLike, SeriesConverter]) -> None:
17+
def __init__(self, *converters: Tuple[ColumnDefMatcherLike, SeriesConverter]) -> None:
1518
self.converters = [(get_matcher(k), v) for k, v in converters]
1619

17-
def __call__(self,
18-
column_def: ColumnDef,
19-
column: pd.Series) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
20+
def __call__(
21+
self, column_def: ColumnDef, column: pd.Series
22+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
2023
converter = find_value_for_matcher(self.converters, column_def)
2124

2225
if converter is None:
@@ -28,9 +31,10 @@ def __call__(self,
2831
raise RuntimeError(f"Failed to convert {column_def} using {converter}") from e
2932

3033
if len(series) != len(this_column_defs):
31-
raise ValueError(f"{converter} returned {len(series)} pd.Series objects, "
32-
f"but {len(this_column_defs)} column definition objects "
33-
f"for column def {column_def}.")
34+
raise ValueError(
35+
f"{converter} returned {len(series)} pd.Series objects, "
36+
f"but {len(this_column_defs)} column definition objects "
37+
f"for column def {column_def}."
38+
)
3439

3540
return series, this_column_defs
36-

db_transformer/data/converter/column/series_converter.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,16 @@
55

66
from db_transformer.schema.columns import ColumnDef
77

8-
_TColumnDef = TypeVar('_TColumnDef', bound=ColumnDef)
8+
_TColumnDef = TypeVar("_TColumnDef", bound=ColumnDef)
99

1010
__ALL__ = [
11-
'SeriesConverter',
11+
"SeriesConverter",
1212
]
1313

1414

1515
class SeriesConverter(Generic[_TColumnDef], ABC):
1616
@abstractmethod
17-
def __call__(self, column_def: _TColumnDef, column: pd.Series) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
17+
def __call__(
18+
self, column_def: _TColumnDef, column: pd.Series
19+
) -> Tuple[Sequence[pd.Series], Sequence[ColumnDef]]:
1820
pass

0 commit comments

Comments
 (0)