Skip to content

Commit 21787ea

Browse files
authored
Merge pull request #149 from statisticsnorway/issues_march_2025
Klass XML - Round up - versions and Paths
2 parents e2cfeb5 + f6a7c99 commit 21787ea

File tree

11 files changed

+1602
-1149
lines changed

11 files changed

+1602
-1149
lines changed

README.md

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,43 @@ logger = logging.getLogger(__name__)
9090
logger.info("This is an info message")
9191
```
9292

93+
94+
95+
### Export XMLs that can be imported into the KLASS UI
96+
```python
97+
from fagfunksjoner import make_klass_xml_codelist
98+
99+
100+
make_klass_xml_codelist(path="kjoenn.xml",
101+
codes=["1", "2"],
102+
names_bokmaal=["Mann", "Kvinne"])
103+
```
104+
105+
106+
### Round data UP
107+
108+
```python
109+
import pandas as pd
110+
111+
from fagfunksjoner import round_up
112+
113+
114+
print(round(2.5, 0), round_up(2.5, 0))
115+
116+
round_up(pd.Series([1.5, 2.5, 3.5]), 0) # Datatype blir Int64 når man runder til 0 desimaler
117+
round_up(pd.Series([1.15, 2.15, 3.15]), 1) # Datatype blir Float64 når man runder til mer enn 0 desimaler
118+
119+
df = pd.DataFrame(
120+
{"col1": [1.5, 2.5, 1.2345, 1.2355],
121+
"col2": [3.5, 4.5, 5.6789, 6.7891]}
122+
).astype({"col1": "Float64", "col2": "Float64"})
123+
rounded = round_up(df, decimal_places=0, col_names="col1") # Avrunder kun col1, den endrer datatype til Int64
124+
125+
rounded2 = round_up(df, col_names={"col1": 1, "col2": 2}) # Avrunder col1 til 1 desimal, col2 til 2 desimaler
126+
127+
```
128+
129+
93130
### Aggregation / Categories
94131

95132
Aggregate on all exclusive combinations of codes in certain columns (maybe before sending to statbank? Like proc means?)
@@ -159,7 +196,7 @@ all_combos_agg_inclusive(
159196
grand_total=True)
160197
```
161198

162-
199+
### "Formats" like in SAS
163200

164201
Perform mapping using SsbFormat. Behaves like a dictionary. Has functionality for mapping ranges and 'other'-category and detecting different types of NaN-values. Does not handle non-exclusive / overlapping categories, please only use for exclusive categories.
165202

@@ -203,6 +240,7 @@ some_frmt = get_format(path+'format_name.json')
203240
```
204241

205242
### Opening archive-files based on Datadok-api in prodsone
243+
206244
We have "flat files", which are not comma seperated. These need metadata to correctly open. In SAS we do this with "lastescript". But there is an API to old Datadok in prodsone, so these functions let you just specify a path, and attempt to open the flat files directly into pandas, with the metadata also available.
207245

208246
```python
@@ -220,6 +258,9 @@ archive_object.datatypes # The datatypes the archivdata ended up having?
220258
archive_object.widths # Width of each column in the flat file
221259

222260
```
261+
262+
263+
223264
### Operation to Oracle database
224265

225266
Remember that any credidential values to the database should not be stored

poetry.lock

Lines changed: 852 additions & 1085 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ssb-fagfunksjoner"
3-
version = "1.0.9"
3+
version = "1.1.0"
44
description = "Fellesfunksjoner for ssb i Python"
55
authors = ["SSB-pythonistas <ssb-pythonistas@ssb.no>"]
66
license = "MIT"
@@ -40,8 +40,8 @@ types-beautifulsoup4 = ">=4.12.0.20240511"
4040
types-colorama = ">=0.4.15.20240205"
4141
types-openpyxl = ">=3.1.5.20240719"
4242
pyarrow-stubs = ">=10.0.1.9"
43-
nox = "^2025.2.9"
44-
nox-poetry = "^1.1.0"
43+
nox = ">=2025.2.9"
44+
nox-poetry = ">=1.1.0"
4545

4646
[tool.poetry.group.dev.dependencies]
4747
pygments = ">=2.10.0"

src/fagfunksjoner/__init__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,22 @@ def _try_getting_pyproject_toml(e: Exception | None = None) -> str:
4646
open_path_datadok,
4747
open_path_metapath_datadok,
4848
)
49+
from fagfunksjoner.data.klass_xml import make_klass_xml_codelist
4950
from fagfunksjoner.data.pandas_combinations import (
5051
all_combos_agg,
5152
all_combos_agg_inclusive,
5253
)
5354
from fagfunksjoner.data.pandas_dtypes import auto_dtype
55+
from fagfunksjoner.data.round_ssb import round_up
5456
from fagfunksjoner.data.view_dataframe import view_dataframe
5557
from fagfunksjoner.formats.formats import SsbFormat
5658
from fagfunksjoner.log.statlogger import StatLogger
5759
from fagfunksjoner.paths.project_root import ProjectRoot
58-
from fagfunksjoner.paths.versions import get_latest_fileversions, next_version_path
60+
from fagfunksjoner.paths.versions import (
61+
get_latest_fileversions,
62+
latest_version_path,
63+
next_version_path,
64+
)
5965
from fagfunksjoner.prodsone.check_env import check_env, linux_shortcuts
6066
from fagfunksjoner.prodsone.saspy_ssb import saspy_df_from_path, saspy_session
6167

@@ -69,10 +75,13 @@ def _try_getting_pyproject_toml(e: Exception | None = None) -> str:
6975
"auto_dtype",
7076
"check_env",
7177
"get_latest_fileversions",
78+
"latest_version_path",
7279
"linux_shortcuts",
80+
"make_klass_xml_codelist",
7381
"next_version_path",
7482
"open_path_datadok",
7583
"open_path_metapath_datadok",
84+
"round_up",
7685
"saspy_df_from_path",
7786
"saspy_session",
7887
"view_dataframe",
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""This module contains functions to create a xml file that can be loaded in the KLASS UI.
2+
3+
It passes data trhough a pandas DataFrame from a list of codes and names, to an XML from the pandas dataframe.
4+
"""
5+
6+
import pandas as pd
7+
8+
9+
def make_klass_df_codelist(
10+
codes: list[str | int],
11+
names_bokmaal: list[str] | None = None,
12+
names_nynorsk: list[str] | None = None,
13+
names_engelsk: list[str] | None = None,
14+
) -> pd.DataFrame:
15+
"""Make a pandas Dataframe from lists of codes and names.
16+
17+
Args:
18+
codes: List of codes.
19+
names_bokmaal: List of names in Bokmål.
20+
names_nynorsk: List of names in Nynorsk.
21+
names_engelsk: List of names in English.
22+
23+
Returns:
24+
pd.DataFrame: Dataframe with columns for codes and names.
25+
26+
Raises:
27+
ValueError: If names_bokmaal and names_nynorsk are both None, or if the length of
28+
codes and names do not match.
29+
"""
30+
if names_bokmaal is None and names_nynorsk is None:
31+
raise ValueError("Must have content in names_bokmaal or names_nynorsk")
32+
for name in [names_bokmaal, names_nynorsk, names_engelsk]:
33+
if name and len(codes) != len(name):
34+
raise ValueError(
35+
"Length of the entered names must match the length of codes."
36+
)
37+
38+
cols = [
39+
"kode",
40+
"forelder",
41+
"navn_bokmål",
42+
"navn_nynorsk",
43+
"navn_engelsk",
44+
"kortnavn_bokmål",
45+
"kortnavn_nynorsk",
46+
"kortnavn_engelsk",
47+
"noter_bokmål",
48+
"noter_nynorsk",
49+
"noter_engelsk",
50+
"gyldig_fra",
51+
"gyldig_til",
52+
]
53+
54+
data = {col: [None] * len(codes) for col in cols} | {
55+
"kode": codes,
56+
"navn_bokmål": names_bokmaal,
57+
"navn_nynorsk": names_nynorsk,
58+
"navn_engelsk": names_engelsk,
59+
}
60+
61+
return pd.DataFrame({name: data for name, data in data.items()})
62+
63+
64+
def make_klass_xml_codelist(
65+
path: str,
66+
codes: list[str | int],
67+
names_bokmaal: list[str] | None = None,
68+
names_nynorsk: list[str] | None = None,
69+
names_engelsk: list[str] | None = None,
70+
) -> pd.DataFrame:
71+
"""Make a klass xml file and pandas Dataframe from a list of codes and names.
72+
73+
This XML can be loaded into the old KLASS UI under version -> import to the top right.
74+
75+
Args:
76+
path (str): Path to save the xml file.
77+
codes (list[str|int]): List of codes.
78+
names_bokmaal (list[str] | None): List of names in Bokmål.
79+
names_nynorsk (list[str] | None): List of names in Nynorsk.
80+
names_engelsk (list[str] | None): List of names in English.
81+
82+
Returns:
83+
pd.DataFrame: Dataframe with columns for codes and names.
84+
"""
85+
df = make_klass_df_codelist(
86+
codes=codes,
87+
names_bokmaal=names_bokmaal,
88+
names_nynorsk=names_nynorsk,
89+
names_engelsk=names_engelsk,
90+
)
91+
df.to_xml(
92+
path,
93+
root_name="versjon",
94+
row_name="element",
95+
namespaces={
96+
"ns1": "https://klass.ssb.no/version",
97+
},
98+
prefix="ns1",
99+
)
100+
return df
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
"""Reproduce the functionality of the default round function from Excel or SAS, rounding data up to a given number of decimal places.
2+
3+
Instead of Python's default of rounding to even.
4+
"""
5+
6+
from decimal import ROUND_HALF_UP, Decimal, localcontext
7+
from typing import TYPE_CHECKING, Any, overload
8+
9+
import pandas as pd
10+
11+
12+
# Alias for type checking
13+
if TYPE_CHECKING:
14+
pd_Series = pd.Series[Any]
15+
else:
16+
pd_Series = (
17+
object # Fallback to avoid runtime issues where pd.Series is not subscriptable
18+
)
19+
20+
21+
# Overloads, output type is dependent on input type
22+
@overload
23+
def round_up(data: pd.DataFrame, decimal_places: int) -> pd.DataFrame: ...
24+
@overload
25+
def round_up(data: pd_Series, decimal_places: int) -> pd_Series: ...
26+
27+
28+
# Mypy does not like getting specific with Literal[0], thats too bad
29+
@overload
30+
def round_up(data: int | float, decimal_places: int) -> int | float: ...
31+
@overload
32+
def round_up(
33+
data: pd._libs.missing.NAType, decimal_places: int
34+
) -> pd._libs.missing.NAType: ...
35+
36+
37+
def round_up(
38+
data: pd.DataFrame | pd_Series | float | pd._libs.missing.NAType,
39+
decimal_places: int = 0,
40+
col_names: str | list[str] | dict[str, int] = "",
41+
) -> pd.DataFrame | pd_Series | int | float | pd._libs.missing.NAType:
42+
"""Round up a number, to a given number of decimal places. Avoids Pythons default of rounding to even.
43+
44+
Args:
45+
data: The data to round up, can be a float, Series, or DataFrame.
46+
decimal_places: The number of decimal places to round up to. Ignored if you send a dictionary into col_names with column names and decimal places.
47+
col_names: The column names to round up. If a dictionary is provided, it should map column names to the number of decimal places for each column.
48+
If a list is provided, it should contain the names of the columns to round up. If a string is provided, it should be the name of a single column to round up.
49+
50+
Returns:
51+
pd.DataFrame | pd.Series | int | float: The rounded up number as an int, float, Series, or DataFrame.
52+
53+
Raises:
54+
TypeError: If data is not a DataFrame, Series, int, float, or NAType.
55+
"""
56+
if isinstance(data, pd.DataFrame):
57+
if isinstance(col_names, dict):
58+
# Assuming col_names is a dictionary with column names as keys and decimal places as values
59+
for col, dec in col_names.items():
60+
data = _apply_rounding_to_df_col(data, col, dec)
61+
elif isinstance(col_names, list):
62+
# Assuming col_names is a list of column names
63+
for col in col_names:
64+
data = _apply_rounding_to_df_col(data, col, decimal_places)
65+
elif isinstance(col_names, str):
66+
# Assuming col_names is a single column name
67+
data = _apply_rounding_to_df_col(data, col_names, decimal_places)
68+
elif isinstance(data, pd.Series):
69+
# If data is a Series, round it directly
70+
data = _set_dtype_from_decimal_places(
71+
data.apply(_round, decimals=decimal_places), decimal_places
72+
)
73+
elif isinstance(data, int | float | pd._libs.missing.NAType):
74+
data = _round(data, decimals=decimal_places)
75+
else:
76+
raise TypeError(
77+
"data must be a DataFrame, Series, int, float, or NAType. "
78+
f"Got {type(data)} instead."
79+
)
80+
return data
81+
82+
83+
def _apply_rounding_to_df_col(
84+
df: pd.DataFrame, col_name: str, decimal_places: int
85+
) -> pd.DataFrame:
86+
"""Apply rounding to a specific column in a DataFrame.
87+
88+
Args:
89+
df: The DataFrame to round.
90+
col_name: The name of the column to round.
91+
decimal_places: The number of decimal places to round to.
92+
93+
Returns:
94+
pd.DataFrame: The DataFrame with the rounded column.
95+
"""
96+
if col_name in df.columns:
97+
df[col_name] = _set_dtype_from_decimal_places(
98+
df[col_name].apply(_round, decimals=decimal_places), decimal_places
99+
)
100+
return df
101+
102+
103+
def _set_dtype_from_decimal_places(
104+
data: pd_Series,
105+
decimal_places: int = 0,
106+
) -> pd_Series:
107+
"""Set the dtype of the data based on the number of decimal places.
108+
109+
Args:
110+
data: The column to set the dtype for.
111+
decimal_places: The number of decimal places.
112+
113+
Returns:
114+
pd_Series: The data with the updated dtype.
115+
"""
116+
if decimal_places == 0:
117+
return data.astype("Int64")
118+
else:
119+
return data.astype("Float64")
120+
121+
122+
def _round(
123+
n: float | pd._libs.missing.NAType,
124+
decimals: int = 0,
125+
) -> float | int | pd._libs.missing.NAType:
126+
if pd.isna(n):
127+
return pd.NA
128+
elif n or n == 0:
129+
with localcontext() as ctx:
130+
ctx.rounding = ROUND_HALF_UP
131+
rounded = round(Decimal(n), decimals)
132+
if decimals == 0:
133+
return int(Decimal(rounded).to_integral_value())
134+
return float(rounded)
135+
return n

src/fagfunksjoner/paths/user.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# +
1+
"""Extract user information from the environment."""
2+
23
import getpass
34
import os
45
import subprocess
@@ -63,6 +64,3 @@ def verify_ssbmail(user: str | None) -> str | None:
6364
if "@" not in user and len(user) == 3:
6465
return user + "@ssb.no"
6566
return None
66-
67-
68-
# -

0 commit comments

Comments
 (0)