Skip to content

Commit 916aebd

Browse files
committed
split functions for cognitive load
1 parent cb9a946 commit 916aebd

File tree

2 files changed

+67
-30
lines changed

2 files changed

+67
-30
lines changed

src/fagfunksjoner/api/statistikkregisteret.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
TEXT = "#text"
1515
ENDRET = "@endret"
1616
DESKFLYT = "@deskFlyt"
17+
SPACE_LANG = r"@{http://www.w3.org/XML/1998/namespace}lang"
1718

1819

1920
STATUS_MAP = {
@@ -214,7 +215,7 @@ def parse_lang_text_single(entry: dict[str, Any]) -> LangText:
214215
LangText: The parsed LangText object.
215216
"""
216217
return LangText(
217-
lang=entry["@{http://www.w3.org/XML/1998/namespace}lang"],
218+
lang=entry[SPACE_LANG],
218219
text=entry.get(TEXT, None),
219220
name=entry.get("@navn", None),
220221
)
@@ -249,7 +250,7 @@ def parse_contact_single(entry: dict[str, Any]) -> Contact:
249250
cellphone=entry["@mobil"],
250251
email=entry["@epost"],
251252
initials=entry["@initialer"],
252-
changed=entry.get("@endret", None),
253+
changed=entry.get(ENDRET, None),
253254
)
254255

255256

@@ -276,7 +277,7 @@ def parse_triggerord_single(entry: dict[str, Any]) -> dict[str, str]:
276277
dict: The parsed trigger word dictionary.
277278
"""
278279
return {
279-
"lang": entry["@{http://www.w3.org/XML/1998/namespace}lang"],
280+
"lang": entry[SPACE_LANG],
280281
"text": entry[TEXT],
281282
}
282283

@@ -450,7 +451,7 @@ def parse_contacts(t: ET.Element) -> list[Contact]:
450451
name=Name(
451452
name_lang=[
452453
LangText(
453-
lang=x["@{http://www.w3.org/XML/1998/namespace}lang"],
454+
lang=x[SPACE_LANG],
454455
text=x.get("#text", None),
455456
name=x.get("@navn", None),
456457
)
@@ -462,7 +463,7 @@ def parse_contacts(t: ET.Element) -> list[Contact]:
462463
cellphone=contact["@mobil"],
463464
email=contact["@epost"],
464465
initials=contact["@initialer"],
465-
changed=dateutil.parser.parse(contact["@endret"]),
466+
changed=dateutil.parser.parse(contact[ENDRET]),
466467
)
467468
)
468469
return result

src/fagfunksjoner/data/datadok_extract.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
#
4646
# Den interne metadataportalen http://www.byranettet.ssb.no/metadata/ har også alle filbeskrivelsene og filvariablene.
4747

48+
YEARS_BACK_CHECK = -20
49+
4850

4951
# %%
5052
@dataclass
@@ -570,7 +572,6 @@ def open_path_datadok(path: str | Path, **read_fwf_params: Any) -> ArchiveData:
570572
571573
Raises:
572574
ValueError: If no datadok-api endpoint is found for the path given.
573-
FileNotFoundError: If more than one file matches, with different file extensions, we do not know which to pick.
574575
"""
575576
path_lib = convert_to_pathlib(path)
576577
combinations = get_path_combinations(path_lib, file_exts=[""])
@@ -586,6 +587,24 @@ def open_path_datadok(path: str | Path, **read_fwf_params: Any) -> ArchiveData:
586587
url_address = url_from_path(url_path)
587588
logger.info(f"Found datadok-response for path {url_path}")
588589

590+
filepath = look_for_filepath(path_lib)
591+
592+
return import_archive_data(url_address, filepath, **read_fwf_params)
593+
594+
595+
def look_for_filepath(path_lib: Path) -> Path:
596+
"""Look for possible placements of the physical "flatfile" on disk.
597+
598+
Args:
599+
path_lib (Path): The given path from the user as a pathlib.Path
600+
601+
Raises:
602+
FileNotFoundError: If we find more than one matching file, we do not know which to pick.
603+
FileNotFoundError: If we find zero matching files, we also do not know which to pick.
604+
605+
Returns:
606+
Path: The found path of an actual physical file.
607+
"""
589608
file_combinations = get_path_combinations(
590609
path_lib.with_suffix(""), file_exts=None, add_dollar=False
591610
) # file_exts=None gets replaced by dat, txt, ""
@@ -619,8 +638,7 @@ def open_path_datadok(path: str | Path, **read_fwf_params: Any) -> ArchiveData:
619638
filepath = filelist[0]
620639

621640
logger.info(f"Found datafile at path {filepath}")
622-
623-
return import_archive_data(url_address, filepath, **read_fwf_params)
641+
return filepath
624642

625643

626644
# Correcting path for API
@@ -866,35 +884,53 @@ def go_back_in_time(
866884
yr_char_ranges = get_yr_char_ranges(path_lib)
867885
# Loop over the years we want to look at, changing all the year ranges in the path
868886
if yr_char_ranges:
869-
curr_path = path_lib
870-
# Looking 20 years back in time
871-
for looking_back in range(-1, -20, -1):
872-
for year_range in yr_char_ranges:
873-
yr = curr_path.name[year_range[0] : year_range[1]]
874-
name_update = (
875-
curr_path.name[: year_range[0]]
876-
+ str(int(yr) - 1)
877-
+ curr_path.name[year_range[1] :]
878-
)
879-
curr_path = Path(curr_path.parent, name_update)
880-
logger.debug(f"Looking back at {looking_back}, {curr_path=}")
881-
yr_combinations = get_path_combinations(curr_path, file_exts=exts)
882-
for yrpath, ext in yr_combinations:
883-
url_address = url_from_path(yrpath.with_suffix(ext))
884-
if test_url(url_address):
885-
f"Looking back {looking_back} years, found a path at {yrpath.with_suffix(ext)}"
886-
return yrpath.with_suffix(ext)
887-
888-
logger.info(
889-
f"Looking back {looking_back} years, DIDNT find a path at {yrpath.with_suffix(ext)}"
890-
)
887+
yrpath = bumpcheck_file_years_back(path_lib, yr_char_ranges, exts)
888+
if yrpath is not None:
889+
return yrpath
891890
else:
892891
logger.info(
893892
"Couldnt determine any year ranges in the pattern gXXXX (possibly repeating, like gXXXXgXXXX.)."
894893
)
895894
return None
896895

897896

897+
def bumpcheck_file_years_back(
898+
curr_path: Path, yr_char_ranges: list[tuple[int, int]], exts: list[str]
899+
) -> Path | None:
900+
"""Modify the path to point at older versions of file, to look for valid datadok-api paths.
901+
902+
Args:
903+
curr_path: The path given by user to look for.
904+
yr_char_ranges: The placement of the year ranges in the paths.
905+
exts: The base extensions to explore.
906+
907+
Returns:
908+
Path | None :
909+
"""
910+
# Looking X years back in time
911+
for looking_back in range(-1, YEARS_BACK_CHECK, -1):
912+
for year_range in yr_char_ranges:
913+
yr = curr_path.name[year_range[0] : year_range[1]]
914+
name_update = (
915+
curr_path.name[: year_range[0]]
916+
+ str(int(yr) - 1)
917+
+ curr_path.name[year_range[1] :]
918+
)
919+
curr_path = Path(curr_path.parent, name_update)
920+
logger.debug(f"Looking back at {looking_back}, {curr_path=}")
921+
yr_combinations = get_path_combinations(curr_path, file_exts=exts)
922+
for yrpath, ext in yr_combinations:
923+
url_address = url_from_path(yrpath.with_suffix(ext))
924+
if test_url(url_address):
925+
f"Looking back {looking_back} years, found a path at {yrpath.with_suffix(ext)}"
926+
return yrpath.with_suffix(ext)
927+
928+
logger.info(
929+
f"Looking back {looking_back} years, DIDNT find a path at {yrpath.with_suffix(ext)}"
930+
)
931+
return None
932+
933+
898934
def get_yr_char_ranges(path: str | Path) -> list[tuple[int, int]]:
899935
"""Find the character ranges containing years in the path. Usually 1-4 ranges.
900936

0 commit comments

Comments
 (0)