Skip to content

Commit 1424dce

Browse files
committed
docs: add module docstrings for utils.trntype and utils.cleaning; changelog bumped
1 parent 7a0f3e4 commit 1424dce

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

utils/cleaning.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
"""Data cleaning helpers.
2+
3+
Contains vectorized utilities for parsing amounts and normalizing free-text
4+
descriptions. Transaction-type inference has been moved to `utils.trntype`;
5+
`utils.cleaning` re-exports the inference functions for compatibility.
6+
"""
7+
18
import re
29
from typing import Optional
310

utils/trntype.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""Transaction-type inference helpers.
2+
3+
This module contains functions that apply the rule-driven transaction-type
4+
inference logic. Rules are configured via `utils.rules.RuleSet` and can be
5+
loaded/overridden from JSON/YAML using `utils.rules.load_rules`.
6+
7+
Prefer importing inference functions from `utils.trntype` in new code. The
8+
legacy module `utils.cleaning` re-exports these functions to preserve
9+
backwards-compatibility.
10+
"""
11+
12+
import re
13+
from typing import Optional
14+
15+
import numpy as np
16+
import pandas as pd
17+
18+
from utils.rules import DEFAULT_RULES, RuleSet
19+
20+
21+
_OFX_TYPE_WHITELIST = {
22+
"CASH",
23+
"INT",
24+
"DIV",
25+
"FEE",
26+
"SRVCHG",
27+
"DEP",
28+
"ATM",
29+
"POS",
30+
"XFER",
31+
"CHECK",
32+
"PAYMENT",
33+
"DIRECTDEP",
34+
"DIRECTDEBIT",
35+
"REPEATPMT",
36+
"OTHER",
37+
"CREDIT",
38+
"DEBIT",
39+
}
40+
41+
42+
def infer_trntype_series(
43+
amount: pd.Series,
44+
trntype_text: Optional[pd.Series],
45+
cleaned_desc: Optional[pd.Series] = None,
46+
rules: Optional[RuleSet] = None,
47+
) -> pd.Series:
48+
"""Infer OFX transaction type values for a series of transactions.
49+
50+
This function was previously defined in `utils.cleaning`. It has been
51+
moved here to separate cleaning utilities from trntype inference logic.
52+
"""
53+
54+
rules = rules or DEFAULT_RULES
55+
idx = amount.index
56+
trn_series = (
57+
trntype_text if trntype_text is not None else pd.Series(pd.NA, index=idx)
58+
)
59+
desc_series = (
60+
cleaned_desc if cleaned_desc is not None else pd.Series(pd.NA, index=idx)
61+
)
62+
63+
trn_text = trn_series.astype("string").str.strip().str.upper()
64+
normalized = trn_text.replace(rules.source_aliases)
65+
66+
result = pd.Series(pd.NA, index=idx, dtype="string")
67+
exact_mask = normalized.isin(_OFX_TYPE_WHITELIST)
68+
result.loc[exact_mask] = normalized.loc[exact_mask]
69+
70+
haystack = (
71+
trn_text.fillna("") + " " + desc_series.astype("string").fillna("")
72+
).str.upper()
73+
haystack = haystack.str.strip()
74+
75+
pending = result.isna()
76+
for pattern, output in rules.rules_regex:
77+
if not pending.any():
78+
break
79+
mask = pending & haystack.str.contains(pattern, regex=True, na=False)
80+
result.loc[mask] = output
81+
pending = result.isna()
82+
83+
for pattern, output in rules.keyword_rules:
84+
if not pending.any():
85+
break
86+
mask = pending & haystack.str.contains(pattern, regex=True, na=False)
87+
result.loc[mask] = output
88+
pending = result.isna()
89+
90+
if pending.any():
91+
numeric_amounts = pd.to_numeric(amount, errors="coerce")
92+
other_mask = pending & numeric_amounts.isna()
93+
result.loc[other_mask] = "OTHER"
94+
pending = result.isna()
95+
if pending.any():
96+
amt_values = numeric_amounts.loc[pending]
97+
result.loc[pending] = np.where(amt_values < 0, "DEBIT", "CREDIT")
98+
99+
return result.fillna("OTHER")
100+
101+
102+
def infer_trntype(
103+
amount,
104+
trntype_text: Optional[str],
105+
cleaned_desc: Optional[str] = None,
106+
rules: Optional[RuleSet] = None,
107+
) -> str:
108+
series = infer_trntype_series(
109+
pd.Series([amount]),
110+
pd.Series([trntype_text]),
111+
pd.Series([cleaned_desc]),
112+
rules=rules,
113+
)
114+
val = series.iloc[0]
115+
return "OTHER" if pd.isna(val) else str(val)

0 commit comments

Comments
 (0)