Skip to content

Commit 51cbd8d

Browse files
authored
Warm and cold users support (#77)
- supported warm users and items in `Dataset` - removed `return_external_ids` parameter in `recommend` and `recommend_to_items` methods - supported cold and warm targets in base model - supported new dataset in cross validation The first part of #87
1 parent eed8e69 commit 51cbd8d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1423
-565
lines changed

.github/workflows/test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
uses: actions/cache@v3
2929
with:
3030
path: .venv
31-
key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
31+
key: venv-${{ runner.os }}-3.8-${{ hashFiles('**/poetry.lock') }}
3232

3333
- name: Install dependencies
3434
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
@@ -65,7 +65,7 @@ jobs:
6565
uses: actions/cache@v3
6666
with:
6767
path: .venv
68-
key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
68+
key: venv-${{ runner.os }}-${{ matrix.python-version }}-old-deps-${{ matrix.old-deps }}-${{ hashFiles('**/poetry.lock') }}
6969

7070
- name: Install dependencies
7171
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# MacOS
2+
.DS_Store
3+
14
# Byte-compiled / optimized / DLL files
25
__pycache__/
36
*.py[cod]

.pylintrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ disable=arguments-differ,
7373
unused-argument,
7474
use-implicit-booleaness-not-comparison,
7575
use-symbolic-message-instead,
76+
abstract-method
7677

7778
# Enable the message, report, category or checker with the given id(s).
7879
# You can either give multiple identifier separated by comma (,) or
@@ -446,7 +447,7 @@ max-args=15
446447
max-attributes=12
447448

448449
# Maximum number of boolean expressions in an if statement (see R0916).
449-
max-bool-expr=2
450+
max-bool-expr=3
450451

451452
# Maximum number of branch for function / method body.
452453
max-branches=9

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88

9+
## [0.6.0] - Unreleased
10+
11+
### Added
12+
- Warm users/items support in `Dataset` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
13+
- Warm and cold users/items support in `ModelBase` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
14+
- Warm and cold users/items support in `cross_validate` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
15+
16+
### Removed
17+
- `return_external_ids` parameter in `recommend` and `recommend_to_items` model methods ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
18+
19+
920
## [0.5.0] - 22.03.2024
1021

1122
### Added

poetry.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ tqdm = "^4.27.0"
6262
implicit = "^0.7.1"
6363
attrs = ">=19.1.0,<24.0.0"
6464
typeguard = "^2.0.1"
65+
typing-extensions = "4.7.1" # TODO: remove after dropping support for python 3.7
6566

6667

6768
lightfm = {version = ">=1.16,<=1.17", optional = true}

rectools/dataset/dataset.py

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
from rectools import Columns
2424

25-
from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures, UnknownIdError
25+
from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures
2626
from .identifiers import IdMap
2727
from .interactions import Interactions
2828

@@ -36,8 +36,8 @@ class Dataset:
3636
user-item interactions, user and item features
3737
in special `rectools` structures for convenient future usage.
3838
39-
This is data class, so you can create it explicitly, but
40-
it's recommended to use `construct` method.
39+
WARNING: It's highly not recommended to create `Dataset` object directly.
40+
Use `construct` class method instead.
4141
4242
Parameters
4343
----------
@@ -59,6 +59,38 @@ class Dataset:
5959
user_features: tp.Optional[Features] = attr.ib(default=None)
6060
item_features: tp.Optional[Features] = attr.ib(default=None)
6161

62+
@property
63+
def n_hot_users(self) -> int:
64+
"""
65+
Return number of hot users in dataset.
66+
Users with internal ids from `0` to `n_hot_users - 1` are hot (they are present in interactions).
67+
Users with internal ids from `n_hot_users` to `dataset.user_id_map.size - 1` are warm
68+
(they aren't present in interactions, but they have features).
69+
"""
70+
return self.interactions.df[Columns.User].max() + 1
71+
72+
@property
73+
def n_hot_items(self) -> int:
74+
"""
75+
Return number of hot items in dataset.
76+
Items with internal ids from `0` to `n_hot_items - 1` are hot (they are present in interactions).
77+
Items with internal ids from `n_hot_items` to `dataset.item_id_map.size - 1` are warm
78+
(they aren't present in interactions, but they have features).
79+
"""
80+
return self.interactions.df[Columns.Item].max() + 1
81+
82+
def get_hot_user_features(self) -> tp.Optional[Features]:
83+
"""User features for hot users."""
84+
if self.user_features is None:
85+
return None
86+
return self.user_features.take(range(self.n_hot_users))
87+
88+
def get_hot_item_features(self) -> tp.Optional[Features]:
89+
"""Item features for hot items."""
90+
if self.item_features is None:
91+
return None
92+
return self.item_features.take(range(self.n_hot_items))
93+
6294
@classmethod
6395
def construct(
6496
cls,
@@ -112,15 +144,16 @@ def construct(
112144
user_id_map = IdMap.from_values(interactions_df[Columns.User].values)
113145
item_id_map = IdMap.from_values(interactions_df[Columns.Item].values)
114146
interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)
115-
user_features = cls._make_features(
147+
148+
user_features, user_id_map = cls._make_features(
116149
user_features_df,
117150
cat_user_features,
118151
make_dense_user_features,
119152
user_id_map,
120153
Columns.User,
121154
"user",
122155
)
123-
item_features = cls._make_features(
156+
item_features, item_id_map = cls._make_features(
124157
item_features_df,
125158
cat_item_features,
126159
make_dense_item_features,
@@ -138,32 +171,30 @@ def _make_features(
138171
id_map: IdMap,
139172
possible_id_col: str,
140173
feature_type: str,
141-
) -> tp.Optional[Features]:
174+
) -> tp.Tuple[tp.Optional[Features], IdMap]:
142175
if df is None:
143-
return None
176+
return None, id_map
144177

145178
id_col = possible_id_col if possible_id_col in df else "id"
179+
id_map = id_map.add_ids(df[id_col].values, raise_if_already_present=False)
146180

147181
if make_dense:
148182
try:
149-
return DenseFeatures.from_dataframe(df, id_map, id_col=id_col)
150-
except UnknownIdError:
151-
raise ValueError(f"Some ids from {feature_type} features table not present in interactions")
183+
return DenseFeatures.from_dataframe(df, id_map, id_col=id_col), id_map
152184
except AbsentIdError:
153185
raise ValueError(
154186
f"An error has occurred while constructing {feature_type} features: "
155-
"When using dense features all ids from interactions must present in features table"
187+
"When using dense features all ids from interactions must be present in features table"
156188
)
157189
except Exception as e: # pragma: no cover
158190
raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}")
191+
159192
try:
160-
return SparseFeatures.from_flatten(df, id_map, cat_features, id_col=id_col)
161-
except UnknownIdError:
162-
raise ValueError(f"Some ids from {feature_type} features table not present in interactions")
193+
return SparseFeatures.from_flatten(df, id_map, cat_features, id_col=id_col), id_map
163194
except Exception as e: # pragma: no cover
164195
raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}")
165196

166-
def get_user_item_matrix(self, include_weights: bool = True) -> sparse.csr_matrix:
197+
def get_user_item_matrix(self, include_weights: bool = True, include_warm: bool = False) -> sparse.csr_matrix:
167198
"""
168199
Construct user-item CSR matrix based on `interactions` attribute.
169200
@@ -177,14 +208,19 @@ def get_user_item_matrix(self, include_weights: bool = True) -> sparse.csr_matri
177208
include_weights : bool, default ``True``
178209
Whether include interaction weights in matrix or not.
179210
If False, all values in returned matrix will be equal to ``1``.
211+
include_warm : bool, default ``False``
212+
Whether to include warm users and items into the matrix or not.
213+
Rows and columns for warm users and items will be added to the end of matrix,
214+
they will contain only zeros.
180215
181216
Returns
182217
-------
183218
csr_matrix
184219
Resized user-item CSR matrix
185220
"""
186221
matrix = self.interactions.get_user_item_matrix(include_weights)
187-
matrix.resize(self.user_id_map.internal_ids.size, self.item_id_map.internal_ids.size)
222+
if include_warm:
223+
matrix.resize(self.user_id_map.size, self.item_id_map.size)
188224
return matrix
189225

190226
def get_raw_interactions(self, include_weight: bool = True, include_datetime: bool = True) -> pd.DataFrame:

rectools/dataset/features.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ def take(self, ids: InternalIds) -> "DenseFeatures":
160160
names=self.names,
161161
)
162162

163+
def __len__(self) -> int:
164+
"""Return number of objects."""
165+
return self.values.shape[0]
166+
163167

164168
SparseFeatureName = tp.Tuple[str, tp.Any]
165169

@@ -442,5 +446,9 @@ def take(self, ids: InternalIds) -> "SparseFeatures":
442446
names=self.names,
443447
)
444448

449+
def __len__(self) -> int:
450+
"""Return number of objects."""
451+
return self.values.shape[0]
452+
445453

446454
Features = tp.Union[DenseFeatures, SparseFeatures]

rectools/dataset/identifiers.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import attr
2121
import numpy as np
2222
import pandas as pd
23+
import typing_extensions as tpe
2324

2425
from rectools import ExternalId, ExternalIds, InternalId, InternalIds
2526
from rectools.utils import fast_isin, get_from_series_by_index
@@ -97,6 +98,11 @@ def size(self) -> int:
9798
"""Return number of ids in map."""
9899
return self.external_ids.size
99100

101+
@property
102+
def external_dtype(self) -> tp.Type:
103+
"""Return dtype of external ids."""
104+
return self.external_ids.dtype
105+
100106
@property
101107
def to_internal(self) -> pd.Series:
102108
"""Map internal->external."""
@@ -120,7 +126,21 @@ def get_external_sorted_by_internal(self) -> np.ndarray:
120126
"""Return array of external ids sorted by internal ids."""
121127
return self.external_ids
122128

123-
def convert_to_internal(self, external: ExternalIds, strict: bool = True) -> np.ndarray:
129+
@tp.overload
130+
def convert_to_internal( # noqa: D102
131+
self, external: ExternalIds, strict: bool = ..., return_missing: tpe.Literal[False] = False
132+
) -> np.ndarray: # pragma: no cover
133+
...
134+
135+
@tp.overload
136+
def convert_to_internal( # noqa: D102
137+
self, external: ExternalIds, strict: bool = ..., *, return_missing: tpe.Literal[True]
138+
) -> tp.Tuple[np.ndarray, np.ndarray]: # pragma: no cover
139+
...
140+
141+
def convert_to_internal(
142+
self, external: ExternalIds, strict: bool = True, return_missing: bool = False
143+
) -> tp.Union[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]]:
124144
"""
125145
Convert any sequence of external ids to array of internal ids (map external -> internal).
126146
@@ -132,21 +152,43 @@ def convert_to_internal(self, external: ExternalIds, strict: bool = True) -> np.
132152
Defines behaviour when some of given external ids do not exist in mapping.
133153
- If ``True``, `KeyError` will be raised;
134154
- If ``False``, nonexistent ids will be skipped.
155+
return_missing : bool, default ``False``
156+
If True, return a tuple of 2 arrays: internal ids and missing ids (that are not in map).
157+
Works only if `strict` is False.
135158
136159
Returns
137160
-------
138161
np.ndarray
139162
Array of internal ids.
163+
np.ndarray, np.ndarray
164+
Tuple of 2 arrays: internal ids and missing ids.
165+
Only if `strict` is False and `return_missing` is True.
140166
141167
Raises
142168
------
143169
KeyError
144170
If some of given external ids do not exist in mapping and `strict` flag is ``True``.
171+
ValueError
172+
If `strict` and `return_missing` are both ``True``.
145173
"""
146-
internal = get_from_series_by_index(self.to_internal, external, strict)
147-
return internal
148-
149-
def convert_to_external(self, internal: InternalIds, strict: bool = True) -> np.ndarray:
174+
result = get_from_series_by_index(self.to_internal, external, strict, return_missing)
175+
return result
176+
177+
@tp.overload
178+
def convert_to_external( # noqa: D102
179+
self, internal: InternalIds, strict: bool = ..., return_missing: tpe.Literal[False] = False
180+
) -> np.ndarray: # pragma: no cover
181+
...
182+
183+
@tp.overload
184+
def convert_to_external( # noqa: D102
185+
self, internal: InternalIds, strict: bool = ..., *, return_missing: tpe.Literal[True]
186+
) -> tp.Tuple[np.ndarray, np.ndarray]: # pragma: no cover
187+
...
188+
189+
def convert_to_external(
190+
self, internal: InternalIds, strict: bool = True, return_missing: bool = False
191+
) -> tp.Union[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]]:
150192
"""
151193
Convert any sequence of internal ids to array of external ids (map internal -> external).
152194
@@ -158,19 +200,27 @@ def convert_to_external(self, internal: InternalIds, strict: bool = True) -> np.
158200
Defines behaviour when some of given internal ids do not exist in mapping.
159201
- If ``True``, `KeyError` will be raised;
160202
- If ``False``, nonexistent ids will be skipped.
203+
return_missing : bool, default ``False``
204+
If True, return a tuple of 2 arrays: external ids and missing ids (that are not in map).
205+
Works only if `strict` is False.
161206
162207
Returns
163208
-------
164209
np.ndarray
165210
Array of external ids.
211+
np.ndarray, np.ndarray
212+
Tuple of 2 arrays: external ids and missing ids.
213+
Only if `strict` is False and `return_missing` is True.
166214
167215
Raises
168216
------
169217
KeyError
170218
If some of given internal ids do not exist in mapping and `strict` flag is True.
219+
ValueError
220+
If `strict` and `return_missing` are both ``True``.
171221
"""
172-
external = get_from_series_by_index(self.to_external, internal, strict)
173-
return external
222+
result = get_from_series_by_index(self.to_external, internal, strict, return_missing)
223+
return result
174224

175225
def add_ids(self, values: ExternalIds, raise_if_already_present: bool = False) -> "IdMap":
176226
"""

rectools/dataset/torch_datasets.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,15 @@ def __init__(
7070
@classmethod
7171
def from_dataset(cls: tp.Type[DD], dataset: Dataset) -> DD:
7272
ui_matrix = dataset.get_user_item_matrix()
73-
if dataset.item_features is not None:
74-
item_features = dataset.item_features.get_sparse()
75-
else:
73+
74+
# We take hot here since this dataset is used for fit only
75+
item_features = dataset.get_hot_item_features()
76+
user_features = dataset.get_hot_user_features()
77+
if item_features is None:
7678
raise AttributeError("Item features attribute of dataset could not be None")
77-
if dataset.user_features is not None:
78-
user_features = dataset.user_features.get_sparse()
79-
else:
79+
if user_features is None:
8080
raise AttributeError("User features attribute of dataset could not be None")
81-
return cls(items=item_features, users=user_features, interactions=ui_matrix)
81+
return cls(items=item_features.get_sparse(), users=user_features.get_sparse(), interactions=ui_matrix)
8282

8383
def __len__(self) -> int:
8484
return self.interactions.shape[0]
@@ -114,6 +114,7 @@ def __init__(self, items: sparse.csr_matrix):
114114

115115
@classmethod
116116
def from_dataset(cls: tp.Type[ID], dataset: Dataset) -> ID:
117+
# We take all features here since this dataset is used for recommend only, not for fit
117118
if dataset.item_features is not None:
118119
return cls(dataset.item_features.get_sparse())
119120
raise AttributeError("Item features attribute of dataset could not be None")
@@ -155,6 +156,7 @@ def from_dataset(
155156
dataset: Dataset,
156157
keep_users: tp.Optional[tp.Sequence[int]] = None,
157158
) -> UD:
159+
# We take all features here since this dataset is used for recommend only, not for fit
158160
if dataset.user_features is not None:
159161
return cls(
160162
dataset.user_features.get_sparse(),

0 commit comments

Comments
 (0)