Skip to content

Commit 5c3476d

Browse files
authored
Cold users/items support in non-personilized models (#120)
- Supported cold recommendations in non-personalized models (random, popular and popular-in-category) - Added check for incorrect types of cold targets - Changed the logic of choosing random sampler for `RandomModel` and increased the sampling speed - Changed the logic of `RandomModel`: now the recommendations are different for repeated calls of recommend methods
1 parent 51cbd8d commit 5c3476d

File tree

10 files changed

+327
-217
lines changed

10 files changed

+327
-217
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010

1111
### Added
1212
- Warm users/items support in `Dataset` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
13-
- Warm and cold users/items support in `ModelBase` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
13+
- Warm and cold users/items support in `ModelBase` and non-personalized models ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77), [#120](https://github.com/MobileTeleSystems/RecTools/pull/120))
1414
- Warm and cold users/items support in `cross_validate` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
1515

16+
### Changed
17+
- Changed the logic of choosing random sampler for `RandomModel` and increased the sampling speed ([#120](https://github.com/MobileTeleSystems/RecTools/pull/120))
18+
- Changed the logic of `RandomModel`: now the recommendations are different for repeated calls of recommend methods ([#120](https://github.com/MobileTeleSystems/RecTools/pull/120))
19+
1620
### Removed
1721
- `return_external_ids` parameter in `recommend` and `recommend_to_items` model methods ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
1822

rectools/models/base.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,11 @@ def recommend(
147147

148148
# Here for hot and warm we get internal ids, for cold we keep given ids
149149
hot_user_ids, warm_user_ids, cold_user_ids = self._split_targets_by_hot_warm_cold(
150-
users, dataset.user_id_map, dataset.n_hot_users, assume_external_ids
150+
users,
151+
dataset.user_id_map,
152+
dataset.n_hot_users,
153+
assume_external_ids,
154+
"user",
151155
)
152156
self._check_targets_are_valid(hot_user_ids, warm_user_ids, cold_user_ids, "user")
153157

@@ -257,7 +261,11 @@ def recommend_to_items( # pylint: disable=too-many-branches
257261

258262
# Here for hot and warm we get internal ids, for cold we keep given ids
259263
hot_target_ids, warm_target_ids, cold_target_ids = self._split_targets_by_hot_warm_cold(
260-
target_items, dataset.item_id_map, dataset.n_hot_items, assume_external_ids
264+
target_items,
265+
dataset.item_id_map,
266+
dataset.n_hot_items,
267+
assume_external_ids,
268+
"item",
261269
)
262270
self._check_targets_are_valid(hot_target_ids, warm_target_ids, cold_target_ids, "item")
263271

@@ -344,9 +352,17 @@ def _split_targets_by_hot_warm_cold(
344352
id_map: IdMap,
345353
n_hot: int,
346354
assume_external_ids: bool,
355+
entity: tpe.Literal["user", "item"],
347356
) -> tp.Tuple[InternalIdsArray, InternalIdsArray, AnyIdsArray]:
348357
if assume_external_ids:
349358
known_ids, cold_ids = id_map.convert_to_internal(targets, strict=False, return_missing=True)
359+
try:
360+
cold_ids = cold_ids.astype(id_map.external_dtype)
361+
except ValueError:
362+
raise TypeError(
363+
f"Given {entity} ids must be convertible to the "
364+
f"{entity}_id` type in dataset ({id_map.external_dtype})"
365+
)
350366
else:
351367
target_ids = cls._ensure_internal_ids_valid(targets)
352368
known_mask = target_ids < id_map.size

rectools/models/popular.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from rectools.types import InternalIdsArray
2828
from rectools.utils import fast_isin_for_sorted_test_elements
2929

30-
from .base import ModelBase, Scores, ScoresArray
30+
from .base import FixedColdRecoModelMixin, ModelBase, Scores, ScoresArray
3131
from .utils import get_viewed_item_ids
3232

3333

@@ -40,7 +40,7 @@ class Popularity(Enum):
4040
SUM_WEIGHT = "sum_weight"
4141

4242

43-
class PopularModel(ModelBase):
43+
class PopularModel(FixedColdRecoModelMixin, ModelBase):
4444
"""
4545
Model generating recommendations based on popularity of items.
4646
@@ -73,6 +73,9 @@ class PopularModel(ModelBase):
7373
Degree of verbose output. If ``0``, no output will be provided.
7474
"""
7575

76+
recommends_for_warm = False
77+
recommends_for_cold = True
78+
7679
def __init__(
7780
self,
7881
popularity: str = "n_users",
@@ -116,12 +119,12 @@ def _fit(self, dataset: Dataset) -> None: # type: ignore
116119
items = items_scores.index.values
117120
scores = items_scores.values.astype(float)
118121

119-
if self.add_cold: # pragma: no cover # TODO: remove when added support for warm and cold
122+
if self.add_cold:
120123
cold_items = np.setdiff1d(dataset.item_id_map.internal_ids, items)
121124
items = np.concatenate((items, cold_items))
122125
scores = np.concatenate((scores, np.zeros(cold_items.size)))
123126

124-
if self.inverse: # pragma: no cover # TODO: remove when added support for warm and cold
127+
if self.inverse:
125128
items = items[::-1]
126129
scores = scores[::-1]
127130

@@ -147,11 +150,7 @@ def _recommend_u2i(
147150
filter_viewed: bool,
148151
sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray],
149152
) -> tp.Tuple[InternalIds, InternalIds, Scores]:
150-
if sorted_item_ids_to_recommend is not None:
151-
valid_items_mask = fast_isin_for_sorted_test_elements(self.popularity_list[0], sorted_item_ids_to_recommend)
152-
popularity_list = (self.popularity_list[0][valid_items_mask], self.popularity_list[1][valid_items_mask])
153-
else:
154-
popularity_list = self.popularity_list
153+
popularity_list = self._get_filtered_popularity_list(sorted_item_ids_to_recommend)
155154

156155
if filter_viewed:
157156
user_items = dataset.get_user_item_matrix(include_weights=False)
@@ -215,3 +214,20 @@ def _recommend_i2i(
215214
all_reco_ids = np.tile(single_reco, n_targets)
216215
all_scores = np.tile(single_scores, n_targets)
217216
return all_target_ids, all_reco_ids, all_scores
217+
218+
def _get_filtered_popularity_list(
219+
self, sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray]
220+
) -> tp.Tuple[InternalIdsArray, ScoresArray]:
221+
popularity_list = self.popularity_list
222+
if sorted_item_ids_to_recommend is not None:
223+
valid_items_mask = fast_isin_for_sorted_test_elements(popularity_list[0], sorted_item_ids_to_recommend)
224+
popularity_list = (popularity_list[0][valid_items_mask], popularity_list[1][valid_items_mask])
225+
return popularity_list
226+
227+
def _get_cold_reco(
228+
self, k: int, sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray]
229+
) -> tp.Tuple[InternalIds, Scores]:
230+
popularity_list = self._get_filtered_popularity_list(sorted_item_ids_to_recommend)
231+
reco_ids = popularity_list[0][:k]
232+
scores = popularity_list[1][:k]
233+
return reco_ids, scores

rectools/models/popular_in_category.py

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ class PopularInCategoryModel(PopularModel):
9595
Degree of verbose output. If ``0``, no output will be provided.
9696
"""
9797

98+
recommends_for_warm = False
99+
recommends_for_cold = True
100+
98101
def __init__(
99102
self,
100103
category_feature: str,
@@ -319,18 +322,62 @@ def _recommend_i2i(
319322
k: int,
320323
sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray],
321324
) -> tp.Tuple[InternalIds, InternalIds, Scores]:
322-
_, single_reco, single_scores = self._recommend_u2i(
323-
user_ids=dataset.user_id_map.internal_ids[:1],
324-
dataset=dataset,
325-
k=k,
326-
filter_viewed=False,
327-
sorted_item_ids_to_recommend=sorted_item_ids_to_recommend,
328-
)
329-
325+
single_reco, single_scores = self._get_cold_reco(k, sorted_item_ids_to_recommend)
330326
n_targets = len(target_ids)
331327
n_reco_per_target = len(single_reco)
332328

333329
all_target_ids = np.repeat(target_ids, n_reco_per_target)
334330
all_reco_ids = np.tile(single_reco, n_targets)
335331
all_scores = np.tile(single_scores, n_targets)
336332
return all_target_ids, all_reco_ids, all_scores
333+
334+
def _get_cold_reco(
335+
self, k: int, sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray]
336+
) -> tp.Tuple[InternalIds, Scores]:
337+
num_recs = self._get_num_recs_for_each_category(k)
338+
main_recs = []
339+
fallback_recs = []
340+
for priority, num_col in enumerate(num_recs.index):
341+
model = self.models[num_col]
342+
reco_ids, reco_scores = model._get_cold_reco( # pylint: disable=protected-access
343+
k, sorted_item_ids_to_recommend
344+
)
345+
reco_df = pd.DataFrame(
346+
{
347+
Columns.Item: reco_ids,
348+
Columns.Score: reco_scores,
349+
"category_priority": priority,
350+
}
351+
)
352+
reco_df["category_rank"] = range(len(reco_df))
353+
main_mask = reco_df["category_rank"] < num_recs.loc[num_col]
354+
main_recs.append(reco_df[main_mask])
355+
fallback_recs.append(reco_df[~main_mask])
356+
cat_recs = pd.concat(main_recs, sort=False)
357+
cat_recs.drop_duplicates(subset=[Columns.Item], inplace=True)
358+
if len(cat_recs) < k:
359+
cat_recs["is_main_rec"] = True
360+
extra_recs = pd.concat(fallback_recs, sort=False)
361+
extra_recs["is_main_rec"] = False
362+
full_recs = pd.concat([cat_recs, extra_recs], sort=False)
363+
full_recs.drop_duplicates(subset=[Columns.Item], inplace=True)
364+
365+
# Extra recommendations are given in a specific logic to guarantee that fallback recommendations
366+
# never replace main recommendations in final result. And popular category doesn't dominate
367+
# over other categories in fallback recs. Thus `rotate` mixing strategy is applied before getting
368+
# k recs for each user.
369+
full_recs.sort_values(
370+
by=["is_main_rec", "category_rank", "category_priority"],
371+
ascending=[False, True, True],
372+
inplace=True,
373+
)
374+
full_recs = full_recs.head(k)
375+
else:
376+
full_recs = cat_recs
377+
378+
if self.mixing_strategy == MixingStrategy.GROUP:
379+
full_recs.sort_values(by=["category_priority", "category_rank"], inplace=True)
380+
elif self.mixing_strategy == MixingStrategy.ROTATE:
381+
full_recs["category_rank"] = full_recs.groupby(["category_priority"], sort=False).cumcount()
382+
full_recs.sort_values(by=["category_rank", "category_priority"], inplace=True)
383+
return full_recs[Columns.Item].values, full_recs[Columns.Score].values

rectools/models/random.py

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,32 @@
2222

2323
from rectools import InternalIds
2424
from rectools.dataset import Dataset
25-
from rectools.types import InternalIdsArray
25+
from rectools.types import AnyIdsArray, InternalId, InternalIdsArray
2626
from rectools.utils import fast_isin_for_sorted_test_elements
2727

28-
from .base import ModelBase, Scores
28+
from .base import ModelBase, Scores, SemiInternalRecoTriplet
2929
from .utils import get_viewed_item_ids
3030

31-
# Experiments have shown that for random sampling without replacement if k / n > 0.025
32-
# where n - size of population, k - required number of samples
33-
# it's faster to use `np.random.choice(population, k, replace=False)
34-
# otherwise it's better to use `random.sample(population, k)
35-
K_TO_N_MIN_NUMPY_RATIO = 0.025
31+
32+
class _RandomGen:
33+
def __init__(self, random_state: tp.Optional[int] = None) -> None:
34+
self.python_gen = random.Random(random_state)
35+
self.np_gen = np.random.default_rng(random_state)
36+
37+
38+
class _RandomSampler:
39+
def __init__(self, values: np.ndarray, random_gen: _RandomGen) -> None:
40+
self.python_gen = random_gen.python_gen
41+
self.np_gen = random_gen.np_gen
42+
self.values = values
43+
self.values_list = list(values) # for random.sample
44+
45+
def sample(self, n: int) -> np.ndarray:
46+
if n < 25: # Empiric value, for optimization
47+
sampled = np.asarray(self.python_gen.sample(self.values_list, n))
48+
else:
49+
sampled = self.np_gen.choice(self.values, n, replace=False)
50+
return sampled
3651

3752

3853
class RandomModel(ModelBase):
@@ -52,9 +67,14 @@ class RandomModel(ModelBase):
5267
Degree of verbose output. If ``0``, no output will be provided.
5368
"""
5469

70+
recommends_for_warm = False
71+
recommends_for_cold = True
72+
5573
def __init__(self, random_state: tp.Optional[int] = None, verbose: int = 0):
5674
super().__init__(verbose=verbose)
5775
self.random_state = random_state
76+
self.random_gen = _RandomGen(random_state)
77+
5878
self.all_item_ids: np.ndarray
5979

6080
def _fit(self, dataset: Dataset) -> None: # type: ignore
@@ -71,18 +91,11 @@ def _recommend_u2i(
7191
if filter_viewed:
7292
user_items = dataset.get_user_item_matrix(include_weights=False)
7393

74-
if sorted_item_ids_to_recommend is not None:
75-
item_ids = np.unique(sorted_item_ids_to_recommend)
76-
else:
77-
item_ids = self.all_item_ids
78-
79-
item_indices = list(range(item_ids.size)) # for random.sample
80-
81-
np.random.seed(self.random_state)
82-
random.seed(self.random_state, version=2)
94+
item_ids = sorted_item_ids_to_recommend if sorted_item_ids_to_recommend is not None else self.all_item_ids
95+
sampler = _RandomSampler(item_ids, self.random_gen)
8396

8497
all_user_ids = []
85-
all_reco_ids = []
98+
all_reco_ids: tp.List[InternalId] = []
8699
all_scores: tp.List[float] = []
87100
for user_id in tqdm(user_ids, disable=self.verbose == 0):
88101
if filter_viewed:
@@ -92,21 +105,16 @@ def _recommend_u2i(
92105
n_reco = k
93106

94107
n_reco = min(n_reco, item_ids.size)
95-
96-
if n_reco / item_ids.size < K_TO_N_MIN_NUMPY_RATIO:
97-
reco_indices = random.sample(item_indices, n_reco)
98-
reco_ids = item_ids[reco_indices]
99-
else:
100-
reco_ids = np.random.choice(item_ids, n_reco, replace=False)
108+
reco_ids = sampler.sample(n_reco)
101109

102110
if filter_viewed:
103111
reco_ids = reco_ids[fast_isin_for_sorted_test_elements(reco_ids, viewed_ids, invert=True)][:k]
104112

105113
reco_scores = np.arange(reco_ids.size, 0, -1)
106114

107115
all_user_ids.extend([user_id] * len(reco_ids))
108-
all_reco_ids.extend(reco_ids)
109-
all_scores.extend(reco_scores)
116+
all_reco_ids.extend(reco_ids.tolist())
117+
all_scores.extend(reco_scores.tolist())
110118

111119
return all_user_ids, all_reco_ids, all_scores
112120

@@ -118,3 +126,20 @@ def _recommend_i2i(
118126
sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray],
119127
) -> tp.Tuple[InternalIds, InternalIds, Scores]:
120128
return self._recommend_u2i(target_ids, dataset, k, False, sorted_item_ids_to_recommend)
129+
130+
def _recommend_cold(
131+
self, target_ids: AnyIdsArray, k: int, sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray]
132+
) -> SemiInternalRecoTriplet:
133+
item_ids = sorted_item_ids_to_recommend if sorted_item_ids_to_recommend is not None else self.all_item_ids
134+
sampler = _RandomSampler(item_ids, self.random_gen)
135+
n_reco = min(k, item_ids.size)
136+
137+
reco_ids_lst = []
138+
for _ in tqdm(target_ids, disable=self.verbose == 0):
139+
reco_ids = sampler.sample(n_reco)
140+
reco_ids_lst.append(reco_ids)
141+
142+
reco_item_ids = np.concatenate(reco_ids_lst)
143+
reco_target_ids = np.repeat(target_ids, n_reco)
144+
reco_scores = np.tile(np.arange(n_reco, 0, -1), len(target_ids))
145+
return reco_target_ids, reco_item_ids, reco_scores

tests/model_selection/test_cross_validate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def setup(self) -> None:
160160
None,
161161
[
162162
{"model": "popular", "i_split": 0, "precision@2": 0.5, "recall@1": 0.5},
163-
{"model": "random", "i_split": 0, "precision@2": 0.5, "recall@1": 1.0},
163+
{"model": "random", "i_split": 0, "precision@2": 0.5, "recall@1": 0.0},
164164
{"model": "popular", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25},
165165
{"model": "random", "i_split": 1, "precision@2": 0.375, "recall@1": 0.5},
166166
],

tests/models/test_base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,13 @@ def test_warm_only_model_raises_on_warm_without_features(self, kind: str) -> Non
468468
with pytest.raises(ValueError, match="doesn't support recommendations for cold"):
469469
self._get_reco(targets, "hot_warm", "no_features", kind)
470470

471+
@pytest.mark.parametrize("dataset_key", ("no_features", "with_features"))
472+
@pytest.mark.parametrize("kind", ("u2i", "i2i"))
473+
@pytest.mark.parametrize("model_key", ("hot_cold", "hot_warm_cold"))
474+
def test_raises_on_incorrect_cold_targets_type(self, dataset_key: str, kind: str, model_key: str) -> None:
475+
with pytest.raises(TypeError):
476+
self._get_reco(["some_id"], model_key, dataset_key, kind)
477+
471478

472479
class TestFixedColdRecoModelMixin:
473480
def test_cold_reco_works(self) -> None:

0 commit comments

Comments
 (0)