Skip to content

fix multicategorical stype inference and add test case #420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions test/utils/test_infer_stype.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
import pytest

import torch_frame
Expand Down Expand Up @@ -38,3 +39,16 @@ def test_infer_df_stype(with_nan):
dataset = get_fake_dataset(num_rows, col_to_text_embedder_cfg, with_nan)
col_to_stype_inferred = infer_df_stype(dataset.df)
assert col_to_stype_inferred == dataset.col_to_stype


def test_infer_multicategorical_stype():
# Test when multicategoricals are lists
df = pd.DataFrame({
'category': [['Books', 'Mystery, Thriller'],
['Books', "Children's Books", 'Geography'],
['Books', 'Health', 'Fitness & Dieting'],
['Books', 'Teen & oung Adult']] * 50,
'id': [i for i in range(200)]
})
col_to_stype_inferred = infer_df_stype(df)
assert col_to_stype_inferred['category'] == torch_frame.multicategorical
34 changes: 21 additions & 13 deletions torch_frame/utils/infer_stype.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import warnings
from typing import Any

import numpy as np
import pandas as pd
import pandas.api.types as ptypes
from dateutil.parser import ParserError
Expand Down Expand Up @@ -137,19 +138,26 @@
return stype.embedding

# Try different possible seps and mick the largest min_count.
min_count_list = []
for sep in POSSIBLE_SEPS:
try:
min_count_list.append(
_min_count(
ser.apply(lambda row: MultiCategoricalTensorMapper.
split_by_sep(row, sep)).explode()))
except Exception as e:
logging.warn(
"Mapping series into multicategorical stype "
f"with separator {sep} raised an exception {e}")
continue
if max(min_count_list) > cat_min_count_thresh:
if isinstance(ser.iloc[0], list) or isinstance(
ser.iloc[0], np.ndarray):
max_min_count = _min_count(ser.explode())

Check warning on line 143 in torch_frame/utils/infer_stype.py

View check run for this annotation

Codecov / codecov/patch

torch_frame/utils/infer_stype.py#L143

Added line #L143 was not covered by tests
else:
min_count_list = []
for sep in POSSIBLE_SEPS:
try:
min_count_list.append(
_min_count(
ser.apply(
lambda row: MultiCategoricalTensorMapper.
split_by_sep(row, sep)).explode()))
except Exception as e:
logging.warn(

Check warning on line 154 in torch_frame/utils/infer_stype.py

View check run for this annotation

Codecov / codecov/patch

torch_frame/utils/infer_stype.py#L153-L154

Added lines #L153 - L154 were not covered by tests
"Mapping series into multicategorical stype "
f"with separator {sep} raised an exception {e}")
continue

Check warning on line 157 in torch_frame/utils/infer_stype.py

View check run for this annotation

Codecov / codecov/patch

torch_frame/utils/infer_stype.py#L157

Added line #L157 was not covered by tests
max_min_count = max(min_count_list or [0])

if max_min_count > cat_min_count_thresh:
return stype.multicategorical
else:
return stype.text_embedded
Expand Down
Loading