Skip to content

feat: reduce duplicate fields on join #1184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ def join(
left_on: None = None,
right_on: None = None,
join_keys: None = None,
keep_duplicate_keys: bool = False,
) -> DataFrame: ...

@overload
Expand All @@ -655,6 +656,7 @@ def join(
left_on: str | Sequence[str],
right_on: str | Sequence[str],
join_keys: tuple[list[str], list[str]] | None = None,
keep_duplicate_keys: bool = False,
) -> DataFrame: ...

@overload
Expand All @@ -667,6 +669,7 @@ def join(
join_keys: tuple[list[str], list[str]],
left_on: None = None,
right_on: None = None,
keep_duplicate_keys: bool = False,
) -> DataFrame: ...

def join(
Expand All @@ -678,6 +681,7 @@ def join(
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
join_keys: tuple[list[str], list[str]] | None = None,
keep_duplicate_keys: bool = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name keep_duplicate_keys is somewhat confusing: it drops the right-side keys when False.
A more direct name like drop_duplicate_keys: bool = False (default) or deduplicate: bool = False may better express intent.

Additionally, in many DataFrame libraries (Pandas, PySpark), the term suffixes or indicator is used for duplicate‐column handling. Consider whether a suffix‐based approach (with default ('', '_right')) could be more familiar to users than a boolean drop flag.

) -> DataFrame:
"""Join this :py:class:`DataFrame` with another :py:class:`DataFrame`.

Expand All @@ -690,11 +694,23 @@ def join(
"right", "full", "semi", "anti".
left_on: Join column of the left dataframe.
right_on: Join column of the right dataframe.
keep_duplicate_keys: When False, the columns from the right DataFrame
that have identical names in the ``on`` fields to the left DataFrame
will be dropped.
join_keys: Tuple of two lists of column names to join on. [Deprecated]

Returns:
DataFrame after join.
"""
if join_keys is not None:
warnings.warn(
"`join_keys` is deprecated, use `on` or `left_on` with `right_on`",
category=DeprecationWarning,
stacklevel=2,
)
left_on = join_keys[0]
right_on = join_keys[1]

# This check is to prevent breaking API changes where users prior to
# DF 43.0.0 would pass the join_keys as a positional argument instead
# of a keyword argument.
Expand All @@ -705,18 +721,10 @@ def join(
and isinstance(on[1], list)
):
# We know this is safe because we've checked the types
join_keys = on # type: ignore[assignment]
left_on = on[0]
right_on = on[1]
on = None

if join_keys is not None:
warnings.warn(
"`join_keys` is deprecated, use `on` or `left_on` with `right_on`",
category=DeprecationWarning,
stacklevel=2,
)
left_on = join_keys[0]
right_on = join_keys[1]

if on is not None:
if left_on is not None or right_on is not None:
error_msg = "`left_on` or `right_on` should not provided with `on`"
Expand All @@ -735,7 +743,9 @@ def join(
if isinstance(right_on, str):
right_on = [right_on]

return DataFrame(self.df.join(right.df, how, left_on, right_on))
return DataFrame(
self.df.join(right.df, how, left_on, right_on, keep_duplicate_keys)
)

def join_on(
self,
Expand Down
2 changes: 1 addition & 1 deletion python/datafusion/dataframe_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def _build_table_container_start(self) -> list[str]:
f"max-height: {self.max_height}px; overflow: auto; border: "
'1px solid #ccc;">'
)
html.append('<table style="border-collapse: collapse; min-width: 100%">')
html.append('<table style="border-collapse: collapse">')
return html

def _build_table_header(self, schema: Any) -> list[str]:
Expand Down
1 change: 0 additions & 1 deletion python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,6 @@ def test_unnest_without_nulls(nested_df):
assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9])


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good if new tests for keep_duplicate_keys=False or True were added. To ensure coverage, add tests verifying that passing keep_duplicate_keys=True preserves both columns.

@pytest.mark.filterwarnings("ignore:`join_keys`:DeprecationWarning")
def test_join():
ctx = SessionContext()

Expand Down
3 changes: 3 additions & 0 deletions python/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ def test_register_parquet(ctx, tmp_path):
assert result.to_pydict() == {"cnt": [100]}


@pytest.mark.filterwarnings(
"ignore:using literals for table_partition_cols data types:DeprecationWarning"
)
@pytest.mark.parametrize(
("path_to_str", "legacy_data_type"), [(True, False), (False, False), (False, True)]
)
Expand Down
36 changes: 35 additions & 1 deletion src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ impl PyDataFrame {
how: &str,
left_on: Vec<PyBackedStr>,
right_on: Vec<PyBackedStr>,
keep_duplicate_keys: bool,
) -> PyDataFusionResult<Self> {
let join_type = match how {
"inner" => JoinType::Inner,
Expand All @@ -584,13 +585,46 @@ impl PyDataFrame {
let left_keys = left_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
let right_keys = right_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();

let df = self.df.as_ref().clone().join(
let mut df = self.df.as_ref().clone().join(
right.df.as_ref().clone(),
join_type,
&left_keys,
&right_keys,
None,
)?;

if !keep_duplicate_keys {
let mutual_keys = left_keys
.iter()
.zip(right_keys.iter())
.filter(|(l, r)| l == r)
.map(|(key, _)| *key)
.collect::<Vec<_>>();

let fields_to_drop = mutual_keys
.iter()
.map(|name| {
df.logical_plan()
.schema()
.qualified_fields_with_unqualified_name(name)
})
.filter(|r| r.len() == 2)
.map(|r| r[1])
.collect::<Vec<_>>();

let expr: Vec<Expr> = df
.logical_plan()
.schema()
.fields()
.into_iter()
.enumerate()
.map(|(idx, _)| df.logical_plan().schema().qualified_field(idx))
.filter(|(qualifier, f)| !fields_to_drop.contains(&(*qualifier, f)))
.map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
.collect();
df = df.select(expr)?;
}

Ok(Self::new(df))
}

Expand Down