Skip to content

Commit 853f064

Browse files
bors[bot]petrosagg
andauthored
Merge #502
502: add .duplicates() and .duplicates_by(..) operations r=jswrenn a=petrosagg Uses a HashMap to detect duplicates in an iterator and emits them only once. The implemention is similar to the `unique()` and `unique_by()` methods but in this case the items are never cloned. Co-authored-by: Petros Angelatos <petrosagg@gmail.com>
2 parents a71fce5 + e1090aa commit 853f064

File tree

4 files changed

+290
-0
lines changed

4 files changed

+290
-0
lines changed

src/duplicates_impl.rs

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
use std::hash::Hash;
2+
3+
mod private {
4+
use std::collections::HashMap;
5+
use std::hash::Hash;
6+
use std::fmt;
7+
8+
#[derive(Clone)]
9+
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
10+
pub struct DuplicatesBy<I: Iterator, Key, F> {
11+
pub(crate) iter: I,
12+
pub(crate) meta: Meta<Key, F>,
13+
}
14+
15+
impl<I, V, F> fmt::Debug for DuplicatesBy<I, V, F>
16+
where
17+
I: Iterator + fmt::Debug,
18+
V: fmt::Debug + Hash + Eq,
19+
{
20+
debug_fmt_fields!(DuplicatesBy, iter, meta.used);
21+
}
22+
23+
impl<I: Iterator, Key: Eq + Hash, F> DuplicatesBy<I, Key, F> {
24+
pub(crate) fn new(iter: I, key_method: F) -> Self {
25+
DuplicatesBy {
26+
iter,
27+
meta: Meta {
28+
used: HashMap::new(),
29+
pending: 0,
30+
key_method,
31+
},
32+
}
33+
}
34+
}
35+
36+
#[derive(Clone)]
37+
pub struct Meta<Key, F> {
38+
used: HashMap<Key, bool>,
39+
pending: usize,
40+
key_method: F,
41+
}
42+
43+
impl<Key, F> Meta<Key, F>
44+
where
45+
Key: Eq + Hash,
46+
{
47+
/// Takes an item and returns it back to the caller if it's the second time we see it.
48+
/// Otherwise the item is consumed and None is returned
49+
#[inline(always)]
50+
fn filter<I>(&mut self, item: I) -> Option<I>
51+
where
52+
F: KeyMethod<Key, I>,
53+
{
54+
let kv = self.key_method.make(item);
55+
match self.used.get_mut(kv.key_ref()) {
56+
None => {
57+
self.used.insert(kv.key(), false);
58+
self.pending += 1;
59+
None
60+
}
61+
Some(true) => None,
62+
Some(produced) => {
63+
*produced = true;
64+
self.pending -= 1;
65+
Some(kv.value())
66+
}
67+
}
68+
}
69+
}
70+
71+
impl<I, Key, F> Iterator for DuplicatesBy<I, Key, F>
72+
where
73+
I: Iterator,
74+
Key: Eq + Hash,
75+
F: KeyMethod<Key, I::Item>,
76+
{
77+
type Item = I::Item;
78+
79+
fn next(&mut self) -> Option<Self::Item> {
80+
let DuplicatesBy { iter, meta } = self;
81+
iter.find_map(|v| meta.filter(v))
82+
}
83+
84+
#[inline]
85+
fn size_hint(&self) -> (usize, Option<usize>) {
86+
let (_, hi) = self.iter.size_hint();
87+
// There are `hi` number of items left in the base iterator. In the best case scenario,
88+
// these items are exactly the same as the ones pending (i.e items seen exactly once so
89+
// far), plus (hi - pending) / 2 pairs of never seen before items.
90+
let hi = hi.map(|hi| {
91+
let max_pending = std::cmp::min(self.meta.pending, hi);
92+
let max_new = std::cmp::max(hi - self.meta.pending, 0) / 2;
93+
max_pending + max_new
94+
});
95+
// The lower bound is always 0 since we might only get unique items from now on
96+
(0, hi)
97+
}
98+
}
99+
100+
impl<I, Key, F> DoubleEndedIterator for DuplicatesBy<I, Key, F>
101+
where
102+
I: DoubleEndedIterator,
103+
Key: Eq + Hash,
104+
F: KeyMethod<Key, I::Item>,
105+
{
106+
fn next_back(&mut self) -> Option<Self::Item> {
107+
let DuplicatesBy { iter, meta } = self;
108+
iter.rev().find_map(|v| meta.filter(v))
109+
}
110+
}
111+
112+
/// A keying method for use with `DuplicatesBy`
113+
pub trait KeyMethod<K, V> {
114+
type Container: KeyXorValue<K, V>;
115+
116+
fn make(&mut self, value: V) -> Self::Container;
117+
}
118+
119+
/// Apply the identity function to elements before checking them for equality.
120+
pub struct ById;
121+
impl<V> KeyMethod<V, V> for ById {
122+
type Container = JustValue<V>;
123+
124+
fn make(&mut self, v: V) -> Self::Container {
125+
JustValue(v)
126+
}
127+
}
128+
129+
/// Apply a user-supplied function to elements before checking them for equality.
130+
pub struct ByFn<F>(pub(crate) F);
131+
impl<K, V, F> KeyMethod<K, V> for ByFn<F>
132+
where
133+
F: FnMut(&V) -> K,
134+
{
135+
type Container = KeyValue<K, V>;
136+
137+
fn make(&mut self, v: V) -> Self::Container {
138+
KeyValue((self.0)(&v), v)
139+
}
140+
}
141+
142+
// Implementors of this trait can hold onto a key and a value but only give access to one of them
143+
// at a time. This allows the key and the value to be the same value internally
144+
pub trait KeyXorValue<K, V> {
145+
fn key_ref(&self) -> &K;
146+
fn key(self) -> K;
147+
fn value(self) -> V;
148+
}
149+
150+
pub struct KeyValue<K, V>(K, V);
151+
impl<K, V> KeyXorValue<K, V> for KeyValue<K, V> {
152+
fn key_ref(&self) -> &K {
153+
&self.0
154+
}
155+
fn key(self) -> K {
156+
self.0
157+
}
158+
fn value(self) -> V {
159+
self.1
160+
}
161+
}
162+
163+
pub struct JustValue<V>(V);
164+
impl<V> KeyXorValue<V, V> for JustValue<V> {
165+
fn key_ref(&self) -> &V {
166+
&self.0
167+
}
168+
fn key(self) -> V {
169+
self.0
170+
}
171+
fn value(self) -> V {
172+
self.0
173+
}
174+
}
175+
}
176+
177+
/// An iterator adapter to filter for duplicate elements.
178+
///
179+
/// See [`.duplicates_by()`](../trait.Itertools.html#method.duplicates_by) for more information.
180+
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
181+
pub type DuplicatesBy<I, V, F> = private::DuplicatesBy<I, V, private::ByFn<F>>;
182+
183+
/// Create a new `DuplicatesBy` iterator.
184+
pub fn duplicates_by<I, Key, F>(iter: I, f: F) -> DuplicatesBy<I, Key, F>
185+
where
186+
Key: Eq + Hash,
187+
F: FnMut(&I::Item) -> Key,
188+
I: Iterator,
189+
{
190+
DuplicatesBy::new(iter, private::ByFn(f))
191+
}
192+
193+
/// An iterator adapter to filter out duplicate elements.
194+
///
195+
/// See [`.duplicates()`](../trait.Itertools.html#method.duplicates) for more information.
196+
pub type Duplicates<I> = private::DuplicatesBy<I, <I as Iterator>::Item, private::ById>;
197+
198+
/// Create a new `Duplicates` iterator.
199+
pub fn duplicates<I>(iter: I) -> Duplicates<I>
200+
where
201+
I: Iterator,
202+
I::Item: Eq + Hash,
203+
{
204+
Duplicates::new(iter, private::ById)
205+
}
206+

src/lib.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ pub mod structs {
149149
pub use crate::tee::Tee;
150150
pub use crate::tuple_impl::{TupleBuffer, TupleWindows, CircularTupleWindows, Tuples};
151151
#[cfg(feature = "use_std")]
152+
pub use crate::duplicates_impl::{Duplicates, DuplicatesBy};
153+
#[cfg(feature = "use_std")]
152154
pub use crate::unique_impl::{Unique, UniqueBy};
153155
pub use crate::with_position::WithPosition;
154156
pub use crate::zip_eq_impl::ZipEq;
@@ -230,6 +232,8 @@ mod sources;
230232
mod tee;
231233
mod tuple_impl;
232234
#[cfg(feature = "use_std")]
235+
mod duplicates_impl;
236+
#[cfg(feature = "use_std")]
233237
mod unique_impl;
234238
mod with_position;
235239
mod zip_eq_impl;
@@ -1147,6 +1151,54 @@ pub trait Itertools : Iterator {
11471151
adaptors::dedup_by_with_count(self, cmp)
11481152
}
11491153

1154+
/// Return an iterator adaptor that produces elements that appear more than once during the
1155+
/// iteration. Duplicates are detected using hash and equality.
1156+
///
1157+
/// The iterator is stable, returning the duplicate items in the order in which they occur in
1158+
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
1159+
/// than twice, the second item is the item retained and the rest are discarded.
1160+
///
1161+
/// ```
1162+
/// use itertools::Itertools;
1163+
///
1164+
/// let data = vec![10, 20, 30, 20, 40, 10, 50];
1165+
/// itertools::assert_equal(data.into_iter().duplicates(),
1166+
/// vec![20, 10]);
1167+
/// ```
1168+
#[cfg(feature = "use_std")]
1169+
fn duplicates(self) -> Duplicates<Self>
1170+
where Self: Sized,
1171+
Self::Item: Eq + Hash
1172+
{
1173+
duplicates_impl::duplicates(self)
1174+
}
1175+
1176+
/// Return an iterator adaptor that produces elements that appear more than once during the
1177+
/// iteration. Duplicates are detected using hash and equality.
1178+
///
1179+
/// Duplicates are detected by comparing the key they map to with the keying function `f` by
1180+
/// hash and equality. The keys are stored in a hash map in the iterator.
1181+
///
1182+
/// The iterator is stable, returning the duplicate items in the order in which they occur in
1183+
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
1184+
/// than twice, the second item is the item retained and the rest are discarded.
1185+
///
1186+
/// ```
1187+
/// use itertools::Itertools;
1188+
///
1189+
/// let data = vec!["a", "bb", "aa", "c", "ccc"];
1190+
/// itertools::assert_equal(data.into_iter().duplicates_by(|s| s.len()),
1191+
/// vec!["aa", "c"]);
1192+
/// ```
1193+
#[cfg(feature = "use_std")]
1194+
fn duplicates_by<V, F>(self, f: F) -> DuplicatesBy<Self, V, F>
1195+
where Self: Sized,
1196+
V: Eq + Hash,
1197+
F: FnMut(&Self::Item) -> V
1198+
{
1199+
duplicates_impl::duplicates_by(self, f)
1200+
}
1201+
11501202
/// Return an iterator adaptor that filters out elements that have
11511203
/// already been produced once during the iteration. Duplicates
11521204
/// are detected using hash and equality.

tests/quick.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,12 @@ quickcheck! {
915915
}
916916
}
917917

918+
quickcheck! {
919+
fn size_duplicates(it: Iter<i8>) -> bool {
920+
correct_size_hint(it.duplicates())
921+
}
922+
}
923+
918924
quickcheck! {
919925
fn size_unique(it: Iter<i8>) -> bool {
920926
correct_size_hint(it.unique())

tests/test_std.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,32 @@ fn interleave_shortest() {
5959
assert_eq!(it.size_hint(), (6, Some(6)));
6060
}
6161

62+
#[test]
63+
fn duplicates_by() {
64+
let xs = ["aaa", "bbbbb", "aa", "ccc", "bbbb", "aaaaa", "cccc"];
65+
let ys = ["aa", "bbbb", "cccc"];
66+
it::assert_equal(ys.iter(), xs.iter().duplicates_by(|x| x[..2].to_string()));
67+
it::assert_equal(ys.iter(), xs.iter().rev().duplicates_by(|x| x[..2].to_string()).rev());
68+
let ys_rev = ["ccc", "aa", "bbbbb"];
69+
it::assert_equal(ys_rev.iter(), xs.iter().duplicates_by(|x| x[..2].to_string()).rev());
70+
}
71+
72+
#[test]
73+
fn duplicates() {
74+
let xs = [0, 1, 2, 3, 2, 1, 3];
75+
let ys = [2, 1, 3];
76+
it::assert_equal(ys.iter(), xs.iter().duplicates());
77+
it::assert_equal(ys.iter(), xs.iter().rev().duplicates().rev());
78+
let ys_rev = [3, 2, 1];
79+
it::assert_equal(ys_rev.iter(), xs.iter().duplicates().rev());
80+
81+
let xs = [0, 1, 0, 1];
82+
let ys = [0, 1];
83+
it::assert_equal(ys.iter(), xs.iter().duplicates());
84+
it::assert_equal(ys.iter(), xs.iter().rev().duplicates().rev());
85+
let ys_rev = [1, 0];
86+
it::assert_equal(ys_rev.iter(), xs.iter().duplicates().rev());
87+
}
6288

6389
#[test]
6490
fn unique_by() {

0 commit comments

Comments
 (0)