28
28
//! Eventually, we hope that `mz_join_core_v2` proves itself sufficiently to become the only join
29
29
//! implementation.
30
30
31
+ use std:: cell:: Cell ;
31
32
use std:: cell:: RefCell ;
32
33
use std:: cmp:: Ordering ;
33
34
use std:: collections:: VecDeque ;
35
+ use std:: marker:: PhantomData ;
36
+ use std:: pin:: Pin ;
34
37
use std:: rc:: Rc ;
35
38
use std:: time:: Instant ;
36
39
@@ -41,12 +44,12 @@ use differential_dataflow::difference::Multiply;
41
44
use differential_dataflow:: lattice:: Lattice ;
42
45
use differential_dataflow:: operators:: arrange:: arrangement:: Arranged ;
43
46
use differential_dataflow:: trace:: { BatchReader , Cursor , TraceReader } ;
47
+ use mz_ore:: future:: yield_now;
44
48
use mz_repr:: Diff ;
45
49
use timely:: PartialOrder ;
46
50
use timely:: container:: { CapacityContainerBuilder , PushInto , SizableContainer } ;
47
51
use timely:: dataflow:: channels:: pact:: Pipeline ;
48
52
use timely:: dataflow:: channels:: pushers:: Tee ;
49
- use timely:: dataflow:: channels:: pushers:: buffer:: Session ;
50
53
use timely:: dataflow:: operators:: generic:: OutputHandleCore ;
51
54
use timely:: dataflow:: operators:: { Capability , Operator } ;
52
55
use timely:: dataflow:: { Scope , StreamCore } ;
75
78
+ Clone
76
79
+ ' static ,
77
80
L : FnMut ( Tr1 :: Key < ' _ > , Tr1 :: Val < ' _ > , Tr2 :: Val < ' _ > ) -> I + ' static ,
78
- I : IntoIterator ,
79
- I :: Item : Data ,
81
+ I : IntoIterator < Item : Data > + ' static ,
80
82
YFn : Fn ( Instant , usize ) -> bool + ' static ,
81
83
C : SizableContainer + PushInto < ( I :: Item , G :: Timestamp , Diff ) > + Data ,
82
84
{
@@ -497,23 +499,36 @@ where
497
499
C2 : Cursor ,
498
500
{
499
501
/// Pending work.
500
- todo : VecDeque < Deferred < C1 , C2 , D > > ,
502
+ todo : VecDeque < ( Pin < Box < dyn Future < Output = ( ) > > > , Capability < C1 :: Time > ) > ,
501
503
/// A function that transforms raw join matches into join results.
502
504
result_fn : Rc < RefCell < L > > ,
505
+ /// A buffer holding the join results.
506
+ ///
507
+ /// Written by the work futures, drained by `Work::process`.
508
+ output : Rc < RefCell < Vec < ( D , C1 :: Time , Diff ) > > > ,
509
+ /// The numer of join results produced by work futures.
510
+ ///
511
+ /// Used with `yield_fn` to inform when `Work::process` should yield.
512
+ produced : Rc < Cell < usize > > ,
513
+
514
+ _cursors : PhantomData < ( C1 , C2 ) > ,
503
515
}
504
516
505
517
impl < C1 , C2 , D , L , I > Work < C1 , C2 , D , L >
506
518
where
507
- C1 : Cursor < Diff = Diff > ,
508
- C2 : for < ' a > Cursor < Key < ' a > = C1 :: Key < ' a > , Time = C1 :: Time , Diff = Diff > ,
519
+ C1 : Cursor < Diff = Diff > + ' static ,
520
+ C2 : for < ' a > Cursor < Key < ' a > = C1 :: Key < ' a > , Time = C1 :: Time , Diff = Diff > + ' static ,
509
521
D : Data ,
510
- L : FnMut ( C1 :: Key < ' _ > , C1 :: Val < ' _ > , C2 :: Val < ' _ > ) -> I ,
511
- I : IntoIterator < Item = D > ,
522
+ L : FnMut ( C1 :: Key < ' _ > , C1 :: Val < ' _ > , C2 :: Val < ' _ > ) -> I + ' static ,
523
+ I : IntoIterator < Item = D > + ' static ,
512
524
{
513
525
fn new ( result_fn : Rc < RefCell < L > > ) -> Self {
514
526
Self {
515
527
todo : Default :: default ( ) ,
516
528
result_fn,
529
+ output : Default :: default ( ) ,
530
+ produced : Default :: default ( ) ,
531
+ _cursors : PhantomData ,
517
532
}
518
533
}
519
534
@@ -536,15 +551,20 @@ where
536
551
storage2 : C2 :: Storage ,
537
552
capability : Capability < C1 :: Time > ,
538
553
) {
539
- self . todo . push_back ( Deferred {
554
+ let deferred = Deferred {
540
555
cursor1,
541
556
storage1,
542
557
cursor2,
543
558
storage2,
544
- capability,
545
- done : false ,
546
- temp : Default :: default ( ) ,
547
- } ) ;
559
+ capability : capability. clone ( ) ,
560
+ } ;
561
+ let fut = deferred. work (
562
+ Rc :: clone ( & self . result_fn ) ,
563
+ Rc :: clone ( & self . output ) ,
564
+ Rc :: clone ( & self . produced ) ,
565
+ ) ;
566
+
567
+ self . todo . push_back ( ( Box :: pin ( fut) , capability) ) ;
548
568
}
549
569
550
570
/// Discard all pending work.
@@ -562,20 +582,43 @@ where
562
582
YFn : Fn ( Instant , usize ) -> bool ,
563
583
{
564
584
let start_time = Instant :: now ( ) ;
565
- let mut produced = 0 ;
566
-
567
- while !yield_fn ( start_time, produced)
568
- && let Some ( mut deferred) = self . todo . pop_front ( )
569
- {
570
- deferred. work (
571
- output,
572
- & mut * self . result_fn . borrow_mut ( ) ,
573
- |w| yield_fn ( start_time, w) ,
574
- & mut produced,
575
- ) ;
585
+ self . produced . set ( 0 ) ;
586
+
587
+ let waker = futures:: task:: noop_waker ( ) ;
588
+ let mut ctx = std:: task:: Context :: from_waker ( & waker) ;
589
+
590
+ while let Some ( ( mut fut, cap) ) = self . todo . pop_front ( ) {
591
+ // Drive the work future until it's done or it's time to yield.
592
+ let mut done = false ;
593
+ let mut should_yield = false ;
594
+ while !done && !should_yield {
595
+ done = fut. as_mut ( ) . poll ( & mut ctx) . is_ready ( ) ;
596
+ should_yield = yield_fn ( start_time, self . produced . get ( ) ) ;
597
+ }
598
+
599
+ // Drain the produced join results.
600
+ let mut output_buf = self . output . borrow_mut ( ) ;
601
+
602
+ // Consolidating here is important when the join closure produces data that
603
+ // consolidates well, for example when projecting columns.
604
+ let old_len = output_buf. len ( ) ;
605
+ consolidate_updates ( & mut output_buf) ;
606
+ let recovered = old_len - output_buf. len ( ) ;
607
+ self . produced . update ( |x| x - recovered) ;
608
+
609
+ output. session ( & cap) . give_iterator ( output_buf. drain ( ..) ) ;
610
+
611
+ if done {
612
+ // We have finished processing a chunk of work. Use this opportunity to truncate
613
+ // the output buffer, so we don't keep excess memory allocated forever.
614
+ * output_buf = Default :: default ( ) ;
615
+ } else if !done {
616
+ // Still work to do in this chunk.
617
+ self . todo . push_front ( ( fut, cap) ) ;
618
+ }
576
619
577
- if !deferred . done {
578
- self . todo . push_front ( deferred ) ;
620
+ if should_yield {
621
+ break ;
579
622
}
580
623
}
581
624
}
@@ -586,7 +629,7 @@ where
586
629
/// The structure wraps cursors which allow us to play out join computation at whatever rate we like.
587
630
/// This allows us to avoid producing and buffering massive amounts of data, without giving the timely
588
631
/// dataflow system a chance to run operators that can consume and aggregate the data.
589
- struct Deferred < C1 , C2 , D >
632
+ struct Deferred < C1 , C2 >
590
633
where
591
634
C1 : Cursor ,
592
635
C2 : Cursor ,
@@ -596,63 +639,45 @@ where
596
639
cursor2 : C2 ,
597
640
storage2 : C2 :: Storage ,
598
641
capability : Capability < C1 :: Time > ,
599
- done : bool ,
600
- temp : Vec < ( D , C1 :: Time , Diff ) > ,
601
642
}
602
643
603
- impl < C1 , C2 , D > Deferred < C1 , C2 , D >
644
+ impl < C1 , C2 > Deferred < C1 , C2 >
604
645
where
605
646
C1 : Cursor < Diff = Diff > ,
606
647
C2 : for < ' a > Cursor < Key < ' a > = C1 :: Key < ' a > , Time = C1 :: Time , Diff = Diff > ,
607
- D : Data ,
608
648
{
609
649
/// Process keys until at least `fuel` output tuples produced, or the work is exhausted.
610
- fn work < L , I , YFn , C > (
611
- & mut self ,
612
- output : & mut OutputHandleCore < C1 :: Time , CapacityContainerBuilder < C > , Tee < C1 :: Time , C > > ,
613
- mut logic : L ,
614
- yield_fn : YFn ,
615
- produced : & mut usize ,
650
+ async fn work < L , I , D > (
651
+ mut self ,
652
+ logic : Rc < RefCell < L > > ,
653
+ output : Rc < RefCell < Vec < ( D , C1 :: Time , Diff ) > > > ,
654
+ produced : Rc < Cell < usize > > ,
616
655
) where
617
656
I : IntoIterator < Item = D > ,
618
657
L : FnMut ( C1 :: Key < ' _ > , C1 :: Val < ' _ > , C2 :: Val < ' _ > ) -> I ,
619
- YFn : Fn ( usize ) -> bool ,
620
- C : SizableContainer + PushInto < ( D , C1 :: Time , Diff ) > + Data ,
658
+ D : Data ,
621
659
{
622
660
let meet = self . capability . time ( ) ;
623
661
624
- let mut session = output. session ( & self . capability ) ;
625
-
626
662
let storage1 = & self . storage1 ;
627
663
let storage2 = & self . storage2 ;
628
664
629
665
let cursor1 = & mut self . cursor1 ;
630
666
let cursor2 = & mut self . cursor2 ;
631
667
632
- let temp = & mut self . temp ;
633
-
634
- let flush = |data : & mut Vec < _ > , session : & mut Session < _ , _ , _ > | {
635
- let old_len = data. len ( ) ;
636
- // Consolidating here is important when the join closure produces data that
637
- // consolidates well, for example when projecting columns.
638
- consolidate_updates ( data) ;
639
- let recovered = old_len - data. len ( ) ;
640
- session. give_iterator ( data. drain ( ..) ) ;
641
- recovered
642
- } ;
643
-
644
- assert_eq ! ( temp. len( ) , 0 ) ;
645
-
646
668
let mut buffer = Vec :: default ( ) ;
647
669
648
670
while cursor1. key_valid ( storage1) && cursor2. key_valid ( storage2) {
649
671
match cursor1. key ( storage1) . cmp ( & cursor2. key ( storage2) ) {
650
672
Ordering :: Less => cursor1. seek_key ( storage1, cursor2. key ( storage2) ) ,
651
673
Ordering :: Greater => cursor2. seek_key ( storage2, cursor1. key ( storage1) ) ,
652
674
Ordering :: Equal => {
653
- // Populate `temp ` with the results, until we should yield.
675
+ // Populate `output ` with the results, until we should yield.
654
676
let key = cursor2. key ( storage2) ;
655
677
while let Some ( val1) = cursor1. get_val ( storage1) {
678
+ let mut logic = logic. borrow_mut ( ) ;
679
+ let mut output = output. borrow_mut ( ) ;
680
+
656
681
while let Some ( val2) = cursor2. get_val ( storage2) {
657
682
// Evaluate logic on `key, val1, val2`. Note the absence of time and diff.
658
683
let mut result = logic ( key, val1, val2) . into_iter ( ) . peekable ( ) ;
@@ -673,55 +698,42 @@ where
673
698
} ) ;
674
699
consolidate ( & mut buffer) ;
675
700
701
+ produced. update ( |x| x + buffer. len ( ) ) ;
702
+
676
703
// Special case no results, one result, and potentially many results
677
704
match ( result. peek ( ) . is_some ( ) , buffer. len ( ) ) {
678
705
// Certainly no output
679
706
( _, 0 ) => { }
680
707
// Single element, single time
681
708
( false , 1 ) => {
682
709
let ( time, diff) = buffer. pop ( ) . unwrap ( ) ;
683
- temp . push ( ( first, time, diff) ) ;
710
+ output . push ( ( first, time, diff) ) ;
684
711
}
685
712
// Multiple elements or multiple times
686
713
( _, _) => {
687
714
for d in std:: iter:: once ( first) . chain ( result) {
688
- temp. extend ( buffer. iter ( ) . map ( |( time, diff) | {
689
- ( d. clone ( ) , time. clone ( ) , diff. clone ( ) )
690
- } ) )
715
+ let updates = buffer
716
+ . drain ( ..)
717
+ . map ( |( time, diff) | ( d. clone ( ) , time, diff) ) ;
718
+ output. extend ( updates) ;
691
719
}
692
720
}
693
721
}
694
- buffer. clear ( ) ;
695
722
}
696
723
cursor2. step_val ( storage2) ;
697
724
}
698
725
cursor1. step_val ( storage1) ;
699
726
cursor2. rewind_vals ( storage2) ;
700
727
701
- * produced = produced. saturating_add ( temp. len ( ) ) ;
702
-
703
- if yield_fn ( * produced) {
704
- // Returning here is only allowed because we leave the cursors in a
705
- // state that will let us pick up the work correctly on the next
706
- // invocation.
707
- * produced -= flush ( temp, & mut session) ;
708
- if yield_fn ( * produced) {
709
- return ;
710
- }
711
- }
728
+ // Drop all shared state before yielding.
729
+ drop ( ( logic, output) ) ;
730
+ yield_now ( ) . await ;
712
731
}
713
732
714
733
cursor1. step_key ( storage1) ;
715
734
cursor2. step_key ( storage2) ;
716
735
}
717
736
}
718
737
}
719
-
720
- if !temp. is_empty ( ) {
721
- * produced -= flush ( temp, & mut session) ;
722
- }
723
-
724
- // We only get here after having iterated through all keys.
725
- self . done = true ;
726
738
}
727
739
}
0 commit comments