@@ -39,131 +39,13 @@ struct TMatchRecognizeProcessorParameters {
39
39
TMeasureInputColumnOrder MeasureInputColumnOrder;
40
40
TComputationNodePtrVector Measures;
41
41
TOutputColumnOrder OutputColumnOrder;
42
- };
43
-
44
- class TBackTrackingMatchRecognize {
45
- using TPartitionList = TSimpleList;
46
- using TRange = TPartitionList::TRange;
47
- using TMatchedVars = TMatchedVars<TRange>;
48
- public:
49
- // TODO(YQL-16486): create a tree for backtracking(replace var names with indexes)
50
-
51
- struct TPatternConfiguration {
52
- void Save (TMrOutputSerializer& /* serializer*/ ) const {
53
- }
54
-
55
- void Load (TMrInputSerializer& /* serializer*/ ) {
56
- }
57
-
58
- friend bool operator ==(const TPatternConfiguration&, const TPatternConfiguration&) {
59
- return true ;
60
- }
61
- };
62
-
63
- struct TPatternConfigurationBuilder {
64
- using TPatternConfigurationPtr = std::shared_ptr<TPatternConfiguration>;
65
- static TPatternConfigurationPtr Create (const TRowPattern& pattern, const THashMap<TString, size_t >& varNameToIndex) {
66
- Y_UNUSED (pattern);
67
- Y_UNUSED (varNameToIndex);
68
- return std::make_shared<TPatternConfiguration>();
69
- }
70
- };
71
-
72
- TBackTrackingMatchRecognize (
73
- NUdf::TUnboxedValue&& partitionKey,
74
- const TMatchRecognizeProcessorParameters& parameters,
75
- const TPatternConfigurationBuilder::TPatternConfigurationPtr pattern,
76
- const TContainerCacheOnContext& cache
77
- )
78
- : PartitionKey(std::move(partitionKey))
79
- , Parameters(parameters)
80
- , Cache(cache)
81
- , CurMatchedVars(parameters.Defines.size())
82
- , MatchNumber(0 )
83
- {
84
- // TODO(YQL-16486)
85
- Y_UNUSED (pattern);
86
- }
87
-
88
- bool ProcessInputRow (NUdf::TUnboxedValue&& row, TComputationContext& ctx) {
89
- Y_UNUSED (ctx);
90
- Rows.Append (std::move (row));
91
- return false ;
92
- }
93
- NUdf::TUnboxedValue GetOutputIfReady (TComputationContext& ctx) {
94
- if (Matches.empty ())
95
- return NUdf::TUnboxedValue{};
96
- Parameters.MatchedVarsArg ->SetValue (ctx, ToValue (ctx.HolderFactory , std::move (Matches.front ())));
97
- Matches.pop_front ();
98
- Parameters.MeasureInputDataArg ->SetValue (ctx, ctx.HolderFactory .Create <TMeasureInputDataValue>(
99
- Parameters.InputDataArg ->GetValue (ctx),
100
- Parameters.MeasureInputColumnOrder ,
101
- Parameters.MatchedVarsArg ->GetValue (ctx),
102
- Parameters.VarNames ,
103
- ++MatchNumber
104
- ));
105
- NUdf::TUnboxedValue *itemsPtr = nullptr ;
106
- const auto result = Cache.NewArray (ctx, Parameters.OutputColumnOrder .size (), itemsPtr);
107
- for (auto const & c: Parameters.OutputColumnOrder ) {
108
- switch (c.first ) {
109
- case EOutputColumnSource::Measure:
110
- *itemsPtr++ = Parameters.Measures [c.second ]->GetValue (ctx);
111
- break ;
112
- case EOutputColumnSource::PartitionKey:
113
- *itemsPtr++ = PartitionKey.GetElement (c.second );
114
- break ;
115
- }
116
- }
117
- return result;
118
- }
119
- bool ProcessEndOfData (TComputationContext& ctx) {
120
- // Assume, that data moved to IComputationExternalNode node, will not be modified or released
121
- // till the end of the current function
122
- auto rowsSize = Rows.Size ();
123
- Parameters.InputDataArg ->SetValue (ctx, ctx.HolderFactory .Create <TListValue<TPartitionList>>(Rows));
124
- for (size_t i = 0 ; i != rowsSize; ++i) {
125
- Parameters.CurrentRowIndexArg ->SetValue (ctx, NUdf::TUnboxedValuePod (static_cast <ui64>(i)));
126
- for (size_t v = 0 ; v != Parameters.Defines .size (); ++v) {
127
- const auto &d = Parameters.Defines [v]->GetValue (ctx);
128
- if (d && d.GetOptionalValue ().Get <bool >()) {
129
- Extend (CurMatchedVars[v], TRange{i});
130
- }
131
- }
132
- // for the sake of dummy usage assume non-overlapped matches at every 5th row of any partition
133
- if (i % 5 == 0 ) {
134
- TMatchedVars temp;
135
- temp.swap (CurMatchedVars);
136
- Matches.emplace_back (std::move (temp));
137
- CurMatchedVars.resize (Parameters.Defines .size ());
138
- }
139
- }
140
- return not Matches.empty ();
141
- }
142
-
143
- void Save (TOutputSerializer& /* serializer*/ ) const {
144
- // Not used in not streaming mode.
145
- }
146
-
147
- void Load (TMrInputSerializer& /* serializer*/ ) {
148
- // Not used in not streaming mode.
149
- }
150
-
151
- private:
152
- const NUdf::TUnboxedValue PartitionKey;
153
- const TMatchRecognizeProcessorParameters& Parameters;
154
- const TContainerCacheOnContext& Cache;
155
- TSimpleList Rows;
156
- TMatchedVars CurMatchedVars;
157
- std::deque<TMatchedVars, TMKQLAllocator<TMatchedVars>> Matches;
158
- ui64 MatchNumber;
42
+ TAfterMatchSkipTo SkipTo;
159
43
};
160
44
161
45
class TStreamingMatchRecognize {
162
46
using TPartitionList = TSparseList;
163
47
using TRange = TPartitionList::TRange;
164
48
public:
165
- using TPatternConfiguration = TNfaTransitionGraph;
166
- using TPatternConfigurationBuilder = TNfaTransitionGraphBuilder;
167
49
TStreamingMatchRecognize (
168
50
NUdf::TUnboxedValue&& partitionKey,
169
51
const TMatchRecognizeProcessorParameters& parameters,
@@ -213,6 +95,9 @@ class TStreamingMatchRecognize {
213
95
break ;
214
96
}
215
97
}
98
+ if (EAfterMatchSkipTo::PastLastRow == Parameters.SkipTo .To ) {
99
+ Nfa.Clear ();
100
+ }
216
101
return result;
217
102
}
218
103
bool ProcessEndOfData (TComputationContext& ctx) {
@@ -243,11 +128,9 @@ class TStreamingMatchRecognize {
243
128
ui64 MatchNumber = 0 ;
244
129
};
245
130
246
- template <typename Algo>
247
131
class TStateForNonInterleavedPartitions
248
- : public TComputationValue<TStateForNonInterleavedPartitions<Algo> >
132
+ : public TComputationValue<TStateForNonInterleavedPartitions>
249
133
{
250
- using TRowPatternConfigurationBuilder = typename Algo::TPatternConfigurationBuilder;
251
134
public:
252
135
TStateForNonInterleavedPartitions (
253
136
TMemoryUsageInfo* memInfo,
@@ -265,7 +148,7 @@ class TStateForNonInterleavedPartitions
265
148
, PartitionKey(partitionKey)
266
149
, PartitionKeyPacker(true , partitionKeyType)
267
150
, Parameters(parameters)
268
- , RowPatternConfiguration(TRowPatternConfigurationBuilder ::Create(parameters.Pattern, parameters.VarNamesLookup))
151
+ , RowPatternConfiguration(TNfaTransitionGraphBuilder ::Create(parameters.Pattern, parameters.VarNamesLookup))
269
152
, Cache(cache)
270
153
, Terminating(false )
271
154
, SerializerContext(ctx, rowType, rowPacker)
@@ -301,7 +184,7 @@ class TStateForNonInterleavedPartitions
301
184
bool validPartitionHandler = in.Read <bool >();
302
185
if (validPartitionHandler) {
303
186
NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack (CurPartitionPackedKey, SerializerContext.Ctx .HolderFactory );
304
- PartitionHandler.reset (new Algo (
187
+ PartitionHandler.reset (new TStreamingMatchRecognize (
305
188
std::move (key),
306
189
Parameters,
307
190
RowPatternConfiguration,
@@ -313,7 +196,7 @@ class TStateForNonInterleavedPartitions
313
196
if (validDelayedRow) {
314
197
in (DelayedRow);
315
198
}
316
- auto restoredRowPatternConfiguration = std::make_shared<typename Algo::TPatternConfiguration >();
199
+ auto restoredRowPatternConfiguration = std::make_shared<TNfaTransitionGraph >();
317
200
restoredRowPatternConfiguration->Load (in);
318
201
MKQL_ENSURE (*restoredRowPatternConfiguration == *RowPatternConfiguration, " Restored and current RowPatternConfiguration is different" );
319
202
MKQL_ENSURE (in.Empty (), " State is corrupted" );
@@ -367,12 +250,11 @@ class TStateForNonInterleavedPartitions
367
250
InputRowArg->SetValue (ctx, NUdf::TUnboxedValue (temp));
368
251
auto partitionKey = PartitionKey->GetValue (ctx);
369
252
CurPartitionPackedKey = PartitionKeyPacker.Pack (partitionKey);
370
- PartitionHandler.reset (new Algo (
253
+ PartitionHandler.reset (new TStreamingMatchRecognize (
371
254
std::move (partitionKey),
372
255
Parameters,
373
256
RowPatternConfiguration,
374
- Cache
375
- ));
257
+ Cache));
376
258
PartitionHandler->ProcessInputRow (std::move (temp), ctx);
377
259
}
378
260
if (Terminating) {
@@ -382,12 +264,12 @@ class TStateForNonInterleavedPartitions
382
264
}
383
265
private:
384
266
TString CurPartitionPackedKey;
385
- std::unique_ptr<Algo > PartitionHandler;
267
+ std::unique_ptr<TStreamingMatchRecognize > PartitionHandler;
386
268
IComputationExternalNode* InputRowArg;
387
269
IComputationNode* PartitionKey;
388
270
TValuePackerGeneric<false > PartitionKeyPacker;
389
271
const TMatchRecognizeProcessorParameters& Parameters;
390
- const typename TRowPatternConfigurationBuilder::TPatternConfigurationPtr RowPatternConfiguration;
272
+ const TNfaTransitionGraph::TPtr RowPatternConfiguration;
391
273
const TContainerCacheOnContext& Cache;
392
274
NUdf::TUnboxedValue DelayedRow;
393
275
bool Terminating;
@@ -768,6 +650,11 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
768
650
defines.push_back (callable.GetInput (inputIndex++));
769
651
}
770
652
const auto & streamingMode = callable.GetInput (inputIndex++);
653
+ NYql::NMatchRecognize::TAfterMatchSkipTo skipTo = {NYql::NMatchRecognize::EAfterMatchSkipTo::NextRow, " " };
654
+ if (inputIndex + 2 <= callable.GetInputsCount ()) {
655
+ skipTo.To = static_cast <EAfterMatchSkipTo>(AS_VALUE (TDataLiteral, callable.GetInput (inputIndex++))->AsValue ().Get <i32 >());
656
+ skipTo.Var = AS_VALUE (TDataLiteral, callable.GetInput (inputIndex++))->AsValue ().AsStringRef ();
657
+ }
771
658
MKQL_ENSURE (callable.GetInputsCount () == inputIndex, " Wrong input count" );
772
659
773
660
const auto & [vars, varsLookup] = ConvertListOfStrings (varNames);
@@ -788,6 +675,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
788
675
)
789
676
, ConvertVectorOfCallables (measures, ctx)
790
677
, GetOutputColumnOrder (partitionColumnIndexes, measureColumnIndexes)
678
+ , skipTo
791
679
};
792
680
if (AS_VALUE (TDataLiteral, streamingMode)->AsValue ().Get <bool >()) {
793
681
return new TMatchRecognizeWrapper<TStateForInterleavedPartitions>(ctx.Mutables
@@ -800,28 +688,15 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation
800
688
, rowType
801
689
);
802
690
} else {
803
- const bool useNfaForTables = true ; // TODO(YQL-16486) get this flag from an optimizer
804
- if (useNfaForTables) {
805
- return new TMatchRecognizeWrapper<TStateForNonInterleavedPartitions<TStreamingMatchRecognize>>(ctx.Mutables
806
- , GetValueRepresentation (inputFlow.GetStaticType ())
807
- , LocateNode (ctx.NodeLocator , *inputFlow.GetNode ())
808
- , static_cast <IComputationExternalNode*>(LocateNode (ctx.NodeLocator , *inputRowArg.GetNode ()))
809
- , LocateNode (ctx.NodeLocator , *partitionKeySelector.GetNode ())
810
- , partitionKeySelector.GetStaticType ()
811
- , std::move (parameters)
812
- , rowType
813
- );
814
- } else {
815
- return new TMatchRecognizeWrapper<TStateForNonInterleavedPartitions<TBackTrackingMatchRecognize>>(ctx.Mutables
816
- , GetValueRepresentation (inputFlow.GetStaticType ())
817
- , LocateNode (ctx.NodeLocator , *inputFlow.GetNode ())
818
- , static_cast <IComputationExternalNode*>(LocateNode (ctx.NodeLocator , *inputRowArg.GetNode ()))
819
- , LocateNode (ctx.NodeLocator , *partitionKeySelector.GetNode ())
820
- , partitionKeySelector.GetStaticType ()
821
- , std::move (parameters)
822
- , rowType
823
- );
824
- }
691
+ return new TMatchRecognizeWrapper<TStateForNonInterleavedPartitions>(ctx.Mutables
692
+ , GetValueRepresentation (inputFlow.GetStaticType ())
693
+ , LocateNode (ctx.NodeLocator , *inputFlow.GetNode ())
694
+ , static_cast <IComputationExternalNode*>(LocateNode (ctx.NodeLocator , *inputRowArg.GetNode ()))
695
+ , LocateNode (ctx.NodeLocator , *partitionKeySelector.GetNode ())
696
+ , partitionKeySelector.GetStaticType ()
697
+ , std::move (parameters)
698
+ , rowType
699
+ );
825
700
}
826
701
}
827
702
0 commit comments