@@ -17,8 +17,10 @@ data Pattern
17
17
| Eof -- succeed if given the empty text, fail otherwise
18
18
| Literal Text -- succeed if input starts with the given text, advance by that text
19
19
| Char CharPattern -- succeed if input starts with a char matching the given pattern, advance by 1 char
20
- | Lookahead Pattern -- Positive lookahead
21
- | NegativeLookahead Pattern -- Negative lookahead
20
+ | Lookahead Pattern -- Succeed if the given pattern matches the input, but don't consume any input
21
+ | NegativeLookahead Pattern -- Succeed if the given pattern does not match the input, but don't consume any input
22
+ | Lookbehind1 CharPattern -- Succeed if the previous char matches.
23
+ | NegativeLookbehind1 CharPattern -- Succeed if the previous char does not match. Needed because lookbehind with negation is not exactly semantically equivalent.
22
24
deriving (Show , Eq , Ord )
23
25
24
26
data CharPattern
@@ -67,28 +69,28 @@ cpattern p = CP p (run p)
67
69
68
70
run :: Pattern -> Text -> Maybe ([Text ], Text )
69
71
run p =
70
- let cp = compile p (\ _ _ -> Nothing ) (\ acc rem -> Just (s acc, rem ))
72
+ let cp = compile p (\ _ _ _ -> Nothing ) (\ acc _ rem -> Just (s acc, rem ))
71
73
s = reverse . capturesToList . stackCaptures
72
- in \ t -> cp (Empty emptyCaptures) t
74
+ in \ t -> cp (Empty emptyCaptures) Nothing t
73
75
74
76
-- Stack used to track captures and to support backtracking.
75
77
-- A `try` will push a `Mark` that allows the old state
76
78
-- (both the list of captures and the current remainder)
77
79
-- to be restored on failure.
78
- data Stack = Empty ! Captures | Mark ! Captures ! Text ! Stack
80
+ data Stack = Empty ! Captures | Mark ! Captures ! ( Maybe Char ) ! Text ! Stack
79
81
80
82
-- A difference list for representing the captures of a pattern.
81
83
-- So that capture lists can be appended in O(1).
82
84
type Captures = [Text ] -> [Text ]
83
85
84
86
stackCaptures :: Stack -> Captures
85
- stackCaptures (Mark cs _ _) = cs
87
+ stackCaptures (Mark cs _ _ _ ) = cs
86
88
stackCaptures (Empty cs) = cs
87
89
{-# INLINE stackCaptures #-}
88
90
89
91
pushCaptures :: Captures -> Stack -> Stack
90
92
pushCaptures c (Empty cs) = Empty (appendCaptures c cs)
91
- pushCaptures c (Mark cs t s) = Mark (appendCaptures c cs) t s
93
+ pushCaptures c (Mark cs oc t s) = Mark (appendCaptures c cs) oc t s
92
94
{-# INLINE pushCaptures #-}
93
95
94
96
pushCapture :: Text -> Stack -> Stack
@@ -105,38 +107,47 @@ emptyCaptures = id
105
107
capturesToList :: Captures -> [Text ]
106
108
capturesToList c = c []
107
109
108
- type Compiled r = (Stack -> Text -> r ) -> (Stack -> Text -> r ) -> Stack -> Text -> r
110
+ type Compiled r = (Stack -> Maybe Char -> Text -> r ) -> (Stack -> Maybe Char -> Text -> r ) -> Stack -> Maybe Char -> Text -> r
109
111
110
112
compile :: Pattern -> Compiled r
111
113
compile Eof ! err ! success = go
112
114
where
113
- go acc t
114
- | Text. size t == 0 = success acc t
115
- | otherwise = err acc t
115
+ go acc c t
116
+ | Text. size t == 0 = success acc c t
117
+ | otherwise = err acc c t
116
118
compile (Literal txt) ! err ! success = go
117
119
where
118
- go acc t
119
- | Text. take (Text. size txt) t == txt = success acc (Text. drop (Text. size txt) t)
120
- | otherwise = err acc t
120
+ go acc oc t =
121
+ let candidate = Text. take (Text. size txt) t
122
+ in if candidate == txt
123
+ then
124
+ let t' = Text. drop (Text. size txt) t
125
+ in case Text. unsnoc candidate of
126
+ Just (_, c) -> success acc (Just c) t'
127
+ Nothing -> success acc oc t'
128
+ else
129
+ err acc oc t
121
130
compile (Char Any ) ! err ! success = go
122
131
where
123
- go acc t = case Text. drop 1 t of
124
- rem
125
- | Text. size t > Text. size rem -> success acc rem
126
- | otherwise -> err acc rem
132
+ go acc oc t = case Text. uncons t of
133
+ Just (c', rem ) -> success acc (Just c') rem
134
+ Nothing -> err acc oc t
127
135
compile (CaptureAs t p) ! err ! success = go
128
136
where
129
- err' _ _ acc0 t0 = err acc0 t0
130
- success' _ rem acc0 _ = success (pushCapture t acc0) rem
137
+ err' _ _ _ acc0 c0 t0 = err acc0 c0 t0
138
+ success' _ cr rem acc0 _ _ = success (pushCapture t acc0) cr rem
131
139
compiled = compile p err' success'
132
- go acc t = compiled acc t acc t
133
- compile (Capture (Many _ (Char Any ))) ! _ ! success = \ acc t -> success (pushCapture t acc) Text. empty
140
+ go acc c t = compiled acc c t acc c t
141
+ compile (Capture (Many _ (Char Any ))) ! _ ! success = \ acc c t ->
142
+ case Text. unsnoc t of
143
+ Just (_, c) -> success (pushCapture t acc) (Just c) Text. empty
144
+ Nothing -> success (pushCapture t acc) c t
134
145
compile (Capture c) ! err ! success = go
135
146
where
136
- err' _ _ acc0 t0 = err acc0 t0
137
- success' _ rem acc0 t0 = success (pushCapture (Text. take (Text. size t0 - Text. size rem ) t0) acc0) rem
147
+ err' _ _ _ acc0 c0 t0 = err acc0 c0 t0
148
+ success' _ cr rem acc0 _ t0 = success (pushCapture (Text. take (Text. size t0 - Text. size rem ) t0) acc0) cr rem
138
149
compiled = compile c err' success'
139
- go acc t = compiled acc t acc t
150
+ go acc c t = compiled acc c t acc c t
140
151
compile (Or p1 p2) err success = cp1
141
152
where
142
153
cp2 = compile p2 err success
@@ -151,57 +162,93 @@ compile (Join ps) !err !success = go ps
151
162
compile (Char cp) ! err ! success = go
152
163
where
153
164
ok = charPatternPred cp
154
- go acc t = case Text. uncons t of
155
- Just (ch, rem ) | ok ch -> success acc rem
156
- _ -> err acc t
165
+ go acc c t = case Text. uncons t of
166
+ Just (ch, rem ) | ok ch -> success acc ( Just ch) rem
167
+ _ -> err acc c t
157
168
compile (Many correct p) ! _ ! success = case p of
158
- Char Any -> (\ acc _ -> success acc Text. empty)
169
+ Char Any ->
170
+ ( \ acc c t -> case Text. unsnoc t of
171
+ Just (_, c) -> success acc (Just c) Text. empty
172
+ Nothing -> success acc c t
173
+ )
159
174
Char cp -> walker (charPatternPred cp)
160
175
p -> go
161
176
where
162
177
go
163
178
| correct = try " Many" (compile p) success success'
164
179
| otherwise = compile p success success'
165
- success' acc rem
166
- | Text. size rem == 0 = success acc rem
167
- | otherwise = go acc rem
180
+ success' acc c rem =
181
+ if Text. size rem == 0
182
+ then
183
+ success acc c rem
184
+ else go acc c rem
168
185
where
169
186
walker ok = go
170
187
where
171
- go acc t = case Text. unconsChunk t of
172
- Nothing -> success acc t
173
- Just (Text. chunkToText -> txt, t) -> case DT. dropWhile ok txt of
174
- rem
175
- | DT. null rem -> go acc t
176
- | otherwise ->
177
- -- moving the remainder to the root of the tree is much more efficient
178
- -- since the next uncons will be O(1) rather than O(log n)
179
- -- this can't unbalance the tree too badly since these promoted chunks
180
- -- are being consumed and will get removed by a subsequent uncons
181
- success acc (Text. appendUnbalanced (Text. fromText rem ) t)
188
+ go acc c t = case Text. unconsChunk t of
189
+ Nothing -> success acc c t
190
+ Just (Text. chunkToText -> txt, t) -> case DT. span ok txt of
191
+ (prefix, rem ) -> case DT. unsnoc prefix of
192
+ -- moving the remainder to the root of the tree is much more efficient
193
+ -- since the next uncons will be O(1) rather than O(log n)
194
+ -- this can't unbalance the tree too badly since these promoted chunks
195
+ -- are being consumed and will get removed by a subsequent uncons
196
+ Just (_, c)
197
+ | DT. null rem -> go acc (Just c) t
198
+ | otherwise -> success acc (Just c) (Text. appendUnbalanced (Text. fromText rem ) t)
199
+ Nothing
200
+ | DT. null rem -> go acc c t
201
+ | otherwise -> success acc c (Text. appendUnbalanced (Text. fromText rem ) t)
182
202
{-# INLINE walker #-}
183
203
compile (Replicate m n p) ! err ! success = case p of
184
- Char Any -> \ acc t ->
185
- if Text. size t < m
186
- then err acc t
187
- else success acc (Text. drop n t)
204
+ Char Any -> \ acc oc t ->
205
+ let sz = Text. size t
206
+ in if sz < m
207
+ then err acc oc t
208
+ else
209
+ if n < 1
210
+ then success acc oc t
211
+ else case Text. uncons (Text. drop (min (n - 1 ) (sz - 1 )) t) of
212
+ Just (c, rem ) -> success acc (Just c) rem
213
+ Nothing -> success acc oc Text. empty
188
214
Char cp -> dropper (charPatternPred cp)
189
215
_ -> try " Replicate" (go1 m) err (go2 (n - m))
190
216
where
191
- go1 0 = \ _err success stk rem -> success stk rem
217
+ go1 0 = \ _err success stk oc rem -> success stk oc rem
192
218
go1 n = \ err success -> compile p err (go1 (n - 1 ) err success)
193
219
go2 0 = success
194
220
go2 n = try " Replicate" (compile p) success (go2 (n - 1 ))
195
221
196
- dropper ok acc t
197
- | (i, rest) <- Text. dropWhileMax ok n t, i >= m = success acc rest
198
- | otherwise = err acc t
222
+ dropper ok acc oc t
223
+ | (i, rest) <- Text. dropWhileMax ok n t,
224
+ i >= m =
225
+ let lastDropped = if i > 0 then Text. at (i - 1 ) t else oc
226
+ in success acc lastDropped rest
227
+ | otherwise = err acc oc t
199
228
compile (Lookahead p) ! err ! success = cp
200
229
where
201
230
cp = lookahead " Lookahead" (compile p) err success
202
231
compile (NegativeLookahead p) ! err ! success = cp
203
232
where
204
233
cp = lookahead " NegativeLookahead" (compile p) success err
234
+ compile (Lookbehind1 cp) ! err ! success = \ acc oc t ->
235
+ case oc of
236
+ Just c ->
237
+ if charPatternPred cp c
238
+ then
239
+ success acc oc t
240
+ else
241
+ err acc oc t
242
+ Nothing -> err acc oc t
243
+ compile (NegativeLookbehind1 cp) ! err ! success = \ acc oc t ->
244
+ case oc of
245
+ Just c ->
246
+ if charPatternPred cp c
247
+ then
248
+ err acc oc t
249
+ else
250
+ success acc oc t
251
+ Nothing -> success acc oc t
205
252
206
253
charInPred , charNotInPred :: [Char ] -> Char -> Bool
207
254
charInPred [] = const False
@@ -234,27 +281,27 @@ charClassPred Letter = isLetter
234
281
235
282
-- runs c and if it fails, restores state to what it was before
236
283
try :: String -> Compiled r -> Compiled r
237
- try msg c err success stk rem =
238
- c err' success' (Mark id rem stk) rem
284
+ try msg c err success stk oc rem =
285
+ c err' success' (Mark id oc rem stk) oc rem
239
286
where
240
- success' stk rem = case stk of
241
- Mark caps _ stk -> success (pushCaptures caps stk) rem
287
+ success' stk oc rem = case stk of
288
+ Mark caps _ _ stk -> success (pushCaptures caps stk) oc rem
242
289
_ -> error $ " Pattern compiler error in: " <> msg
243
- err' stk _ = case stk of
244
- Mark _ rem stk -> err stk rem
290
+ err' stk _ _ = case stk of
291
+ Mark _ oc rem stk -> err stk oc rem
245
292
_ -> error $ " Pattern compiler error in: " <> msg
246
293
{-# INLINE try #-}
247
294
248
295
-- runs c and restores state to what it was before,
249
296
-- regardless of whether it succeeds or not
250
297
lookahead :: String -> Compiled r -> Compiled r
251
- lookahead msg c err success stk rem =
252
- c err' success' (Mark id rem stk) rem
298
+ lookahead msg c err success stk oc rem =
299
+ c err' success' (Mark id oc rem stk) oc rem
253
300
where
254
- success' stk _ = case stk of
255
- Mark _ rem stk -> success stk rem
301
+ success' stk _ _ = case stk of
302
+ Mark caps oc rem stk -> success (pushCaptures caps stk) oc rem
256
303
_ -> error $ " Pattern compiler error in: " <> msg
257
- err' stk _ = case stk of
258
- Mark _ rem stk -> err stk rem
304
+ err' stk _ _ = case stk of
305
+ Mark _ oc rem stk -> err stk oc rem
259
306
_ -> error $ " Pattern compiler error in: " <> msg
260
307
{-# INLINE lookahead #-}
0 commit comments