Skip to content

Commit 3103f68

Browse files
authored
Merge pull request #5656 from unisonweb/lookbehind
Add lookbehind to builtin patterns
2 parents 8d3be96 + 23e55cd commit 3103f68

File tree

21 files changed

+356
-290
lines changed

21 files changed

+356
-290
lines changed

parser-typechecker/src/Unison/Builtin.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,8 @@ builtinsSrc =
513513
B "Text.patterns.notCharRange" $ char --> char --> pat text,
514514
B "Text.patterns.charIn" $ list char --> pat text,
515515
B "Text.patterns.notCharIn" $ list char --> pat text,
516+
B "Text.patterns.lookbehind" $ charClass --> pat text,
517+
B "Text.patterns.negativeLookbehind" $ charClass --> pat text,
516518
-- Pattern.many : Pattern a -> Pattern a
517519
B "Pattern.many" $ forall1 "a" (\a -> pat a --> pat a),
518520
B "Pattern.many.corrected" $ forall1 "a" (\a -> pat a --> pat a),

parser-typechecker/src/Unison/Util/Text/Pattern.hs

Lines changed: 110 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ data Pattern
1717
| Eof -- succeed if given the empty text, fail otherwise
1818
| Literal Text -- succeed if input starts with the given text, advance by that text
1919
| Char CharPattern -- succeed if input starts with a char matching the given pattern, advance by 1 char
20-
| Lookahead Pattern -- Positive lookahead
21-
| NegativeLookahead Pattern -- Negative lookahead
20+
| Lookahead Pattern -- Succeed if the given pattern matches the input, but don't consume any input
21+
| NegativeLookahead Pattern -- Succeed if the given pattern does not match the input, but don't consume any input
22+
| Lookbehind1 CharPattern -- Succeed if the previous char matches.
23+
| NegativeLookbehind1 CharPattern -- Succeed if the previous char does not match. Needed because lookbehind with negation is not exactly semantically equivalent.
2224
deriving (Show, Eq, Ord)
2325

2426
data CharPattern
@@ -67,28 +69,28 @@ cpattern p = CP p (run p)
6769

6870
run :: Pattern -> Text -> Maybe ([Text], Text)
6971
run p =
70-
let cp = compile p (\_ _ -> Nothing) (\acc rem -> Just (s acc, rem))
72+
let cp = compile p (\_ _ _ -> Nothing) (\acc _ rem -> Just (s acc, rem))
7173
s = reverse . capturesToList . stackCaptures
72-
in \t -> cp (Empty emptyCaptures) t
74+
in \t -> cp (Empty emptyCaptures) Nothing t
7375

7476
-- Stack used to track captures and to support backtracking.
7577
-- A `try` will push a `Mark` that allows the old state
7678
-- (both the list of captures and the current remainder)
7779
-- to be restored on failure.
78-
data Stack = Empty !Captures | Mark !Captures !Text !Stack
80+
data Stack = Empty !Captures | Mark !Captures !(Maybe Char) !Text !Stack
7981

8082
-- A difference list for representing the captures of a pattern.
8183
-- So that capture lists can be appended in O(1).
8284
type Captures = [Text] -> [Text]
8385

8486
stackCaptures :: Stack -> Captures
85-
stackCaptures (Mark cs _ _) = cs
87+
stackCaptures (Mark cs _ _ _) = cs
8688
stackCaptures (Empty cs) = cs
8789
{-# INLINE stackCaptures #-}
8890

8991
pushCaptures :: Captures -> Stack -> Stack
9092
pushCaptures c (Empty cs) = Empty (appendCaptures c cs)
91-
pushCaptures c (Mark cs t s) = Mark (appendCaptures c cs) t s
93+
pushCaptures c (Mark cs oc t s) = Mark (appendCaptures c cs) oc t s
9294
{-# INLINE pushCaptures #-}
9395

9496
pushCapture :: Text -> Stack -> Stack
@@ -105,38 +107,47 @@ emptyCaptures = id
105107
capturesToList :: Captures -> [Text]
106108
capturesToList c = c []
107109

108-
type Compiled r = (Stack -> Text -> r) -> (Stack -> Text -> r) -> Stack -> Text -> r
110+
type Compiled r = (Stack -> Maybe Char -> Text -> r) -> (Stack -> Maybe Char -> Text -> r) -> Stack -> Maybe Char -> Text -> r
109111

110112
compile :: Pattern -> Compiled r
111113
compile Eof !err !success = go
112114
where
113-
go acc t
114-
| Text.size t == 0 = success acc t
115-
| otherwise = err acc t
115+
go acc c t
116+
| Text.size t == 0 = success acc c t
117+
| otherwise = err acc c t
116118
compile (Literal txt) !err !success = go
117119
where
118-
go acc t
119-
| Text.take (Text.size txt) t == txt = success acc (Text.drop (Text.size txt) t)
120-
| otherwise = err acc t
120+
go acc oc t =
121+
let candidate = Text.take (Text.size txt) t
122+
in if candidate == txt
123+
then
124+
let t' = Text.drop (Text.size txt) t
125+
in case Text.unsnoc candidate of
126+
Just (_, c) -> success acc (Just c) t'
127+
Nothing -> success acc oc t'
128+
else
129+
err acc oc t
121130
compile (Char Any) !err !success = go
122131
where
123-
go acc t = case Text.drop 1 t of
124-
rem
125-
| Text.size t > Text.size rem -> success acc rem
126-
| otherwise -> err acc rem
132+
go acc oc t = case Text.uncons t of
133+
Just (c', rem) -> success acc (Just c') rem
134+
Nothing -> err acc oc t
127135
compile (CaptureAs t p) !err !success = go
128136
where
129-
err' _ _ acc0 t0 = err acc0 t0
130-
success' _ rem acc0 _ = success (pushCapture t acc0) rem
137+
err' _ _ _ acc0 c0 t0 = err acc0 c0 t0
138+
success' _ cr rem acc0 _ _ = success (pushCapture t acc0) cr rem
131139
compiled = compile p err' success'
132-
go acc t = compiled acc t acc t
133-
compile (Capture (Many _ (Char Any))) !_ !success = \acc t -> success (pushCapture t acc) Text.empty
140+
go acc c t = compiled acc c t acc c t
141+
compile (Capture (Many _ (Char Any))) !_ !success = \acc c t ->
142+
case Text.unsnoc t of
143+
Just (_, c) -> success (pushCapture t acc) (Just c) Text.empty
144+
Nothing -> success (pushCapture t acc) c t
134145
compile (Capture c) !err !success = go
135146
where
136-
err' _ _ acc0 t0 = err acc0 t0
137-
success' _ rem acc0 t0 = success (pushCapture (Text.take (Text.size t0 - Text.size rem) t0) acc0) rem
147+
err' _ _ _ acc0 c0 t0 = err acc0 c0 t0
148+
success' _ cr rem acc0 _ t0 = success (pushCapture (Text.take (Text.size t0 - Text.size rem) t0) acc0) cr rem
138149
compiled = compile c err' success'
139-
go acc t = compiled acc t acc t
150+
go acc c t = compiled acc c t acc c t
140151
compile (Or p1 p2) err success = cp1
141152
where
142153
cp2 = compile p2 err success
@@ -151,57 +162,93 @@ compile (Join ps) !err !success = go ps
151162
compile (Char cp) !err !success = go
152163
where
153164
ok = charPatternPred cp
154-
go acc t = case Text.uncons t of
155-
Just (ch, rem) | ok ch -> success acc rem
156-
_ -> err acc t
165+
go acc c t = case Text.uncons t of
166+
Just (ch, rem) | ok ch -> success acc (Just ch) rem
167+
_ -> err acc c t
157168
compile (Many correct p) !_ !success = case p of
158-
Char Any -> (\acc _ -> success acc Text.empty)
169+
Char Any ->
170+
( \acc c t -> case Text.unsnoc t of
171+
Just (_, c) -> success acc (Just c) Text.empty
172+
Nothing -> success acc c t
173+
)
159174
Char cp -> walker (charPatternPred cp)
160175
p -> go
161176
where
162177
go
163178
| correct = try "Many" (compile p) success success'
164179
| otherwise = compile p success success'
165-
success' acc rem
166-
| Text.size rem == 0 = success acc rem
167-
| otherwise = go acc rem
180+
success' acc c rem =
181+
if Text.size rem == 0
182+
then
183+
success acc c rem
184+
else go acc c rem
168185
where
169186
walker ok = go
170187
where
171-
go acc t = case Text.unconsChunk t of
172-
Nothing -> success acc t
173-
Just (Text.chunkToText -> txt, t) -> case DT.dropWhile ok txt of
174-
rem
175-
| DT.null rem -> go acc t
176-
| otherwise ->
177-
-- moving the remainder to the root of the tree is much more efficient
178-
-- since the next uncons will be O(1) rather than O(log n)
179-
-- this can't unbalance the tree too badly since these promoted chunks
180-
-- are being consumed and will get removed by a subsequent uncons
181-
success acc (Text.appendUnbalanced (Text.fromText rem) t)
188+
go acc c t = case Text.unconsChunk t of
189+
Nothing -> success acc c t
190+
Just (Text.chunkToText -> txt, t) -> case DT.span ok txt of
191+
(prefix, rem) -> case DT.unsnoc prefix of
192+
-- moving the remainder to the root of the tree is much more efficient
193+
-- since the next uncons will be O(1) rather than O(log n)
194+
-- this can't unbalance the tree too badly since these promoted chunks
195+
-- are being consumed and will get removed by a subsequent uncons
196+
Just (_, c)
197+
| DT.null rem -> go acc (Just c) t
198+
| otherwise -> success acc (Just c) (Text.appendUnbalanced (Text.fromText rem) t)
199+
Nothing
200+
| DT.null rem -> go acc c t
201+
| otherwise -> success acc c (Text.appendUnbalanced (Text.fromText rem) t)
182202
{-# INLINE walker #-}
183203
compile (Replicate m n p) !err !success = case p of
184-
Char Any -> \acc t ->
185-
if Text.size t < m
186-
then err acc t
187-
else success acc (Text.drop n t)
204+
Char Any -> \acc oc t ->
205+
let sz = Text.size t
206+
in if sz < m
207+
then err acc oc t
208+
else
209+
if n < 1
210+
then success acc oc t
211+
else case Text.uncons (Text.drop (min (n - 1) (sz - 1)) t) of
212+
Just (c, rem) -> success acc (Just c) rem
213+
Nothing -> success acc oc Text.empty
188214
Char cp -> dropper (charPatternPred cp)
189215
_ -> try "Replicate" (go1 m) err (go2 (n - m))
190216
where
191-
go1 0 = \_err success stk rem -> success stk rem
217+
go1 0 = \_err success stk oc rem -> success stk oc rem
192218
go1 n = \err success -> compile p err (go1 (n - 1) err success)
193219
go2 0 = success
194220
go2 n = try "Replicate" (compile p) success (go2 (n - 1))
195221

196-
dropper ok acc t
197-
| (i, rest) <- Text.dropWhileMax ok n t, i >= m = success acc rest
198-
| otherwise = err acc t
222+
dropper ok acc oc t
223+
| (i, rest) <- Text.dropWhileMax ok n t,
224+
i >= m =
225+
let lastDropped = if i > 0 then Text.at (i - 1) t else oc
226+
in success acc lastDropped rest
227+
| otherwise = err acc oc t
199228
compile (Lookahead p) !err !success = cp
200229
where
201230
cp = lookahead "Lookahead" (compile p) err success
202231
compile (NegativeLookahead p) !err !success = cp
203232
where
204233
cp = lookahead "NegativeLookahead" (compile p) success err
234+
compile (Lookbehind1 cp) !err !success = \acc oc t ->
235+
case oc of
236+
Just c ->
237+
if charPatternPred cp c
238+
then
239+
success acc oc t
240+
else
241+
err acc oc t
242+
Nothing -> err acc oc t
243+
compile (NegativeLookbehind1 cp) !err !success = \acc oc t ->
244+
case oc of
245+
Just c ->
246+
if charPatternPred cp c
247+
then
248+
err acc oc t
249+
else
250+
success acc oc t
251+
Nothing -> success acc oc t
205252

206253
charInPred, charNotInPred :: [Char] -> Char -> Bool
207254
charInPred [] = const False
@@ -234,27 +281,27 @@ charClassPred Letter = isLetter
234281

235282
-- runs c and if it fails, restores state to what it was before
236283
try :: String -> Compiled r -> Compiled r
237-
try msg c err success stk rem =
238-
c err' success' (Mark id rem stk) rem
284+
try msg c err success stk oc rem =
285+
c err' success' (Mark id oc rem stk) oc rem
239286
where
240-
success' stk rem = case stk of
241-
Mark caps _ stk -> success (pushCaptures caps stk) rem
287+
success' stk oc rem = case stk of
288+
Mark caps _ _ stk -> success (pushCaptures caps stk) oc rem
242289
_ -> error $ "Pattern compiler error in: " <> msg
243-
err' stk _ = case stk of
244-
Mark _ rem stk -> err stk rem
290+
err' stk _ _ = case stk of
291+
Mark _ oc rem stk -> err stk oc rem
245292
_ -> error $ "Pattern compiler error in: " <> msg
246293
{-# INLINE try #-}
247294

248295
-- runs c and restores state to what it was before,
249296
-- regardless of whether it succeeds or not
250297
lookahead :: String -> Compiled r -> Compiled r
251-
lookahead msg c err success stk rem =
252-
c err' success' (Mark id rem stk) rem
298+
lookahead msg c err success stk oc rem =
299+
c err' success' (Mark id oc rem stk) oc rem
253300
where
254-
success' stk _ = case stk of
255-
Mark _ rem stk -> success stk rem
301+
success' stk _ _ = case stk of
302+
Mark caps oc rem stk -> success (pushCaptures caps stk) oc rem
256303
_ -> error $ "Pattern compiler error in: " <> msg
257-
err' stk _ = case stk of
258-
Mark _ rem stk -> err stk rem
304+
err' stk _ _ = case stk of
305+
Mark _ oc rem stk -> err stk oc rem
259306
_ -> error $ "Pattern compiler error in: " <> msg
260307
{-# INLINE lookahead #-}

unison-runtime/src/Unison/Runtime/Builtin.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,6 +1278,8 @@ declareForeigns = do
12781278
declareForeignWrap Untracked direct Char_Class_letter
12791279
declareForeign Untracked 2 Char_Class_is
12801280
declareForeign Untracked 1 Text_patterns_char
1281+
declareForeign Untracked 1 Text_patterns_lookbehind1
1282+
declareForeign Untracked 1 Text_patterns_negativeLookbehind1
12811283

12821284
-- replacements
12831285
declareForeign Untracked 3 Map_insert

unison-runtime/src/Unison/Runtime/Foreign/Function.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,10 @@ foreignCallHelper = \case
871871
Char_Class_is -> mkForeign $ \(cl, c) -> evaluate $ TPat.charPatternPred cl c
872872
Text_patterns_char -> mkForeign $ \c ->
873873
let v = TPat.cpattern (TPat.Char c) in pure v
874+
Text_patterns_lookbehind1 -> mkForeign $ \cp ->
875+
let v = TPat.cpattern (TPat.Lookbehind1 cp) in pure v
876+
Text_patterns_negativeLookbehind1 -> mkForeign $ \cp ->
877+
let v = TPat.cpattern (TPat.NegativeLookbehind1 cp) in pure v
874878
Map_tip -> mkForeign $ \() -> pure Map.empty
875879
Map_bin -> mkForeign $ \(sz :: Word64, k :: Val, v :: Val, l, r) ->
876880
pure (Map.Bin (fromIntegral sz) k v l r)

unison-runtime/src/Unison/Runtime/Foreign/Function/Type.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,8 @@ data ForeignFunc
256256
| Char_Class_letter
257257
| Char_Class_is
258258
| Text_patterns_char
259+
| Text_patterns_lookbehind1
260+
| Text_patterns_negativeLookbehind1
259261
| Map_tip
260262
| Map_bin
261263
| Map_insert
@@ -524,6 +526,8 @@ foreignFuncBuiltinName = \case
524526
Char_Class_letter -> "Char.Class.letter"
525527
Char_Class_is -> "Char.Class.is"
526528
Text_patterns_char -> "Text.patterns.char"
529+
Text_patterns_lookbehind1 -> "Text.patterns.lookbehind"
530+
Text_patterns_negativeLookbehind1 -> "Text.patterns.negativeLookbehind"
527531
Map_tip -> "Map.Tip"
528532
Map_bin -> "Map.Bin"
529533
Map_insert -> "Map.insert"

unison-src/transcripts-manual/gen-racket-libs.output.md

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,32 @@ Next, we'll download the jit project and generate a few Racket files from it.
55
``` ucm
66
jit-setup/main> lib.install @unison/internal/releases/0.0.25
77
8-
Downloaded 14942 entities.
9-
108
I installed @unison/internal/releases/0.0.25 as
119
unison_internal_0_0_25.
12-
1310
```
11+
1412
``` unison
1513
go = generateSchemeBoot "scheme-libs/racket"
1614
```
1715

18-
``` ucm
19-
16+
``` ucm :added-by-ucm
2017
Loading changes detected in scratch.u.
2118
2219
I found and typechecked these definitions in scratch.u. If you
2320
do an `add` or `update`, here's how your codebase would
2421
change:
25-
22+
2623
⍟ These new definitions are ok to `add`:
2724
2825
go : '{IO, Exception} ()
29-
3026
```
27+
3128
``` ucm
3229
jit-setup/main> run go
3330
3431
()
35-
3632
```
33+
3734
After executing this, `scheme-libs/racket` will contain the full
3835
complement of unison libraries for a given combination of ucm version
3936
and @unison/internal version.
@@ -59,4 +56,3 @@ raco distribute <output-dir> scheme-libs/racket/unison-runtime
5956

6057
At that point, <output-dir> should contain the executable and all
6158
dependencies necessary to run it.
62-

0 commit comments

Comments
 (0)