From c60c1cf5f5c1e13e028f87b9fcae71c94aabdac9 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 17 Aug 2024 20:32:41 +0100 Subject: [PATCH 1/5] Fix some warnings in the test suite --- tests/Tests/Properties/Basics.hs | 1 + tests/Tests/ShareEmpty.hs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/Tests/Properties/Basics.hs b/tests/Tests/Properties/Basics.hs index 72d916bc..6310bb10 100644 --- a/tests/Tests/Properties/Basics.hs +++ b/tests/Tests/Properties/Basics.hs @@ -4,6 +4,7 @@ {-# OPTIONS_GHC -Wno-missing-signatures #-} {-# OPTIONS_GHC -Wno-warnings-deprecations #-} +{-# OPTIONS_GHC -Wno-unrecognised-warning-flags #-} {-# OPTIONS_GHC -Wno-x-partial #-} module Tests.Properties.Basics diff --git a/tests/Tests/ShareEmpty.hs b/tests/Tests/ShareEmpty.hs index 6667aee9..bfbf7cdb 100644 --- a/tests/Tests/ShareEmpty.hs +++ b/tests/Tests/ShareEmpty.hs @@ -2,6 +2,8 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE BangPatterns #-} + +{-# OPTIONS_GHC -Wno-unrecognised-warning-flags #-} {-# OPTIONS_GHC -Wno-x-partial #-} module Tests.ShareEmpty From c8e5b1d2d9d27d990c2121c8eb34e63c9c844f4a Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 17 Aug 2024 20:31:55 +0100 Subject: [PATCH 2/5] Add benchmarks for toTitle --- benchmarks/haskell/Benchmarks/Pure.hs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/haskell/Benchmarks/Pure.hs b/benchmarks/haskell/Benchmarks/Pure.hs index b5a31fe8..cec8a2d1 100644 --- a/benchmarks/haskell/Benchmarks/Pure.hs +++ b/benchmarks/haskell/Benchmarks/Pure.hs @@ -192,6 +192,10 @@ benchmark kind ~Env{..} = [ benchT $ nf T.toUpper ta , benchTL $ nf TL.toUpper tla ] + , bgroup "toTitle" + [ benchT $ nf T.toTitle ta + , benchTL $ nf TL.toTitle tla + ] , bgroup "uncons" [ benchT $ nf T.uncons ta , benchTL $ nf TL.uncons tla @@ -269,6 +273,10 @@ benchmark kind ~Env{..} = [ benchT $ nf (T.length . T.toUpper) ta , benchTL $ nf (TL.length . TL.toUpper) tla ] + , bgroup "toTitle" + [ benchT $ nf (T.length . T.toTitle) ta + , benchTL $ nf (TL.length . TL.toTitle) tla + ] , bgroup "words" [ benchT $ nf (L.length . T.words) ta , benchTL $ nf (L.length . TL.words) tla From e2fd1307867189690fd438efc5ac88ebc5f65e50 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 17 Aug 2024 20:31:02 +0100 Subject: [PATCH 3/5] More callstacks --- src/Data/Text/Unsafe.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs index 8f6cff86..918f0a61 100644 --- a/src/Data/Text/Unsafe.hs +++ b/src/Data/Text/Unsafe.hs @@ -80,7 +80,11 @@ iter (Text arr off _len) i = iterArray arr (off + i) {-# INLINE iter #-} -- | @since 2.0 -iterArray :: A.Array -> Int -> Iter +iterArray :: +#if defined(ASSERTS) + HasCallStack => +#endif + A.Array -> Int -> Iter iterArray arr j = Iter chr l where m0 = A.unsafeIndex arr j m1 = A.unsafeIndex arr (j+1) From c455e21e1f43fa630a8e18c29ea4694e86f7a140 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Wed, 14 Aug 2024 23:05:05 +0100 Subject: [PATCH 4/5] Implement Data.Text.unpack directly, without streaming --- src/Data/Text/Show.hs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/Data/Text/Show.hs b/src/Data/Text/Show.hs index f0996914..696d1354 100644 --- a/src/Data/Text/Show.hs +++ b/src/Data/Text/Show.hs @@ -1,6 +1,8 @@ -{-# LANGUAGE CPP, MagicHash #-} -{-# LANGUAGE Trustworthy #-} +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CApiFFI #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE Trustworthy #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -fno-warn-orphans #-} @@ -26,12 +28,11 @@ module Data.Text.Show import Control.Monad.ST (ST, runST) import Data.Text.Internal (Text(..), empty, safe, pack) import Data.Text.Internal.Encoding.Utf8 (utf8Length) -import Data.Text.Internal.Fusion (stream) import Data.Text.Internal.Unsafe.Char (unsafeWrite) +import Data.Text.Unsafe (Iter(..), iterArray) import GHC.Exts (Ptr(..), Int(..), Addr#, indexWord8OffAddr#) import GHC.Word (Word8(..)) import qualified Data.Text.Array as A -import qualified Data.Text.Internal.Fusion.Common as S #if !MIN_VERSION_ghc_prim(0,7,0) import Foreign.C.String (CString) import Foreign.C.Types (CSize(..)) @@ -52,7 +53,11 @@ unpack :: HasCallStack => #endif Text -> String -unpack = S.unstreamList . stream +unpack (Text arr off len) = go off + where + go !i + | i >= off + len = [] + | otherwise = let !(Iter c l) = iterArray arr i in c : go (i + l) {-# INLINE [1] unpack #-} -- | /O(n)/ Convert a null-terminated From 0d52a32456a0e6d51983bc4d84bf9db448e2976e Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 17 Aug 2024 20:30:21 +0100 Subject: [PATCH 5/5] Implement Data.Text.toTitle directly, without streaming --- src/Data/Text.hs | 6 +- src/Data/Text/Internal/Transformation.hs | 152 +++++++++++++++++++---- 2 files changed, 130 insertions(+), 28 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 790e9d1a..3a37db91 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -255,7 +255,7 @@ import qualified Prelude as P import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter, reverseIter_, unsafeHead, unsafeTail, iterArray, reverseIterArray) import Data.Text.Internal.Search (indices) -import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, filter_) +import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, toTitleNonEmpty, filter_) #if defined(__HADDOCK__) import Data.ByteString (ByteString) import qualified Data.Text.Lazy as L @@ -900,7 +900,9 @@ toUpper = \t -> -- -- @since 1.0.0.0 toTitle :: Text -> Text -toTitle t = unstream (S.toTitle (stream t)) +toTitle = \t -> + if null t then empty + else toTitleNonEmpty t {-# INLINE toTitle #-} -- | /O(n)/ Left-justify a string to the given length, using the diff --git a/src/Data/Text/Internal/Transformation.hs b/src/Data/Text/Internal/Transformation.hs index c9f14c48..e67abd9e 100644 --- a/src/Data/Text/Internal/Transformation.hs +++ b/src/Data/Text/Internal/Transformation.hs @@ -1,8 +1,11 @@ -{-# LANGUAGE BangPatterns, CPP, MagicHash #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE PartialTypeSignatures #-} +{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE UnliftedFFITypes #-} -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE PartialTypeSignatures #-} {-# OPTIONS_GHC -fno-warn-orphans #-} {-# OPTIONS_GHC -Wno-partial-type-signatures #-} @@ -25,24 +28,26 @@ module Data.Text.Internal.Transformation , toCaseFoldNonEmpty , toLowerNonEmpty , toUpperNonEmpty + , toTitleNonEmpty , filter_ ) where import Prelude (Char, Bool(..), Int, Ord(..), Monad(..), pure, - (+), (-), ($), - not, return, otherwise) + (+), (-), ($), (&&), (||), (==), + not, return, otherwise, fromIntegral, (/=), const) import Data.Bits ((.&.), shiftR, shiftL) +import Data.Char (isLetter, isSpace, ord) import Control.Monad.ST (ST, runST) import qualified Data.Text.Array as A import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, chr2, chr3, chr4) -import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping) +import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping, titleMapping) import Data.Text.Internal (Text(..), safe) import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8) import qualified Prelude as P import Data.Text.Unsafe (Iter(..), iterArray) -import Data.Word (Word8) +import Data.Word (Word8, Word) import qualified GHC.Exts as Exts import GHC.Int (Int64(..)) @@ -113,7 +118,7 @@ caseConvert ascii remap (Text src o l) = runST $ do A.unsafeWrite dst dstOff m0 A.unsafeWrite dst (dstOff + 1) m1 pure $ dstOff + 2 - i -> writeMapping i dstOff + i -> writeMapping dst i dstOff inner (srcOff + 2) dstOff' 3 -> do let !(Exts.C# c) = chr3 m0 m1 m2 @@ -123,7 +128,7 @@ caseConvert ascii remap (Text src o l) = runST $ do A.unsafeWrite dst (dstOff + 1) m1 A.unsafeWrite dst (dstOff + 2) m2 pure $ dstOff + 3 - i -> writeMapping i dstOff + i -> writeMapping dst i dstOff inner (srcOff + 3) dstOff' _ -> do let !(Exts.C# c) = chr4 m0 m1 m2 m3 @@ -134,45 +139,140 @@ caseConvert ascii remap (Text src o l) = runST $ do A.unsafeWrite dst (dstOff + 2) m2 A.unsafeWrite dst (dstOff + 3) m3 pure $ dstOff + 4 - i -> writeMapping i dstOff + i -> writeMapping dst i dstOff inner (srcOff + 4) dstOff' - writeMapping :: Int64 -> Int -> ST s Int - writeMapping 0 dstOff = pure dstOff - writeMapping i dstOff = do - let (ch, j) = chopOffChar i - d <- unsafeWrite dst dstOff ch - writeMapping j (dstOff + d) - - chopOffChar :: Int64 -> (Char, Int64) - chopOffChar ab = (chr a, ab `shiftR` 21) - where - chr (Exts.I# n) = Exts.C# (Exts.chr# n) - mask = (1 `shiftL` 21) - 1 - a = P.fromIntegral $ ab .&. mask {-# INLINE caseConvert #-} +writeMapping :: A.MArray s -> Int64 -> Int -> ST s Int +writeMapping !_ 0 !dstOff = pure dstOff +writeMapping dst i dstOff = do + let (ch, j) = chopOffChar i + d <- unsafeWrite dst dstOff ch + writeMapping dst j (dstOff + d) + +chopOffChar :: Int64 -> (Char, Int64) +chopOffChar ab = (chr a, ab `shiftR` 21) + where + chr (Exts.I# n) = Exts.C# (Exts.chr# n) + mask = (1 `shiftL` 21) - 1 + a = P.fromIntegral $ ab .&. mask -- | /O(n)/ Convert a string to folded case. -- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. toCaseFoldNonEmpty :: Text -> Text -toCaseFoldNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) foldMapping xs +toCaseFoldNonEmpty = \xs -> caseConvert asciiToLower foldMapping xs {-# INLINE toCaseFoldNonEmpty #-} -- | /O(n)/ Convert a string to lower case, using simple case -- conversion. -- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. toLowerNonEmpty :: Text -> Text -toLowerNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) lowerMapping xs +toLowerNonEmpty = \xs -> caseConvert asciiToLower lowerMapping xs {-# INLINE toLowerNonEmpty #-} -- | /O(n)/ Convert a string to upper case, using simple case -- conversion. -- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. toUpperNonEmpty :: Text -> Text -toUpperNonEmpty = \xs -> caseConvert (\w -> if w - 97 <= 25 then w - 32 else w) upperMapping xs +toUpperNonEmpty = \xs -> caseConvert asciiToUpper upperMapping xs {-# INLINE toUpperNonEmpty #-} +asciiToLower :: Word8 -> Word8 +asciiToLower w = if w - 65 <= 25 then w + 32 else w + +asciiToUpper :: Word8 -> Word8 +asciiToUpper w = if w - 97 <= 25 then w - 32 else w + +isAsciiLetter :: Word8 -> Bool +isAsciiLetter w = w - 65 <= 25 || w - 97 <= 25 + +isAsciiSpace :: Word8 -> Bool +isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5) + +-- | /O(n)/ Convert a string to title case, see 'Data.Text.toTitle' for discussion. +-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. +toTitleNonEmpty :: Text -> Text +toTitleNonEmpty (Text src o l) = runST $ do + -- Case conversion a single code point may produce up to 3 code-points, + -- each up to 4 bytes, so 12 in total. + dst <- A.new (l + 12) + outer dst l o 0 False + where + outer :: forall s. A.MArray s -> Int -> Int -> Int -> Bool -> ST s Text + outer !dst !dstLen = inner + where + inner !srcOff !dstOff !mode + | srcOff >= o + l = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + return (Text arr 0 dstOff) + | dstOff + 12 > dstLen = do + -- Ensure to extend the buffer by at least 12 bytes. + let !dstLen' = dstLen + max 12 (l + o - srcOff) + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff mode + -- If a character is to remain unchanged, no need to decode Char back into UTF8, + -- just copy bytes from input. + | otherwise = do + let m0 = A.unsafeIndex src srcOff + m1 = A.unsafeIndex src (srcOff + 1) + m2 = A.unsafeIndex src (srcOff + 2) + m3 = A.unsafeIndex src (srcOff + 3) + !d = utf8LengthByLeader m0 + + case d of + 1 -> do + let (mode', m0') = asciiAdvance mode m0 + A.unsafeWrite dst dstOff m0' + inner (srcOff + 1) (dstOff + 1) mode' + 2 -> do + let !(Exts.C# c) = chr2 m0 m1 + !(# mode', c' #) = advance (\_ -> m0 == 0xC2 && m1 == 0xA0) mode c + dstOff' <- case I64# c' of + 0 -> do + A.unsafeWrite dst dstOff m0 + A.unsafeWrite dst (dstOff + 1) m1 + pure $ dstOff + 2 + i -> writeMapping dst i dstOff + inner (srcOff + 2) dstOff' mode' + 3 -> do + let !(Exts.C# c) = chr3 m0 m1 m2 + isSpace3 ch + = m0 == 0xE1 && m1 == 0x9A && m2 == 0x80 + || m0 == 0xE2 && (m1 == 0x80 && isSpace (Exts.C# ch) || m1 == 0x81 && m2 == 0x9F) + || m0 == 0xE3 && m1 == 0x80 && m2 == 0x80 + !(# mode', c' #) = advance isSpace3 mode c + dstOff' <- case I64# c' of + 0 -> do + A.unsafeWrite dst dstOff m0 + A.unsafeWrite dst (dstOff + 1) m1 + A.unsafeWrite dst (dstOff + 2) m2 + pure $ dstOff + 3 + i -> writeMapping dst i dstOff + inner (srcOff + 3) dstOff' mode' + _ -> do + let !(Exts.C# c) = chr4 m0 m1 m2 m3 + !(# mode', c' #) = advance (\_ -> False) mode c + dstOff' <- case I64# c' of + 0 -> do + A.unsafeWrite dst dstOff m0 + A.unsafeWrite dst (dstOff + 1) m1 + A.unsafeWrite dst (dstOff + 2) m2 + A.unsafeWrite dst (dstOff + 3) m3 + pure $ dstOff + 4 + i -> writeMapping dst i dstOff + inner (srcOff + 4) dstOff' mode' + + asciiAdvance :: Bool -> Word8 -> (Bool, Word8) + asciiAdvance False w = (isAsciiLetter w, asciiToUpper w) + asciiAdvance True w = (not (isAsciiSpace w), asciiToLower w) + + advance :: (Exts.Char# -> Bool) -> Bool -> Exts.Char# -> (# Bool, _ {- unboxed Int64 -} #) + advance _ False c = (# isLetter (Exts.C# c), titleMapping c #) + advance isSpaceChar True c = (# not (isSpaceChar c), lowerMapping c #) + {-# INLINE advance #-} + -- | /O(n)/ 'filter_', applied to a continuation, a predicate and a @Text@, -- calls the continuation with the @Text@ containing only the characters satisfying the predicate. filter_ :: forall a. (A.Array -> Int -> Int -> a) -> (Char -> Bool) -> Text -> a