Copyright | © Herbert Valerio Riedel 2017 |
---|---|
License | BSD3 |
Maintainer | hvr@gnu.org |
Stability | stable |
Safe Haskell | Trustworthy |
Language | Haskell2010 |
Memory-efficient representation of Unicode text strings.
This module is intended to be imported qualified
, to avoid name
clashes with Prelude functions, e.g.
import qualified Data.Text.Short as TS import qualified Data.Text.Short (ShortText)
This modules deliberately omits (common) partial functions, which can be found in Data.Text.Short.Partial instead.
Since: 0.1
Synopsis
- data ShortText
- empty :: ShortText
- singleton :: Char -> ShortText
- pack :: [Char] -> ShortText
- append :: ShortText -> ShortText -> ShortText
- concat :: [ShortText] -> ShortText
- cons :: Char -> ShortText -> ShortText
- snoc :: ShortText -> Char -> ShortText
- replicate :: Int -> ShortText -> ShortText
- unpack :: ShortText -> [Char]
- uncons :: ShortText -> Maybe (Char, ShortText)
- unsnoc :: ShortText -> Maybe (ShortText, Char)
- null :: ShortText -> Bool
- length :: ShortText -> Int
- isAscii :: ShortText -> Bool
- all :: (Char -> Bool) -> ShortText -> Bool
- any :: (Char -> Bool) -> ShortText -> Bool
- find :: (Char -> Bool) -> ShortText -> Maybe Char
- isPrefixOf :: ShortText -> ShortText -> Bool
- isSuffixOf :: ShortText -> ShortText -> Bool
- (!?) :: ShortText -> Int -> Maybe Char
- indexMaybe :: ShortText -> Int -> Maybe Char
- indexEndMaybe :: ShortText -> Int -> Maybe Char
- findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
- take :: Int -> ShortText -> ShortText
- takeEnd :: Int -> ShortText -> ShortText
- drop :: Int -> ShortText -> ShortText
- dropEnd :: Int -> ShortText -> ShortText
- takeWhile :: (Char -> Bool) -> ShortText -> ShortText
- takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
- dropWhile :: (Char -> Bool) -> ShortText -> ShortText
- dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
- dropAround :: (Char -> Bool) -> ShortText -> ShortText
- splitAt :: Int -> ShortText -> (ShortText, ShortText)
- splitAtEnd :: Int -> ShortText -> (ShortText, ShortText)
- span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- stripPrefix :: ShortText -> ShortText -> Maybe ShortText
- stripSuffix :: ShortText -> ShortText -> Maybe ShortText
- intersperse :: Char -> ShortText -> ShortText
- intercalate :: ShortText -> [ShortText] -> ShortText
- reverse :: ShortText -> ShortText
- filter :: (Char -> Bool) -> ShortText -> ShortText
- foldl :: (a -> Char -> a) -> a -> ShortText -> a
- foldl' :: (a -> Char -> a) -> a -> ShortText -> a
- foldr :: (Char -> a -> a) -> a -> ShortText -> a
- fromString :: String -> ShortText
- toString :: ShortText -> String
- fromText :: Text -> ShortText
- toText :: ShortText -> Text
- fromShortByteString :: ShortByteString -> Maybe ShortText
- toShortByteString :: ShortText -> ShortByteString
- fromByteString :: ByteString -> Maybe ShortText
- toByteString :: ShortText -> ByteString
- toBuilder :: ShortText -> Builder
The ShortText
type
A compact representation of Unicode strings.
A ShortText
value is a sequence of Unicode scalar values, as defined in
§3.9, definition D76 of the Unicode 5.2 standard;
This means that a ShortText
is a list of (scalar) Unicode code-points (i.e. code-points in the range [U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]
).
This type relates to Text
as ShortByteString
relates to ByteString
by providing a more compact type. Please consult the documentation of Data.ByteString.Short for more information.
Currently, a boxed unshared Text
has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each Text
value which can share its payload with another Text
requires only 4 words additionally. Unlike ByteString
, Text
use unpinned memory.
In comparison, the footprint of a boxed ShortText
is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
It can be shown that for realistic data UTF-16 has a space overhead of 50% over UTF-8.
Since: 0.1
Instances
IsList ShortText # | Note: Surrogate pairs ( Since: 0.1.2 |
Eq ShortText # | |
Ord ShortText # | |
Defined in Data.Text.Short.Internal | |
Read ShortText # | |
Show ShortText # | |
IsString ShortText # | Note: Surrogate pairs ( This matches the behaviour of |
Defined in Data.Text.Short.Internal fromString :: String -> ShortText # | |
Semigroup ShortText # | |
Monoid ShortText # | |
PrintfArg ShortText # | Since: 0.1.2 |
Defined in Data.Text.Short.Internal formatArg :: ShortText -> FieldFormatter # parseFormat :: ShortText -> ModifierParser # | |
Binary ShortText # | |
NFData ShortText # | |
Defined in Data.Text.Short.Internal | |
Hashable ShortText # | |
Defined in Data.Text.Short.Internal | |
type Item ShortText # | |
Defined in Data.Text.Short.Internal |
Basic operations
Construction
singleton :: Char -> ShortText #
\(\mathcal{O}(1)\) Construct ShortText
from single codepoint.
singleton c == pack [c]
length (singleton c) == 1
>>>
singleton 'A'
"A"
>>>
map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
["\55295","\65533","\65533","\57344"]
Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1.2
\(\mathcal{O}(n)\) Construct a ShortText
from a list of Char
s.
This is an alias for fromString
.
Since: 0.1.2
snoc :: ShortText -> Char -> ShortText #
\(\mathcal{O}(n)\) Append a character to the ond of a ShortText
.
snoc t c == t <> singleton c
Since: 0.1.2
replicate :: Int -> ShortText -> ShortText #
\(\mathcal{O}(n*m)\) Replicate a ShortText
.
A repetition count smaller than 1 results in an empty string result.
>>>
replicate 3 "jobs!"
"jobs!jobs!jobs!"
>>>
replicate 10000 ""
""
>>>
replicate 0 "nothing"
""
length (replicate n t) == max 0 n * length t
Since: 0.1.2
Deconstruction
Querying & predicates
\(\mathcal{O}(1)\) Test whether a ShortText
is empty.
>>>
null ""
True
null (singleton c) == False
null t == (length t == 0)
Since: 0.1
\(\mathcal{O}(n)\) Count the number of Unicode code-points in a ShortText
.
>>>
length "abcd€"
5
>>>
length ""
0
length t >= 0
Since: 0.1
all :: (Char -> Bool) -> ShortText -> Bool #
\(\mathcal{O}(n)\) Test whether all code points in ShortText
satisfy a predicate.
>>>
all (const False) ""
True
>>>
all (> 'c') "abcdabcd"
False
>>>
all (/= 'c') "abdabd"
True
Since: 0.1.2
any :: (Char -> Bool) -> ShortText -> Bool #
\(\mathcal{O}(n)\) Test whether any code points in ShortText
satisfy a predicate.
>>>
any (> 'c') "abcdabcd"
True
>>>
any (const True) ""
False
>>>
any (== 'c') "abdabd"
False
any p t == not (all (not . p) t)
Since: 0.1.2
find :: (Char -> Bool) -> ShortText -> Maybe Char #
\(\mathcal{O}(n)\) Return the left-most codepoint in ShortText
that satisfies the given predicate.
>>>
find (> 'b') "abcdabcd"
Just 'c'
>>>
find (> 'b') "ababab"
Nothing
Since: 0.1.2
isPrefixOf :: ShortText -> ShortText -> Bool #
isSuffixOf :: ShortText -> ShortText -> Bool #
Lookup & indexing
(!?) :: ShortText -> Int -> Maybe Char #
\(\mathcal{O}(n)\) Index i-th code-point in ShortText
.
Infix operator alias of indexMaybe
>>>
"abcdefg" !? 2
Just 'c'
Since: 0.1.2
findIndex :: (Char -> Bool) -> ShortText -> Maybe Int #
\(\mathcal{O}(n)\) Return the index of the left-most codepoint in ShortText
that satisfies the given predicate.
>>>
findIndex (> 'b') "abcdabcdef"
Just 2
>>>
findIndex (> 'b') "ababab"
Nothing
(indexMaybe t =<< findIndex p t) == find p t
Since: 0.1.2
Splitting ShortText
s
Basic functions
take :: Int -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Take prefix of given length or return whole ShortText
if too short.
>>>
take 3 "abcdef"
"abc"
>>>
take 3 "ab"
"ab"
Since: 0.1.2
takeEnd :: Int -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Take suffix of given length or return whole ShortText
if too short.
>>>
takeEnd 3 "abcdefg"
"efg"
>>>
takeEnd 3 "ab"
"ab"
Since: 0.1.2
takeWhile :: (Char -> Bool) -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Take longest prefix satisfying given predicate.
takeWhile p t == fst (span p t)
>>>
takeWhile (< 'c') "abcdabcd"
"ab"
Since: 0.1.2
takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Take longest suffix satisfying given predicate.
takeWhileEnd p t == snd (spanEnd p t)
>>>
takeWhileEnd (>= 'c') "abcdabcd"
"cd"
Since: 0.1.2
dropWhile :: (Char -> Bool) -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate.
dropWhile p t == snd (span p t)
>>>
dropWhile (< 'c') "abcdabcd"
"cdabcd"
Since: 0.1.2
dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate.
dropWhileEnd p t == fst (spanEnd p t)
>>>
dropWhileEnd (>= 'c') "abcdabcd"
"abcdab"
Since: 0.1.2
dropAround :: (Char -> Bool) -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Strip characters from the beginning end and of ShortText
which satisfy given predicate.
>>>
dropAround (== ' ') " white space "
"white space"
>>>
dropAround (> 'a') "bcdefghi"
""
Since: 0.1.2
Pair-valued functions
splitAt :: Int -> ShortText -> (ShortText, ShortText) #
\(\mathcal{O}(n)\) Split ShortText
into two halves.
'splitAtOfs n t
returns a pair of ShortText
with the following properties:
length (fst (splitAt n t)) == min (length t) (max 0 n)
fst (splitAt n t) <> snd (splitAt n t) == t
>>>
splitAt 2 "abcdef"
("ab","cdef")
>>>
splitAt 10 "abcdef"
("abcdef","")
>>>
splitAt (-1) "abcdef"
("","abcdef")
Since: 0.1.2
splitAtEnd :: Int -> ShortText -> (ShortText, ShortText) #
\(\mathcal{O}(n)\) Split ShortText
into two halves.
returns a pair of splitAtEnd
n tShortText
with the following properties:
length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
splitAtEnd n t == splitAt (length t - n) t
>>>
splitAtEnd 2 "abcdef"
("abcd","ef")
>>>
splitAtEnd 10 "abcdef"
("","abcdef")
>>>
splitAtEnd (-1) "abcdef"
("abcdef","")
Since: 0.1.2
span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) #
\(\mathcal{O}(n)\) Split ShortText
into longest prefix satisfying the given predicate and the remaining suffix.
>>>
span (< 'c') "abcdabcd"
("ab","cdabcd")
fst (span p t) <> snd (span p t) == t
Since: 0.1.2
break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) #
\(\mathcal{O}(n)\) Variant of span
with negated predicate.
>>>
break (> 'c') "abcdabcd"
("abc","dabcd")
break p t == span (not . p) t
fst (break p t) <> snd (break p t) == t
Since: 0.1.2
spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) #
\(\mathcal{O}(n)\) Split ShortText
into longest suffix satisfying the given predicate and the preceding prefix.
>>>
spanEnd (> 'c') "abcdabcd"
("abcdabc","d")
fst (spanEnd p t) <> snd (spanEnd p t) == t
Since: 0.1.2
breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) #
\(\mathcal{O}(n)\) Variant of spanEnd
with negated predicate.
>>>
breakEnd (< 'c') "abcdabcd"
("abcdab","cd")
breakEnd p t == spanEnd (not . p) t
fst (breakEnd p t) <> snd (breakEnd p t) == t
Since: 0.1.2
Suffix & Prefix operations
Transformations
intersperse :: Char -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Insert character between characters of ShortText
.
>>>
intersperse '*' "_"
"_"
>>>
intersperse '*' "MASH"
"M*A*S*H"
Since: 0.1.2
intercalate :: ShortText -> [ShortText] -> ShortText #
reverse :: ShortText -> ShortText #
\(\mathcal{O}(n)\) Reverse characters in ShortText
.
>>>
reverse "star live desserts"
"stressed evil rats"
reverse (singleton c) == singleton c
reverse (reverse t) == t
Since: 0.1.2
filter :: (Char -> Bool) -> ShortText -> ShortText #
\(\mathcal{O}(n)\) Remove characters from ShortText
which don't satisfy given predicate.
>>>
filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!"
"Y dn't nd vwls t cnvy nfrmtn!"
filter (const False) t == ""
filter (const True) t == t
length (filter p t) <= length t
filter p t == pack [ c | c <- unpack t, p c ]
Since: 0.1.2
Folds
foldl :: (a -> Char -> a) -> a -> ShortText -> a #
\(\mathcal{O}(n)\) Reduces the characters of the ShortText
with
the binary operator and an initial in forward direction (i.e. from
left to right).
>>>
foldl (\_ _ -> True) False ""
False
>>>
foldl (\s c -> c : s) ['.'] "abcd"
"dcba."
Since: 0.1.2
foldl' :: (a -> Char -> a) -> a -> ShortText -> a #
\(\mathcal{O}(n)\) Strict version of foldl
.
Since: 0.1.2
foldr :: (Char -> a -> a) -> a -> ShortText -> a #
\(\mathcal{O}(n)\) Reduces the characters of the ShortText
with
the binary operator and an initial in reverse direction (i.e. from
right to left).
>>>
foldr (\_ _ -> True) False ""
False
>>>
foldr (:) ['.'] "abcd"
"abcd."
Since: 0.1.2
Conversions
String
fromString :: String -> ShortText #
\(\mathcal{O}(n)\) Construct/pack from String
>>>
fromString []
""
>>>
fromString ['a','b','c']
"abc"
>>>
fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
"\55295\65533\65533\57344"
Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1
toString :: ShortText -> String #
\(\mathcal{O}(n)\) Convert to String
(fromString . toString) t == t
Note: See documentation of fromString
for why (
is not an identity function.toString
. fromString
)
Since: 0.1
Text
\(\mathcal{O}(n)\) Convert to Text
(fromText . toText) t == t
(toText . fromText) t == t
This is currently not \(\mathcal{O}(1)\) because currently Text
uses UTF-16 as its internal representation.
In the event that Text
will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
Since: 0.1
ByteString
fromShortByteString :: ShortByteString -> Maybe ShortText #
\(\mathcal{O}(n)\) Construct ShortText
from UTF-8 encoded ShortByteString
This operation doesn't copy the input ShortByteString
but it
cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding.
Returns Nothing
in case of invalid UTF-8 encoding.
>>>
fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A
Just "\NUL8\66330"
>>>
fromShortByteString "\xC0\x80" -- invalid denormalised U+00
Nothing
>>>
fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)
Nothing
>>>
fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF
Just "\1114111"
>>>
fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)
Nothing
fromShortByteString (toShortByteString t) == Just t
Since: 0.1
toShortByteString :: ShortText -> ShortByteString #
\(\mathcal{O}(0)\) Converts to UTF-8 encoded ShortByteString
This operation has effectively no overhead, as it's currently merely a newtype
-cast.
Since: 0.1
fromByteString :: ByteString -> Maybe ShortText #
\(\mathcal{O}(n)\) Construct ShortText
from UTF-8 encoded ByteString
fromByteString
accepts (or rejects) the same input data as fromShortByteString
.
Returns Nothing
in case of invalid UTF-8 encoding.
Since: 0.1
toByteString :: ShortText -> ByteString #
\(\mathcal{O}(n)\) Converts to UTF-8 encoded ByteString
Since: 0.1