diff --git a/CHANGELOG b/CHANGELOG index 8e4f7f5a..cfc97bba 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,58 @@ +1.23.1 --> 1.23.2 +================= +Performance: lexer/parser internals are now Text-native (Phase 1). +The Syntax AST stays String-valued, so this is a non-breaking change +for AST consumers. Phase 2 (Text-valued AST + lazy String compat +shim) is a separate later change. + +What changed: +* Lexer input tape, all Token payloads, the keyword/operator/pragma + lookup tables, and the numeric/identifier/escape/raw-pragma lex + workers operate on Data.Text directly. +* Strict 'discard' and a single-pass 'lexWhileT' (using T.span + + foldl' for line/column) replace the previous lazy formulations, + removing a per-character thunk chain in the tokenizer hot loop. +* Token payload fields are strict (!Text), so the [Char]-cons + accumulator inside lexString is materialized to a strict Text and + freed at token yield rather than being kept alive by the token + stream. +* lexString uses a scan-and-splice strategy: each plain run is + sliced from the input Text via T.span, and only the parsed + escape characters allocate new Text records. Per-token + allocation is O(escapes), not O(chars). + +What's new: +* Language.Haskell.Exts.Parser.Text exposing Text-input parser + entry points (parseModuleText, parseExpText, ...) that skip the + eager Data.Text.pack at the String boundary. +* lexTokenStreamText / lexTokenStreamTextWithMode in + Language.Haskell.Exts.Lexer, the lexer-only counterparts. + +Measurements (15-trial bench-mutex 2σ-gated, GHC 9.10, -O), Text +API vs master parseModule: + + Issue #478 stress (1 x 3 MB string literal): + master: 234 MB residency, 1156 MB allocated, 1.0 s + Text API: 52 KB residency, 7 MB allocated, 13 ms + delta: -99.98% / -99.40% / -98.71% + + Multi-literal stress (200 x 50 kB literals, 9.6 MB): + master: 628 MB residency, 3858 MB allocated, 2.9 s + Text API: 6 MB residency, 30 MB allocated, 47 ms + delta: -99.05% / -99.22% / -98.39% + + Identifier-heavy stress (5.1 MB, 1M ids / 100 unique): + no measurable change (the AST scaffolding dominates). + Atom table at lex time would address this; not in this PR. + + Real haskell-src-exts library files via the Text API: + InternalLexer.hs (58 kB): resid -74%, alloc -27%, time -51% + ParseSyntax.hs (17 kB): resid -54%, alloc +13% (small file + T.unpack overhead at AST construction; + eliminated by Phase 2) + Build.hs, Comments.hs, SrcLoc.hs (small): resid neutral to -38%, + alloc +11-13% (same small-file T.unpack overhead). + 1.23.0 --> 1.23.1 ================= * show instance for SrcLoc and SrcSpan renders "(-1)" instead of "-1" diff --git a/haskell-src-exts.cabal b/haskell-src-exts.cabal index fd059739..1aa63b16 100644 --- a/haskell-src-exts.cabal +++ b/haskell-src-exts.cabal @@ -1,5 +1,5 @@ Name: haskell-src-exts -Version: 1.23.1 +Version: 1.23.2 License: BSD3 License-File: LICENSE Build-Type: Simple @@ -50,6 +50,7 @@ Library Build-Tools: happy >= 1.19 Build-Depends: array >= 0.1, pretty >= 1.0, base >= 4.5 && < 5, + text >= 1.2, -- this is needed to access GHC.Generics on GHC 7.4 ghc-prim -- this is needed to access Data.Semigroup and Control.Monad.Fail on GHCs @@ -70,6 +71,7 @@ Library Language.Haskell.Exts.Fixity, Language.Haskell.Exts.ExactPrint, Language.Haskell.Exts.Parser, + Language.Haskell.Exts.Parser.Text, Language.Haskell.Exts.Comments Other-modules: Language.Haskell.Exts.ExtScheme, diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs index 6f054c8f..6a68d6b4 100644 --- a/src/Language/Haskell/Exts/InternalLexer.hs +++ b/src/Language/Haskell/Exts/InternalLexer.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_HADDOCK hide #-} ----------------------------------------------------------------------------- -- | @@ -29,34 +31,36 @@ import Language.Haskell.Exts.ExtScheme import Prelude hiding (id, exponent) import Data.Char import Data.Ratio -import Data.List (intercalate, isPrefixOf) +import Data.List (intercalate) import Control.Monad (when) +import qualified Data.Text as T +import Data.Text (Text) -- import Debug.Trace (trace) data Token - = VarId String - | LabelVarId String - | QVarId (String,String) - | IDupVarId (String) -- duplicable implicit parameter - | ILinVarId (String) -- linear implicit parameter - | ConId String - | QConId (String,String) - | DVarId [String] -- to enable varid's with '-' in them - | VarSym String - | ConSym String - | QVarSym (String,String) - | QConSym (String,String) - | IntTok (Integer, String) - | FloatTok (Rational, String) - | Character (Char, String) - | StringTok (String, String) - | IntTokHash (Integer, String) -- 1# - | WordTokHash (Integer, String) -- 1## - | FloatTokHash (Rational, String) -- 1.0# - | DoubleTokHash (Rational, String) -- 1.0## - | CharacterHash (Char, String) -- c# - | StringHash (String, String) -- "Hello world!"# + = VarId !Text + | LabelVarId !Text + | QVarId !(Text,Text) + | IDupVarId !Text -- duplicable implicit parameter + | ILinVarId !Text -- linear implicit parameter + | ConId !Text + | QConId !(Text,Text) + | DVarId ![Text] -- to enable varid's with '-' in them + | VarSym !Text + | ConSym !Text + | QVarSym !(Text,Text) + | QConSym !(Text,Text) + | IntTok !(Integer, Text) + | FloatTok !(Rational, Text) + | Character !(Char, Text) + | StringTok !(Text, Text) + | IntTokHash !(Integer, Text) -- 1# + | WordTokHash !(Integer, Text) -- 1## + | FloatTokHash !(Rational, Text) -- 1.0# + | DoubleTokHash !(Rational, Text) -- 1.0## + | CharacterHash !(Char, Text) -- c# + | StringHash !(Text, Text) -- "Hello world!"# -- Symbols @@ -110,13 +114,13 @@ data Token | THTypQuote -- [t| | THCloseQuote -- |] | THTCloseQuote -- ||] - | THIdEscape (String) -- dollar x + | THIdEscape (Text) -- dollar x | THParenEscape -- dollar ( - | THTIdEscape String -- dollar dollar x + | THTIdEscape Text -- dollar dollar x | THTParenEscape -- double dollar ( | THVarQuote -- 'x (but without the x) | THTyQuote -- ''T (but without the T) - | THQuasiQuote (String,String) -- [$...|...] + | THQuasiQuote (Text,Text) -- [$...|...] -- HaRP | RPGuardOpen -- (| @@ -131,7 +135,7 @@ data Token | XCloseTagOpen -- | XChildTagOpen -- <%> (note that close doesn't exist, it's XCloseTagOpen followed by XCodeTagClose) - | XPCDATA String + | XPCDATA Text | XRPatOpen -- <[ | XRPatClose -- ]> @@ -151,7 +155,7 @@ data Token | CORE | UNPACK | NOUNPACK - | OPTIONS (Maybe String,String) + | OPTIONS (Maybe Text,Text) -- | CFILES String -- | INCLUDE String | LANGUAGE @@ -224,7 +228,7 @@ data Token | EOF deriving (Eq,Show) -reserved_ops :: [(String,(Token, Maybe ExtScheme))] +reserved_ops :: [(Text,(Token, Maybe ExtScheme))] reserved_ops = [ ( "..", (DotDot, Nothing) ), ( ":", (Colon, Nothing) ), @@ -260,7 +264,7 @@ reserved_ops = [ ( "\x2200", (KW_Forall, Just (All [UnicodeSyntax, ExplicitForAll])) ) ] -special_varops :: [(String,(Token, Maybe ExtScheme))] +special_varops :: [(Text,(Token, Maybe ExtScheme))] special_varops = [ -- the dot is only a special symbol together with forall, but can still be used as function composition ( ".", (Dot, Just (Any [ExplicitForAll, ExistentialQuantification])) ), @@ -268,7 +272,7 @@ special_varops = [ ( "!", (Exclamation, Nothing) ) ] -reserved_ids :: [(String,(Token, Maybe ExtScheme))] +reserved_ids :: [(Text,(Token, Maybe ExtScheme))] reserved_ids = [ ( "_", (Underscore, Nothing) ), ( "by", (KW_By, Just (Any [TransformListComp])) ), @@ -311,7 +315,7 @@ reserved_ids = [ ] -special_varids :: [(String,(Token, Maybe ExtScheme))] +special_varids :: [(Text,(Token, Maybe ExtScheme))] special_varids = [ ( "as", (KW_As, Nothing) ), ( "qualified", (KW_Qualified, Nothing) ), @@ -333,7 +337,7 @@ special_varids = [ ( "capi", (KW_CApi, Just (Any [CApiFFI])) ) ] -pragmas :: [(String,Token)] +pragmas :: [(Text,Token)] pragmas = [ ( "rules", RULES ), ( "inline", INLINE True ), @@ -367,7 +371,7 @@ pragmas = [ isIdent, isHSymbol, isPragmaChar :: Char -> Bool isIdent c = isAlphaNum c || c == '\'' || c == '_' -isHSymbol c = c `elem` ":!#%&*./?@\\-" || ((isSymbol c || isPunctuation c) && not (c `elem` "(),;[]`{}_\"'")) +isHSymbol c = c `elem` (":!#%&*./?@\\-" :: String) || ((isSymbol c || isPunctuation c) && not (c `elem` ("(),;[]`{}_\"'" :: String))) isPragmaChar c = isAlphaNum c || c == '_' @@ -379,7 +383,7 @@ isIdentStart c = isAlpha c && not (isUpper c) || c == '_' -- Why is it like this? I don't know exactly but this is how it is in -- GHC's parser. isOpSymbol :: Char -> Bool -isOpSymbol c = c `elem` "!#$%&*+./<=>?@\\^|-~" +isOpSymbol c = c `elem` ("!#$%&*+./<=>?@\\^|-~" :: String) -- | Checks whether the character would be legal in some position of a qvar. -- Means that '..' and "AAA" will pass the test. @@ -477,7 +481,7 @@ lexWhiteSpace_ bol = do _ <- lexWhiteSpace bol return () isRecognisedPragma, isLinePragma :: String -> Bool -isRecognisedPragma str = let pragma = takeWhile isPragmaChar . dropWhile isSpace $ str +isRecognisedPragma str = let pragma = T.pack . takeWhile isPragmaChar . dropWhile isSpace $ str in case lookupKnownPragma pragma of Nothing -> False _ -> True @@ -500,7 +504,7 @@ lexLinePragma = do fn <- lexWhile (/= '"') matchChar '"' "Impossible - lexLinePragma" lexWhile_ isSpace - mapM_ (flip matchChar "Improperly formatted LINE pragma") "#-}" + mapM_ (flip matchChar "Improperly formatted LINE pragma") ("#-}" :: String) lexNewline return (read i, fn) @@ -589,16 +593,16 @@ lexPCDATA = do '\n':_ -> do x <- lexNewline >> lexPCDATA case x of - XPCDATA p -> return $ XPCDATA $ '\n':p + XPCDATA p -> return $ XPCDATA $ T.cons '\n' p EOF -> return EOF _ -> fail $ "lexPCDATA: unexpected token: " ++ show x '<':_ -> return $ XPCDATA "" - _ -> do let pcd = takeWhile (\c -> c `notElem` "<\n") s + _ -> do let pcd = takeWhile (\c -> c `notElem` ("<\n" :: String)) s l = length pcd discard l x <- lexPCDATA case x of - XPCDATA pcd' -> return $ XPCDATA $ pcd ++ pcd' + XPCDATA pcd' -> return $ XPCDATA $ T.pack pcd <> pcd' EOF -> return EOF _ -> fail $ "lexPCDATA: unexpected token: " ++ show x @@ -662,27 +666,27 @@ lexStdToken = do discard 2 (n, str) <- lexOctal con <- intHash - return (con (n, '0':c:str)) + return (con (n, T.cons '0' (T.cons c str))) | toLower c == 'b' && isBinDigit d && BinaryLiterals `elem` exts -> do discard 2 (n, str) <- lexBinary con <- intHash - return (con (n, '0':c:str)) + return (con (n, T.cons '0' (T.cons c str))) | toLower c == 'x' && isHexDigit d -> do discard 2 (n, str) <- lexHexadecimal con <- intHash - return (con (n, '0':c:str)) + return (con (n, T.cons '0' (T.cons c str))) -- implicit parameters '?':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do discard 1 - id <- lexWhile isIdent + id <- lexWhileT isIdent return $ IDupVarId id '%':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do discard 1 - id <- lexWhile isIdent + id <- lexWhileT isIdent return $ ILinVarId id -- end implicit parameters @@ -741,14 +745,14 @@ lexStdToken = do '$':c1:c2:_ | isIdentStart c1 && TemplateHaskell `elem` exts -> do discard 1 - id <- lexWhile isIdent + id <- lexWhileT isIdent return $ THIdEscape id | c1 == '(' && TemplateHaskell `elem` exts -> do discard 2 return THParenEscape | c1 == '$' && isIdentStart c2 && TemplateHaskell `elem` exts -> do discard 2 - id <- lexWhile isIdent + id <- lexWhileT isIdent return $ THTIdEscape id | c1 == '$' && c2 == '(' && TemplateHaskell `elem` exts -> do discard 3 @@ -805,7 +809,7 @@ lexStdToken = do c:_ | isDigit c -> lexDecimalOrFloat - | isUpper c -> lexConIdOrQual "" + | isUpper c -> lexConIdOrQual T.empty | isIdentStart c -> do idents <- lexIdents @@ -820,7 +824,7 @@ lexStdToken = do _ -> return $ DVarId idents | isHSymbol c -> do - sym <- lexWhile isHSymbol + sym <- lexWhileT isHSymbol return $ case lookup sym (reserved_ops ++ special_varops) of Just (t , scheme) -> -- check if an extension op is enabled @@ -857,9 +861,9 @@ lexStdToken = do _ -> fail ("Illegal character \'" ++ show c ++ "\'\n") - where lexIdents :: Lex a [String] + where lexIdents :: Lex a [Text] lexIdents = do - ident <- lexWhile isIdent + ident <- lexWhileT isIdent s <- getInput exts <- getExtensionsL case s of @@ -870,53 +874,47 @@ lexStdToken = do idents <- lexIdents return $ ident : idents '#':_ | MagicHash `elem` exts -> do - hashes <- lexWhile (== '#') - return [ident ++ hashes] + hashes <- lexWhileT (== '#') + return [ident <> hashes] _ -> return [ident] lexQuasiQuote :: Char -> Lex a Token - lexQuasiQuote c = do - -- We've seen and dropped [$ already - ident <- lexQuoter - matchChar '|' "Malformed quasi-quote quoter" - body <- lexQQBody - return $ THQuasiQuote (ident, body) + lexQuasiQuote c = do ident <- lexQuoter -- We've seen and dropped [$ + matchChar '|' "Malformed quasi-quote quoter" + body <- lexQQBody + return $ THQuasiQuote (ident, T.pack body) where lexQuoter - | isIdentStart c = lexWhile isIdent - | otherwise = do - qualThing <- lexConIdOrQual "" - case qualThing of - QVarId (s1,s2) -> return $ s1 ++ '.':s2 - QVarSym (s1, s2) -> return $ s1 ++ '.':s2 - _ -> fail "Malformed quasi-quote quoter" - + | isIdentStart c = lexWhileT isIdent + | otherwise = do qualThing <- lexConIdOrQual "" + case qualThing of + QVarId (s1,s2) -> return $ s1 <> T.cons '.' s2 + QVarSym (s1,s2) -> return $ s1 <> T.cons '.' s2 + _ -> fail "Malformed quasi-quote quoter" + + -- [Char]-cons accumulator (same rationale as lexString): per-char + -- cons is cheaper than per-char T.singleton, and the body dies + -- when the THQuasiQuote token is yielded. lexQQBody :: Lex a String - lexQQBody = do - s <- getInput - case s of - '\\':']':_ -> do discard 2 - str <- lexQQBody - return (']':str) - '\\':'|':_ -> do discard 2 - str <- lexQQBody - return ('|':str) - '|':']':_ -> discard 2 >> return "" - '|':_ -> do discard 1 - str <- lexQQBody - return ('|':str) - ']':_ -> do discard 1 - str <- lexQQBody - return (']':str) - '\\':_ -> do discard 1 - str <- lexQQBody - return ('\\':str) - '\n':_ -> do lexNewline - str <- lexQQBody - return ('\n':str) - [] -> fail "Unexpected end of input while lexing quasi-quoter" - _ -> do str <- lexWhile (not . (`elem` "\\|\n")) - rest <- lexQQBody - return (str++rest) + lexQQBody = do s <- getInput + case s of + '\\':']':_ -> do discard 2 + fmap (']' :) lexQQBody + '\\':'|':_ -> do discard 2 + fmap ('|' :) lexQQBody + '|':']':_ -> do discard 2 + return "" + '|':_ -> do discard 1 + fmap ('|' :) lexQQBody + ']':_ -> do discard 1 + fmap (']' :) lexQQBody + '\\':_ -> do discard 1 + fmap ('\\':) lexQQBody + '\n':_ -> do lexNewline + fmap ('\n':) lexQQBody + [] -> fail "Unexpected end of input while lexing quasi-quoter" + _ -> do str <- lexWhile (not . (`elem` ("\\|\n" :: String))) + rest <- lexQQBody + return (str ++ rest) unboxed :: [KnownExtension] -> Bool unboxed exts = UnboxedSums `elem` exts || UnboxedTuples `elem` exts @@ -925,17 +923,17 @@ unboxed exts = UnboxedSums `elem` exts || UnboxedTuples `elem` exts -- with our representation: the thing after the underscore is a parameter. -- Strip off the parameters to option pragmas by hand here, everything else -- sits in the pragmas map. -lookupKnownPragma :: String -> Maybe Token +lookupKnownPragma :: Text -> Maybe Token lookupKnownPragma s = - case map toLower s of - x | "options_" `isPrefixOf` x -> Just $ OPTIONS (Just $ drop 8 s, undefined) - | "options" == x -> Just $ OPTIONS (Nothing, undefined) - | otherwise -> lookup x pragmas + case T.toLower s of + x | "options_" `T.isPrefixOf` x -> Just $ OPTIONS (Just $ T.drop 8 s, undefined) + | "options" == x -> Just $ OPTIONS (Nothing, undefined) + | otherwise -> lookup x pragmas lexPragmaStart :: Lex a Token lexPragmaStart = do lexWhile_ isSpace - pr <- lexWhile isPragmaChar + pr <- lexWhileT isPragmaChar case lookupKnownPragma pr of Just (INLINE True) -> do s <- getInput @@ -965,8 +963,9 @@ lexPragmaStart = do -- We do not want to store necessary whitespace in the datatype -- but if the pragma starts with a newline then we must keep -- it to differentiate the two cases. - let dropIfSpace (' ':xs) = xs - dropIfSpace xs = xs + let dropIfSpace t = case T.uncons t of + Just (' ', xs) -> xs + _ -> t in case fst opt of Just opt' -> do @@ -996,10 +995,10 @@ lexPragmaStart = do -- discard 3 -- #-} -- topLexer -- we just discard it as a comment for now and restart -} -lexRawPragma :: Lex a String +lexRawPragma :: Lex a Text lexRawPragma = lexRawPragmaAux where lexRawPragmaAux = do - rpr <- lexWhile (/='#') + rpr <- lexWhileT (/='#') s <- getInput case s of '#':'-':'}':_ -> return rpr @@ -1007,37 +1006,37 @@ lexRawPragma = lexRawPragmaAux _ -> do discard 1 rpr' <- lexRawPragma - return $ rpr ++ '#':rpr' + return $ rpr <> T.cons '#' rpr' lexDecimalOrFloat :: Lex a Token lexDecimalOrFloat = do - ds <- lexWhile isDigit + ds <- lexWhileT isDigit rest <- getInput exts <- getExtensionsL case rest of ('.':d:_) | isDigit d -> do discard 1 - frac <- lexWhile isDigit - let num = parseInteger 10 (ds ++ frac) - decimals = toInteger (length frac) + frac <- lexWhileT isDigit + let num = parseInteger 10 (ds <> frac) + decimals = toInteger (T.length frac) (exponent, estr) <- do rest2 <- getInput case rest2 of 'e':_ -> lexExponent 'E':_ -> lexExponent - _ -> return (0,"") + _ -> return (0, T.empty) con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash) - return $ con ((num%1) * 10^^(exponent - decimals), ds ++ '.':frac ++ estr) + return $ con ((num%1) * 10^^(exponent - decimals), ds <> T.cons '.' frac <> estr) e:_ | toLower e == 'e' -> do (exponent, estr) <- lexExponent con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash) - return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds ++ estr) + return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds <> estr) '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, ds)) '#':_ | MagicHash `elem` exts -> discard 1 >> return (IntTokHash (parseInteger 10 ds, ds)) _ -> return (IntTok (parseInteger 10 ds, ds)) where - lexExponent :: Lex a (Integer, String) + lexExponent :: Lex a (Integer, Text) lexExponent = do (e:r) <- getInput discard 1 -- 'e' or 'E' @@ -1045,12 +1044,12 @@ lexDecimalOrFloat = do '+':d:_ | isDigit d -> do discard 1 (n, str) <- lexDecimal - return (n, e:'+':str) + return (n, T.cons e (T.cons '+' str)) '-':d:_ | isDigit d -> do discard 1 (n, str) <- lexDecimal - return (negate n, e:'-':str) - d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, e:str) + return (negate n, T.cons e (T.cons '-' str)) + d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, T.cons e str) _ -> fail "Float with missing exponent" lexHash :: (b -> Token) -> (b -> Token) -> Either String (b -> Token) -> Lex a (b -> Token) @@ -1067,13 +1066,13 @@ lexHash a b c = do _ -> return a else return a -lexConIdOrQual :: String -> Lex a Token +lexConIdOrQual :: Text -> Lex a Token lexConIdOrQual qual = do - con <- lexWhile isIdent - let conid | null qual = ConId con - | otherwise = QConId (qual,con) - qual' | null qual = con - | otherwise = qual ++ '.':con + con <- lexWhileT isIdent + let conid | T.null qual = ConId con + | otherwise = QConId (qual, con) + qual' | T.null qual = con + | otherwise = qual <> T.cons '.' con just_a_conid <- alternative (return conid) rest <- getInput exts <- getExtensionsL @@ -1081,11 +1080,11 @@ lexConIdOrQual qual = do '.':c:_ | isIdentStart c -> do -- qualified varid? discard 1 - ident <- lexWhile isIdent + ident <- lexWhileT isIdent s <- getInput exts' <- getExtensionsL ident' <- case s of - '#':_ | MagicHash `elem` exts' -> discard 1 >> return (ident ++ "#") + '#':_ | MagicHash `elem` exts' -> discard 1 >> return (ident <> T.singleton '#') _ -> return ident case lookup ident' reserved_ids of -- cannot qualify a reserved word @@ -1098,7 +1097,7 @@ lexConIdOrQual qual = do | isHSymbol c -> do -- qualified symbol? discard 1 - sym <- lexWhile isHSymbol + sym <- lexWhileT isHSymbol exts' <- getExtensionsL case lookup sym reserved_ops of -- cannot qualify a reserved operator @@ -1113,8 +1112,8 @@ lexConIdOrQual qual = do not (isIdent $ head cs) && MagicHash `elem` exts -> do discard 1 case conid of - ConId con' -> return $ ConId $ con' ++ "#" - QConId (q,con') -> return $ QConId (q,con' ++ "#") + ConId con' -> return $ ConId $ con' <> "#" + QConId (q,con') -> return $ QConId (q, con' <> "#") _ -> fail $ "lexConIdOrQual: unexpected token: " ++ show conid _ -> return conid -- not a qualified thing @@ -1130,66 +1129,86 @@ lexCharacter = do -- We need to keep track of not only character constants but matchQuote con <- lexHash Character CharacterHash (Left "Double hash not available for character literals") - return (con (c, '\\':raw)) + return (con (c, T.cons '\\' raw)) c:'\'':_ -> do discard 2 con <- lexHash Character CharacterHash (Left "Double hash not available for character literals") - return (con (c, [c])) + return (con (c, T.singleton c)) _ | any (`elem` exts) [TemplateHaskell, DataKinds] -> return THVarQuote _ -> fail "Improper character constant or misplaced \'" where matchQuote = matchChar '\'' "Improperly terminated character constant" +-- Scan-and-splice: walk the input 'Text' looking for the longest run +-- of "plain" characters (not '"', '\\', '\n'), slice it as a single +-- 'Text' chunk that shares the input 'ByteArray', then handle the +-- terminator or escape. Per-token allocation is O(escapes), not +-- O(chars) -- a literal with no escapes allocates a single 'Text' +-- record; one with K escapes allocates O(K) chunks plus K +-- 'T.singleton' values for the parsed escape characters. +-- +-- The accumulators are reverse-ordered chunk lists (newest chunk +-- first); we 'T.concat . reverse' once at the end to materialize the +-- final strict 'Text' payload. lexString :: Lex a Token -lexString = loop ("","") - where - loop (s,raw) = do - r <- getInput - exts <- getExtensionsL - case r of - '\\':'&':_ -> do - discard 2 - loop (s, '&':'\\':raw) - '\\':c:_ | isSpace c -> do - discard 1 - wcs <- lexWhiteChars - matchChar '\\' "Illegal character in string gap" - loop (s, '\\':reverse wcs ++ '\\':raw) - | otherwise -> do - (ce, str) <- lexEscape - loop (ce:s, reverse str ++ '\\':raw) - '"':'#':_ | MagicHash `elem` exts -> do - discard 2 - return (StringHash (reverse s, reverse raw)) - '"':_ -> do - discard 1 - return (StringTok (reverse s, reverse raw)) - c:_ | c /= '\n' -> do - discard 1 - loop (c:s, c:raw) - _ -> fail "Improperly terminated string" - - lexWhiteChars :: Lex a String - lexWhiteChars = do - s <- getInput - case s of - '\n':_ -> do - lexNewline - wcs <- lexWhiteChars - return $ '\n':wcs - '\t':_ -> do - lexTab - wcs <- lexWhiteChars - return $ '\t':wcs - c:_ | isSpace c -> do - discard 1 - wcs <- lexWhiteChars - return $ c:wcs - _ -> return "" - -lexEscape :: Lex a (Char, String) +lexString = loop [] [] + where + isPlain c = c /= '"' && c /= '\\' && c /= '\n' + + -- Fast-merge for the very common case of a literal with no escapes. + -- 's' and 'raw' would both be a single chunk, so we'd otherwise + -- 'T.concat [taken]' which is identity but still allocates. Skip. + finish [taken] = taken + finish chunks = T.concat (reverse chunks) + + loop sChunks rawChunks = do + inp <- getInputT + let (taken, rest) = T.span isPlain inp + let !sChunks' = if T.null taken then sChunks else taken : sChunks + let !rawChunks' = if T.null taken then rawChunks else taken : rawChunks + discard (T.length taken) + exts <- getExtensionsL + case T.uncons rest of + Just ('"', after) -> do + discard 1 + case T.uncons after of + Just ('#', _) | MagicHash `elem` exts -> do + discard 1 + return (StringHash (finish sChunks', finish rawChunks')) + _ -> return (StringTok (finish sChunks', finish rawChunks')) + Just ('\\', _) -> do + inp2 <- getInputT + case T.unpack (T.take 2 inp2) of + "\\&" -> do + discard 2 + loop sChunks' (T.pack "\\&" : rawChunks') + '\\':c:[] | isSpace c -> do + discard 1 + wcs <- lexWhiteChars + matchChar '\\' "Illegal character in string gap" + -- Raw rep: '\' + wcs + '\' + loop sChunks' (T.singleton '\\' : wcs : T.singleton '\\' : rawChunks') + _ -> do + (ce, str) <- lexEscape + loop (T.singleton ce : sChunks') (str : T.singleton '\\' : rawChunks') + _ -> fail "Improperly terminated string" + + lexWhiteChars :: Lex a Text + lexWhiteChars = T.concat . reverse <$> go [] + where + go acc = do s <- getInput + case s of + '\n':_ -> do lexNewline + go (T.singleton '\n' : acc) + '\t':_ -> do lexTab + go (T.singleton '\t' : acc) + c:_ | isSpace c -> do discard 1 + go (T.singleton c : acc) + _ -> return acc + +lexEscape :: Lex a (Char, Text) lexEscape = do discard 1 r <- getInput @@ -1252,12 +1271,12 @@ lexEscape = do discard 1 (n, raw) <- lexOctal n' <- checkChar n - return (n', 'o':raw) + return (n', T.cons 'o' raw) 'x':c:_ | isHexDigit c -> do discard 1 (n, raw) <- lexHexadecimal n' <- checkChar n - return (n', 'x':raw) + return (n', T.cons 'x' raw) c:_ | isDigit c -> do (n, raw) <- lexDecimal n' <- checkChar n @@ -1271,38 +1290,38 @@ lexEscape = do -- Production cntrl from section B.2 - cntrl :: Char -> Lex a (Char, String) - cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), '^':c:[]) + cntrl :: Char -> Lex a (Char, Text) + cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), T.pack ('^':c:[])) cntrl _ = fail "Illegal control character" -- assumes at least one octal digit -lexOctal :: Lex a (Integer, String) +lexOctal :: Lex a (Integer, Text) lexOctal = do - ds <- lexWhile isOctDigit + ds <- lexWhileT isOctDigit return (parseInteger 8 ds, ds) -- assumes at least one binary digit -lexBinary :: Lex a (Integer, String) +lexBinary :: Lex a (Integer, Text) lexBinary = do - ds <- lexWhile isBinDigit + ds <- lexWhileT isBinDigit return (parseInteger 2 ds, ds) -- assumes at least one hexadecimal digit -lexHexadecimal :: Lex a (Integer, String) +lexHexadecimal :: Lex a (Integer, Text) lexHexadecimal = do - ds <- lexWhile isHexDigit + ds <- lexWhileT isHexDigit return (parseInteger 16 ds, ds) -- assumes at least one decimal digit -lexDecimal :: Lex a (Integer, String) +lexDecimal :: Lex a (Integer, Text) lexDecimal = do - ds <- lexWhile isDigit + ds <- lexWhileT isDigit return (parseInteger 10 ds, ds) -- Stolen from Hugs's Prelude -parseInteger :: Integer -> String -> Integer -parseInteger radix ds = - foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds) +parseInteger :: Integer -> Text -> Integer +parseInteger radix = + T.foldl' (\n c -> n * radix + toInteger (digitToInt c)) 0 flagKW :: Token -> Lex a () flagKW t = @@ -1318,28 +1337,28 @@ isBinDigit c = c >= '0' && c <= '1' showToken :: Token -> String showToken t = case t of - VarId s -> s - LabelVarId s -> '#':s - QVarId (q,s) -> q ++ '.':s - IDupVarId s -> '?':s - ILinVarId s -> '%':s - ConId s -> s - QConId (q,s) -> q ++ '.':s - DVarId ss -> intercalate "-" ss - VarSym s -> s - ConSym s -> s - QVarSym (q,s) -> q ++ '.':s - QConSym (q,s) -> q ++ '.':s - IntTok (_, s) -> s - FloatTok (_, s) -> s - Character (_, s) -> '\'':s ++ "'" - StringTok (_, s) -> '"':s ++ "\"" - IntTokHash (_, s) -> s ++ "#" - WordTokHash (_, s) -> s ++ "##" - FloatTokHash (_, s) -> s ++ "#" - DoubleTokHash (_, s) -> s ++ "##" - CharacterHash (_, s) -> '\'':s ++ "'#" - StringHash (_, s) -> '"':s ++ "\"#" + VarId s -> T.unpack s + LabelVarId s -> '#' : T.unpack s + QVarId (q,s) -> T.unpack q ++ '.' : T.unpack s + IDupVarId s -> '?' : T.unpack s + ILinVarId s -> '%' : T.unpack s + ConId s -> T.unpack s + QConId (q,s) -> T.unpack q ++ '.' : T.unpack s + DVarId ss -> intercalate "-" (map T.unpack ss) + VarSym s -> T.unpack s + ConSym s -> T.unpack s + QVarSym (q,s) -> T.unpack q ++ '.' : T.unpack s + QConSym (q,s) -> T.unpack q ++ '.' : T.unpack s + IntTok (_, s) -> T.unpack s + FloatTok (_, s) -> T.unpack s + Character (_, s) -> '\'' : T.unpack s ++ "'" + StringTok (_, s) -> '"' : T.unpack s ++ "\"" + IntTokHash (_, s) -> T.unpack s ++ "#" + WordTokHash (_, s) -> T.unpack s ++ "##" + FloatTokHash (_, s) -> T.unpack s ++ "#" + DoubleTokHash (_, s) -> T.unpack s ++ "##" + CharacterHash (_, s) -> '\'' : T.unpack s ++ "'#" + StringHash (_, s) -> '"' : T.unpack s ++ "\"#" LeftParen -> "(" RightParen -> ")" LeftHashParen -> "(#" @@ -1385,13 +1404,13 @@ showToken t = case t of THTypQuote -> "[t|" THCloseQuote -> "|]" THTCloseQuote -> "||]" - THIdEscape s -> '$':s + THIdEscape s -> '$' : T.unpack s THParenEscape -> "$(" - THTIdEscape s -> "$$" ++ s + THTIdEscape s -> "$$" ++ T.unpack s THTParenEscape -> "$$(" THVarQuote -> "'" THTyQuote -> "''" - THQuasiQuote (n,q) -> "[$" ++ n ++ "|" ++ q ++ "]" + THQuasiQuote (n,q) -> "[$" ++ T.unpack n ++ "|" ++ T.unpack q ++ "]" RPGuardOpen -> "(|" RPGuardClose -> "|)" RPCAt -> "@:" @@ -1401,7 +1420,7 @@ showToken t = case t of XStdTagClose -> ">" XCloseTagOpen -> " "/>" - XPCDATA s -> "PCDATA " ++ s + XPCDATA s -> "PCDATA " ++ T.unpack s XRPatOpen -> "<[" XRPatClose -> "]>" PragmaEnd -> "#-}" @@ -1418,7 +1437,7 @@ showToken t = case t of CORE -> "{-# CORE" UNPACK -> "{-# UNPACK" NOUNPACK -> "{-# NOUNPACK" - OPTIONS (mt,_) -> "{-# OPTIONS" ++ maybe "" (':':) mt ++ " ..." + OPTIONS (mt,_) -> "{-# OPTIONS" ++ maybe "" ((':':) . T.unpack) mt ++ " ..." -- CFILES s -> "{-# CFILES ..." -- INCLUDE s -> "{-# INCLUDE ..." LANGUAGE -> "{-# LANGUAGE" diff --git a/src/Language/Haskell/Exts/InternalParser.ly b/src/Language/Haskell/Exts/InternalParser.ly index 523b8ef9..9c2ae71d 100644 --- a/src/Language/Haskell/Exts/InternalParser.ly +++ b/src/Language/Haskell/Exts/InternalParser.ly @@ -41,6 +41,7 @@ > import Control.Monad ( liftM, (<=<), when ) > import Control.Applicative ( (<$>) ) > import Data.Maybe +> import qualified Data.Text as T > #if MIN_VERSION_base(4,11,0) > import Prelude hiding ((<>)) > #endif @@ -367,7 +368,7 @@ TODO: Yuck, this is messy, needs fixing in the AST! > toppragma :: { ModulePragma L } > : '{-# LANGUAGE' conids optsemis '#-}' { LanguagePragma ($1 <^^> $4 <** ($1:snd $2 ++ reverse $3 ++ [$4])) (fst $2) } > | '{-# OPTIONS' optsemis '#-}' { let Loc l (OPTIONS (mc, s)) = $1 -> in OptionsPragma (l <^^> $3 <** (l:reverse $2 ++ [$3])) (readTool mc) s } +> in OptionsPragma (l <^^> $3 <** (l:reverse $2 ++ [$3])) (readTool (fmap T.unpack mc)) (T.unpack s) } > | '{-# ANN' annotation '#-}' { AnnModulePragma ($1 <^^> $3 <** [$1,$3]) $2 } @@ -388,8 +389,8 @@ Module Header > | {- empty -} { Nothing } > maybemodwarning :: { Maybe (WarningText L) } -> : '{-# DEPRECATED' STRING '#-}' { let Loc l (StringTok (s,_)) = $2 in Just $ DeprText ($1 <^^> $3 <** [$1,l,$3]) s } -> | '{-# WARNING' STRING '#-}' { let Loc l (StringTok (s,_)) = $2 in Just $ WarnText ($1 <^^> $3 <** [$1,l,$3]) s } +> : '{-# DEPRECATED' STRING '#-}' { let Loc l (StringTok (s,_)) = $2 in Just $ DeprText ($1 <^^> $3 <** [$1,l,$3]) (T.unpack s) } +> | '{-# WARNING' STRING '#-}' { let Loc l (StringTok (s,_)) = $2 in Just $ WarnText ($1 <^^> $3 <** [$1,l,$3]) (T.unpack s) } > | {- empty -} { Nothing } > body :: { ([ImportDecl L],[Decl L],[S],L) } @@ -487,7 +488,7 @@ Requires the PackageImports extension enabled. > maybepkg :: { (Maybe String,[S]) } > : STRING {% do { checkEnabled PackageImports ; > let { Loc l (StringTok (s,_)) = $1 } ; -> return $ (Just s,[l]) } } +> return $ (Just (T.unpack s),[l]) } } > | {- empty -} { (Nothing,[]) } > maybeas :: { (Maybe (ModuleName L),[S],Maybe L) } @@ -726,7 +727,7 @@ Role annotations > -- read it in as a varid for better error messages > role :: { (Maybe String, L) } -> role : VARID { let (VarId v) = unLoc $1 in (Just v, nIS $ loc $1) } +> role : VARID { let (VarId v) = unLoc $1 in (Just (T.unpack v), nIS $ loc $1) } > | '_' { (Nothing, nIS $1) } @@ -888,7 +889,7 @@ so no need to check for extensions. > | {- empty -} { Nothing } > fspec :: { (Maybe String, Name L, Type L, [S]) } -> : STRING var_no_safety '::' truedtype { let Loc l (StringTok (s,_)) = $1 in (Just s, $2, $4, [l,$3]) } +> : STRING var_no_safety '::' truedtype { let Loc l (StringTok (s,_)) = $1 in (Just (T.unpack s), $2, $4, [l,$3]) } > | var_no_safety '::' truedtype { (Nothing, $1, $3, [$2]) } ----------------------------------------------------------------------------- @@ -903,7 +904,7 @@ Pragmas > rule :: { Rule L } > : STRING activation ruleforall exp0 '=' trueexp {% do { let {Loc l (StringTok (s,_)) = $1}; > e <- checkRuleExpr $4; -> return $ Rule (nIS l <++> ann $6 <** l:snd $3 ++ [$5]) s $2 (fst $3) e $6 } } +> return $ Rule (nIS l <++> ann $6 <** l:snd $3 ++ [$5]) (T.unpack s) $2 (fst $3) e $6 } } > activation :: { Maybe (Activation L) } > : {- empty -} { Nothing } @@ -929,7 +930,7 @@ Pragmas > | {- empty -} { ([],[]) } > warndepr :: { (([Name L], String),[S]) } -> : namevars STRING { let Loc l (StringTok (s,_)) = $2 in ((fst $1,s),snd $1 ++ [l]) } +> : namevars STRING { let Loc l (StringTok (s,_)) = $2 in ((fst $1, T.unpack s),snd $1 ++ [l]) } > namevars :: { ([Name L],[S]) } > : namevar { ([$1],[]) } @@ -1009,10 +1010,10 @@ the (# and #) lexemes. Kinds will be handled at the kind rule. > | '(' ctype_(ostar,kstar) '::' kind ')' { TyKind ($1 <^^> $5 <** [$1,$3,$5]) $2 $4 } > | '$(' trueexp ')' { let l = ($1 <^^> $3 <** [$1,$3]) in TySplice l $ ParenSplice l $2 } > | '$$(' trueexp ')' { let l = ($1 <^^> $3 <** [$1,$3]) in TySplice l $ TParenSplice l $2 } -> | IDSPLICE { let Loc l (THIdEscape s) = $1 in TySplice (nIS l) $ IdSplice (nIS l) s } -> | TIDSPLICE { let Loc l (THTIdEscape s) = $1 in TySplice (nIS l) $ TIdSplice (nIS l) s } +> | IDSPLICE { let Loc l (THIdEscape s) = $1 in TySplice (nIS l) $ IdSplice (nIS l) (T.unpack s) } +> | TIDSPLICE { let Loc l (THTIdEscape s) = $1 in TySplice (nIS l) $ TIdSplice (nIS l) (T.unpack s) } > | '_' { TyWildCard (nIS $1) Nothing } -> | QUASIQUOTE { let Loc l (THQuasiQuote (n,q)) = $1 in TyQuasiQuote (nIS l) n q } +> | QUASIQUOTE { let Loc l (THQuasiQuote (n,q)) = $1 in TyQuasiQuote (nIS l) (T.unpack n) (T.unpack q) } > | ptype_(ostar,kstar) { % checkEnabled DataKinds >> return (TyPromoted (ann $1) $1) } > ptype_(ostar,kstar) :: { Promoted L } @@ -1022,8 +1023,8 @@ the (# and #) lexemes. Kinds will be handled at the kind rule. > | VARQUOTE '[' ']' { PromotedList ($1 <^^> $3 <** [$1, $3]) True [] } | '[' ']' {% PromotedList ($1 <^^> $2 <** [$1, $2]) False [] } > | VARQUOTE '(' types1 ')' {% PromotedTuple ($1 <^^> $4 <** ($1:reverse($4:snd $3))) . reverse <\$> mapM checkType (fst $3) } -> | INT { let Loc l (IntTok (i,raw)) = $1 in PromotedInteger (nIS l) i raw } -> | STRING { let Loc l (StringTok (s,raw)) = $1 in PromotedString (nIS l) s raw } +> | INT { let Loc l (IntTok (i,raw)) = $1 in PromotedInteger (nIS l) i (T.unpack raw) } +> | STRING { let Loc l (StringTok (s,raw)) = $1 in PromotedString (nIS l) (T.unpack s) (T.unpack raw) } > strict_mark :: { (Maybe (L -> BangType L,S), Maybe (Unpackedness L)) } @@ -1476,8 +1477,8 @@ mdo blocks require the RecursiveDo extension enabled, but the lexer handles that > | 'mdo' stmtlist { let (sts, inf, ss) = $2 in MDo (nIS $1 <++> inf <** $1:ss) sts } > exppragma :: { PExp L } -> : '{-# CORE' STRING '#-}' exp { let Loc l (StringTok (s,_)) = $2 in CorePragma (nIS $1 <++> ann $4 <** [l,$3]) s $4 } -> | '{-# SCC' STRING '#-}' exp { let Loc l (StringTok (s,_)) = $2 in SCCPragma (nIS $1 <++> ann $4 <** [l,$3]) s $4 } +> : '{-# CORE' STRING '#-}' exp { let Loc l (StringTok (s,_)) = $2 in CorePragma (nIS $1 <++> ann $4 <** [l,$3]) (T.unpack s) $4 } +> | '{-# SCC' STRING '#-}' exp { let Loc l (StringTok (s,_)) = $2 in SCCPragma (nIS $1 <++> ann $4 <** [l,$3]) (T.unpack s) $4 } > | '{-# GENERATED' STRING INT ':' INT '-' INT ':' INT '#-}' exp > { let { Loc l0 (StringTok (s,_)) = $2; > Loc l1 (IntTok (i1,_)) = $3; @@ -1485,7 +1486,7 @@ mdo blocks require the RecursiveDo extension enabled, but the lexer handles that > Loc l3 (IntTok (i3,_)) = $7; > Loc l4 (IntTok (i4,_)) = $9} > in GenPragma (nIS $1 <++> ann $11 <** [$1,l0,l1,$4,l2,$6,l3,$8,l4,$10]) -> s (fromInteger i1, fromInteger i2) +> (T.unpack s) (fromInteger i1, fromInteger i2) > (fromInteger i3, fromInteger i4) $11 } > fexp :: { PExp L } @@ -1558,8 +1559,8 @@ thing we need to look at here is the erpats that use no non-standard lexemes. Template Haskell - all this is enabled in the lexer. -> | IDSPLICE { let Loc l (THIdEscape s) = $1 in SpliceExp (nIS l) $ IdSplice (nIS l) s } -> | TIDSPLICE { let Loc l (THTIdEscape s) = $1 in SpliceExp (nIS l) $ TIdSplice (nIS l) s } +> | IDSPLICE { let Loc l (THIdEscape s) = $1 in SpliceExp (nIS l) $ IdSplice (nIS l) (T.unpack s) } +> | TIDSPLICE { let Loc l (THTIdEscape s) = $1 in SpliceExp (nIS l) $ TIdSplice (nIS l) (T.unpack s) } > | '$(' trueexp ')' { let l = ($1 <^^> $3 <** [$1,$3]) in SpliceExp l $ ParenSplice l $2 } > | '$$(' trueexp ')' { let l = ($1 <^^> $3 <** [$1,$3]) in SpliceExp l $ TParenSplice l $2 } > | '[|' trueexp '|]' { let l = ($1 <^^> $3 <** [$1,$3]) in BracketExp l $ ExpBracket l $2 } @@ -1580,7 +1581,7 @@ Template Haskell - all this is enabled in the lexer. > | VARQUOTE qcon { VarQuote (nIS $1 <++> ann $2 <** [$1]) $2 } > | TYPQUOTE tyvar { TypQuote (nIS $1 <++> ann $2 <** [$1]) (UnQual (ann $2) $2) } > | TYPQUOTE gtycon { TypQuote (nIS $1 <++> ann $2 <** [$1]) $2 } -> | QUASIQUOTE { let Loc l (THQuasiQuote (n,q)) = $1 in QuasiQuote (nIS l) n q } +> | QUASIQUOTE { let Loc l (THQuasiQuote (n,q)) = $1 in QuasiQuote (nIS l) (T.unpack n) (T.unpack q) } End Template Haskell > tup_exprs :: { ([S], SumOrTuple L) } @@ -1646,7 +1647,7 @@ Hsx Extensions - requires XmlSyntax, but the lexer handles all that. > | {- empty -} { [] } > child :: { PExp L } -> : PCDATA { let Loc l (XPCDATA pcd) = $1 in XPcdata (nIS l) pcd } +> : PCDATA { let Loc l (XPCDATA pcd) = $1 in XPcdata (nIS l) (T.unpack pcd) } > | '<[' sexps ']>' { XRPats ($1 <^^> $3 <** (snd $2 ++ [$1,$3])) $ reverse (fst $2) } > | xml { $1 } @@ -1656,9 +1657,9 @@ Hsx Extensions - requires XmlSyntax, but the lexer handles all that. > | xmlname { let Loc l str = $1 in XName (nIS l) str } > xmlname :: { Loc String } -> : VARID { let Loc l (VarId s) = $1 in Loc l s } -> | CONID { let Loc l (ConId s) = $1 in Loc l s } -> | DVARID { let Loc l (DVarId s) = $1 in Loc l $ mkDVar s } +> : VARID { let Loc l (VarId s) = $1 in Loc l (T.unpack s) } +> | CONID { let Loc l (ConId s) = $1 in Loc l (T.unpack s) } +> | DVARID { let Loc l (DVarId s) = $1 in Loc l $ mkDVar (map T.unpack s) } > | xmlkeyword { $1 } > xmlkeyword :: { Loc String } @@ -1986,7 +1987,7 @@ Implicit parameter > overloaded_label :: { PExp L } > : LABELVARID { let Loc l (LabelVarId v) = $1 in OverloadedLabel -> (nIS l) v } +> (nIS l) (T.unpack v) } ----------------------------------------------------------------------------- Identifiers and Symbols @@ -1994,11 +1995,11 @@ Identifiers and Symbols > qvarid :: { QName L } > : varid { UnQual (ann $1) $1 } > | QVARID { let {Loc l (QVarId q) = $1; nis = nIS l} -> in Qual nis (ModuleName nis (fst q)) (Ident nis (snd q)) } +> in Qual nis (ModuleName nis (T.unpack (fst q))) (Ident nis (T.unpack (snd q))) } > | '_' { hole_name (nIS $1) } > varid_no_safety :: { Name L } -> : VARID { let Loc l (VarId v) = $1 in Ident (nIS l) v } +> : VARID { let Loc l (VarId v) = $1 in Ident (nIS l) (T.unpack v) } > | 'as' { as_name (nIS $1) } > | 'qualified' { qualified_name (nIS $1) } > | 'hiding' { hiding_name (nIS $1) } @@ -2027,22 +2028,22 @@ Identifiers and Symbols Implicit parameter > ivarid :: { IPName L } -> : IDUPID { let Loc l (IDupVarId i) = $1 in IPDup (nIS l) i } -> | ILINID { let Loc l (ILinVarId i) = $1 in IPLin (nIS l) i } +> : IDUPID { let Loc l (IDupVarId i) = $1 in IPDup (nIS l) (T.unpack i) } +> | ILINID { let Loc l (ILinVarId i) = $1 in IPLin (nIS l) (T.unpack i) } > qconid :: { QName L } > : conid { UnQual (ann $1) $1 } -> | QCONID { let {Loc l (QConId q) = $1; nis = nIS l} in Qual nis (ModuleName nis (fst q)) (Ident nis (snd q)) } +> | QCONID { let {Loc l (QConId q) = $1; nis = nIS l} in Qual nis (ModuleName nis (T.unpack (fst q))) (Ident nis (T.unpack (snd q))) } > conid :: { Name L } -> : CONID { let Loc l (ConId c) = $1 in Ident (nIS l) c } +> : CONID { let Loc l (ConId c) = $1 in Ident (nIS l) (T.unpack c) } > qconsym :: { QName L } > : consym { UnQual (ann $1) $1 } -> | QCONSYM { let {Loc l (QConSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (fst q)) (Symbol nis (snd q)) } +> | QCONSYM { let {Loc l (QConSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (T.unpack (fst q))) (Symbol nis (T.unpack (snd q))) } > consym :: { Name L } -> : CONSYM { let Loc l (ConSym c) = $1 in Symbol (nIS l) c } +> : CONSYM { let Loc l (ConSym c) = $1 in Symbol (nIS l) (T.unpack c) } > qvarsym :: { QName L } > : qvarsym_('*') { $1 } @@ -2066,25 +2067,25 @@ Implicit parameter > : varsymm_('*') { $1 } > varsymm_(ostar) :: { Name L } -- varsym not including '-' -> : VARSYM { let Loc l (VarSym v) = $1 in Symbol (nIS l) v } +> : VARSYM { let Loc l (VarSym v) = $1 in Symbol (nIS l) (T.unpack v) } > | '!' { bang_name (nIS $1) } > | '.' { dot_name (nIS $1) } > | ostar { star_name (nIS $1) } > qvarsym1 :: { QName L } -> : QVARSYM { let {Loc l (QVarSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (fst q)) (Symbol nis (snd q)) } +> : QVARSYM { let {Loc l (QVarSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (T.unpack (fst q))) (Symbol nis (T.unpack (snd q))) } > literal :: { Literal L } -> : INT { let Loc l (IntTok (i,raw)) = $1 in Int (nIS l) i raw } -> | CHAR { let Loc l (Character (c,raw)) = $1 in Char (nIS l) c raw } -> | RATIONAL { let Loc l (FloatTok (r,raw)) = $1 in Frac (nIS l) r raw } -> | STRING { let Loc l (StringTok (s,raw)) = $1 in String (nIS l) s raw } -> | PRIMINT { let Loc l (IntTokHash (i,raw)) = $1 in PrimInt (nIS l) i raw } -> | PRIMWORD { let Loc l (WordTokHash (w,raw)) = $1 in PrimWord (nIS l) w raw } -> | PRIMFLOAT { let Loc l (FloatTokHash (f,raw)) = $1 in PrimFloat (nIS l) f raw } -> | PRIMDOUBLE { let Loc l (DoubleTokHash (d,raw)) = $1 in PrimDouble (nIS l) d raw } -> | PRIMCHAR { let Loc l (CharacterHash (c,raw)) = $1 in PrimChar (nIS l) c raw } -> | PRIMSTRING { let Loc l (StringHash (s,raw)) = $1 in PrimString (nIS l) s raw } +> : INT { let Loc l (IntTok (i,raw)) = $1 in Int (nIS l) i (T.unpack raw) } +> | CHAR { let Loc l (Character (c,raw)) = $1 in Char (nIS l) c (T.unpack raw) } +> | RATIONAL { let Loc l (FloatTok (r,raw)) = $1 in Frac (nIS l) r (T.unpack raw) } +> | STRING { let Loc l (StringTok (s,raw)) = $1 in String (nIS l) (T.unpack s) (T.unpack raw) } +> | PRIMINT { let Loc l (IntTokHash (i,raw)) = $1 in PrimInt (nIS l) i (T.unpack raw) } +> | PRIMWORD { let Loc l (WordTokHash (w,raw)) = $1 in PrimWord (nIS l) w (T.unpack raw) } +> | PRIMFLOAT { let Loc l (FloatTokHash (f,raw)) = $1 in PrimFloat (nIS l) f (T.unpack raw) } +> | PRIMDOUBLE { let Loc l (DoubleTokHash (d,raw)) = $1 in PrimDouble (nIS l) d (T.unpack raw) } +> | PRIMCHAR { let Loc l (CharacterHash (c,raw)) = $1 in PrimChar (nIS l) c (T.unpack raw) } +> | PRIMSTRING { let Loc l (StringHash (s,raw)) = $1 in PrimString (nIS l) (T.unpack s) (T.unpack raw) } ----------------------------------------------------------------------------- Layout @@ -2195,8 +2196,8 @@ Deriving strategies Miscellaneous (mostly renamings) > modid :: { ModuleName L } -> : CONID { let Loc l (ConId n) = $1 in ModuleName (nIS l) n } -> | QCONID { let Loc l (QConId n) = $1 in ModuleName (nIS l) (fst n ++ '.':snd n) } +> : CONID { let Loc l (ConId n) = $1 in ModuleName (nIS l) (T.unpack n) } +> | QCONID { let Loc l (QConId n) = $1 in ModuleName (nIS l) (T.unpack (fst n) ++ '.':T.unpack (snd n)) } > tyconorcls :: { Name L } > : con { $1 } @@ -2220,7 +2221,7 @@ Miscellaneous (mostly renamings) > | tyvarsym_(ostar) { UnQual (ann $1) $1 } > tyvarsym_(ostar) :: { Name L } -> tyvarsym : VARSYM { let Loc l (VarSym x) = $1 in Symbol (nIS l) x } +> tyvarsym : VARSYM { let Loc l (VarSym x) = $1 in Symbol (nIS l) (T.unpack x) } > | '-' { Symbol (nIS $1) "-" } > | ostar { Symbol (nIS $1) "*" } diff --git a/src/Language/Haskell/Exts/Lexer.hs b/src/Language/Haskell/Exts/Lexer.hs index ac41669d..86d9fc7a 100644 --- a/src/Language/Haskell/Exts/Lexer.hs +++ b/src/Language/Haskell/Exts/Lexer.hs @@ -16,6 +16,7 @@ module Language.Haskell.Exts.Lexer ( lexTokenStream, lexTokenStreamWithMode, + lexTokenStreamText, lexTokenStreamTextWithMode, Token(..), Loc(..), @@ -26,6 +27,7 @@ module Language.Haskell.Exts.Lexer import Language.Haskell.Exts.InternalLexer import Language.Haskell.Exts.ParseMonad import Language.Haskell.Exts.SrcLoc +import Data.Text (Text) -- | Lex a string into a list of Haskell 2010 source tokens. lexTokenStream :: String -> ParseResult [Loc Token] @@ -34,12 +36,21 @@ lexTokenStream = lexTokenStreamWithMode defaultParseMode -- | Lex a string into a list of Haskell source tokens, using an explicit mode. lexTokenStreamWithMode :: ParseMode -> String -> ParseResult [Loc Token] lexTokenStreamWithMode mode = runParserWithMode mode lexIt - where lexIt :: P [Loc Token] - lexIt = runL go return - go :: Lex [Loc Token] [Loc Token] - go = do - ltok <- topLexer - case ltok of - Loc _ EOF -> return [] - _ -> do ts <- go - return (ltok:ts) + +-- | 'Text'-input variant of 'lexTokenStream'. Skips the eager 'T.pack' +-- that the 'String' entry point performs at the boundary. +lexTokenStreamText :: Text -> ParseResult [Loc Token] +lexTokenStreamText = lexTokenStreamTextWithMode defaultParseMode + +-- | 'Text'-input variant of 'lexTokenStreamWithMode'. +lexTokenStreamTextWithMode :: ParseMode -> Text -> ParseResult [Loc Token] +lexTokenStreamTextWithMode mode = fmap fst . runParserWithModeCommentsText mode lexIt + +lexIt :: P [Loc Token] +lexIt = runL go return + where go :: Lex [Loc Token] [Loc Token] + go = do ltok <- topLexer + case ltok of + Loc _ EOF -> return [] + _ -> do ts <- go + return (ltok:ts) diff --git a/src/Language/Haskell/Exts/ParseMonad.hs b/src/Language/Haskell/Exts/ParseMonad.hs index bab71a45..80c3134e 100644 --- a/src/Language/Haskell/Exts/ParseMonad.hs +++ b/src/Language/Haskell/Exts/ParseMonad.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# OPTIONS_HADDOCK hide #-} ----------------------------------------------------------------------------- @@ -22,11 +23,12 @@ module Language.Haskell.Exts.ParseMonad( P, ParseResult(..), atSrcLoc, LexContext(..), ParseMode(..), defaultParseMode, fromParseResult, runParserWithMode, runParserWithModeComments, runParser, + runParserWithModeCommentsText, getSrcLoc, pushCurrentContext, popContext, getExtensions, getIgnoreFunctionArity, -- * Lexing - Lex(runL), getInput, discard, getLastChar, lexNewline, - lexTab, lexWhile, lexWhile_, + Lex(runL), getInput, getInputT, discard, getLastChar, lexNewline, + lexTab, lexWhile, lexWhile_, lexWhileT, alternative, checkBOL, setBOL, startToken, getOffside, pushContextL, popContextL, getExtensionsL, addExtensionL, saveExtensionsL, restoreExtensionsL, pushComment, @@ -44,6 +46,8 @@ import Language.Haskell.Exts.Comments import Language.Haskell.Exts.Extension -- (Extension, impliesExts, haskell2010) import Data.List (intercalate) +import Data.Text (Text) +import qualified Data.Text as T import Control.Applicative import Control.Monad (when, liftM, ap) import qualified Control.Monad.Fail as Fail @@ -70,6 +74,13 @@ class Parseable ast where -- with the AST. parseWithComments :: ParseMode -> String -> ParseResult (ast, [Comment]) parseWithComments mode = runParserWithModeComments mode . parser $ fixities mode + -- | 'Text'-input variant of 'parseWithMode'. Avoids the eager + -- 'T.pack'\/'T.unpack' cycle when the caller already has 'Text'. + parseTextWithMode :: ParseMode -> Text -> ParseResult ast + parseTextWithMode mode = fmap fst . parseTextWithComments mode + -- | 'Text'-input variant of 'parseWithComments'. + parseTextWithComments :: ParseMode -> Text -> ParseResult (ast, [Comment]) + parseTextWithComments mode = runParserWithModeCommentsText mode . parser $ fixities mode -- | Internal parser, used to provide default definitions for the others. parser :: Maybe [Fixity] -> P ast @@ -199,7 +210,7 @@ toInternalParseMode (ParseMode pf bLang exts _ilang iline _fx farity) = -- | Monad for parsing newtype P a = P { runP :: - String -- input string + Text -- input text -> Int -- current column -> Int -- current line -> SrcLoc -- location of last token read @@ -225,7 +236,13 @@ runParser :: P a -> String -> ParseResult a runParser = runParserWithMode defaultParseMode runParserWithModeComments :: ParseMode -> P a -> String -> ParseResult (a, [Comment]) -runParserWithModeComments mode = let mode2 = toInternalParseMode mode in \(P m) s -> +runParserWithModeComments mode pm s = runParserWithModeCommentsText mode pm (T.pack s) + +-- | 'Text'-input variant of 'runParserWithModeComments'. Avoids the +-- intermediate 'String' allocation when the caller already has the source +-- as 'Text'. +runParserWithModeCommentsText :: ParseMode -> P a -> Text -> ParseResult (a, [Comment]) +runParserWithModeCommentsText mode = let mode2 = toInternalParseMode mode in \(P m) s -> case m s 0 1 start '\n' ([],[],[],(False,False),[]) mode2 of Ok (_,_,_,_,cs) a -> ParseOk (a, reverse cs) Failed loc msg -> ParseFailed loc msg @@ -370,14 +387,28 @@ instance Fail.MonadFail (Lex r) where -- Operations on this monad +-- | Lazy-'String' view of the remaining input. Kept for compatibility — +-- pattern matching on the result forces only as many characters as examined, +-- so short lookaheads are cheap. The hot loops use 'getInputT' instead. getInput :: Lex r String -getInput = Lex $ \cont -> P $ \r -> runP (cont r) r +getInput = Lex $ \cont -> P $ \r -> runP (cont (T.unpack r)) r + +-- | 'Text' view of the remaining input. Preferred for productions that +-- bulk-consume a run of characters (e.g. string literals), because the +-- backing storage is an O(1) slice of the source buffer. +getInputT :: Lex r Text +getInputT = Lex $ \cont -> P $ \r -> runP (cont r) r -- | Discard some input characters (these must not include tabs or newlines). discard :: Int -> Lex r () discard n = Lex $ \cont -> P $ \r x y loc ch - -> let (newCh:rest)= if n > 0 then drop (n-1) r else (ch:r) + -> let !rest = T.drop n r + !newCh = if n > 0 + then case T.uncons (T.drop (n-1) r) of + Just (c,_) -> c + Nothing -> ch + else ch in runP (cont ()) rest (x+n) y loc newCh -- | Get the last discarded character. @@ -391,14 +422,14 @@ getLastChar = Lex $ \cont -> P $ \r x y loc ch -> runP (cont ch) r x y loc ch lexNewline :: Lex a () lexNewline = Lex $ \cont -> P $ \rs _x y loc -> - case rs of - (_:r) -> runP (cont ()) r 1 (y+1) loc - [] -> \_ _ _ -> Failed loc "Lexer: expected newline." + case T.uncons rs of + Just (_,r) -> runP (cont ()) r 1 (y+1) loc + Nothing -> \_ _ _ -> Failed loc "Lexer: expected newline." -- | Discard the next character, which must be a tab. lexTab :: Lex a () -lexTab = Lex $ \cont -> P $ \(_:r) x -> runP (cont ()) r (nextTab x) +lexTab = Lex $ \cont -> P $ \rs x -> runP (cont ()) (T.drop 1 rs) (nextTab x) nextTab :: Int -> Int nextTab x = x + (tAB_LENGTH - (x-1) `mod` tAB_LENGTH) @@ -409,24 +440,27 @@ tAB_LENGTH = 8 -- Consume and return the largest string of characters satisfying p lexWhile :: (Char -> Bool) -> Lex a String -lexWhile p = Lex $ \cont -> P $ \rss c l loc char -> - case rss of - [] -> runP (cont []) [] c l loc char - (r:rs) -> - let - l' = case r of - '\n' -> l + 1 - _ -> l - c' = case r of - '\n' -> 1 - _ -> c + 1 - in if p r - then runP (runL ((r:) <$> lexWhile p) cont) rs c' l' loc r - else runP (cont []) (r:rs) c l loc char +lexWhile p = T.unpack <$> lexWhileT p + +-- | 'Text'-producing variant of 'lexWhile'. Avoids the intermediate 'String' +-- allocation when the caller is happy with a 'Text' span. Implemented as a +-- single 'T.span' so the result shares the underlying input buffer and we +-- pay no per-char allocation; column/line counters are advanced in one pass. +lexWhileT :: (Char -> Bool) -> Lex a Text +lexWhileT p = Lex $ \cont -> P $ \rss c l loc char -> + let (taken, rest) = T.span p rss + in if T.null taken + then runP (cont T.empty) rss c l loc char + else let !lastCh = T.last taken + (!l', !c') = T.foldl' advance (l, c) taken + advance (!ln, !cn) ch = case ch of + '\n' -> (ln + 1, 1) + _ -> (ln, cn + 1) + in runP (cont taken) rest c' l' loc lastCh -- | lexWhile without the return value. lexWhile_ :: (Char -> Bool) -> Lex a () -lexWhile_ p = do _ <- lexWhile p +lexWhile_ p = do _ <- lexWhileT p return () -- An alternative scan, to which we can return if subsequent scanning diff --git a/src/Language/Haskell/Exts/Parser/Text.hs b/src/Language/Haskell/Exts/Parser/Text.hs new file mode 100644 index 00000000..dff98dc0 --- /dev/null +++ b/src/Language/Haskell/Exts/Parser/Text.hs @@ -0,0 +1,109 @@ +----------------------------------------------------------------------------- +-- | +-- Module : Language.Haskell.Exts.Parser.Text +-- License : BSD-style (see the file LICENSE.txt) +-- +-- Stability : stable +-- Portability : portable +-- +-- 'Text'-input variants of the parser entry points in +-- "Language.Haskell.Exts.Parser". These skip the eager 'T.pack' that +-- the 'String' entry points do at the boundary; the returned AST is +-- still the existing 'String'-valued 'Syntax' AST. +-- +----------------------------------------------------------------------------- +module Language.Haskell.Exts.Parser.Text + ( parseTextWithMode + , parseTextWithComments + , parseModuleText + , parseModuleTextWithMode + , parseModuleTextWithComments + , parseExpText + , parseExpTextWithMode + , parseExpTextWithComments + , parseDeclText + , parseDeclTextWithMode + , parseDeclTextWithComments + , parseTypeText + , parseTypeTextWithMode + , parseTypeTextWithComments + , parsePatText + , parsePatTextWithMode + , parsePatTextWithComments + , parseStmtText + , parseStmtTextWithMode + , parseStmtTextWithComments + , parseImportDeclText + , parseImportDeclTextWithMode + , parseImportDeclTextWithComments + ) where + +import Data.Text (Text) +import Language.Haskell.Exts.Comments +import Language.Haskell.Exts.ParseMonad +import Language.Haskell.Exts.Parser () +import Language.Haskell.Exts.SrcLoc +import Language.Haskell.Exts.Syntax + +parseModuleText :: Text -> ParseResult (Module SrcSpanInfo) +parseModuleText = parseTextWithMode defaultParseMode + +parseModuleTextWithMode :: ParseMode -> Text -> ParseResult (Module SrcSpanInfo) +parseModuleTextWithMode = parseTextWithMode + +parseModuleTextWithComments :: ParseMode -> Text -> ParseResult (Module SrcSpanInfo, [Comment]) +parseModuleTextWithComments = parseTextWithComments + +parseExpText :: Text -> ParseResult (Exp SrcSpanInfo) +parseExpText = parseTextWithMode defaultParseMode + +parseExpTextWithMode :: ParseMode -> Text -> ParseResult (Exp SrcSpanInfo) +parseExpTextWithMode = parseTextWithMode + +parseExpTextWithComments :: ParseMode -> Text -> ParseResult (Exp SrcSpanInfo, [Comment]) +parseExpTextWithComments = parseTextWithComments + +parseDeclText :: Text -> ParseResult (Decl SrcSpanInfo) +parseDeclText = parseTextWithMode defaultParseMode + +parseDeclTextWithMode :: ParseMode -> Text -> ParseResult (Decl SrcSpanInfo) +parseDeclTextWithMode = parseTextWithMode + +parseDeclTextWithComments :: ParseMode -> Text -> ParseResult (Decl SrcSpanInfo, [Comment]) +parseDeclTextWithComments = parseTextWithComments + +parseTypeText :: Text -> ParseResult (Type SrcSpanInfo) +parseTypeText = parseTextWithMode defaultParseMode + +parseTypeTextWithMode :: ParseMode -> Text -> ParseResult (Type SrcSpanInfo) +parseTypeTextWithMode = parseTextWithMode + +parseTypeTextWithComments :: ParseMode -> Text -> ParseResult (Type SrcSpanInfo, [Comment]) +parseTypeTextWithComments = parseTextWithComments + +parsePatText :: Text -> ParseResult (Pat SrcSpanInfo) +parsePatText = parseTextWithMode defaultParseMode + +parsePatTextWithMode :: ParseMode -> Text -> ParseResult (Pat SrcSpanInfo) +parsePatTextWithMode = parseTextWithMode + +parsePatTextWithComments :: ParseMode -> Text -> ParseResult (Pat SrcSpanInfo, [Comment]) +parsePatTextWithComments = parseTextWithComments + +parseStmtText :: Text -> ParseResult (Stmt SrcSpanInfo) +parseStmtText = parseTextWithMode defaultParseMode + +parseStmtTextWithMode :: ParseMode -> Text -> ParseResult (Stmt SrcSpanInfo) +parseStmtTextWithMode = parseTextWithMode + +parseStmtTextWithComments :: ParseMode -> Text -> ParseResult (Stmt SrcSpanInfo, [Comment]) +parseStmtTextWithComments = parseTextWithComments + +parseImportDeclText :: Text -> ParseResult (ImportDecl SrcSpanInfo) +parseImportDeclText = parseTextWithMode defaultParseMode + +parseImportDeclTextWithMode :: ParseMode -> Text -> ParseResult (ImportDecl SrcSpanInfo) +parseImportDeclTextWithMode = parseTextWithMode + +parseImportDeclTextWithComments :: ParseMode -> Text -> ParseResult (ImportDecl SrcSpanInfo, [Comment]) +parseImportDeclTextWithComments = parseTextWithComments