From e532fa1b076be6795db33d3756e6e621834a8dc4 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Fri, 8 May 2026 14:43:56 +0000
Subject: [PATCH 1/8] Move parser internals and entry points to Text

Make the lexer's input tape, all Token payloads, and the P/Lex monad
state operate on Data.Text rather than [Char].  This is the
foundation for the Text-input parser API.

Changes:

* Token type's String fields become Text fields.  Parser actions
  T.unpack at AST construction sites to keep the existing String-
  valued Syntax AST intact, so this is non-breaking for AST
  consumers.
* New module Language.Haskell.Exts.Parser.Text exposes Text-input
  variants of parseModule, parseExp, parseDecl, parseType, parsePat,
  parseStmt, parseImportDecl (with -WithMode and -WithComments
  variants for each).  These skip the eager T.pack the existing
  String entry points perform at the boundary.
* lexWhile gains a Text-producing companion lexWhileT.
* getInput's String form is preserved for short lookaheads;
  hot loops (lexWhileT, discard, lexNewline, lexTab) work on Text
  natively.

The Syntax AST stays String-valued in this commit -- a Text-valued
AST is a separate later change.

Adds dependency on the 'text' package.
---
 haskell-src-exts.cabal                      |   2 +
 src/Language/Haskell/Exts/InternalLexer.hs  | 202 ++++++++++----------
 src/Language/Haskell/Exts/InternalParser.ly |  95 ++++-----
 src/Language/Haskell/Exts/ParseMonad.hs     |  69 +++++--
 src/Language/Haskell/Exts/Parser/Text.hs    | 111 +++++++++++
 5 files changed, 316 insertions(+), 163 deletions(-)
 create mode 100644 src/Language/Haskell/Exts/Parser/Text.hs

diff --git a/haskell-src-exts.cabal b/haskell-src-exts.cabal
index fd059739..1cf7c5da 100644
--- a/haskell-src-exts.cabal
+++ b/haskell-src-exts.cabal
@@ -50,6 +50,7 @@ Library
   Build-Tools:          happy >= 1.19
   Build-Depends:        array >= 0.1, pretty >= 1.0,
                         base >= 4.5 && < 5,
+                        text >= 1.2,
                         -- this is needed to access GHC.Generics on GHC 7.4
                         ghc-prim
   -- this is needed to access Data.Semigroup and Control.Monad.Fail on GHCs
@@ -70,6 +71,7 @@ Library
                         Language.Haskell.Exts.Fixity,
                         Language.Haskell.Exts.ExactPrint,
                         Language.Haskell.Exts.Parser,
+                        Language.Haskell.Exts.Parser.Text,
                         Language.Haskell.Exts.Comments
 
   Other-modules:        Language.Haskell.Exts.ExtScheme,
diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs
index 6f054c8f..54c0e31b 100644
--- a/src/Language/Haskell/Exts/InternalLexer.hs
+++ b/src/Language/Haskell/Exts/InternalLexer.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE OverloadedStrings #-}
 {-# OPTIONS_HADDOCK hide #-}
 -----------------------------------------------------------------------------
 -- |
@@ -31,32 +33,34 @@ import Data.Char
 import Data.Ratio
 import Data.List (intercalate, isPrefixOf)
 import Control.Monad (when)
+import qualified Data.Text as T
+import Data.Text (Text)
 
 -- import Debug.Trace (trace)
 
 data Token
-        = VarId String
-        | LabelVarId String
-        | QVarId (String,String)
-        | IDupVarId (String)        -- duplicable implicit parameter
-        | ILinVarId (String)        -- linear implicit parameter
-        | ConId String
-        | QConId (String,String)
-        | DVarId [String]       -- to enable varid's with '-' in them
-        | VarSym String
-        | ConSym String
-        | QVarSym (String,String)
-        | QConSym (String,String)
-        | IntTok (Integer, String)
-        | FloatTok (Rational, String)
-        | Character (Char, String)
-        | StringTok (String, String)
-        | IntTokHash (Integer, String)        -- 1#
-        | WordTokHash (Integer, String)       -- 1##
-        | FloatTokHash (Rational, String)     -- 1.0#
-        | DoubleTokHash (Rational, String)    -- 1.0##
-        | CharacterHash (Char, String)        -- c#
-        | StringHash (String, String)         -- "Hello world!"#
+        = VarId Text
+        | LabelVarId Text
+        | QVarId (Text,Text)
+        | IDupVarId (Text)        -- duplicable implicit parameter
+        | ILinVarId (Text)        -- linear implicit parameter
+        | ConId Text
+        | QConId (Text,Text)
+        | DVarId [Text]       -- to enable varid's with '-' in them
+        | VarSym Text
+        | ConSym Text
+        | QVarSym (Text,Text)
+        | QConSym (Text,Text)
+        | IntTok (Integer, Text)
+        | FloatTok (Rational, Text)
+        | Character (Char, Text)
+        | StringTok (Text, Text)
+        | IntTokHash (Integer, Text)        -- 1#
+        | WordTokHash (Integer, Text)       -- 1##
+        | FloatTokHash (Rational, Text)     -- 1.0#
+        | DoubleTokHash (Rational, Text)    -- 1.0##
+        | CharacterHash (Char, Text)        -- c#
+        | StringHash (Text, Text)         -- "Hello world!"#
 
 -- Symbols
 
@@ -110,13 +114,13 @@ data Token
         | THTypQuote            -- [t|
         | THCloseQuote          -- |]
         | THTCloseQuote         -- ||]
-        | THIdEscape (String)   -- dollar x
+        | THIdEscape (Text)   -- dollar x
         | THParenEscape         -- dollar (
-        | THTIdEscape String    -- dollar dollar x
+        | THTIdEscape Text    -- dollar dollar x
         | THTParenEscape        -- double dollar (
         | THVarQuote            -- 'x (but without the x)
         | THTyQuote             -- ''T (but without the T)
-        | THQuasiQuote (String,String)  -- [$...|...]
+        | THQuasiQuote (Text,Text)  -- [$...|...]
 
 -- HaRP
         | RPGuardOpen       -- (|
@@ -131,7 +135,7 @@ data Token
         | XCloseTagOpen     -- </
         | XEmptyTagClose    -- />
         | XChildTagOpen     -- <%> (note that close doesn't exist, it's XCloseTagOpen followed by XCodeTagClose)
-        | XPCDATA String
+        | XPCDATA Text
         | XRPatOpen             -- <[
         | XRPatClose            -- ]>
 
@@ -151,7 +155,7 @@ data Token
         | CORE
         | UNPACK
         | NOUNPACK
-        | OPTIONS (Maybe String,String)
+        | OPTIONS (Maybe Text,Text)
 --        | CFILES  String
 --        | INCLUDE String
         | LANGUAGE
@@ -367,7 +371,7 @@ pragmas = [
 isIdent, isHSymbol, isPragmaChar :: Char -> Bool
 isIdent   c = isAlphaNum c || c == '\'' || c == '_'
 
-isHSymbol c = c `elem` ":!#%&*./?@\\-" || ((isSymbol c || isPunctuation c) && not (c `elem` "(),;[]`{}_\"'"))
+isHSymbol c = c `elem` (":!#%&*./?@\\-" :: String) || ((isSymbol c || isPunctuation c) && not (c `elem` ("(),;[]`{}_\"'" :: String)))
 
 isPragmaChar c = isAlphaNum c || c == '_'
 
@@ -379,7 +383,7 @@ isIdentStart c = isAlpha c && not (isUpper c) || c == '_'
 -- Why is it like this? I don't know exactly but this is how it is in
 -- GHC's parser.
 isOpSymbol :: Char -> Bool
-isOpSymbol c = c `elem` "!#$%&*+./<=>?@\\^|-~"
+isOpSymbol c = c `elem` ("!#$%&*+./<=>?@\\^|-~" :: String)
 
 -- | Checks whether the character would be legal in some position of a qvar.
 --   Means that '..' and "AAA" will pass the test.
@@ -500,7 +504,7 @@ lexLinePragma = do
     fn <- lexWhile (/= '"')
     matchChar '"' "Impossible - lexLinePragma"
     lexWhile_ isSpace
-    mapM_ (flip matchChar "Improperly formatted LINE pragma") "#-}"
+    mapM_ (flip matchChar "Improperly formatted LINE pragma") ("#-}" :: String)
     lexNewline
     return (read i, fn)
 
@@ -589,16 +593,16 @@ lexPCDATA = do
             '\n':_ -> do
                 x <- lexNewline >> lexPCDATA
                 case x of
-                 XPCDATA p -> return $ XPCDATA $ '\n':p
+                 XPCDATA p -> return $ XPCDATA $ T.cons '\n' p
                  EOF -> return EOF
                  _ -> fail $ "lexPCDATA: unexpected token: " ++ show x
             '<':_ -> return $ XPCDATA ""
-            _ -> do let pcd = takeWhile (\c -> c `notElem` "<\n") s
+            _ -> do let pcd = takeWhile (\c -> c `notElem` ("<\n" :: String)) s
                         l = length pcd
                     discard l
                     x <- lexPCDATA
                     case x of
-                     XPCDATA pcd' -> return $ XPCDATA $ pcd ++ pcd'
+                     XPCDATA pcd' -> return $ XPCDATA $ T.pack pcd <> pcd'
                      EOF -> return EOF
                      _ -> fail $ "lexPCDATA: unexpected token: " ++ show x
 
@@ -662,27 +666,27 @@ lexStdToken = do
                         discard 2
                         (n, str) <- lexOctal
                         con <- intHash
-                        return (con (n, '0':c:str))
+                        return (con (n, T.pack ('0':c:str)))
                   | toLower c == 'b' && isBinDigit d && BinaryLiterals `elem` exts -> do
                         discard 2
                         (n, str) <- lexBinary
                         con <- intHash
-                        return (con (n, '0':c:str))
+                        return (con (n, T.pack ('0':c:str)))
                   | toLower c == 'x' && isHexDigit d -> do
                         discard 2
                         (n, str) <- lexHexadecimal
                         con <- intHash
-                        return (con (n, '0':c:str))
+                        return (con (n, T.pack ('0':c:str)))
 
         -- implicit parameters
         '?':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do
                         discard 1
-                        id <- lexWhile isIdent
+                        id <- lexWhileT isIdent
                         return $ IDupVarId id
 
         '%':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do
                         discard 1
-                        id <- lexWhile isIdent
+                        id <- lexWhileT isIdent
                         return $ ILinVarId id
         -- end implicit parameters
 
@@ -741,14 +745,14 @@ lexStdToken = do
 
         '$':c1:c2:_ | isIdentStart c1 && TemplateHaskell `elem` exts -> do
                         discard 1
-                        id <- lexWhile isIdent
+                        id <- lexWhileT isIdent
                         return $ THIdEscape id
                     | c1 == '(' && TemplateHaskell `elem` exts -> do
                         discard 2
                         return THParenEscape
                     | c1 == '$' && isIdentStart c2 && TemplateHaskell `elem` exts -> do
                         discard 2
-                        id <- lexWhile isIdent
+                        id <- lexWhileT isIdent
                         return $ THTIdEscape id
                     | c1 == '$' && c2 == '(' && TemplateHaskell `elem` exts -> do
                         discard 3
@@ -800,7 +804,7 @@ lexStdToken = do
                    && isIdentStart c -> do
                                                   discard 1
                                                   [ident] <- lexIdents
-                                                  return $ LabelVarId ident
+                                                  return $ LabelVarId (T.pack ident)
 
 
         c:_ | isDigit c -> lexDecimalOrFloat
@@ -815,9 +819,9 @@ lexStdToken = do
                                     -- check if an extension keyword is enabled
                                     if isEnabled scheme exts
                                      then flagKW keyword >> return keyword
-                                     else return $ VarId ident
-                                 Nothing -> return $ VarId ident
-                     _ -> return $ DVarId idents
+                                     else return $ VarId (T.pack ident)
+                                 Nothing -> return $ VarId (T.pack ident)
+                     _ -> return $ DVarId (map T.pack idents)
 
             | isHSymbol c -> do
                     sym <- lexWhile isHSymbol
@@ -827,11 +831,11 @@ lexStdToken = do
                                 if isEnabled scheme exts
                                  then t
                                  else case c of
-                                        ':' -> ConSym sym
-                                        _   -> VarSym sym
+                                        ':' -> ConSym (T.pack sym)
+                                        _   -> VarSym (T.pack sym)
                               Nothing -> case c of
-                                          ':' -> ConSym sym
-                                          _   -> VarSym sym
+                                          ':' -> ConSym (T.pack sym)
+                                          _   -> VarSym (T.pack sym)
 
             | otherwise -> do
                     discard 1
@@ -880,14 +884,14 @@ lexStdToken = do
                 ident <- lexQuoter
                 matchChar '|' "Malformed quasi-quote quoter"
                 body <- lexQQBody
-                return $ THQuasiQuote (ident, body)
+                return $ THQuasiQuote (ident, T.pack body)
                   where lexQuoter
-                         | isIdentStart c = lexWhile isIdent
+                         | isIdentStart c = lexWhileT isIdent
                          | otherwise = do
                             qualThing <- lexConIdOrQual ""
                             case qualThing of
-                                QVarId (s1,s2) -> return $ s1 ++ '.':s2
-                                QVarSym (s1, s2) -> return $ s1 ++ '.':s2
+                                QVarId (s1,s2) -> return $ s1 <> T.cons '.' s2
+                                QVarSym (s1, s2) -> return $ s1 <> T.cons '.' s2
                                 _                -> fail "Malformed quasi-quote quoter"
 
             lexQQBody :: Lex a String
@@ -914,7 +918,7 @@ lexStdToken = do
                                str <- lexQQBody
                                return ('\n':str)
                   []     -> fail "Unexpected end of input while lexing quasi-quoter"
-                  _ -> do str <- lexWhile (not . (`elem` "\\|\n"))
+                  _ -> do str <- lexWhile (not . (`elem` ("\\|\n" :: String)))
                           rest <- lexQQBody
                           return (str++rest)
 
@@ -928,7 +932,7 @@ unboxed exts = UnboxedSums `elem` exts || UnboxedTuples `elem` exts
 lookupKnownPragma :: String -> Maybe Token
 lookupKnownPragma s =
     case map toLower s of
-      x | "options_" `isPrefixOf` x -> Just $ OPTIONS (Just $ drop 8 s, undefined)
+      x | "options_" `isPrefixOf` x -> Just $ OPTIONS (Just $ T.pack $ drop 8 s, undefined)
         | "options" == x            -> Just $ OPTIONS (Nothing, undefined)
         | otherwise                 -> lookup x pragmas
 
@@ -971,13 +975,13 @@ lexPragmaStart = do
               case fst opt of
                 Just opt' -> do
                   rest <- lexRawPragma
-                  return $ OPTIONS (Just opt', dropIfSpace rest)
+                  return $ OPTIONS (Just opt', T.pack (dropIfSpace rest))
                 Nothing -> do
                   s <- getInput
                   case s of
                     x:_ | isSpace x -> do
                       rest <- lexRawPragma
-                      return $ OPTIONS (Nothing, dropIfSpace rest)
+                      return $ OPTIONS (Nothing, T.pack (dropIfSpace rest))
                     _  -> fail "Malformed Options pragma"
      Just RULES -> do -- Rules enable ScopedTypeVariables locally.
             addExtensionL ScopedTypeVariables
@@ -1027,14 +1031,14 @@ lexDecimalOrFloat = do
                         'E':_ -> lexExponent
                         _     -> return (0,"")
                 con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
-                return $ con ((num%1) * 10^^(exponent - decimals), ds ++ '.':frac ++ estr)
+                return $ con ((num%1) * 10^^(exponent - decimals), T.pack (ds ++ '.':frac ++ estr))
         e:_ | toLower e == 'e' -> do
                 (exponent, estr) <- lexExponent
                 con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
-                return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds ++ estr)
-        '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, ds))
-        '#':_     | MagicHash `elem` exts -> discard 1 >> return (IntTokHash  (parseInteger 10 ds, ds))
-        _         ->              return (IntTok      (parseInteger 10 ds, ds))
+                return $ con ((parseInteger 10 ds%1) * 10^^exponent, T.pack (ds ++ estr))
+        '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, T.pack ds))
+        '#':_     | MagicHash `elem` exts -> discard 1 >> return (IntTokHash  (parseInteger 10 ds, T.pack ds))
+        _         ->              return (IntTok      (parseInteger 10 ds, T.pack ds))
 
     where
     lexExponent :: Lex a (Integer, String)
@@ -1070,8 +1074,8 @@ lexHash a b c = do
 lexConIdOrQual :: String -> Lex a Token
 lexConIdOrQual qual = do
         con <- lexWhile isIdent
-        let conid | null qual = ConId con
-                  | otherwise = QConId (qual,con)
+        let conid | null qual = ConId (T.pack con)
+                  | otherwise = QConId (T.pack qual, T.pack con)
             qual' | null qual = con
                   | otherwise = qual ++ '.':con
         just_a_conid <- alternative (return conid)
@@ -1090,7 +1094,7 @@ lexConIdOrQual qual = do
                     case lookup ident' reserved_ids of
                        -- cannot qualify a reserved word
                        Just (_,scheme) | isEnabled scheme exts'  -> just_a_conid
-                       _ -> return (QVarId (qual', ident'))
+                       _ -> return (QVarId (T.pack qual', T.pack ident'))
 
              | isUpper c -> do      -- qualified conid?
                     discard 1
@@ -1104,8 +1108,8 @@ lexConIdOrQual qual = do
                         -- cannot qualify a reserved operator
                         Just (_,scheme) | isEnabled scheme exts' -> just_a_conid
                         _        -> return $ case c of
-                                              ':' -> QConSym (qual', sym)
-                                              _   -> QVarSym (qual', sym)
+                                              ':' -> QConSym (T.pack qual', T.pack sym)
+                                              _   -> QVarSym (T.pack qual', T.pack sym)
 
           '#':cs
             | null cs ||
@@ -1113,8 +1117,8 @@ lexConIdOrQual qual = do
               not (isIdent $ head cs) && MagicHash `elem` exts -> do
                 discard 1
                 case conid of
-                 ConId con' -> return $ ConId $ con' ++ "#"
-                 QConId (q,con') -> return $ QConId (q,con' ++ "#")
+                 ConId con' -> return $ ConId $ con' <> "#"
+                 QConId (q,con') -> return $ QConId (q, con' <> "#")
                  _ -> fail $ "lexConIdOrQual: unexpected token: " ++ show conid
           _ ->  return conid -- not a qualified thing
 
@@ -1130,12 +1134,12 @@ lexCharacter = do   -- We need to keep track of not only character constants but
                     matchQuote
                     con <- lexHash Character CharacterHash
                             (Left "Double hash not available for character literals")
-                    return (con (c, '\\':raw))
+                    return (con (c, T.pack ('\\':raw)))
          c:'\'':_ -> do
                     discard 2
                     con <- lexHash Character CharacterHash
                             (Left "Double hash not available for character literals")
-                    return (con (c, [c]))
+                    return (con (c, T.singleton c))
          _ | any (`elem` exts) [TemplateHaskell, DataKinds] -> return THVarQuote
          _ -> fail "Improper character constant or misplaced \'"
 
@@ -1162,10 +1166,10 @@ lexString = loop ("","")
                         loop (ce:s, reverse str ++ '\\':raw)
             '"':'#':_ | MagicHash `elem` exts -> do
                         discard 2
-                        return (StringHash (reverse s, reverse raw))
+                        return (StringHash (T.pack (reverse s), T.pack (reverse raw)))
             '"':_ -> do
                 discard 1
-                return (StringTok (reverse s, reverse raw))
+                return (StringTok (T.pack (reverse s), T.pack (reverse raw)))
             c:_ | c /= '\n' -> do
                 discard 1
                 loop (c:s, c:raw)
@@ -1318,28 +1322,28 @@ isBinDigit c =  c >= '0' && c <= '1'
 
 showToken :: Token -> String
 showToken t = case t of
-  VarId s           -> s
-  LabelVarId s      -> '#':s
-  QVarId (q,s)      -> q ++ '.':s
-  IDupVarId s       -> '?':s
-  ILinVarId s       -> '%':s
-  ConId s           -> s
-  QConId (q,s)      -> q ++ '.':s
-  DVarId ss         -> intercalate "-" ss
-  VarSym s          -> s
-  ConSym s          -> s
-  QVarSym (q,s)     -> q ++ '.':s
-  QConSym (q,s)     -> q ++ '.':s
-  IntTok (_, s)         -> s
-  FloatTok (_, s)       -> s
-  Character (_, s)      -> '\'':s ++ "'"
-  StringTok (_, s)      -> '"':s ++ "\""
-  IntTokHash (_, s)     -> s ++ "#"
-  WordTokHash (_, s)    -> s ++ "##"
-  FloatTokHash (_, s)   -> s ++ "#"
-  DoubleTokHash (_, s)  -> s ++ "##"
-  CharacterHash (_, s)  -> '\'':s ++ "'#"
-  StringHash (_, s)     -> '"':s ++ "\"#"
+  VarId s           -> T.unpack s
+  LabelVarId s      -> '#' : T.unpack s
+  QVarId (q,s)      -> T.unpack q ++ '.' : T.unpack s
+  IDupVarId s       -> '?' : T.unpack s
+  ILinVarId s       -> '%' : T.unpack s
+  ConId s           -> T.unpack s
+  QConId (q,s)      -> T.unpack q ++ '.' : T.unpack s
+  DVarId ss         -> intercalate "-" (map T.unpack ss)
+  VarSym s          -> T.unpack s
+  ConSym s          -> T.unpack s
+  QVarSym (q,s)     -> T.unpack q ++ '.' : T.unpack s
+  QConSym (q,s)     -> T.unpack q ++ '.' : T.unpack s
+  IntTok (_, s)         -> T.unpack s
+  FloatTok (_, s)       -> T.unpack s
+  Character (_, s)      -> '\'' : T.unpack s ++ "'"
+  StringTok (_, s)      -> '"' : T.unpack s ++ "\""
+  IntTokHash (_, s)     -> T.unpack s ++ "#"
+  WordTokHash (_, s)    -> T.unpack s ++ "##"
+  FloatTokHash (_, s)   -> T.unpack s ++ "#"
+  DoubleTokHash (_, s)  -> T.unpack s ++ "##"
+  CharacterHash (_, s)  -> '\'' : T.unpack s ++ "'#"
+  StringHash (_, s)     -> '"' : T.unpack s ++ "\"#"
   LeftParen         -> "("
   RightParen        -> ")"
   LeftHashParen     -> "(#"
@@ -1385,13 +1389,13 @@ showToken t = case t of
   THTypQuote        -> "[t|"
   THCloseQuote      -> "|]"
   THTCloseQuote     -> "||]"
-  THIdEscape s      -> '$':s
+  THIdEscape s      -> '$' : T.unpack s
   THParenEscape     -> "$("
-  THTIdEscape s     -> "$$" ++ s
+  THTIdEscape s     -> "$$" ++ T.unpack s
   THTParenEscape    -> "$$("
   THVarQuote        -> "'"
   THTyQuote         -> "''"
-  THQuasiQuote (n,q) -> "[$" ++ n ++ "|" ++ q ++ "]"
+  THQuasiQuote (n,q) -> "[$" ++ T.unpack n ++ "|" ++ T.unpack q ++ "]"
   RPGuardOpen       -> "(|"
   RPGuardClose      -> "|)"
   RPCAt             -> "@:"
@@ -1401,7 +1405,7 @@ showToken t = case t of
   XStdTagClose      -> ">"
   XCloseTagOpen     -> "</"
   XEmptyTagClose    -> "/>"
-  XPCDATA s         -> "PCDATA " ++ s
+  XPCDATA s         -> "PCDATA " ++ T.unpack s
   XRPatOpen         -> "<["
   XRPatClose        -> "]>"
   PragmaEnd         -> "#-}"
@@ -1418,7 +1422,7 @@ showToken t = case t of
   CORE              -> "{-# CORE"
   UNPACK            -> "{-# UNPACK"
   NOUNPACK          -> "{-# NOUNPACK"
-  OPTIONS (mt,_)    -> "{-# OPTIONS" ++ maybe "" (':':) mt ++ " ..."
+  OPTIONS (mt,_)    -> "{-# OPTIONS" ++ maybe "" ((':':) . T.unpack) mt ++ " ..."
 --  CFILES  s         -> "{-# CFILES ..."
 --  INCLUDE s         -> "{-# INCLUDE ..."
   LANGUAGE          -> "{-# LANGUAGE"
diff --git a/src/Language/Haskell/Exts/InternalParser.ly b/src/Language/Haskell/Exts/InternalParser.ly
index 523b8ef9..9c2ae71d 100644
--- a/src/Language/Haskell/Exts/InternalParser.ly
+++ b/src/Language/Haskell/Exts/InternalParser.ly
@@ -41,6 +41,7 @@
 > import Control.Monad ( liftM, (<=<), when )
 > import Control.Applicative ( (<$>) )
 > import Data.Maybe
+> import qualified Data.Text as T
 > #if MIN_VERSION_base(4,11,0)
 > import Prelude hiding ((<>))
 > #endif
@@ -367,7 +368,7 @@ TODO: Yuck, this is messy, needs fixing in the AST!
 > toppragma :: { ModulePragma L }
 >           : '{-# LANGUAGE' conids optsemis '#-}'   { LanguagePragma ($1 <^^> $4 <** ($1:snd $2 ++ reverse $3 ++ [$4])) (fst $2) }
 >           | '{-# OPTIONS' optsemis '#-}'           { let Loc l (OPTIONS (mc, s)) = $1
->                                                       in OptionsPragma (l <^^> $3 <** (l:reverse $2 ++ [$3])) (readTool mc) s }
+>                                                       in OptionsPragma (l <^^> $3 <** (l:reverse $2 ++ [$3])) (readTool (fmap T.unpack mc)) (T.unpack s) }
 >           | '{-# ANN' annotation '#-}'             { AnnModulePragma ($1 <^^> $3 <** [$1,$3]) $2 }
 
 
@@ -388,8 +389,8 @@ Module Header
 >       | {- empty -}                                           { Nothing }
 
 > maybemodwarning ::  { Maybe (WarningText L) }
->       : '{-# DEPRECATED' STRING '#-}'         { let Loc l (StringTok (s,_)) = $2 in Just $ DeprText ($1 <^^> $3 <** [$1,l,$3]) s }
->       | '{-# WARNING'    STRING '#-}'         { let Loc l (StringTok (s,_)) = $2 in Just $ WarnText ($1 <^^> $3 <** [$1,l,$3]) s }
+>       : '{-# DEPRECATED' STRING '#-}'         { let Loc l (StringTok (s,_)) = $2 in Just $ DeprText ($1 <^^> $3 <** [$1,l,$3]) (T.unpack s) }
+>       | '{-# WARNING'    STRING '#-}'         { let Loc l (StringTok (s,_)) = $2 in Just $ WarnText ($1 <^^> $3 <** [$1,l,$3]) (T.unpack s) }
 >       | {- empty -}                           { Nothing }
 
 > body :: { ([ImportDecl L],[Decl L],[S],L) }
@@ -487,7 +488,7 @@ Requires the PackageImports extension enabled.
 > maybepkg :: { (Maybe String,[S]) }
 >       : STRING                                {% do { checkEnabled PackageImports ;
 >                                                       let { Loc l (StringTok (s,_)) = $1 } ;
->                                                       return $ (Just s,[l]) } }
+>                                                       return $ (Just (T.unpack s),[l]) } }
 >       | {- empty -}                           { (Nothing,[]) }
 
 > maybeas :: { (Maybe (ModuleName L),[S],Maybe L) }
@@ -726,7 +727,7 @@ Role annotations
 
 > -- read it in as a varid for better error messages
 > role :: { (Maybe String, L) }
-> role : VARID             { let (VarId v) = unLoc $1 in (Just v, nIS $ loc $1) }
+> role : VARID             { let (VarId v) = unLoc $1 in (Just (T.unpack v), nIS $ loc $1) }
 >      | '_'               { (Nothing, nIS $1) }
 
 
@@ -888,7 +889,7 @@ so no need to check for extensions.
 >        | {- empty -}                  { Nothing }
 
 > fspec :: { (Maybe String, Name L, Type L, [S]) }
->       : STRING var_no_safety '::' truedtype               { let Loc l (StringTok (s,_)) = $1 in (Just s, $2, $4, [l,$3]) }
+>       : STRING var_no_safety '::' truedtype               { let Loc l (StringTok (s,_)) = $1 in (Just (T.unpack s), $2, $4, [l,$3]) }
 >       |        var_no_safety '::' truedtype               { (Nothing, $1, $3, [$2]) }
 
 -----------------------------------------------------------------------------
@@ -903,7 +904,7 @@ Pragmas
 > rule :: { Rule L }
 >      : STRING activation ruleforall exp0 '=' trueexp      {% do { let {Loc l (StringTok (s,_)) = $1};
 >                                                                   e <- checkRuleExpr $4;
->                                                                   return $ Rule (nIS l <++> ann $6 <** l:snd $3 ++ [$5]) s $2 (fst $3) e $6 } }
+>                                                                   return $ Rule (nIS l <++> ann $6 <** l:snd $3 ++ [$5]) (T.unpack s) $2 (fst $3) e $6 } }
 
 > activation :: { Maybe (Activation L) }
 >        : {- empty -}          { Nothing }
@@ -929,7 +930,7 @@ Pragmas
 >   | {- empty -}                       { ([],[]) }
 
 > warndepr :: { (([Name L], String),[S]) }
->       : namevars STRING               { let Loc l (StringTok (s,_)) = $2 in ((fst $1,s),snd $1 ++ [l]) }
+>       : namevars STRING               { let Loc l (StringTok (s,_)) = $2 in ((fst $1, T.unpack s),snd $1 ++ [l]) }
 
 > namevars :: { ([Name L],[S]) }
 >           : namevar                   { ([$1],[]) }
@@ -1009,10 +1010,10 @@ the (# and #) lexemes. Kinds will be handled at the kind rule.
 >       | '(' ctype_(ostar,kstar) '::' kind ')'         { TyKind  ($1 <^^> $5 <** [$1,$3,$5]) $2 $4 }
 >       | '$(' trueexp ')'              { let l = ($1 <^^> $3 <** [$1,$3]) in TySplice l $ ParenSplice l $2 }
 >       | '$$(' trueexp ')'             { let l = ($1 <^^> $3 <** [$1,$3]) in TySplice l $ TParenSplice l $2 }
->       | IDSPLICE                      { let Loc l (THIdEscape s) = $1 in TySplice (nIS l) $ IdSplice (nIS l) s }
->       | TIDSPLICE                     { let Loc l (THTIdEscape s) = $1 in TySplice (nIS l) $ TIdSplice (nIS l) s }
+>       | IDSPLICE                      { let Loc l (THIdEscape s) = $1 in TySplice (nIS l) $ IdSplice (nIS l) (T.unpack s) }
+>       | TIDSPLICE                     { let Loc l (THTIdEscape s) = $1 in TySplice (nIS l) $ TIdSplice (nIS l) (T.unpack s) }
 >       | '_'                           { TyWildCard (nIS $1) Nothing }
->       | QUASIQUOTE                    { let Loc l (THQuasiQuote (n,q)) = $1 in TyQuasiQuote (nIS l) n q }
+>       | QUASIQUOTE                    { let Loc l (THQuasiQuote (n,q)) = $1 in TyQuasiQuote (nIS l) (T.unpack n) (T.unpack q) }
 >       | ptype_(ostar,kstar)           { % checkEnabled DataKinds >> return (TyPromoted (ann $1) $1) }
 
 > ptype_(ostar,kstar) :: { Promoted L }
@@ -1022,8 +1023,8 @@ the (# and #) lexemes. Kinds will be handled at the kind rule.
 >       | VARQUOTE '[' ']'              { PromotedList  ($1 <^^> $3 <** [$1, $3]) True [] }
        | '[' ']'                       {% PromotedList  ($1 <^^> $2 <** [$1, $2]) False [] }
 >       | VARQUOTE '(' types1 ')'       {% PromotedTuple ($1 <^^> $4 <** ($1:reverse($4:snd $3))) . reverse <\$> mapM checkType (fst $3) }
->       | INT                           { let Loc l (IntTok  (i,raw)) = $1 in PromotedInteger (nIS l) i raw }
->       | STRING                        { let Loc l (StringTok (s,raw)) = $1 in PromotedString (nIS l) s raw }
+>       | INT                           { let Loc l (IntTok  (i,raw)) = $1 in PromotedInteger (nIS l) i (T.unpack raw) }
+>       | STRING                        { let Loc l (StringTok (s,raw)) = $1 in PromotedString (nIS l) (T.unpack s) (T.unpack raw) }
 
 
 > strict_mark :: { (Maybe (L -> BangType L,S), Maybe (Unpackedness L)) }
@@ -1476,8 +1477,8 @@ mdo blocks require the RecursiveDo extension enabled, but the lexer handles that
 >       | 'mdo' stmtlist                { let (sts, inf, ss) = $2 in MDo  (nIS $1 <++> inf <** $1:ss) sts }
 
 > exppragma :: { PExp L }
->       : '{-# CORE' STRING '#-}' exp   { let Loc l (StringTok (s,_)) = $2 in CorePragma (nIS $1 <++> ann $4 <** [l,$3]) s $4 }
->       | '{-# SCC'  STRING '#-}' exp   { let Loc l (StringTok (s,_)) = $2 in SCCPragma  (nIS $1 <++> ann $4 <** [l,$3]) s $4 }
+>       : '{-# CORE' STRING '#-}' exp   { let Loc l (StringTok (s,_)) = $2 in CorePragma (nIS $1 <++> ann $4 <** [l,$3]) (T.unpack s) $4 }
+>       | '{-# SCC'  STRING '#-}' exp   { let Loc l (StringTok (s,_)) = $2 in SCCPragma  (nIS $1 <++> ann $4 <** [l,$3]) (T.unpack s) $4 }
 >       | '{-# GENERATED' STRING INT ':' INT '-' INT ':' INT '#-}' exp
 >                                           { let { Loc l0 (StringTok (s,_)) = $2;
 >                                                   Loc l1 (IntTok (i1,_))   = $3;
@@ -1485,7 +1486,7 @@ mdo blocks require the RecursiveDo extension enabled, but the lexer handles that
 >                                                   Loc l3 (IntTok (i3,_))   = $7;
 >                                                   Loc l4 (IntTok (i4,_))   = $9}
 >                                              in GenPragma (nIS $1 <++> ann $11 <** [$1,l0,l1,$4,l2,$6,l3,$8,l4,$10])
->                                                       s (fromInteger i1, fromInteger i2)
+>                                                       (T.unpack s) (fromInteger i1, fromInteger i2)
 >                                                         (fromInteger i3, fromInteger i4) $11 }
 
 > fexp :: { PExp L }
@@ -1558,8 +1559,8 @@ thing we need to look at here is the erpats that use no non-standard lexemes.
 
 
 Template Haskell - all this is enabled in the lexer.
->       | IDSPLICE                      { let Loc l (THIdEscape s) = $1 in SpliceExp (nIS l) $ IdSplice (nIS l) s }
->       | TIDSPLICE                     { let Loc l (THTIdEscape s) = $1 in SpliceExp (nIS l) $ TIdSplice (nIS l) s }
+>       | IDSPLICE                      { let Loc l (THIdEscape s) = $1 in SpliceExp (nIS l) $ IdSplice (nIS l) (T.unpack s) }
+>       | TIDSPLICE                     { let Loc l (THTIdEscape s) = $1 in SpliceExp (nIS l) $ TIdSplice (nIS l) (T.unpack s) }
 >       | '$(' trueexp ')'              { let l = ($1 <^^> $3 <** [$1,$3]) in SpliceExp l $ ParenSplice l $2 }
 >       | '$$(' trueexp ')'             { let l = ($1 <^^> $3 <** [$1,$3]) in SpliceExp l $ TParenSplice l $2 }
 >       | '[|' trueexp '|]'             { let l = ($1 <^^> $3 <** [$1,$3]) in BracketExp l $ ExpBracket l $2 }
@@ -1580,7 +1581,7 @@ Template Haskell - all this is enabled in the lexer.
 >       | VARQUOTE qcon                 { VarQuote (nIS $1 <++> ann $2 <** [$1]) $2 }
 >       | TYPQUOTE tyvar                { TypQuote (nIS $1 <++> ann $2 <** [$1]) (UnQual (ann $2) $2) }
 >       | TYPQUOTE gtycon               { TypQuote (nIS $1 <++> ann $2 <** [$1]) $2 }
->       | QUASIQUOTE                    { let Loc l (THQuasiQuote (n,q)) = $1 in QuasiQuote (nIS l) n q }
+>       | QUASIQUOTE                    { let Loc l (THQuasiQuote (n,q)) = $1 in QuasiQuote (nIS l) (T.unpack n) (T.unpack q) }
 End Template Haskell
 
 > tup_exprs :: { ([S], SumOrTuple L) }
@@ -1646,7 +1647,7 @@ Hsx Extensions - requires XmlSyntax, but the lexer handles all that.
 >       | {- empty -}                   { [] }
 
 > child :: { PExp L }
->       : PCDATA                        { let Loc l (XPCDATA pcd) = $1 in XPcdata (nIS l) pcd }
+>       : PCDATA                        { let Loc l (XPCDATA pcd) = $1 in XPcdata (nIS l) (T.unpack pcd) }
 >       | '<[' sexps ']>'               { XRPats ($1 <^^> $3 <** (snd $2 ++ [$1,$3])) $ reverse (fst $2) }
 >       | xml                           { $1 }
 
@@ -1656,9 +1657,9 @@ Hsx Extensions - requires XmlSyntax, but the lexer handles all that.
 >       | xmlname                       { let Loc l str = $1 in XName (nIS l) str }
 
 > xmlname :: { Loc String }
->       : VARID                         { let Loc l (VarId  s) = $1 in Loc l s }
->       | CONID                         { let Loc l (ConId  s) = $1 in Loc l s }
->       | DVARID                        { let Loc l (DVarId s) = $1 in Loc l $ mkDVar s }
+>       : VARID                         { let Loc l (VarId  s) = $1 in Loc l (T.unpack s) }
+>       | CONID                         { let Loc l (ConId  s) = $1 in Loc l (T.unpack s) }
+>       | DVARID                        { let Loc l (DVarId s) = $1 in Loc l $ mkDVar (map T.unpack s) }
 >       | xmlkeyword                    { $1 }
 
 > xmlkeyword :: { Loc String }
@@ -1986,7 +1987,7 @@ Implicit parameter
 
 > overloaded_label :: { PExp L }
 >       : LABELVARID            { let Loc l (LabelVarId v) = $1 in OverloadedLabel
->                                                                      (nIS l) v }
+>                                                                      (nIS l) (T.unpack v) }
 
 -----------------------------------------------------------------------------
 Identifiers and Symbols
@@ -1994,11 +1995,11 @@ Identifiers and Symbols
 > qvarid :: { QName L }
 >       : varid                 { UnQual (ann $1) $1 }
 >       | QVARID                { let {Loc l (QVarId q) = $1; nis = nIS l}
->                                  in Qual nis (ModuleName nis (fst q)) (Ident nis (snd q)) }
+>                                  in Qual nis (ModuleName nis (T.unpack (fst q))) (Ident nis (T.unpack (snd q))) }
 >       | '_'                   { hole_name       (nIS $1) }
 
 > varid_no_safety :: { Name L }
->       : VARID                 { let Loc l (VarId v) = $1 in Ident (nIS l) v }
+>       : VARID                 { let Loc l (VarId v) = $1 in Ident (nIS l) (T.unpack v) }
 >       | 'as'                  { as_name         (nIS $1) }
 >       | 'qualified'           { qualified_name  (nIS $1) }
 >       | 'hiding'              { hiding_name     (nIS $1) }
@@ -2027,22 +2028,22 @@ Identifiers and Symbols
 
 Implicit parameter
 > ivarid :: { IPName L }
->       : IDUPID                { let Loc l (IDupVarId i) = $1 in IPDup (nIS l) i }
->       | ILINID                { let Loc l (ILinVarId i) = $1 in IPLin (nIS l) i }
+>       : IDUPID                { let Loc l (IDupVarId i) = $1 in IPDup (nIS l) (T.unpack i) }
+>       | ILINID                { let Loc l (ILinVarId i) = $1 in IPLin (nIS l) (T.unpack i) }
 
 > qconid :: { QName L }
 >       : conid                 { UnQual (ann $1) $1 }
->       | QCONID                { let {Loc l (QConId q) = $1; nis = nIS l} in Qual nis (ModuleName nis (fst q)) (Ident nis (snd q)) }
+>       | QCONID                { let {Loc l (QConId q) = $1; nis = nIS l} in Qual nis (ModuleName nis (T.unpack (fst q))) (Ident nis (T.unpack (snd q))) }
 
 > conid :: { Name L }
->       : CONID                 { let Loc l (ConId c) = $1 in Ident (nIS l) c }
+>       : CONID                 { let Loc l (ConId c) = $1 in Ident (nIS l) (T.unpack c) }
 
 > qconsym :: { QName L }
 >       : consym                { UnQual (ann $1) $1 }
->       | QCONSYM               { let {Loc l (QConSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (fst q)) (Symbol nis (snd q)) }
+>       | QCONSYM               { let {Loc l (QConSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (T.unpack (fst q))) (Symbol nis (T.unpack (snd q))) }
 
 > consym :: { Name L }
->       : CONSYM                { let Loc l (ConSym c) = $1 in Symbol (nIS l) c }
+>       : CONSYM                { let Loc l (ConSym c) = $1 in Symbol (nIS l) (T.unpack c) }
 
 > qvarsym :: { QName L }
 >       : qvarsym_('*')         { $1 }
@@ -2066,25 +2067,25 @@ Implicit parameter
 >       : varsymm_('*')         { $1 }
 
 > varsymm_(ostar) :: { Name L } -- varsym not including '-'
->       : VARSYM                { let Loc l (VarSym v) = $1 in Symbol (nIS l) v }
+>       : VARSYM                { let Loc l (VarSym v) = $1 in Symbol (nIS l) (T.unpack v) }
 >       | '!'                   { bang_name (nIS $1) }
 >       | '.'                   { dot_name  (nIS $1) }
 >       | ostar                 { star_name (nIS $1) }
 
 > qvarsym1 :: { QName L }
->       : QVARSYM               { let {Loc l (QVarSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (fst q)) (Symbol nis (snd q)) }
+>       : QVARSYM               { let {Loc l (QVarSym q) = $1; nis = nIS l} in Qual nis (ModuleName nis (T.unpack (fst q))) (Symbol nis (T.unpack (snd q))) }
 
 > literal :: { Literal L }
->       : INT                   { let Loc l (IntTok        (i,raw)) = $1 in Int        (nIS l) i raw }
->       | CHAR                  { let Loc l (Character     (c,raw)) = $1 in Char       (nIS l) c raw }
->       | RATIONAL              { let Loc l (FloatTok      (r,raw)) = $1 in Frac       (nIS l) r raw }
->       | STRING                { let Loc l (StringTok     (s,raw)) = $1 in String     (nIS l) s raw }
->       | PRIMINT               { let Loc l (IntTokHash    (i,raw)) = $1 in PrimInt    (nIS l) i raw }
->       | PRIMWORD              { let Loc l (WordTokHash   (w,raw)) = $1 in PrimWord   (nIS l) w raw }
->       | PRIMFLOAT             { let Loc l (FloatTokHash  (f,raw)) = $1 in PrimFloat  (nIS l) f raw }
->       | PRIMDOUBLE            { let Loc l (DoubleTokHash (d,raw)) = $1 in PrimDouble (nIS l) d raw }
->       | PRIMCHAR              { let Loc l (CharacterHash (c,raw)) = $1 in PrimChar   (nIS l) c raw }
->       | PRIMSTRING            { let Loc l (StringHash    (s,raw)) = $1 in PrimString (nIS l) s raw }
+>       : INT                   { let Loc l (IntTok        (i,raw)) = $1 in Int        (nIS l) i (T.unpack raw) }
+>       | CHAR                  { let Loc l (Character     (c,raw)) = $1 in Char       (nIS l) c (T.unpack raw) }
+>       | RATIONAL              { let Loc l (FloatTok      (r,raw)) = $1 in Frac       (nIS l) r (T.unpack raw) }
+>       | STRING                { let Loc l (StringTok     (s,raw)) = $1 in String     (nIS l) (T.unpack s) (T.unpack raw) }
+>       | PRIMINT               { let Loc l (IntTokHash    (i,raw)) = $1 in PrimInt    (nIS l) i (T.unpack raw) }
+>       | PRIMWORD              { let Loc l (WordTokHash   (w,raw)) = $1 in PrimWord   (nIS l) w (T.unpack raw) }
+>       | PRIMFLOAT             { let Loc l (FloatTokHash  (f,raw)) = $1 in PrimFloat  (nIS l) f (T.unpack raw) }
+>       | PRIMDOUBLE            { let Loc l (DoubleTokHash (d,raw)) = $1 in PrimDouble (nIS l) d (T.unpack raw) }
+>       | PRIMCHAR              { let Loc l (CharacterHash (c,raw)) = $1 in PrimChar   (nIS l) c (T.unpack raw) }
+>       | PRIMSTRING            { let Loc l (StringHash    (s,raw)) = $1 in PrimString (nIS l) (T.unpack s) (T.unpack raw) }
 
 -----------------------------------------------------------------------------
 Layout
@@ -2195,8 +2196,8 @@ Deriving strategies
 Miscellaneous (mostly renamings)
 
 > modid :: { ModuleName L }
->       : CONID                 { let Loc l (ConId  n) = $1 in ModuleName (nIS l) n }
->       | QCONID                { let Loc l (QConId n) = $1 in ModuleName (nIS l) (fst n ++ '.':snd n) }
+>       : CONID                 { let Loc l (ConId  n) = $1 in ModuleName (nIS l) (T.unpack n) }
+>       | QCONID                { let Loc l (QConId n) = $1 in ModuleName (nIS l) (T.unpack (fst n) ++ '.':T.unpack (snd n)) }
 
 > tyconorcls :: { Name L }
 >       : con                   { $1 }
@@ -2220,7 +2221,7 @@ Miscellaneous (mostly renamings)
 >          | tyvarsym_(ostar)   { UnQual (ann $1) $1 }
 
 > tyvarsym_(ostar) :: { Name L }
-> tyvarsym : VARSYM              { let Loc l (VarSym x) = $1 in Symbol (nIS l) x }
+> tyvarsym : VARSYM              { let Loc l (VarSym x) = $1 in Symbol (nIS l) (T.unpack x) }
 >          | '-'                 { Symbol (nIS $1) "-" }
 >          | ostar                   { Symbol (nIS $1) "*" }
 
diff --git a/src/Language/Haskell/Exts/ParseMonad.hs b/src/Language/Haskell/Exts/ParseMonad.hs
index bab71a45..bb03e2f4 100644
--- a/src/Language/Haskell/Exts/ParseMonad.hs
+++ b/src/Language/Haskell/Exts/ParseMonad.hs
@@ -22,11 +22,12 @@ module Language.Haskell.Exts.ParseMonad(
         P, ParseResult(..), atSrcLoc, LexContext(..),
         ParseMode(..), defaultParseMode, fromParseResult,
         runParserWithMode, runParserWithModeComments, runParser,
+        runParserWithModeCommentsText,
         getSrcLoc, pushCurrentContext, popContext,
         getExtensions, getIgnoreFunctionArity,
         -- * Lexing
-        Lex(runL), getInput, discard, getLastChar, lexNewline,
-        lexTab, lexWhile, lexWhile_,
+        Lex(runL), getInput, getInputT, discard, getLastChar, lexNewline,
+        lexTab, lexWhile, lexWhile_, lexWhileT,
         alternative, checkBOL, setBOL, startToken, getOffside,
         pushContextL, popContextL, getExtensionsL, addExtensionL,
         saveExtensionsL, restoreExtensionsL, pushComment,
@@ -44,6 +45,8 @@ import Language.Haskell.Exts.Comments
 import Language.Haskell.Exts.Extension -- (Extension, impliesExts, haskell2010)
 
 import Data.List (intercalate)
+import Data.Text (Text)
+import qualified Data.Text as T
 import Control.Applicative
 import Control.Monad (when, liftM, ap)
 import qualified Control.Monad.Fail as Fail
@@ -70,6 +73,13 @@ class Parseable ast where
   --   with the AST.
   parseWithComments :: ParseMode -> String -> ParseResult (ast, [Comment])
   parseWithComments mode = runParserWithModeComments mode . parser $ fixities mode
+  -- | 'Text'-input variant of 'parseWithMode'.  Avoids the eager
+  --   'T.pack'\/'T.unpack' cycle when the caller already has 'Text'.
+  parseTextWithMode :: ParseMode -> Text -> ParseResult ast
+  parseTextWithMode mode = fmap fst . parseTextWithComments mode
+  -- | 'Text'-input variant of 'parseWithComments'.
+  parseTextWithComments :: ParseMode -> Text -> ParseResult (ast, [Comment])
+  parseTextWithComments mode = runParserWithModeCommentsText mode . parser $ fixities mode
   -- | Internal parser, used to provide default definitions for the others.
   parser :: Maybe [Fixity] -> P ast
 
@@ -199,7 +209,7 @@ toInternalParseMode (ParseMode pf bLang exts _ilang iline _fx farity) =
 -- | Monad for parsing
 
 newtype P a = P { runP ::
-                String              -- input string
+                Text                -- input text
              -> Int                 -- current column
              -> Int                 -- current line
              -> SrcLoc              -- location of last token read
@@ -225,7 +235,13 @@ runParser :: P a -> String -> ParseResult a
 runParser = runParserWithMode defaultParseMode
 
 runParserWithModeComments :: ParseMode -> P a -> String -> ParseResult (a, [Comment])
-runParserWithModeComments mode = let mode2 = toInternalParseMode mode in \(P m) s ->
+runParserWithModeComments mode pm s = runParserWithModeCommentsText mode pm (T.pack s)
+
+-- | 'Text'-input variant of 'runParserWithModeComments'.  Avoids the
+-- intermediate 'String' allocation when the caller already has the source
+-- as 'Text'.
+runParserWithModeCommentsText :: ParseMode -> P a -> Text -> ParseResult (a, [Comment])
+runParserWithModeCommentsText mode = let mode2 = toInternalParseMode mode in \(P m) s ->
   case m s 0 1 start '\n' ([],[],[],(False,False),[]) mode2 of
     Ok (_,_,_,_,cs) a -> ParseOk (a, reverse cs)
     Failed loc msg    -> ParseFailed loc msg
@@ -370,14 +386,28 @@ instance Fail.MonadFail (Lex r) where
 
 -- Operations on this monad
 
+-- | Lazy-'String' view of the remaining input.  Kept for compatibility —
+-- pattern matching on the result forces only as many characters as examined,
+-- so short lookaheads are cheap.  The hot loops use 'getInputT' instead.
 getInput :: Lex r String
-getInput = Lex $ \cont -> P $ \r -> runP (cont r) r
+getInput = Lex $ \cont -> P $ \r -> runP (cont (T.unpack r)) r
+
+-- | 'Text' view of the remaining input.  Preferred for productions that
+-- bulk-consume a run of characters (e.g. string literals), because the
+-- backing storage is an O(1) slice of the source buffer.
+getInputT :: Lex r Text
+getInputT = Lex $ \cont -> P $ \r -> runP (cont r) r
 
 -- | Discard some input characters (these must not include tabs or newlines).
 
 discard :: Int -> Lex r ()
 discard n = Lex $ \cont -> P $ \r x y loc ch
-                        -> let (newCh:rest)= if n > 0 then drop (n-1) r else (ch:r)
+                        -> let rest   = T.drop n r
+                               newCh  = if n > 0
+                                          then case T.uncons (T.drop (n-1) r) of
+                                                 Just (c,_) -> c
+                                                 Nothing    -> ch
+                                          else ch
                            in runP (cont ()) rest (x+n) y loc newCh
 
 -- | Get the last discarded character.
@@ -391,14 +421,14 @@ getLastChar = Lex $ \cont -> P $ \r x y loc ch -> runP (cont ch) r x y loc ch
 
 lexNewline :: Lex a ()
 lexNewline = Lex $ \cont -> P $ \rs _x y loc  ->
-  case rs of
-    (_:r) -> runP (cont ()) r 1 (y+1) loc
-    []    -> \_ _ _ -> Failed loc "Lexer: expected newline."
+  case T.uncons rs of
+    Just (_,r) -> runP (cont ()) r 1 (y+1) loc
+    Nothing    -> \_ _ _ -> Failed loc "Lexer: expected newline."
 
 -- | Discard the next character, which must be a tab.
 
 lexTab :: Lex a ()
-lexTab = Lex $ \cont -> P $ \(_:r) x -> runP (cont ()) r (nextTab x)
+lexTab = Lex $ \cont -> P $ \rs x -> runP (cont ()) (T.drop 1 rs) (nextTab x)
 
 nextTab :: Int -> Int
 nextTab x = x + (tAB_LENGTH - (x-1) `mod` tAB_LENGTH)
@@ -409,10 +439,15 @@ tAB_LENGTH = 8
 -- Consume and return the largest string of characters satisfying p
 
 lexWhile :: (Char -> Bool) -> Lex a String
-lexWhile p = Lex $ \cont -> P $ \rss c l loc char ->
-  case rss of
-    [] -> runP (cont []) [] c l loc char
-    (r:rs) ->
+lexWhile p = T.unpack <$> lexWhileT p
+
+-- | 'Text'-producing variant of 'lexWhile'.  Avoids the intermediate 'String'
+-- allocation when the caller is happy with a 'Text' span.
+lexWhileT :: (Char -> Bool) -> Lex a Text
+lexWhileT p = Lex $ \cont -> P $ \rss c l loc char ->
+  case T.uncons rss of
+    Nothing -> runP (cont T.empty) rss c l loc char
+    Just (r,rs) ->
       let
         l' = case r of
               '\n' -> l + 1
@@ -421,12 +456,12 @@ lexWhile p = Lex $ \cont -> P $ \rss c l loc char ->
               '\n' -> 1
               _    -> c + 1
        in if p r
-            then runP (runL ((r:) <$> lexWhile p) cont) rs c' l' loc r
-            else runP (cont []) (r:rs) c l loc char
+            then runP (runL (T.cons r <$> lexWhileT p) cont) rs c' l' loc r
+            else runP (cont T.empty) rss c l loc char
 
 -- | lexWhile without the return value.
 lexWhile_ :: (Char -> Bool) -> Lex a ()
-lexWhile_ p = do _ <- lexWhile p
+lexWhile_ p = do _ <- lexWhileT p
                  return ()
 
 -- An alternative scan, to which we can return if subsequent scanning
diff --git a/src/Language/Haskell/Exts/Parser/Text.hs b/src/Language/Haskell/Exts/Parser/Text.hs
new file mode 100644
index 00000000..7986737d
--- /dev/null
+++ b/src/Language/Haskell/Exts/Parser/Text.hs
@@ -0,0 +1,111 @@
+-----------------------------------------------------------------------------
+-- |
+-- Module      :  Language.Haskell.Exts.Parser.Text
+-- License     :  BSD-style (see the file LICENSE.txt)
+--
+-- Stability   :  stable
+-- Portability :  portable
+--
+-- 'Text'-input variants of the parser entry points in
+-- "Language.Haskell.Exts.Parser".  These skip the eager 'T.pack' that
+-- the 'String' entry points do at the boundary.  The returned AST is
+-- the existing 'String'-valued 'Syntax' AST; 'Syntax.Text' exists as a
+-- separate module for users who wish to represent the AST in 'Text'
+-- as well.
+--
+-----------------------------------------------------------------------------
+module Language.Haskell.Exts.Parser.Text
+    ( parseTextWithMode
+    , parseTextWithComments
+    , parseModuleText
+    , parseModuleTextWithMode
+    , parseModuleTextWithComments
+    , parseExpText
+    , parseExpTextWithMode
+    , parseExpTextWithComments
+    , parseDeclText
+    , parseDeclTextWithMode
+    , parseDeclTextWithComments
+    , parseTypeText
+    , parseTypeTextWithMode
+    , parseTypeTextWithComments
+    , parsePatText
+    , parsePatTextWithMode
+    , parsePatTextWithComments
+    , parseStmtText
+    , parseStmtTextWithMode
+    , parseStmtTextWithComments
+    , parseImportDeclText
+    , parseImportDeclTextWithMode
+    , parseImportDeclTextWithComments
+    ) where
+
+import Data.Text (Text)
+import Language.Haskell.Exts.Comments
+import Language.Haskell.Exts.ParseMonad
+import Language.Haskell.Exts.Parser ()
+import Language.Haskell.Exts.SrcLoc
+import Language.Haskell.Exts.Syntax
+
+parseModuleText :: Text -> ParseResult (Module SrcSpanInfo)
+parseModuleText = parseTextWithMode defaultParseMode
+
+parseModuleTextWithMode :: ParseMode -> Text -> ParseResult (Module SrcSpanInfo)
+parseModuleTextWithMode = parseTextWithMode
+
+parseModuleTextWithComments :: ParseMode -> Text -> ParseResult (Module SrcSpanInfo, [Comment])
+parseModuleTextWithComments = parseTextWithComments
+
+parseExpText :: Text -> ParseResult (Exp SrcSpanInfo)
+parseExpText = parseTextWithMode defaultParseMode
+
+parseExpTextWithMode :: ParseMode -> Text -> ParseResult (Exp SrcSpanInfo)
+parseExpTextWithMode = parseTextWithMode
+
+parseExpTextWithComments :: ParseMode -> Text -> ParseResult (Exp SrcSpanInfo, [Comment])
+parseExpTextWithComments = parseTextWithComments
+
+parseDeclText :: Text -> ParseResult (Decl SrcSpanInfo)
+parseDeclText = parseTextWithMode defaultParseMode
+
+parseDeclTextWithMode :: ParseMode -> Text -> ParseResult (Decl SrcSpanInfo)
+parseDeclTextWithMode = parseTextWithMode
+
+parseDeclTextWithComments :: ParseMode -> Text -> ParseResult (Decl SrcSpanInfo, [Comment])
+parseDeclTextWithComments = parseTextWithComments
+
+parseTypeText :: Text -> ParseResult (Type SrcSpanInfo)
+parseTypeText = parseTextWithMode defaultParseMode
+
+parseTypeTextWithMode :: ParseMode -> Text -> ParseResult (Type SrcSpanInfo)
+parseTypeTextWithMode = parseTextWithMode
+
+parseTypeTextWithComments :: ParseMode -> Text -> ParseResult (Type SrcSpanInfo, [Comment])
+parseTypeTextWithComments = parseTextWithComments
+
+parsePatText :: Text -> ParseResult (Pat SrcSpanInfo)
+parsePatText = parseTextWithMode defaultParseMode
+
+parsePatTextWithMode :: ParseMode -> Text -> ParseResult (Pat SrcSpanInfo)
+parsePatTextWithMode = parseTextWithMode
+
+parsePatTextWithComments :: ParseMode -> Text -> ParseResult (Pat SrcSpanInfo, [Comment])
+parsePatTextWithComments = parseTextWithComments
+
+parseStmtText :: Text -> ParseResult (Stmt SrcSpanInfo)
+parseStmtText = parseTextWithMode defaultParseMode
+
+parseStmtTextWithMode :: ParseMode -> Text -> ParseResult (Stmt SrcSpanInfo)
+parseStmtTextWithMode = parseTextWithMode
+
+parseStmtTextWithComments :: ParseMode -> Text -> ParseResult (Stmt SrcSpanInfo, [Comment])
+parseStmtTextWithComments = parseTextWithComments
+
+parseImportDeclText :: Text -> ParseResult (ImportDecl SrcSpanInfo)
+parseImportDeclText = parseTextWithMode defaultParseMode
+
+parseImportDeclTextWithMode :: ParseMode -> Text -> ParseResult (ImportDecl SrcSpanInfo)
+parseImportDeclTextWithMode = parseTextWithMode
+
+parseImportDeclTextWithComments :: ParseMode -> Text -> ParseResult (ImportDecl SrcSpanInfo, [Comment])
+parseImportDeclTextWithComments = parseTextWithComments

From b5265a6664751a2a89bb15ffc42aa94d62d81319 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Wed, 6 May 2026 08:48:10 +0000
Subject: [PATCH 2/8] Make lexer keyword tables and lex workers Text-native

Keyword/operator/pragma lookup tables (reserved_ops, special_varops,
reserved_ids, special_varids, pragmas) now keep their keys as Text, so
the lexer can look up directly against the Text spans produced by
lexWhileT and skip a per-token T.pack/unpack round-trip.

Numeric, escape, raw-pragma and identifier workers (lexOctal,
lexBinary, lexHexadecimal, lexDecimal, lexExponent, lexEscape,
lexRawPragma, lexIdents, lexConIdOrQual) now produce Text directly via
lexWhileT instead of the [Char] -> T.pack pattern.  parseInteger is
generalised to fold over Text.  String-cons accumulators in lexString
and lexQQBody are kept (cons over [Char] is O(1); only the final
T.pack pays for the conversion).
---
 src/Language/Haskell/Exts/InternalLexer.hs | 163 +++++++++++----------
 1 file changed, 82 insertions(+), 81 deletions(-)

diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs
index 54c0e31b..dfa36fc8 100644
--- a/src/Language/Haskell/Exts/InternalLexer.hs
+++ b/src/Language/Haskell/Exts/InternalLexer.hs
@@ -31,7 +31,7 @@ import Language.Haskell.Exts.ExtScheme
 import Prelude hiding (id, exponent)
 import Data.Char
 import Data.Ratio
-import Data.List (intercalate, isPrefixOf)
+import Data.List (intercalate)
 import Control.Monad (when)
 import qualified Data.Text as T
 import Data.Text (Text)
@@ -228,7 +228,7 @@ data Token
         | EOF
         deriving (Eq,Show)
 
-reserved_ops :: [(String,(Token, Maybe ExtScheme))]
+reserved_ops :: [(Text,(Token, Maybe ExtScheme))]
 reserved_ops = [
  ( "..", (DotDot,       Nothing) ),
  ( ":",  (Colon,        Nothing) ),
@@ -264,7 +264,7 @@ reserved_ops = [
  ( "\x2200",    (KW_Forall,         Just (All [UnicodeSyntax, ExplicitForAll])) )
  ]
 
-special_varops :: [(String,(Token, Maybe ExtScheme))]
+special_varops :: [(Text,(Token, Maybe ExtScheme))]
 special_varops = [
  -- the dot is only a special symbol together with forall, but can still be used as function composition
  ( ".",  (Dot,          Just (Any [ExplicitForAll, ExistentialQuantification])) ),
@@ -272,7 +272,7 @@ special_varops = [
  ( "!",  (Exclamation,  Nothing) )
  ]
 
-reserved_ids :: [(String,(Token, Maybe ExtScheme))]
+reserved_ids :: [(Text,(Token, Maybe ExtScheme))]
 reserved_ids = [
  ( "_",         (Underscore,    Nothing) ),
  ( "by",        (KW_By,         Just (Any [TransformListComp])) ),
@@ -315,7 +315,7 @@ reserved_ids = [
  ]
 
 
-special_varids :: [(String,(Token, Maybe ExtScheme))]
+special_varids :: [(Text,(Token, Maybe ExtScheme))]
 special_varids = [
  ( "as",        (KW_As,         Nothing) ),
  ( "qualified", (KW_Qualified,  Nothing) ),
@@ -337,7 +337,7 @@ special_varids = [
  ( "capi",          (KW_CApi,          Just (Any [CApiFFI])) )
  ]
 
-pragmas :: [(String,Token)]
+pragmas :: [(Text,Token)]
 pragmas = [
  ( "rules",             RULES           ),
  ( "inline",            INLINE True     ),
@@ -481,7 +481,7 @@ lexWhiteSpace_ bol =  do _ <- lexWhiteSpace bol
                          return ()
 
 isRecognisedPragma, isLinePragma :: String -> Bool
-isRecognisedPragma str = let pragma = takeWhile isPragmaChar . dropWhile isSpace $ str
+isRecognisedPragma str = let pragma = T.pack . takeWhile isPragmaChar . dropWhile isSpace $ str
                           in case lookupKnownPragma pragma of
                               Nothing -> False
                               _       -> True
@@ -666,17 +666,17 @@ lexStdToken = do
                         discard 2
                         (n, str) <- lexOctal
                         con <- intHash
-                        return (con (n, T.pack ('0':c:str)))
+                        return (con (n, T.cons '0' (T.cons c str)))
                   | toLower c == 'b' && isBinDigit d && BinaryLiterals `elem` exts -> do
                         discard 2
                         (n, str) <- lexBinary
                         con <- intHash
-                        return (con (n, T.pack ('0':c:str)))
+                        return (con (n, T.cons '0' (T.cons c str)))
                   | toLower c == 'x' && isHexDigit d -> do
                         discard 2
                         (n, str) <- lexHexadecimal
                         con <- intHash
-                        return (con (n, T.pack ('0':c:str)))
+                        return (con (n, T.cons '0' (T.cons c str)))
 
         -- implicit parameters
         '?':c:_ | isIdentStart c && ImplicitParams `elem` exts -> do
@@ -804,12 +804,12 @@ lexStdToken = do
                    && isIdentStart c -> do
                                                   discard 1
                                                   [ident] <- lexIdents
-                                                  return $ LabelVarId (T.pack ident)
+                                                  return $ LabelVarId ident
 
 
         c:_ | isDigit c -> lexDecimalOrFloat
 
-            | isUpper c -> lexConIdOrQual ""
+            | isUpper c -> lexConIdOrQual T.empty
 
             | isIdentStart c -> do
                     idents <- lexIdents
@@ -819,23 +819,23 @@ lexStdToken = do
                                     -- check if an extension keyword is enabled
                                     if isEnabled scheme exts
                                      then flagKW keyword >> return keyword
-                                     else return $ VarId (T.pack ident)
-                                 Nothing -> return $ VarId (T.pack ident)
-                     _ -> return $ DVarId (map T.pack idents)
+                                     else return $ VarId ident
+                                 Nothing -> return $ VarId ident
+                     _ -> return $ DVarId idents
 
             | isHSymbol c -> do
-                    sym <- lexWhile isHSymbol
+                    sym <- lexWhileT isHSymbol
                     return $ case lookup sym (reserved_ops ++ special_varops) of
                               Just (t , scheme) ->
                                 -- check if an extension op is enabled
                                 if isEnabled scheme exts
                                  then t
                                  else case c of
-                                        ':' -> ConSym (T.pack sym)
-                                        _   -> VarSym (T.pack sym)
+                                        ':' -> ConSym sym
+                                        _   -> VarSym sym
                               Nothing -> case c of
-                                          ':' -> ConSym (T.pack sym)
-                                          _   -> VarSym (T.pack sym)
+                                          ':' -> ConSym sym
+                                          _   -> VarSym sym
 
             | otherwise -> do
                     discard 1
@@ -861,9 +861,9 @@ lexStdToken = do
 
                         _ ->    fail ("Illegal character \'" ++ show c ++ "\'\n")
 
-      where lexIdents :: Lex a [String]
+      where lexIdents :: Lex a [Text]
             lexIdents = do
-                ident <- lexWhile isIdent
+                ident <- lexWhileT isIdent
                 s <- getInput
                 exts <- getExtensionsL
                 case s of
@@ -874,8 +874,8 @@ lexStdToken = do
                         idents <- lexIdents
                         return $ ident : idents
                  '#':_ | MagicHash `elem` exts -> do
-                        hashes <- lexWhile (== '#')
-                        return [ident ++ hashes]
+                        hashes <- lexWhileT (== '#')
+                        return [ident <> hashes]
                  _ -> return [ident]
 
             lexQuasiQuote :: Char -> Lex a Token
@@ -929,17 +929,17 @@ unboxed exts = UnboxedSums `elem` exts || UnboxedTuples `elem` exts
 -- with our representation: the thing after the underscore is a parameter.
 -- Strip off the parameters to option pragmas by hand here, everything else
 -- sits in the pragmas map.
-lookupKnownPragma :: String -> Maybe Token
+lookupKnownPragma :: Text -> Maybe Token
 lookupKnownPragma s =
-    case map toLower s of
-      x | "options_" `isPrefixOf` x -> Just $ OPTIONS (Just $ T.pack $ drop 8 s, undefined)
-        | "options" == x            -> Just $ OPTIONS (Nothing, undefined)
-        | otherwise                 -> lookup x pragmas
+    case T.toLower s of
+      x | "options_" `T.isPrefixOf` x -> Just $ OPTIONS (Just $ T.drop 8 s, undefined)
+        | "options" == x              -> Just $ OPTIONS (Nothing, undefined)
+        | otherwise                   -> lookup x pragmas
 
 lexPragmaStart :: Lex a Token
 lexPragmaStart = do
     lexWhile_ isSpace
-    pr <- lexWhile isPragmaChar
+    pr <- lexWhileT isPragmaChar
     case lookupKnownPragma pr of
      Just (INLINE True) -> do
             s <- getInput
@@ -969,19 +969,20 @@ lexPragmaStart = do
             -- We do not want to store necessary whitespace in the datatype
             -- but if the pragma starts with a newline then we must keep
             -- it to differentiate the two cases.
-            let dropIfSpace (' ':xs) = xs
-                dropIfSpace xs       = xs
+            let dropIfSpace t = case T.uncons t of
+                                  Just (' ', xs) -> xs
+                                  _              -> t
              in
               case fst opt of
                 Just opt' -> do
                   rest <- lexRawPragma
-                  return $ OPTIONS (Just opt', T.pack (dropIfSpace rest))
+                  return $ OPTIONS (Just opt', dropIfSpace rest)
                 Nothing -> do
                   s <- getInput
                   case s of
                     x:_ | isSpace x -> do
                       rest <- lexRawPragma
-                      return $ OPTIONS (Nothing, T.pack (dropIfSpace rest))
+                      return $ OPTIONS (Nothing, dropIfSpace rest)
                     _  -> fail "Malformed Options pragma"
      Just RULES -> do -- Rules enable ScopedTypeVariables locally.
             addExtensionL ScopedTypeVariables
@@ -1000,10 +1001,10 @@ lexPragmaStart = do
                   -- discard 3 -- #-}
                   -- topLexer -- we just discard it as a comment for now and restart -}
 
-lexRawPragma :: Lex a String
+lexRawPragma :: Lex a Text
 lexRawPragma = lexRawPragmaAux
  where lexRawPragmaAux = do
-        rpr <- lexWhile (/='#')
+        rpr <- lexWhileT (/='#')
         s <- getInput
         case s of
          '#':'-':'}':_  -> return rpr
@@ -1011,37 +1012,37 @@ lexRawPragma = lexRawPragmaAux
          _ -> do
             discard 1
             rpr' <- lexRawPragma
-            return $ rpr ++ '#':rpr'
+            return $ rpr <> T.cons '#' rpr'
 
 lexDecimalOrFloat :: Lex a Token
 lexDecimalOrFloat = do
-    ds <- lexWhile isDigit
+    ds <- lexWhileT isDigit
     rest <- getInput
     exts <- getExtensionsL
     case rest of
         ('.':d:_) | isDigit d -> do
                 discard 1
-                frac <- lexWhile isDigit
-                let num = parseInteger 10 (ds ++ frac)
-                    decimals = toInteger (length frac)
+                frac <- lexWhileT isDigit
+                let num = parseInteger 10 (ds <> frac)
+                    decimals = toInteger (T.length frac)
                 (exponent, estr) <- do
                     rest2 <- getInput
                     case rest2 of
                         'e':_ -> lexExponent
                         'E':_ -> lexExponent
-                        _     -> return (0,"")
+                        _     -> return (0, T.empty)
                 con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
-                return $ con ((num%1) * 10^^(exponent - decimals), T.pack (ds ++ '.':frac ++ estr))
+                return $ con ((num%1) * 10^^(exponent - decimals), ds <> T.cons '.' frac <> estr)
         e:_ | toLower e == 'e' -> do
                 (exponent, estr) <- lexExponent
                 con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
-                return $ con ((parseInteger 10 ds%1) * 10^^exponent, T.pack (ds ++ estr))
-        '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, T.pack ds))
-        '#':_     | MagicHash `elem` exts -> discard 1 >> return (IntTokHash  (parseInteger 10 ds, T.pack ds))
-        _         ->              return (IntTok      (parseInteger 10 ds, T.pack ds))
+                return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds <> estr)
+        '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, ds))
+        '#':_     | MagicHash `elem` exts -> discard 1 >> return (IntTokHash  (parseInteger 10 ds, ds))
+        _         ->              return (IntTok      (parseInteger 10 ds, ds))
 
     where
-    lexExponent :: Lex a (Integer, String)
+    lexExponent :: Lex a (Integer, Text)
     lexExponent = do
         (e:r) <- getInput
         discard 1   -- 'e' or 'E'
@@ -1049,12 +1050,12 @@ lexDecimalOrFloat = do
          '+':d:_ | isDigit d -> do
             discard 1
             (n, str) <- lexDecimal
-            return (n, e:'+':str)
+            return (n, T.cons e (T.cons '+' str))
          '-':d:_ | isDigit d -> do
             discard 1
             (n, str) <- lexDecimal
-            return (negate n, e:'-':str)
-         d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, e:str)
+            return (negate n, T.cons e (T.cons '-' str))
+         d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, T.cons e str)
          _ -> fail "Float with missing exponent"
 
 lexHash :: (b -> Token) -> (b -> Token) -> Either String (b -> Token) -> Lex a (b -> Token)
@@ -1071,13 +1072,13 @@ lexHash a b c = do
          _         ->              return a
      else return a
 
-lexConIdOrQual :: String -> Lex a Token
+lexConIdOrQual :: Text -> Lex a Token
 lexConIdOrQual qual = do
-        con <- lexWhile isIdent
-        let conid | null qual = ConId (T.pack con)
-                  | otherwise = QConId (T.pack qual, T.pack con)
-            qual' | null qual = con
-                  | otherwise = qual ++ '.':con
+        con <- lexWhileT isIdent
+        let conid | T.null qual = ConId con
+                  | otherwise   = QConId (qual, con)
+            qual' | T.null qual = con
+                  | otherwise   = qual <> T.cons '.' con
         just_a_conid <- alternative (return conid)
         rest <- getInput
         exts <- getExtensionsL
@@ -1085,16 +1086,16 @@ lexConIdOrQual qual = do
           '.':c:_
              | isIdentStart c -> do  -- qualified varid?
                     discard 1
-                    ident <- lexWhile isIdent
+                    ident <- lexWhileT isIdent
                     s <- getInput
                     exts' <- getExtensionsL
                     ident' <- case s of
-                               '#':_ | MagicHash `elem` exts' -> discard 1 >> return (ident ++ "#")
+                               '#':_ | MagicHash `elem` exts' -> discard 1 >> return (ident <> T.singleton '#')
                                _ -> return ident
                     case lookup ident' reserved_ids of
                        -- cannot qualify a reserved word
                        Just (_,scheme) | isEnabled scheme exts'  -> just_a_conid
-                       _ -> return (QVarId (T.pack qual', T.pack ident'))
+                       _ -> return (QVarId (qual', ident'))
 
              | isUpper c -> do      -- qualified conid?
                     discard 1
@@ -1102,14 +1103,14 @@ lexConIdOrQual qual = do
 
              | isHSymbol c -> do    -- qualified symbol?
                     discard 1
-                    sym <- lexWhile isHSymbol
+                    sym <- lexWhileT isHSymbol
                     exts' <- getExtensionsL
                     case lookup sym reserved_ops of
                         -- cannot qualify a reserved operator
                         Just (_,scheme) | isEnabled scheme exts' -> just_a_conid
                         _        -> return $ case c of
-                                              ':' -> QConSym (T.pack qual', T.pack sym)
-                                              _   -> QVarSym (T.pack qual', T.pack sym)
+                                              ':' -> QConSym (qual', sym)
+                                              _   -> QVarSym (qual', sym)
 
           '#':cs
             | null cs ||
@@ -1134,7 +1135,7 @@ lexCharacter = do   -- We need to keep track of not only character constants but
                     matchQuote
                     con <- lexHash Character CharacterHash
                             (Left "Double hash not available for character literals")
-                    return (con (c, T.pack ('\\':raw)))
+                    return (con (c, T.cons '\\' raw))
          c:'\'':_ -> do
                     discard 2
                     con <- lexHash Character CharacterHash
@@ -1149,7 +1150,7 @@ lexCharacter = do   -- We need to keep track of not only character constants but
 lexString :: Lex a Token
 lexString = loop ("","")
     where
-    loop (s,raw) = do
+    loop (s, raw) = do
         r <- getInput
         exts <- getExtensionsL
         case r of
@@ -1163,7 +1164,7 @@ lexString = loop ("","")
                         loop (s, '\\':reverse wcs ++ '\\':raw)
                      | otherwise -> do
                         (ce, str) <- lexEscape
-                        loop (ce:s, reverse str ++ '\\':raw)
+                        loop (ce:s, reverse (T.unpack str) ++ '\\':raw)
             '"':'#':_ | MagicHash `elem` exts -> do
                         discard 2
                         return (StringHash (T.pack (reverse s), T.pack (reverse raw)))
@@ -1193,7 +1194,7 @@ lexString = loop ("","")
                     return $ c:wcs
             _ -> return ""
 
-lexEscape :: Lex a (Char, String)
+lexEscape :: Lex a (Char, Text)
 lexEscape = do
     discard 1
     r <- getInput
@@ -1256,12 +1257,12 @@ lexEscape = do
                     discard 1
                     (n, raw) <- lexOctal
                     n' <- checkChar n
-                    return (n', 'o':raw)
+                    return (n', T.cons 'o' raw)
         'x':c:_ | isHexDigit c -> do
                     discard 1
                     (n, raw) <- lexHexadecimal
                     n' <- checkChar n
-                    return (n', 'x':raw)
+                    return (n', T.cons 'x' raw)
         c:_ | isDigit c -> do
                     (n, raw) <- lexDecimal
                     n' <- checkChar n
@@ -1275,38 +1276,38 @@ lexEscape = do
 
 -- Production cntrl from section B.2
 
-    cntrl :: Char -> Lex a (Char, String)
-    cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), '^':c:[])
+    cntrl :: Char -> Lex a (Char, Text)
+    cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), T.pack ('^':c:[]))
     cntrl _                        = fail "Illegal control character"
 
 -- assumes at least one octal digit
-lexOctal :: Lex a (Integer, String)
+lexOctal :: Lex a (Integer, Text)
 lexOctal = do
-    ds <- lexWhile isOctDigit
+    ds <- lexWhileT isOctDigit
     return (parseInteger 8 ds, ds)
 
 -- assumes at least one binary digit
-lexBinary :: Lex a (Integer, String)
+lexBinary :: Lex a (Integer, Text)
 lexBinary = do
-    ds <- lexWhile isBinDigit
+    ds <- lexWhileT isBinDigit
     return (parseInteger 2 ds, ds)
 
 -- assumes at least one hexadecimal digit
-lexHexadecimal :: Lex a (Integer, String)
+lexHexadecimal :: Lex a (Integer, Text)
 lexHexadecimal = do
-    ds <- lexWhile isHexDigit
+    ds <- lexWhileT isHexDigit
     return (parseInteger 16 ds, ds)
 
 -- assumes at least one decimal digit
-lexDecimal :: Lex a (Integer, String)
+lexDecimal :: Lex a (Integer, Text)
 lexDecimal = do
-    ds <- lexWhile isDigit
+    ds <- lexWhileT isDigit
     return (parseInteger 10 ds, ds)
 
 -- Stolen from Hugs's Prelude
-parseInteger :: Integer -> String -> Integer
+parseInteger :: Integer -> Text -> Integer
 parseInteger radix ds =
-    foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds)
+    T.foldl' (\n c -> n * radix + toInteger (digitToInt c)) 0 ds
 
 flagKW :: Token -> Lex a ()
 flagKW t =

From 0e1a772a22461e300df0fd546b08e23db1cad0c5 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Fri, 8 May 2026 14:46:01 +0000
Subject: [PATCH 3/8] Strict tokenizer: bang discard, T.span-based lexWhileT

Two surgical fixes to the lexer hot loops to remove allocation
patterns that the Text-native rewrite would otherwise regress on.

* discard: strict bang on the let-bound 'rest' and 'newCh' so the
  monad continuation receives forced values, rather than a lazy
  T.drop closure that holds the input alive across continuations.
* lexWhileT: replace per-character recursion with a single T.span
  followed by a strict foldl' over the matched text for line/column
  tracking.  This removes the O(n) thunk chain that recursing
  through the Lex CPS monad produces on long identifier-class runs.
---
 haskell-src-exts.cabal                     |  2 +-
 src/Language/Haskell/Exts/InternalLexer.hs |  4 +--
 src/Language/Haskell/Exts/ParseMonad.hs    | 31 +++++++++++-----------
 src/Language/Haskell/Exts/Parser/Text.hs   |  6 ++---
 4 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/haskell-src-exts.cabal b/haskell-src-exts.cabal
index 1cf7c5da..1aa63b16 100644
--- a/haskell-src-exts.cabal
+++ b/haskell-src-exts.cabal
@@ -1,5 +1,5 @@
 Name:                   haskell-src-exts
-Version:                1.23.1
+Version:                1.23.2
 License:                BSD3
 License-File:           LICENSE
 Build-Type:             Simple
diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs
index dfa36fc8..74f15cbc 100644
--- a/src/Language/Haskell/Exts/InternalLexer.hs
+++ b/src/Language/Haskell/Exts/InternalLexer.hs
@@ -1306,8 +1306,8 @@ lexDecimal = do
 
 -- Stolen from Hugs's Prelude
 parseInteger :: Integer -> Text -> Integer
-parseInteger radix ds =
-    T.foldl' (\n c -> n * radix + toInteger (digitToInt c)) 0 ds
+parseInteger radix =
+    T.foldl' (\n c -> n * radix + toInteger (digitToInt c)) 0
 
 flagKW :: Token -> Lex a ()
 flagKW t =
diff --git a/src/Language/Haskell/Exts/ParseMonad.hs b/src/Language/Haskell/Exts/ParseMonad.hs
index bb03e2f4..80c3134e 100644
--- a/src/Language/Haskell/Exts/ParseMonad.hs
+++ b/src/Language/Haskell/Exts/ParseMonad.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE CPP #-}
 {-# OPTIONS_HADDOCK hide #-}
 -----------------------------------------------------------------------------
@@ -402,8 +403,8 @@ getInputT = Lex $ \cont -> P $ \r -> runP (cont r) r
 
 discard :: Int -> Lex r ()
 discard n = Lex $ \cont -> P $ \r x y loc ch
-                        -> let rest   = T.drop n r
-                               newCh  = if n > 0
+                        -> let !rest   = T.drop n r
+                               !newCh  = if n > 0
                                           then case T.uncons (T.drop (n-1) r) of
                                                  Just (c,_) -> c
                                                  Nothing    -> ch
@@ -442,22 +443,20 @@ lexWhile :: (Char -> Bool) -> Lex a String
 lexWhile p = T.unpack <$> lexWhileT p
 
 -- | 'Text'-producing variant of 'lexWhile'.  Avoids the intermediate 'String'
--- allocation when the caller is happy with a 'Text' span.
+-- allocation when the caller is happy with a 'Text' span.  Implemented as a
+-- single 'T.span' so the result shares the underlying input buffer and we
+-- pay no per-char allocation; column/line counters are advanced in one pass.
 lexWhileT :: (Char -> Bool) -> Lex a Text
 lexWhileT p = Lex $ \cont -> P $ \rss c l loc char ->
-  case T.uncons rss of
-    Nothing -> runP (cont T.empty) rss c l loc char
-    Just (r,rs) ->
-      let
-        l' = case r of
-              '\n' -> l + 1
-              _    -> l
-        c' = case r of
-              '\n' -> 1
-              _    -> c + 1
-       in if p r
-            then runP (runL (T.cons r <$> lexWhileT p) cont) rs c' l' loc r
-            else runP (cont T.empty) rss c l loc char
+  let (taken, rest) = T.span p rss
+  in if T.null taken
+       then runP (cont T.empty) rss c l loc char
+       else let !lastCh = T.last taken
+                (!l', !c') = T.foldl' advance (l, c) taken
+                advance (!ln, !cn) ch = case ch of
+                  '\n' -> (ln + 1, 1)
+                  _    -> (ln, cn + 1)
+            in runP (cont taken) rest c' l' loc lastCh
 
 -- | lexWhile without the return value.
 lexWhile_ :: (Char -> Bool) -> Lex a ()
diff --git a/src/Language/Haskell/Exts/Parser/Text.hs b/src/Language/Haskell/Exts/Parser/Text.hs
index 7986737d..dff98dc0 100644
--- a/src/Language/Haskell/Exts/Parser/Text.hs
+++ b/src/Language/Haskell/Exts/Parser/Text.hs
@@ -8,10 +8,8 @@
 --
 -- 'Text'-input variants of the parser entry points in
 -- "Language.Haskell.Exts.Parser".  These skip the eager 'T.pack' that
--- the 'String' entry points do at the boundary.  The returned AST is
--- the existing 'String'-valued 'Syntax' AST; 'Syntax.Text' exists as a
--- separate module for users who wish to represent the AST in 'Text'
--- as well.
+-- the 'String' entry points do at the boundary; the returned AST is
+-- still the existing 'String'-valued 'Syntax' AST.
 --
 -----------------------------------------------------------------------------
 module Language.Haskell.Exts.Parser.Text

From 7d273e62a1ea2dc686e913cc6b3e0b7730af266c Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Wed, 6 May 2026 13:50:59 +0000
Subject: [PATCH 4/8] Strict Token fields; tidy lexString/lexQQBody style

* Make every Token payload field strict (!Text, !(Text,Text)).  This
  forces the T.pack at token construction so the [Char] accumulator in
  lexString/lexQQBody is freed as soon as the token is yielded; the
  AST then holds only the materialized strict Text.
* Reformat lexString.loop, lexQQBody and lexWhiteChars to keep the
  one-action-per-line house style: first action immediately after 'do',
  vertically aligned matrix when consecutive case branches share shape.

A separate experiment with [Text] reverse-list accumulators (true
strict-Text path, no Builder/Lazy) showed the [Char]-cons design is
already optimal for lexString: per-char T.singleton allocates a
~48-byte Text record + ByteArray vs a 16-byte cons cell, making the
strict-Text version 3.4x slower and using 90% more peak residency on
the issue #478 stress case.  The lower per-token-stream residency the
[Text] approach showed on multi-literal corpora turned out to come
from the AST's 'Literal l String String' representation, not from the
lexer itself, and is left as Phase 2 work.
---
 src/Language/Haskell/Exts/InternalLexer.hs | 192 ++++++++++-----------
 1 file changed, 90 insertions(+), 102 deletions(-)

diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs
index 74f15cbc..2f951390 100644
--- a/src/Language/Haskell/Exts/InternalLexer.hs
+++ b/src/Language/Haskell/Exts/InternalLexer.hs
@@ -39,28 +39,28 @@ import Data.Text (Text)
 -- import Debug.Trace (trace)
 
 data Token
-        = VarId Text
-        | LabelVarId Text
-        | QVarId (Text,Text)
-        | IDupVarId (Text)        -- duplicable implicit parameter
-        | ILinVarId (Text)        -- linear implicit parameter
-        | ConId Text
-        | QConId (Text,Text)
-        | DVarId [Text]       -- to enable varid's with '-' in them
-        | VarSym Text
-        | ConSym Text
-        | QVarSym (Text,Text)
-        | QConSym (Text,Text)
-        | IntTok (Integer, Text)
-        | FloatTok (Rational, Text)
-        | Character (Char, Text)
-        | StringTok (Text, Text)
-        | IntTokHash (Integer, Text)        -- 1#
-        | WordTokHash (Integer, Text)       -- 1##
-        | FloatTokHash (Rational, Text)     -- 1.0#
-        | DoubleTokHash (Rational, Text)    -- 1.0##
-        | CharacterHash (Char, Text)        -- c#
-        | StringHash (Text, Text)         -- "Hello world!"#
+        = VarId          !Text
+        | LabelVarId     !Text
+        | QVarId         !(Text,Text)
+        | IDupVarId      !Text        -- duplicable implicit parameter
+        | ILinVarId      !Text        -- linear implicit parameter
+        | ConId          !Text
+        | QConId         !(Text,Text)
+        | DVarId         ![Text]      -- to enable varid's with '-' in them
+        | VarSym         !Text
+        | ConSym         !Text
+        | QVarSym        !(Text,Text)
+        | QConSym        !(Text,Text)
+        | IntTok         !(Integer, Text)
+        | FloatTok       !(Rational, Text)
+        | Character      !(Char, Text)
+        | StringTok      !(Text, Text)
+        | IntTokHash     !(Integer, Text)        -- 1#
+        | WordTokHash    !(Integer, Text)        -- 1##
+        | FloatTokHash   !(Rational, Text)       -- 1.0#
+        | DoubleTokHash  !(Rational, Text)       -- 1.0##
+        | CharacterHash  !(Char, Text)           -- c#
+        | StringHash     !(Text, Text)           -- "Hello world!"#
 
 -- Symbols
 
@@ -879,48 +879,42 @@ lexStdToken = do
                  _ -> return [ident]
 
             lexQuasiQuote :: Char -> Lex a Token
-            lexQuasiQuote c = do
-                -- We've seen and dropped [$ already
-                ident <- lexQuoter
-                matchChar '|' "Malformed quasi-quote quoter"
-                body <- lexQQBody
-                return $ THQuasiQuote (ident, T.pack body)
+            lexQuasiQuote c = do ident <- lexQuoter   -- We've seen and dropped [$
+                                 matchChar '|' "Malformed quasi-quote quoter"
+                                 body <- lexQQBody
+                                 return $ THQuasiQuote (ident, T.pack body)
                   where lexQuoter
                          | isIdentStart c = lexWhileT isIdent
-                         | otherwise = do
-                            qualThing <- lexConIdOrQual ""
-                            case qualThing of
-                                QVarId (s1,s2) -> return $ s1 <> T.cons '.' s2
-                                QVarSym (s1, s2) -> return $ s1 <> T.cons '.' s2
-                                _                -> fail "Malformed quasi-quote quoter"
-
+                         | otherwise = do qualThing <- lexConIdOrQual ""
+                                          case qualThing of
+                                            QVarId  (s1,s2) -> return $ s1 <> T.cons '.' s2
+                                            QVarSym (s1,s2) -> return $ s1 <> T.cons '.' s2
+                                            _               -> fail "Malformed quasi-quote quoter"
+
+            -- [Char]-cons accumulator (same rationale as lexString): per-char
+            -- cons is cheaper than per-char T.singleton, and the body dies
+            -- when the THQuasiQuote token is yielded.
             lexQQBody :: Lex a String
-            lexQQBody = do
-                s <- getInput
-                case s of
-                  '\\':']':_ -> do discard 2
-                                   str <- lexQQBody
-                                   return (']':str)
-                  '\\':'|':_ -> do discard 2
-                                   str <- lexQQBody
-                                   return ('|':str)
-                  '|':']':_  -> discard 2 >> return ""
-                  '|':_ -> do discard 1
-                              str <- lexQQBody
-                              return ('|':str)
-                  ']':_ -> do discard 1
-                              str <- lexQQBody
-                              return (']':str)
-                  '\\':_ -> do discard 1
-                               str <- lexQQBody
-                               return ('\\':str)
-                  '\n':_ -> do lexNewline
-                               str <- lexQQBody
-                               return ('\n':str)
-                  []     -> fail "Unexpected end of input while lexing quasi-quoter"
-                  _ -> do str <- lexWhile (not . (`elem` ("\\|\n" :: String)))
-                          rest <- lexQQBody
-                          return (str++rest)
+            lexQQBody = do s <- getInput
+                           case s of
+                             '\\':']':_ -> do discard   2
+                                              fmap (']' :) lexQQBody
+                             '\\':'|':_ -> do discard   2
+                                              fmap ('|' :) lexQQBody
+                             '|':']':_  -> do discard   2
+                                              return ""
+                             '|':_      -> do discard   1
+                                              fmap ('|' :) lexQQBody
+                             ']':_      -> do discard   1
+                                              fmap (']' :) lexQQBody
+                             '\\':_     -> do discard   1
+                                              fmap ('\\':) lexQQBody
+                             '\n':_     -> do lexNewline
+                                              fmap ('\n':) lexQQBody
+                             []         -> fail "Unexpected end of input while lexing quasi-quoter"
+                             _          -> do str  <- lexWhile (not . (`elem` ("\\|\n" :: String)))
+                                              rest <- lexQQBody
+                                              return (str ++ rest)
 
 unboxed :: [KnownExtension] -> Bool
 unboxed exts = UnboxedSums `elem` exts || UnboxedTuples `elem` exts
@@ -1147,52 +1141,46 @@ lexCharacter = do   -- We need to keep track of not only character constants but
     where matchQuote = matchChar '\'' "Improperly terminated character constant"
 
 
+-- The accumulator is a reversed [Char] list rather than [Text]/Builder:
+-- a [Char] cons cell is ~16 B vs ~48 B + ByteArray for T.singleton, and
+-- the literal's cons list dies as soon as the token is yielded (the
+-- single 'T.pack . reverse' at the end materializes the strict 'Text'
+-- payload), so per-literal residency is already bounded.
 lexString :: Lex a Token
 lexString = loop ("","")
     where
-    loop (s, raw) = do
-        r <- getInput
-        exts <- getExtensionsL
-        case r of
-            '\\':'&':_ -> do
-                    discard 2
-                    loop (s, '&':'\\':raw)
-            '\\':c:_ | isSpace c -> do
-                        discard 1
-                        wcs <- lexWhiteChars
-                        matchChar '\\' "Illegal character in string gap"
-                        loop (s, '\\':reverse wcs ++ '\\':raw)
-                     | otherwise -> do
-                        (ce, str) <- lexEscape
-                        loop (ce:s, reverse (T.unpack str) ++ '\\':raw)
-            '"':'#':_ | MagicHash `elem` exts -> do
-                        discard 2
-                        return (StringHash (T.pack (reverse s), T.pack (reverse raw)))
-            '"':_ -> do
-                discard 1
-                return (StringTok (T.pack (reverse s), T.pack (reverse raw)))
-            c:_ | c /= '\n' -> do
-                discard 1
-                loop (c:s, c:raw)
-            _ ->   fail "Improperly terminated string"
+    loop (s, raw) = do r    <- getInput
+                       exts <- getExtensionsL
+                       case r of
+                         '\\':'&':_ -> do discard 2
+                                          loop (s, '&':'\\':raw)
+                         '\\':c:_
+                           | isSpace c -> do discard 1
+                                             wcs <- lexWhiteChars
+                                             matchChar '\\' "Illegal character in string gap"
+                                             loop (s, '\\':reverse wcs ++ '\\':raw)
+                           | otherwise -> do (ce, str) <- lexEscape
+                                             loop (ce:s, reverse (T.unpack str) ++ '\\':raw)
+                         '"':'#':_
+                           | MagicHash `elem` exts -> do discard 2
+                                                         return (StringHash (T.pack (reverse s), T.pack (reverse raw)))
+                         '"':_ -> do discard 1
+                                     return (StringTok (T.pack (reverse s), T.pack (reverse raw)))
+                         c:_
+                           | c /= '\n' -> do discard 1
+                                             loop (c:s, c:raw)
+                         _ -> fail "Improperly terminated string"
 
     lexWhiteChars :: Lex a String
-    lexWhiteChars = do
-        s <- getInput
-        case s of
-            '\n':_ -> do
-                    lexNewline
-                    wcs <- lexWhiteChars
-                    return $ '\n':wcs
-            '\t':_ -> do
-                    lexTab
-                    wcs <- lexWhiteChars
-                    return $ '\t':wcs
-            c:_ | isSpace c -> do
-                    discard 1
-                    wcs <- lexWhiteChars
-                    return $ c:wcs
-            _ -> return ""
+    lexWhiteChars = do s <- getInput
+                       case s of
+                         '\n':_          -> do lexNewline
+                                               fmap ('\n':) lexWhiteChars
+                         '\t':_          -> do lexTab
+                                               fmap ('\t':) lexWhiteChars
+                         c:_ | isSpace c -> do discard 1
+                                               fmap (c:) lexWhiteChars
+                         _               -> return ""
 
 lexEscape :: Lex a (Char, Text)
 lexEscape = do

From 674ff8cc476de9e2f047c9ba9a4ef59e7eae3a59 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Fri, 8 May 2026 14:46:54 +0000
Subject: [PATCH 5/8] Expose Text-input lexer entry points

Adds lexTokenStreamText and lexTokenStreamTextWithMode in
Language.Haskell.Exts.Lexer, the lexer-only counterparts of the
Text-input parser entry points in Language.Haskell.Exts.Parser.Text.

These let consumers that only need a token stream (linters, syntax
highlighters, exact-print tooling) skip the eager Data.Text.pack at
the String boundary that the existing String entry point performs.

Also factors lexIt out as a top-level helper shared by both the
String and Text variants.
---
 src/Language/Haskell/Exts/Lexer.hs | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/Language/Haskell/Exts/Lexer.hs b/src/Language/Haskell/Exts/Lexer.hs
index ac41669d..86d9fc7a 100644
--- a/src/Language/Haskell/Exts/Lexer.hs
+++ b/src/Language/Haskell/Exts/Lexer.hs
@@ -16,6 +16,7 @@
 module Language.Haskell.Exts.Lexer
     (
       lexTokenStream, lexTokenStreamWithMode,
+      lexTokenStreamText, lexTokenStreamTextWithMode,
 
       Token(..), Loc(..),
 
@@ -26,6 +27,7 @@ module Language.Haskell.Exts.Lexer
 import Language.Haskell.Exts.InternalLexer
 import Language.Haskell.Exts.ParseMonad
 import Language.Haskell.Exts.SrcLoc
+import Data.Text (Text)
 
 -- | Lex a string into a list of Haskell 2010 source tokens.
 lexTokenStream :: String -> ParseResult [Loc Token]
@@ -34,12 +36,21 @@ lexTokenStream = lexTokenStreamWithMode defaultParseMode
 -- | Lex a string into a list of Haskell source tokens, using an explicit mode.
 lexTokenStreamWithMode :: ParseMode -> String -> ParseResult [Loc Token]
 lexTokenStreamWithMode mode = runParserWithMode mode lexIt
-  where lexIt :: P [Loc Token]
-        lexIt = runL go return
-        go :: Lex [Loc Token] [Loc Token]
-        go = do
-          ltok <- topLexer
-          case ltok of
-            Loc _ EOF -> return []
-            _ -> do ts <- go
-                    return (ltok:ts)
+
+-- | 'Text'-input variant of 'lexTokenStream'.  Skips the eager 'T.pack'
+-- that the 'String' entry point performs at the boundary.
+lexTokenStreamText :: Text -> ParseResult [Loc Token]
+lexTokenStreamText = lexTokenStreamTextWithMode defaultParseMode
+
+-- | 'Text'-input variant of 'lexTokenStreamWithMode'.
+lexTokenStreamTextWithMode :: ParseMode -> Text -> ParseResult [Loc Token]
+lexTokenStreamTextWithMode mode = fmap fst . runParserWithModeCommentsText mode lexIt
+
+lexIt :: P [Loc Token]
+lexIt = runL go return
+  where go :: Lex [Loc Token] [Loc Token]
+        go = do ltok <- topLexer
+                case ltok of
+                  Loc _ EOF -> return []
+                  _         -> do ts <- go
+                                  return (ltok:ts)

From b56c0470c97b540e975079e85726edbddb433a58 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Fri, 8 May 2026 14:00:34 +0000
Subject: [PATCH 6/8] lexString: fast-path slice from input Text when literal
 has no escapes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most string literals contain neither backslash escapes nor literal
newlines.  Previously every such literal was lexed character-by-
character into a [Char] cons accumulator and then T.pack'd at the
end, allocating one cons cell per character plus a fresh ByteArray.

This commit walks the input Text with T.span looking for the run of
"plain" characters (not '"', '\\', or '\n').  If the run is followed
by a closing '"', we emit StringTok with the slice itself -- no
allocation per character, no T.pack copy, just a Text record (32 B)
pointing at the existing input ByteArray.

When an escape, gap or premature EOF is encountered, we fall back to
the original cons-list 'loop' starting from the prefix already
consumed.

Measured (15-trial 2σ-gated, vs Text PR tip without this fast path):
  big.hs   (1 x 3 MB literal):   alloc -99%, resid -99%, time -98%
  many.hs  (200 x 50 kB liters): alloc -99%, resid -98%, time -97%
  repeat.hs (id-heavy, no liters): unchanged (fast path doesn't apply)

vs master (String API):
  big.hs:   234 MB -> 4 MB residency
  many.hs:  628 MB -> 17 MB residency

The slice keeps the input ByteArray alive while any token derived
from it remains live; this is acceptable because the parser holds
the whole input until parse completion anyway, and O(1) sharing
beats the O(n) cons-and-pack cost we previously paid per literal.
---
 src/Language/Haskell/Exts/InternalLexer.hs | 41 ++++++++++++++++++----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs
index 2f951390..535b214f 100644
--- a/src/Language/Haskell/Exts/InternalLexer.hs
+++ b/src/Language/Haskell/Exts/InternalLexer.hs
@@ -1141,14 +1141,41 @@ lexCharacter = do   -- We need to keep track of not only character constants but
     where matchQuote = matchChar '\'' "Improperly terminated character constant"
 
 
--- The accumulator is a reversed [Char] list rather than [Text]/Builder:
--- a [Char] cons cell is ~16 B vs ~48 B + ByteArray for T.singleton, and
--- the literal's cons list dies as soon as the token is yielded (the
--- single 'T.pack . reverse' at the end materializes the strict 'Text'
--- payload), so per-literal residency is already bounded.
+-- Fast path: scan the input 'Text' with 'T.span' for the longest run
+-- of characters that are neither escape introducers nor terminators
+-- nor newlines, and slice it directly.  Most string literals have no
+-- escapes, so this avoids building any intermediate '[Char]' or
+-- 'Text' for the body and emits a 'Text' that shares the input
+-- 'ByteArray'.  When an escape or string-gap is encountered the
+-- function falls back to the cons-list 'loop' implementation.
+--
+-- The cons-list fallback uses a reversed [Char] accumulator: a [Char]
+-- cons cell is ~16 B vs ~48 B + ByteArray for T.singleton, and the
+-- literal's cons list dies as soon as the token is yielded.
 lexString :: Lex a Token
-lexString = loop ("","")
-    where
+lexString = do
+    inp <- getInputT
+    let (taken, rest) = T.span isPlain inp
+    case T.uncons rest of
+        Just ('"', after) -> do
+            -- Fast path: simple literal closed with '"'.
+            discard (T.length taken + 1)
+            exts <- getExtensionsL
+            case T.uncons after of
+                Just ('#', _) | MagicHash `elem` exts -> do
+                    discard 1
+                    return (StringHash (taken, taken))
+                _ -> return (StringTok (taken, taken))
+        _ ->
+            -- Slow path: escape, newline, or EOF in the body.  Hand
+            -- off the prefix we have so far (reversed for the loop's
+            -- accumulator convention) and continue char-by-char.
+            let !rs = reverse (T.unpack taken)
+            in do discard (T.length taken)
+                  loop (rs, rs)
+  where
+    isPlain c = c /= '"' && c /= '\\' && c /= '\n'
+
     loop (s, raw) = do r    <- getInput
                        exts <- getExtensionsL
                        case r of

From 414661048a1aea16c5ffd766a2c1ba9fc09938a4 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Fri, 8 May 2026 14:14:57 +0000
Subject: [PATCH 7/8] lexString: scan-and-splice replaces cons-list fallback

The original fast-path bailed to a [Char]-cons loop on the first
escape or string-gap.  This commit unifies fast and slow paths into
a single scan-and-splice: at each iteration, slice the next plain
run via T.span, then handle the terminator or escape, and continue.

Per-token allocation is now O(escapes), not O(chars):
- A literal with no escapes allocates one Text record (slice).
- A literal with K escapes allocates O(K) chunks + K T.singleton
  parsed-value records; intermediate runs are slices of the input.
- The reverse-ordered chunk list collapses to one strict Text via
  T.concat at token emission.

Wall-clock improvement vs the simpler fast-path (already 99% faster
than master) on test corpora:
  big.hs   -13.33% (0.015s -> 0.013s)
  many.hs  -12.96% (0.054s -> 0.047s)
  repeat.hs no-effect
Allocation and residency unchanged on these escape-free corpora; the
new path matters most on literals with sparse escapes (no test
corpus available).
---
 src/Language/Haskell/Exts/InternalLexer.hs | 123 ++++++++++-----------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/src/Language/Haskell/Exts/InternalLexer.hs b/src/Language/Haskell/Exts/InternalLexer.hs
index 535b214f..6a68d6b4 100644
--- a/src/Language/Haskell/Exts/InternalLexer.hs
+++ b/src/Language/Haskell/Exts/InternalLexer.hs
@@ -1141,73 +1141,72 @@ lexCharacter = do   -- We need to keep track of not only character constants but
     where matchQuote = matchChar '\'' "Improperly terminated character constant"
 
 
--- Fast path: scan the input 'Text' with 'T.span' for the longest run
--- of characters that are neither escape introducers nor terminators
--- nor newlines, and slice it directly.  Most string literals have no
--- escapes, so this avoids building any intermediate '[Char]' or
--- 'Text' for the body and emits a 'Text' that shares the input
--- 'ByteArray'.  When an escape or string-gap is encountered the
--- function falls back to the cons-list 'loop' implementation.
+-- Scan-and-splice: walk the input 'Text' looking for the longest run
+-- of "plain" characters (not '"', '\\', '\n'), slice it as a single
+-- 'Text' chunk that shares the input 'ByteArray', then handle the
+-- terminator or escape.  Per-token allocation is O(escapes), not
+-- O(chars) -- a literal with no escapes allocates a single 'Text'
+-- record; one with K escapes allocates O(K) chunks plus K
+-- 'T.singleton' values for the parsed escape characters.
 --
--- The cons-list fallback uses a reversed [Char] accumulator: a [Char]
--- cons cell is ~16 B vs ~48 B + ByteArray for T.singleton, and the
--- literal's cons list dies as soon as the token is yielded.
+-- The accumulators are reverse-ordered chunk lists (newest chunk
+-- first); we 'T.concat . reverse' once at the end to materialize the
+-- final strict 'Text' payload.
 lexString :: Lex a Token
-lexString = do
-    inp <- getInputT
-    let (taken, rest) = T.span isPlain inp
-    case T.uncons rest of
-        Just ('"', after) -> do
-            -- Fast path: simple literal closed with '"'.
-            discard (T.length taken + 1)
-            exts <- getExtensionsL
-            case T.uncons after of
-                Just ('#', _) | MagicHash `elem` exts -> do
-                    discard 1
-                    return (StringHash (taken, taken))
-                _ -> return (StringTok (taken, taken))
-        _ ->
-            -- Slow path: escape, newline, or EOF in the body.  Hand
-            -- off the prefix we have so far (reversed for the loop's
-            -- accumulator convention) and continue char-by-char.
-            let !rs = reverse (T.unpack taken)
-            in do discard (T.length taken)
-                  loop (rs, rs)
+lexString = loop [] []
   where
     isPlain c = c /= '"' && c /= '\\' && c /= '\n'
 
-    loop (s, raw) = do r    <- getInput
-                       exts <- getExtensionsL
-                       case r of
-                         '\\':'&':_ -> do discard 2
-                                          loop (s, '&':'\\':raw)
-                         '\\':c:_
-                           | isSpace c -> do discard 1
-                                             wcs <- lexWhiteChars
-                                             matchChar '\\' "Illegal character in string gap"
-                                             loop (s, '\\':reverse wcs ++ '\\':raw)
-                           | otherwise -> do (ce, str) <- lexEscape
-                                             loop (ce:s, reverse (T.unpack str) ++ '\\':raw)
-                         '"':'#':_
-                           | MagicHash `elem` exts -> do discard 2
-                                                         return (StringHash (T.pack (reverse s), T.pack (reverse raw)))
-                         '"':_ -> do discard 1
-                                     return (StringTok (T.pack (reverse s), T.pack (reverse raw)))
-                         c:_
-                           | c /= '\n' -> do discard 1
-                                             loop (c:s, c:raw)
-                         _ -> fail "Improperly terminated string"
-
-    lexWhiteChars :: Lex a String
-    lexWhiteChars = do s <- getInput
-                       case s of
-                         '\n':_          -> do lexNewline
-                                               fmap ('\n':) lexWhiteChars
-                         '\t':_          -> do lexTab
-                                               fmap ('\t':) lexWhiteChars
-                         c:_ | isSpace c -> do discard 1
-                                               fmap (c:) lexWhiteChars
-                         _               -> return ""
+    -- Fast-merge for the very common case of a literal with no escapes.
+    -- 's' and 'raw' would both be a single chunk, so we'd otherwise
+    -- 'T.concat [taken]' which is identity but still allocates.  Skip.
+    finish [taken] = taken
+    finish chunks  = T.concat (reverse chunks)
+
+    loop sChunks rawChunks = do
+      inp <- getInputT
+      let (taken, rest) = T.span isPlain inp
+      let !sChunks'   = if T.null taken then sChunks   else taken : sChunks
+      let !rawChunks' = if T.null taken then rawChunks else taken : rawChunks
+      discard (T.length taken)
+      exts <- getExtensionsL
+      case T.uncons rest of
+        Just ('"', after) -> do
+          discard 1
+          case T.uncons after of
+            Just ('#', _) | MagicHash `elem` exts -> do
+              discard 1
+              return (StringHash (finish sChunks', finish rawChunks'))
+            _ -> return (StringTok (finish sChunks', finish rawChunks'))
+        Just ('\\', _) -> do
+          inp2 <- getInputT
+          case T.unpack (T.take 2 inp2) of
+            "\\&" -> do
+              discard 2
+              loop sChunks' (T.pack "\\&" : rawChunks')
+            '\\':c:[] | isSpace c -> do
+              discard 1
+              wcs <- lexWhiteChars
+              matchChar '\\' "Illegal character in string gap"
+              -- Raw rep: '\' + wcs + '\'
+              loop sChunks' (T.singleton '\\' : wcs : T.singleton '\\' : rawChunks')
+            _ -> do
+              (ce, str) <- lexEscape
+              loop (T.singleton ce : sChunks') (str : T.singleton '\\' : rawChunks')
+        _ -> fail "Improperly terminated string"
+
+    lexWhiteChars :: Lex a Text
+    lexWhiteChars = T.concat . reverse <$> go []
+      where
+      go acc = do s <- getInput
+                  case s of
+                    '\n':_          -> do lexNewline
+                                          go (T.singleton '\n' : acc)
+                    '\t':_          -> do lexTab
+                                          go (T.singleton '\t' : acc)
+                    c:_ | isSpace c -> do discard 1
+                                          go (T.singleton c : acc)
+                    _               -> return acc
 
 lexEscape :: Lex a (Char, Text)
 lexEscape = do

From 635133f214bed97e71033918cd6fb607ad7d2512 Mon Sep 17 00:00:00 2001
From: "Michal J. Gajda" <mjgajda@gmail.com>
Date: Fri, 8 May 2026 14:49:08 +0000
Subject: [PATCH 8/8] CHANGELOG: 1.23.2 entry with measured Phase 1 numbers

---
 CHANGELOG | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 8e4f7f5a..cfc97bba 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,58 @@
+1.23.1 --> 1.23.2
+=================
+Performance: lexer/parser internals are now Text-native (Phase 1).
+The Syntax AST stays String-valued, so this is a non-breaking change
+for AST consumers.  Phase 2 (Text-valued AST + lazy String compat
+shim) is a separate later change.
+
+What changed:
+* Lexer input tape, all Token payloads, the keyword/operator/pragma
+  lookup tables, and the numeric/identifier/escape/raw-pragma lex
+  workers operate on Data.Text directly.
+* Strict 'discard' and a single-pass 'lexWhileT' (using T.span +
+  foldl' for line/column) replace the previous lazy formulations,
+  removing a per-character thunk chain in the tokenizer hot loop.
+* Token payload fields are strict (!Text), so the [Char]-cons
+  accumulator inside lexString is materialized to a strict Text and
+  freed at token yield rather than being kept alive by the token
+  stream.
+* lexString uses a scan-and-splice strategy: each plain run is
+  sliced from the input Text via T.span, and only the parsed
+  escape characters allocate new Text records.  Per-token
+  allocation is O(escapes), not O(chars).
+
+What's new:
+* Language.Haskell.Exts.Parser.Text exposing Text-input parser
+  entry points (parseModuleText, parseExpText, ...) that skip the
+  eager Data.Text.pack at the String boundary.
+* lexTokenStreamText / lexTokenStreamTextWithMode in
+  Language.Haskell.Exts.Lexer, the lexer-only counterparts.
+
+Measurements (15-trial bench-mutex 2σ-gated, GHC 9.10, -O), Text
+API vs master parseModule:
+
+  Issue #478 stress (1 x 3 MB string literal):
+    master:     234 MB residency, 1156 MB allocated, 1.0 s
+    Text API:    52 KB residency,    7 MB allocated, 13 ms
+    delta:      -99.98% / -99.40% / -98.71%
+
+  Multi-literal stress (200 x 50 kB literals, 9.6 MB):
+    master:     628 MB residency, 3858 MB allocated, 2.9 s
+    Text API:     6 MB residency,   30 MB allocated, 47 ms
+    delta:      -99.05% / -99.22% / -98.39%
+
+  Identifier-heavy stress (5.1 MB, 1M ids / 100 unique):
+    no measurable change (the AST scaffolding dominates).
+    Atom table at lex time would address this; not in this PR.
+
+  Real haskell-src-exts library files via the Text API:
+    InternalLexer.hs (58 kB):  resid -74%, alloc -27%, time -51%
+    ParseSyntax.hs   (17 kB):  resid -54%, alloc +13% (small file
+                                T.unpack overhead at AST construction;
+                                eliminated by Phase 2)
+    Build.hs, Comments.hs, SrcLoc.hs (small): resid neutral to -38%,
+    alloc +11-13% (same small-file T.unpack overhead).
+
 1.23.0 --> 1.23.1
 =================
 * show instance for SrcLoc and SrcSpan renders "(-1)" instead of "-1"