diff --git a/elm-format-lib/elm-format-lib.cabal b/elm-format-lib/elm-format-lib.cabal index 50586441a..c1a7568e1 100644 --- a/elm-format-lib/elm-format-lib.cabal +++ b/elm-format-lib/elm-format-lib.cabal @@ -59,20 +59,12 @@ common common-options Parse.Literal Parse.Module Parse.Parse + Parse.ParsecAdapter Parse.Pattern Parse.Primitives Parse.State Parse.Type Parse.Whitespace - Text.Parsec - Text.Parsec.Char - Text.Parsec.Combinator - Text.Parsec.Error - Text.Parsec.Indent - Text.Parsec.Pos - Text.Parsec.Prim - Text.ParserCombinators.Parsec.Char - Text.ParserCombinators.Parsec.Combinator build-depends: aeson >= 1.5.5.1 && < 1.6, diff --git a/elm-format-lib/src/Parse/Binop.hs b/elm-format-lib/src/Parse/Binop.hs index 3141d2a10..4a1a6edb2 100644 --- a/elm-format-lib/src/Parse/Binop.hs +++ b/elm-format-lib/src/Parse/Binop.hs @@ -1,7 +1,7 @@ {-# LANGUAGE DataKinds #-} module Parse.Binop (binops) where -import Text.Parsec ((<|>), choice, try) +import Parse.ParsecAdapter ((<|>), choice, try) import AST.V0_16 import AST.Structure (FixAST) diff --git a/elm-format-lib/src/Parse/Common.hs b/elm-format-lib/src/Parse/Common.hs index 4284d91a5..4f05792bb 100644 --- a/elm-format-lib/src/Parse/Common.hs +++ b/elm-format-lib/src/Parse/Common.hs @@ -5,7 +5,7 @@ module Parse.Common ) where import AST.V0_16 -import Text.Parsec +import Parse.ParsecAdapter import Parse.Helpers import Parse.Whitespace import Parse.IParser diff --git a/elm-format-lib/src/Parse/Declaration.hs b/elm-format-lib/src/Parse/Declaration.hs index 37f5d3e16..8b257a45c 100644 --- a/elm-format-lib/src/Parse/Declaration.hs +++ b/elm-format-lib/src/Parse/Declaration.hs @@ -1,7 +1,7 @@ {-# LANGUAGE DataKinds #-} module Parse.Declaration where -import Text.Parsec ( (<|>), (), choice, digit, optionMaybe, string, try ) +import Parse.ParsecAdapter ( (<|>), (), choice, digit, optionMaybe, string, try ) import AST.Structure import qualified Data.Indexed as I diff --git a/elm-format-lib/src/Parse/Expression.hs b/elm-format-lib/src/Parse/Expression.hs index e63016467..da973ee2d 100644 --- a/elm-format-lib/src/Parse/Expression.hs +++ b/elm-format-lib/src/Parse/Expression.hs @@ -6,9 +6,8 @@ module Parse.Expression (term, typeAnnotation, definition, expr) where import Data.Coapplicative import qualified Data.Indexed as I import Data.Maybe (fromMaybe) -import Text.Parsec hiding (newline, spaces) -import Text.Parsec.Indent (block, withPos, checkIndent) +import Parse.ParsecAdapter hiding (newline, spaces) import qualified Parse.Binop as Binop import Parse.Helpers import Parse.Common diff --git a/elm-format-lib/src/Parse/Helpers.hs b/elm-format-lib/src/Parse/Helpers.hs index 4a53e0218..164edbe17 100644 --- a/elm-format-lib/src/Parse/Helpers.hs +++ b/elm-format-lib/src/Parse/Helpers.hs @@ -6,8 +6,7 @@ import Prelude hiding (until) import Control.Monad (guard) import qualified Data.Indexed as I import Data.Map.Strict hiding (foldl) -import Text.Parsec hiding (newline, spaces, State) -import Text.Parsec.Indent (indented, runIndent) +import Parse.ParsecAdapter hiding (newline, spaces, State) import AST.V0_16 import qualified AST.Helpers as Help @@ -17,6 +16,7 @@ import qualified Parse.State as State import Parse.Comments import Parse.IParser import Parse.Whitespace +import qualified Parse.Primitives as EP import qualified Reporting.Annotation as A import qualified Reporting.Error.Syntax as Syntax import qualified Reporting.Region as R @@ -512,12 +512,18 @@ commentedKeyword elmVersion word parser = -- ODD COMBINATORS +-- Behaves the same as `Parse.ParsecAdapter.fail` except that the consumed +-- continuation is called instead of the empty continuation. failure :: String -> IParser String -failure msg = do - inp <- getInput - setInput ('x':inp) - _ <- anyToken - fail msg +failure msg = + EP.Parser $ \s _ _ cerr _ -> + let + (EP.Parser p) = fail msg + in + -- This looks really unsound, but `p` which was created with `fail` will + -- only ever call the empty error continuation (which in this case + -- re-routes to the consumed error continuation) + p s undefined undefined undefined cerr until :: IParser a -> IParser b -> IParser b diff --git a/elm-format-lib/src/Parse/IParser.hs b/elm-format-lib/src/Parse/IParser.hs index cb03cc787..d426f0c1d 100644 --- a/elm-format-lib/src/Parse/IParser.hs +++ b/elm-format-lib/src/Parse/IParser.hs @@ -1,6 +1,7 @@ module Parse.IParser where -import Text.Parsec.Prim (Parser) +import Parse.Primitives (Parser) +import Parse.ParsecAdapter (ParseError) -type IParser a = Parser a +type IParser a = Parser ParseError a diff --git a/elm-format-lib/src/Parse/Literal.hs b/elm-format-lib/src/Parse/Literal.hs index 2bc126114..ddb2ef531 100644 --- a/elm-format-lib/src/Parse/Literal.hs +++ b/elm-format-lib/src/Parse/Literal.hs @@ -2,8 +2,7 @@ module Parse.Literal (literal) where import Prelude hiding (exponent) import Data.Char (digitToInt, isSpace) -import Text.Parsec ((<|>), (), digit, hexDigit, lookAhead, many1, option, string, try, char, notFollowedBy, choice, anyChar, satisfy, manyTill, many, between, skipMany, skipMany1) -import Text.Parsec.Char (octDigit, space, upper) +import Parse.ParsecAdapter import Parse.Helpers (processAs, escaped, expecting, sandwich, betwixt) import Parse.IParser diff --git a/elm-format-lib/src/Parse/Module.hs b/elm-format-lib/src/Parse/Module.hs index 0d3c329dc..2dd4bf8ae 100644 --- a/elm-format-lib/src/Parse/Module.hs +++ b/elm-format-lib/src/Parse/Module.hs @@ -4,7 +4,7 @@ module Parse.Module (moduleDecl, elmModule, topLevel, import') where import qualified Control.Applicative import Data.Map.Strict ( Map, empty, insert, insertWith ) import Elm.Utils ((|>)) -import Text.Parsec ( char, letter, string, choice, eof, option, optionMaybe, (), (<|>), many, try ) +import Parse.ParsecAdapter ( char, letter, string, choice, eof, option, optionMaybe, (), (<|>), many, try ) import Parse.Helpers import qualified Parse.Declaration as Decl import AST.Listing (Listing(..), mergeCommentedMap, mergeListing) diff --git a/elm-format-lib/src/Parse/Parse.hs b/elm-format-lib/src/Parse/Parse.hs index a841c158f..25da45a5c 100644 --- a/elm-format-lib/src/Parse/Parse.hs +++ b/elm-format-lib/src/Parse/Parse.hs @@ -1,7 +1,8 @@ {-# LANGUAGE DataKinds #-} module Parse.Parse (parse, parseModule, parseDeclarations, parseExpressions) where -import qualified Text.Parsec.Error as Parsec +import Parse.ParsecAdapter (eof) +import qualified Parse.ParsecAdapter as Parsec import AST.V0_16 import AST.Module (Module) @@ -17,7 +18,6 @@ import qualified Reporting.Region as R import qualified Reporting.Error.Syntax as Error import qualified Reporting.Result as Result import Parse.IParser -import Text.Parsec (eof) parseModule :: ElmVersion -> String -> Result.Result () Error.Error (Module [UppercaseIdentifier] (ASTNS Located [UppercaseIdentifier] 'TopLevelNK)) diff --git a/elm-format-lib/src/Parse/ParsecAdapter.hs b/elm-format-lib/src/Parse/ParsecAdapter.hs new file mode 100644 index 000000000..bcdd5c9d4 --- /dev/null +++ b/elm-format-lib/src/Parse/ParsecAdapter.hs @@ -0,0 +1,800 @@ +-- This adapter module allows one to interact with the new Elm parser with a +-- parsec-like API +-- +-- Historically the Elm compiler and elm-format have shared the same parsing logic +-- and used parsec[1] + indents[2] as parser. Since Elm 0.15 the compiler got +-- its own custom parser[3] and the dependency on parsec was removed. This +-- change was not integrated into elm-format however which has continued using +-- parsec[3]. It is desirable for elm-format to utilize the new parser however, and +-- this module is the first step in making that transition. With this module +-- it has been possible to replace parsec with the new parser without having to +-- rewrite all of the higher level parser, they just interact with the new parser +-- through this module instead. +-- +-- 1. https://hackage.haskell.org/package/parsec-3.1.14.0 +-- 2. https://hackage.haskell.org/package/indents-0.3.3 +-- 3. https://github.com/elm/compiler/blob/94715a520f499591ac6901c8c822bc87cd1af24f/compiler/src/Parse/Primitives.hs + + +{-# LANGUAGE BangPatterns #-} + +module Parse.ParsecAdapter + -- Text.Parsec.Prim + ( (<|>) + , () + , lookAhead + , try + , many + , skipMany + , runParserT + , getPosition + , getState + , updateState + -- Text.Parsec.Pos + , SourcePos + , SourceName + , sourceLine + , sourceColumn + -- Text.Parsec.Error + , ParseError + , Message + , errorPos + , errorMessages + -- Text.Parsec.Combinator + , many1 + , manyTill + , skipMany1 + , option + , optionMaybe + , anyToken + , choice + , notFollowedBy + , between + , eof + -- Text.Parsec.Char + , oneOf + , space + , upper + , lower + , alphaNum + , letter + , digit + , hexDigit + , octDigit + , char + , anyChar + , satisfy + , string + -- Text.Parsec.Indents + , runIndent + , block + , indented + , checkIndent + , withPos + ) + where + +import Parse.Primitives (Row, Col) +import qualified Parse.Primitives as EP +import Parse.State (State(..)) + +import qualified Control.Applicative as Applicative +import Control.Monad (MonadPlus(..), mzero, liftM) +import qualified Control.Monad.Fail as Fail + +import Data.List (nub) +import Data.Typeable (Typeable) +import Data.Bits (shiftL, shiftR, (.&.), (.|.)) +import Data.Word (Word8, Word16) +import Data.Char (chr, ord) +import qualified Data.Char as C +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B + +import Foreign.Ptr (plusPtr) +import Foreign.ForeignPtr (touchForeignPtr) +import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) + + + + +-- Text.Parsec.Prim + + +unknownError :: EP.Row -> EP.Col -> ParseError +unknownError row col = + newErrorUnknown "" row col + + +unexpected :: String -> Parser a +unexpected msg = + EP.Parser $ \(EP.State _ _ _ _ row col sourceName _) _ _ _ eerr -> + eerr row col (newErrorMessage (UnExpect msg) sourceName) + + +type Parser a = EP.Parser ParseError a + + +instance Applicative.Alternative (EP.Parser ParseError) where + empty = mzero + (<|>) = mplus + + +instance Fail.MonadFail (EP.Parser ParseError) where + fail = parserFail + + +parserFail :: String -> Parser a +parserFail msg = + EP.Parser $ \(EP.State _ _ _ _ row col sourceName _) _ _ _ eerr -> + eerr row col $ newErrorMessage (Message msg) sourceName + + +instance MonadPlus (EP.Parser ParseError) where + mzero = parserZero + mplus = parserPlus + + +parserZero :: Parser a +parserZero = + EP.Parser $ \state _ _ _ eerr -> + let + (EP.State _ _ _ _ row col _ _) = state + in + eerr row col unknownError + + +parserPlus :: Parser a -> Parser a -> Parser a +parserPlus (EP.Parser p) (EP.Parser q) = + EP.Parser $ \s cok eok cerr eerr -> + let + meerr r1 c1 toErr1 = + let + neerr r2 c2 toErr2 = + -- This error merging behavior from parsec is really tricky for + -- me to understand, especially the error positions. + -- I doubt that I got this 100% correct. + let + err = mergeError (toErr1 r1 c1) (toErr2 r2 c2) + row = fromIntegral $ sourceLine $ errorPos err + col = fromIntegral $ sourceColumn $ errorPos err + in + eerr row col (\_ _ -> err) + in q s cok eok cerr neerr + in + p s cok eok cerr meerr + + +infixr 1 <|> +infix 0 + + +(<|>) :: Parser a -> Parser a -> Parser a +(<|>) = mplus + + +-- TODO: Can the implementation be improved? +-- +-- It's probable that this behaviour doesn't 100% match the original parsec +-- behaviour. In particular, parsec stores error information in the _ok_ +-- continuations as well, why is that? And is it possible to get the same +-- behaviour with the new parser which only stores errors in the _err_ +-- continuations. +() :: Parser a -> String -> Parser a +() (EP.Parser p) msg = + EP.Parser $ \s@(EP.State _ _ _ _ _ _ sn _) cok eok cerr eerr -> + let + eerr' row col _ = + eerr row col (newErrorMessage (Expect msg) sn) + in + p s cok eok cerr eerr' + + +lookAhead :: Parser a -> Parser a +lookAhead (EP.Parser p) = + EP.Parser $ \s _ eok cerr eerr -> + let + eok' a _ = eok a s + in + p s eok' eok' cerr eerr + + +try :: Parser a -> Parser a +try (EP.Parser parser) = + EP.Parser $ \s cok eok _ err -> + parser s cok eok err err + + +-- TODO: See if this can be implemented more eloquently +-- +-- The many_ helper and the code after the `in` are very similar and it's not +-- obvious how they differ from looking at the code. Is there a way to make this +-- implementation more obvious? Maybe the code in the `in` could be replaced +-- with a single `many_` call? +many :: Parser a -> Parser [a] +many (EP.Parser p) = + EP.Parser $ \s cok eok cerr _ -> + let + many_ acc x s' = + p + s' + (many_ (x:acc)) + parserDoesNotConsumeErr + cerr + (\_ _ _ -> cok (reverse (x:acc)) s') + in + p + s + (many_ []) + parserDoesNotConsumeErr + cerr + (\_ _ _ -> eok [] s) + + +skipMany ::Parser a -> Parser () +skipMany (EP.Parser p) = + EP.Parser $ \s cok _ cerr _ -> + let + skipMany_ s' = + p + s' + (\_ -> skipMany_) + parserDoesNotConsumeErr + cerr + (\_ _ _ -> cok () s') + in + skipMany_ s + + +-- Note that causing a runtime crash when using `many` or `skipMany` with a +-- parser that does not consume is the same behaviour as it was with parsec. +parserDoesNotConsumeErr :: a +parserDoesNotConsumeErr = error "Text.Parsec.Prim.many: combinator 'many' is applied to a parser that accepts an empty string." + + +-- This function is very similar to `Parse.Primitives.fromByteString`. +runParserT :: Parser a -> State -> SourceName -> String -> Either ParseError a +runParserT (EP.Parser p) (State newline) name source = + B.accursedUnutterablePerformIO $ + let + (B.PS fptr offset length) = stringToByteString source + !pos = plusPtr (unsafeForeignPtrToPtr fptr) offset + !end = plusPtr pos length + !result = p (EP.State fptr pos end 1 1 1 name newline) toOk toOk toErr toErr + in + do touchForeignPtr fptr + return result + + +toOk :: a -> EP.State -> Either x a +toOk !a _ = + Right a + + +toErr :: EP.Row -> EP.Col -> (EP.Row -> EP.Col -> x) -> Either x a +toErr row col toError = + Left (toError row col) + + +stringToByteString :: String -> B.ByteString +stringToByteString = B.pack . concatMap encodeChar + + +-- https://hackage.haskell.org/package/utf8-string-1.0.2/docs/src/Codec.Binary.UTF8.String.html#encodeChar +encodeChar :: Char -> [Word8] +encodeChar = map fromIntegral . go . ord + where + go oc + | oc <= 0x7f = [oc] + + | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) + , 0x80 + oc .&. 0x3f + ] + + | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + + | otherwise = [ 0xf0 + (oc `shiftR` 18) + , 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + + +getPosition :: Parser SourcePos +getPosition = + do (EP.State _ _ _ _ row col sourceName _) <- getParserState + return $ newPos sourceName row col + + +getState :: Parser State +getState = + do (EP.State _ _ _ _ _ _ _ newline) <- getParserState + return (State newline) + + +updateState :: (State -> State) -> Parser () +updateState f = + do _ <- updateParserState + (\(EP.State src pos end indent row col sourceName newline) -> + let + (State newline') = f (State newline) + in + EP.State src pos end indent row col sourceName newline' + ) + return () + + +getParserState :: Parser EP.State +getParserState = updateParserState id + + +updateParserState :: (EP.State -> EP.State) -> Parser EP.State +updateParserState f = + EP.Parser $ \s _ eok _ _ -> eok (f s) (f s) + + + +-- Text.Parsec.Pos + + +type SourceName = String + + +data SourcePos = SourcePos SourceName !EP.Row !EP.Col + + +newPos :: SourceName -> EP.Row -> EP.Col -> SourcePos +newPos = + SourcePos + + +sourceLine :: SourcePos -> Int +sourceLine (SourcePos _ row _) = + fromIntegral row + + +sourceColumn :: SourcePos -> Int +sourceColumn (SourcePos _ _ col) = + fromIntegral col + + +instance Show SourcePos where + show (SourcePos name line column) + | null name = showLineColumn + | otherwise = "\"" ++ name ++ "\" " ++ showLineColumn + where + showLineColumn = "(line " ++ show line ++ + ", column " ++ show column ++ + ")" +-- Text.Parsec.Error + + +data Message + = SysUnExpect !String -- @ library generated unexpect + | UnExpect !String -- @ unexpected something + | Expect !String -- @ expecting something + | Message !String -- @ raw message + deriving ( Typeable ) + + +instance Enum Message where + fromEnum (SysUnExpect _) = 0 + fromEnum (UnExpect _) = 1 + fromEnum (Expect _) = 2 + fromEnum (Message _) = 3 + toEnum _ = error "toEnum is undefined for Message" + + +instance Eq Message where + m1 == m2 = fromEnum m1 == fromEnum m2 + + +instance Ord Message where + compare msg1 msg2 = compare (fromEnum msg1) (fromEnum msg2) + + +messageString :: Message -> String +messageString (SysUnExpect s) = s +messageString (UnExpect s) = s +messageString (Expect s) = s +messageString (Message s) = s + + +data ParseError = ParseError String Row Col [Message] + + +errorPos :: ParseError -> SourcePos +errorPos (ParseError sourceName row col _) = newPos sourceName row col + + +errorMessages :: ParseError -> [Message] +errorMessages (ParseError _ _ _ messages) = messages + + + +-- Create parse errors + + +newErrorMessage :: Message -> String -> Row -> Col -> ParseError +newErrorMessage msg sourceName row col + = ParseError sourceName row col [msg] + + +newErrorUnknown :: String -> Row -> Col -> ParseError +newErrorUnknown sourceName row col + = ParseError sourceName row col [] + + +setErrorMessage :: Message -> ParseError -> ParseError +setErrorMessage msg (ParseError sourceName row col msgs) + = ParseError sourceName row col (msg : filter (msg /=) msgs) + + +mergeError :: ParseError -> ParseError -> ParseError +mergeError e1@(ParseError sn1 r1 c1 msgs1) e2@(ParseError _ r2 c2 msgs2) + -- prefer meaningful errors + | null msgs2 && not (null msgs1) = e1 + | null msgs1 && not (null msgs2) = e2 + | otherwise + = case (r1, c1) `compare` (r2, c2) of + -- select the longest match + EQ -> ParseError sn1 r1 c1 (msgs1 ++ msgs2) + GT -> e1 + LT -> e2 + + +instance Show ParseError where + show err + = show (errorPos err) ++ ":" ++ + showErrorMessages "or" "unknown parse error" + "expecting" "unexpected" "end of input" + (errorMessages err) + + +showErrorMessages :: + String -> String -> String -> String -> String -> [Message] -> String +showErrorMessages msgOr msgUnknown msgExpecting msgUnExpected msgEndOfInput msgs + | null msgs = msgUnknown + | otherwise = concat $ map ("\n"++) $ clean $ + [showSysUnExpect,showUnExpect,showExpect,showMessages] + where + (sysUnExpect,msgs1) = span ((SysUnExpect "") ==) msgs + (unExpect,msgs2) = span ((UnExpect "") ==) msgs1 + (expect,messages) = span ((Expect "") ==) msgs2 + + showExpect = showMany msgExpecting expect + showUnExpect = showMany msgUnExpected unExpect + showSysUnExpect | not (null unExpect) || + null sysUnExpect = "" + | null firstMsg = msgUnExpected ++ " " ++ msgEndOfInput + | otherwise = msgUnExpected ++ " " ++ firstMsg + where + firstMsg = messageString (head sysUnExpect) + + showMessages = showMany "" messages + + -- helpers + showMany pre msgs3 = case clean (map messageString msgs3) of + [] -> "" + ms | null pre -> commasOr ms + | otherwise -> pre ++ " " ++ commasOr ms + + commasOr [] = "" + commasOr [m] = m + commasOr ms = commaSep (init ms) ++ " " ++ msgOr ++ " " ++ last ms + + commaSep = separate ", " . clean + + separate _ [] = "" + separate _ [m] = m + separate sep (m:ms) = m ++ sep ++ separate sep ms + + clean = nub . filter (not . null) + + + +-- Text.Parsec.Combinator + + +choice :: [Parser a] -> Parser a +choice ps = foldr (<|>) mzero ps + + +many1 :: Parser a -> Parser [a] +many1 p = + do x <- p + xs <- many p + return (x:xs) + + +manyTill :: Parser a -> Parser end -> Parser [a] +manyTill p end = + scan + where + scan = + do{ _ <- end; return [] } + <|> + do{ x <- p; xs <- scan; return (x:xs) } + + +skipMany1 :: Parser a -> Parser () +skipMany1 p = + do _ <- p + skipMany p + + +option :: a -> Parser a -> Parser a +option x p = p <|> return x + + +optionMaybe :: Parser a -> Parser (Maybe a) +optionMaybe p = option Nothing (liftM Just p) + + +anyToken :: Parser Char +anyToken = anyChar + + +notFollowedBy :: Show a => Parser a -> Parser () +notFollowedBy p = + try $ do{ c <- try p; unexpected (show c) } <|> return () + + +between :: Parser open -> Parser close -> Parser a -> Parser a +between open close p = + do{ _ <- open; x <- p; _ <- close; return x } + + + --- `eof` makes the parser fail if the entire input hasn't been consumed. + --- This function sits in an odd position right now because the new parser + --- (`Parse.Primiteves.fromByteString` and `Parse.Primitives.fromSnippet`) + --- automatically does this whereas the adapter (`Parse.ParsecAdapter.runParsercT`) + --- does not. + --- + --- I think the solution is to remove the eof behaviour from the new parser, + --- but we'll see +eof :: Parser () +eof = notFollowedBy anyToken "end of input" + + +-- Text.Parsec.Char + + +oneOf :: [Char] -> Parser Char +oneOf cs = satisfy (\c -> elem c cs) + + +space :: Parser Char +space = satisfy C.isSpace "space" + + +upper :: Parser Char +upper = satisfy C.isUpper "uppercase letter" + + +lower :: Parser Char +lower = satisfy C.isLower "lowercase letter" + + +alphaNum :: Parser Char +alphaNum = satisfy C.isAlphaNum "letter or digit" + + +letter :: Parser Char +letter = satisfy C.isAlpha "letter" + + +digit :: Parser Char +digit = satisfy C.isDigit "digit" + + +hexDigit :: Parser Char +hexDigit = satisfy C.isHexDigit "hexadecimal digit" + + +octDigit :: Parser Char +octDigit = satisfy C.isOctDigit "octal digit" + + +char :: Char -> Parser Char +char c = satisfy (==c) show [c] + + +anyChar :: Parser Char +anyChar = satisfy (const True) + + +satisfy :: (Char -> Bool) -> Parser Char +satisfy f = + EP.Parser $ \s@(EP.State _ pos end _ row col sourceName _) cok _ _ eerr -> + let + (char, width) = extractChar s + + errEof = newErrorMessage (SysUnExpect "") sourceName + + errExpect = newErrorMessage (SysUnExpect $ [char]) sourceName + in + if pos == end then + eerr row col errEof + else if f char then + cok char (updatePos width char s) + else + eerr row col errExpect + + +string :: String -> Parser String +string "" = return "" +string match@(c:cs) = + EP.Parser $ \s cok _ cerr eerr -> + stringHelp + [c] + s + (\s' -> + stringHelp + cs + s' + (cok match) + cerr + ) + eerr + + +stringHelp :: forall b. + String + -> EP.State + -> (EP.State -> b) + -> (Row -> Col -> (Row -> Col -> ParseError) -> b) + -> b +stringHelp "" s toOk _ = toOk s +stringHelp (c:cs) s@(EP.State _ pos end _ row col sourceName _) toOk toError = + let + errEof _ _ = setErrorMessage (Expect (show (c:cs))) + (newErrorMessage (SysUnExpect "") sourceName row col) + + errExpect x _ _ = setErrorMessage (Expect (show (c:cs))) + (newErrorMessage (SysUnExpect (show x)) sourceName row col) + + (char, width) = extractChar s + in + if pos == end then + toError row col errEof + else if char == c then + stringHelp cs (updatePos width char s) toOk toError + else + toError row col (errExpect c) + + +updatePos :: Int -> Char -> EP.State -> EP.State +updatePos width c (EP.State src pos end indent row col sourceName newline) = + let + (row', col') = + case c of + '\n' -> (row + 1, 1) + + -- The parsec docs states that CR increments line just like an LF does, + -- this is not what happens in the code though, + -- see: https://github.com/haskell/parsec/issues/129 for details. + -- + -- Here we've opted for following the behaviour of parsec, and not the + -- doccumentation even though this behaviour might be considered a bug. + '\r' -> (row, col + 1) + + -- The parsec behaviour for tabs is to increment to the nearest + -- 8'th collumn. Shoud we do this as well? + -- Let's not implement this unless it turns out that elm-format + -- needs it. + '\t' -> (row, (col + 8 - ((col-1) `mod` 8))) + + _ -> (row, col + 1) + in + EP.State src (plusPtr pos width) end indent row' col' sourceName newline + + +-- Inspired by https://hackage.haskell.org/package/utf8-string-1.0.2/docs/src/Codec.Binary.UTF8.String.html#decode +-- +-- TODO: "Gracefully" crash on incomplete multibyte codepoint +-- +-- If there's an incomplete multibyte codepoint at the end of the file this +-- function will attempt to index ´Word8´'s outside the buffer, resulting in +-- some nasty things. While 100% proper handling for utf-8 is not super important +-- (or even desirable) for elm-format, crashing with a descriptive error message +-- instead of indexing outside the buffer might be worth implementing. +-- +-- w0, 4 byte char w1 w2 w3, outside buffer +-- v v v v +-- | ..., 11110xxx, 10xxxxxx, 10xxxxxx | ... +extractChar :: EP.State -> (Char, Int) +extractChar (EP.State _ pos _ _ _ _ _ _) = + -- 1 byte codepoint + if w0 < 0xc0 then + (chr (fromEnum w0), 1) + -- 2 byte codepoint + else if w0 < 0xe0 then + (multi1, 2) + -- 3 byte codepoint + else if w0 < 0xf0 then + (multi_byte [w1, w2] 0xf 0x800, 3) + -- 4 byte codepoint + else if w0 < 0xf8 then + (multi_byte [w1, w2, w3] 0x7 0x10000, 4) + else + error "invalid utf-8" + where + w0 = EP.unsafeIndex pos + w1 = EP.unsafeIndex (plusPtr pos 1) + w2 = EP.unsafeIndex (plusPtr pos 2) + w3 = EP.unsafeIndex (plusPtr pos 3) + + -- `Codec.Binary.UTF8.String.decode` has this special case function for + -- a 2 byte codepoint, why is that? Will it behave the same way if we use + -- the general `multi_byte` instead? + multi1 = + if w1 .&. 0xc0 == 0x80 then + let d = (fromEnum w0 .&. 0x1f) `shiftL` 6 .|. fromEnum (w1 .&. 0x3f) + in + if d >= 0x000080 then + toEnum d + else + error "invalid utf-8" + else + error "invalid utf-8" + + multi_byte words mask overlong = aux words (fromEnum (w0 .&. mask)) + where + aux [] acc + | overlong <= acc && acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = chr acc + | otherwise = error "invalid utf-8" + + aux (w:ws) acc + | w .&. 0xc0 == 0x80 = aux ws + $ shiftL acc 6 .|. fromEnum (w .&. 0x3f) + | otherwise = error "invalid utf-8" + + + +-- Text.Parsec.Indents + + +-- indents adds additional data onto parsecs `ParsecT` in order to track +-- indentation information. The new parser tracks this information by itself +-- now, which is why this function becomes a no-op. +runIndent :: s -> a -> a +runIndent _ = id + + +block :: Parser a -> Parser [a] +block p = withPos $ do + r <- many1 (checkIndent >> p) + return r + + +indented :: Parser () +indented = + do (EP.State _ _ _ indent _ col _ _) <- getParserState + if col <= indent then fail "not indented" else do return () + + +checkIndent :: Parser () +checkIndent = + do (EP.State _ _ _ indent _ col _ _) <- getParserState + if indent == col then return () else fail "indentation doesn't match" + + +withPos :: Parser a -> Parser a +withPos (EP.Parser p) = + EP.Parser $ \s@(EP.State _ _ _ indent _ col _ _) cok eok cerr eerr -> + let + cok' x s' = cok x (setIndent indent s') + eok' x s' = eok x (setIndent indent s') + in + p (setIndent col s) cok' eok' cerr eerr + + +setIndent :: Word16 -> EP.State -> EP.State +setIndent indent (EP.State s p e _ r c nl sn) = + EP.State s p e indent r c nl sn diff --git a/elm-format-lib/src/Parse/Pattern.hs b/elm-format-lib/src/Parse/Pattern.hs index bf0017fd6..5390a619d 100644 --- a/elm-format-lib/src/Parse/Pattern.hs +++ b/elm-format-lib/src/Parse/Pattern.hs @@ -1,7 +1,7 @@ {-# LANGUAGE DataKinds #-} module Parse.Pattern (term, expr) where -import Text.Parsec ((<|>), (), char, choice, optionMaybe, try) +import Parse.ParsecAdapter ((<|>), (), char, choice, optionMaybe, try) import AST.V0_16 import AST.Structure diff --git a/elm-format-lib/src/Parse/Primitives.hs b/elm-format-lib/src/Parse/Primitives.hs index 1657d6a5b..998cfe1bc 100644 --- a/elm-format-lib/src/Parse/Primitives.hs +++ b/elm-format-lib/src/Parse/Primitives.hs @@ -57,7 +57,10 @@ data State = -- PERF try taking some out to avoid allocation , _indent :: !Word16 , _row :: !Row , _col :: !Col + -- _sourceName is needed by the parsec errors. At some point we should + -- migrate to our own error type an refactor this field out of the parser state. , _sourceName :: String + -- _newline is needed by elm-format to track some formatting information. , _newline :: [Bool] } diff --git a/elm-format-lib/src/Parse/Type.hs b/elm-format-lib/src/Parse/Type.hs index af336f619..686999e2c 100644 --- a/elm-format-lib/src/Parse/Type.hs +++ b/elm-format-lib/src/Parse/Type.hs @@ -1,7 +1,7 @@ {-# LANGUAGE DataKinds #-} module Parse.Type where -import Text.Parsec ((<|>), (), char, many1, string, try, optionMaybe) +import Parse.ParsecAdapter ((<|>), (), char, many1, string, try, optionMaybe) import Parse.Helpers import Reporting.Annotation (Located) diff --git a/elm-format-lib/src/Parse/Whitespace.hs b/elm-format-lib/src/Parse/Whitespace.hs index c49433b5b..84f15475a 100644 --- a/elm-format-lib/src/Parse/Whitespace.hs +++ b/elm-format-lib/src/Parse/Whitespace.hs @@ -7,7 +7,7 @@ import Parse.IParser import qualified Parse.Markdown as Markdown import qualified Parse.State as State import qualified Reporting.Error.Syntax as Syntax -import Text.Parsec hiding (newline, spaces, State) +import Parse.ParsecAdapter hiding (newline, spaces, State) padded :: IParser a -> IParser (C2 before after a) diff --git a/elm-format-lib/src/Reporting/Error/Syntax.hs b/elm-format-lib/src/Reporting/Error/Syntax.hs index 45a1d757d..f2b98fd70 100644 --- a/elm-format-lib/src/Reporting/Error/Syntax.hs +++ b/elm-format-lib/src/Reporting/Error/Syntax.hs @@ -2,7 +2,7 @@ module Reporting.Error.Syntax where import AST.V0_16 -import qualified Text.Parsec.Error as Parsec +import qualified Parse.ParsecAdapter as Parsec data Error diff --git a/elm-format-lib/src/Reporting/Region.hs b/elm-format-lib/src/Reporting/Region.hs index 99d77c6ab..61ae04fbf 100644 --- a/elm-format-lib/src/Reporting/Region.hs +++ b/elm-format-lib/src/Reporting/Region.hs @@ -6,7 +6,7 @@ import Relude import Text.Show (showParen, showString, showsPrec) import qualified Data.String as String -import qualified Text.Parsec.Pos as Parsec +import qualified Parse.ParsecAdapter as Parsec data Region = Region diff --git a/elm-format-lib/src/Text/Parsec.hs b/elm-format-lib/src/Text/Parsec.hs deleted file mode 100644 index b8197efa1..000000000 --- a/elm-format-lib/src/Text/Parsec.hs +++ /dev/null @@ -1,13 +0,0 @@ -module Text.Parsec - ( module Text.Parsec.Pos - , module Text.Parsec.Error - , module Text.Parsec.Prim - , module Text.Parsec.Char - , module Text.Parsec.Combinator - ) where - -import Text.Parsec.Pos -import Text.Parsec.Error -import Text.Parsec.Prim -import Text.Parsec.Char -import Text.Parsec.Combinator diff --git a/elm-format-lib/src/Text/Parsec/Char.hs b/elm-format-lib/src/Text/Parsec/Char.hs deleted file mode 100644 index e3747fa9e..000000000 --- a/elm-format-lib/src/Text/Parsec/Char.hs +++ /dev/null @@ -1,201 +0,0 @@ -{-# LANGUAGE CPP, FlexibleContexts, Rank2Types #-} - -module Text.Parsec.Char - ( oneOf - , space - , upper - , lower - , alphaNum - , letter - , digit - , hexDigit - , octDigit - , char - , anyChar - , satisfy - , string - ) where - -import Parse.Primitives (Row, Col, State) -import qualified Parse.Primitives as EP -import Text.Parsec.Prim (Parser, ()) -import Text.Parsec.Error (ParseError, Message(SysUnExpect, Expect), newErrorMessage, setErrorMessage) - -import Foreign.Ptr (plusPtr) -import Data.Bits (shiftL, (.&.), (.|.)) -import Data.Word (Word8) -import Data.Char (chr, ord) -import qualified Data.Char as C - - -oneOf :: [Char] -> Parser Char -oneOf cs = satisfy (\c -> elem c cs) - - -space :: Parser Char -space = satisfy C.isSpace "space" - - -upper :: Parser Char -upper = satisfy C.isUpper "uppercase letter" - - -lower :: Parser Char -lower = satisfy C.isLower "lowercase letter" - - -alphaNum :: Parser Char -alphaNum = satisfy C.isAlphaNum "letter or digit" - - -letter :: Parser Char -letter = satisfy C.isAlpha "letter" - - -digit :: Parser Char -digit = satisfy C.isDigit "digit" - - -hexDigit :: Parser Char -hexDigit = satisfy C.isHexDigit "hexadecimal digit" - - -octDigit :: Parser Char -octDigit = satisfy C.isOctDigit "octal digit" - - -char :: Char -> Parser Char -char c = satisfy (==c) show [c] - - -anyChar :: Parser Char -anyChar = satisfy (const True) - - -satisfy :: (Char -> Bool) -> Parser Char -satisfy f = - EP.Parser $ \s@(EP.State _ pos end _ row col sourceName _) cok _ _ eerr -> - let - (char, width) = extractChar s - - errEof = newErrorMessage (SysUnExpect "") sourceName - - errExpect = newErrorMessage (SysUnExpect $ [char]) sourceName - in - if pos == end then - eerr row col errEof - else if f char then - cok char (updatePos width char s) - else - eerr row col errExpect - - -string :: String -> Parser String -string "" = return "" -string match@(c:cs) = - EP.Parser $ \s cok _ cerr eerr -> - stringHelp - [c] - s - (\s' -> - stringHelp - cs - s' - (cok match) - cerr - ) - eerr - - -stringHelp :: forall b. - String - -> EP.State - -> (State -> b) - -> (Row -> Col -> (Row -> Col -> ParseError) -> b) - -> b -stringHelp "" s toOk _ = toOk s -stringHelp (c:cs) s@(EP.State _ pos end _ row col sourceName _) toOk toError = - let - errEof _ _ = setErrorMessage (Expect (show (c:cs))) - (newErrorMessage (SysUnExpect "") sourceName row col) - - errExpect x _ _ = setErrorMessage (Expect (show (c:cs))) - (newErrorMessage (SysUnExpect (show x)) sourceName row col) - - (char, width) = extractChar s - in - if pos == end then - toError row col errEof - else if char == c then - stringHelp cs (updatePos width char s) toOk toError - else - toError row col (errExpect c) - - -updatePos :: Int -> Char -> EP.State -> EP.State -updatePos width c (EP.State src pos end indent row col sourceName newline) = - let - (row', col') = - case c of - '\n' -> (row + 1, 1) - - -- The parsec docs states that CR increments line just like an LF does, - -- this is not what happens in the code though, - -- see: https://github.com/haskell/parsec/issues/129 for details. - -- - -- Here we've opted for following the behaviour of parsec, and not the - -- doccumentation even though this behaviour might be considered a bug. - '\r' -> (row, col + 1) - - -- The parsec behaviour for tabs is to increment to the nearest - -- 8'th collumn. Shoud we do this as well? - -- Let's follow the parsec behaviour - '\t' -> (row, (col + 8 - ((col-1) `mod` 8))) - - _ -> (row, col + 1) - in - EP.State src (plusPtr pos width) end indent row' col' sourceName newline - - --- Inspired by https://hackage.haskell.org/package/utf8-string-1.0.2/docs/src/Codec.Binary.UTF8.String.html#decode -extractChar :: EP.State -> (Char, Int) -extractChar (EP.State _ pos _ _ _ _ _ _) = - if w0 < 0xc0 then - (chr (fromEnum w0), 1) - else if w0 < 0xe0 then - (multi1, 2) - else if w0 < 0xf0 then - (multi_byte [w1, w2] 0xf 0x800, 3) - else if w0 < 0xf8 then - (multi_byte [w1, w2, w3] 0x7 0x10000, 4) - else - error "invalid utf-8" - where - w0 = EP.unsafeIndex pos - w1 = EP.unsafeIndex (plusPtr pos 1) - w2 = EP.unsafeIndex (plusPtr pos 2) - w3 = EP.unsafeIndex (plusPtr pos 3) - - multi1 = - if w1 .&. 0xc0 == 0x80 then - let d = (fromEnum w0 .&. 0x1f) `shiftL` 6 .|. fromEnum (w1 .&. 0x3f) - in - if d >= 0x000080 then - toEnum d - else - error "invalid utf-8" - else - error "invalid utf-8" - - multi_byte words mask overlong = aux words (fromEnum (w0 .&. mask)) - where - aux [] acc - | overlong <= acc && acc <= 0x10ffff && - (acc < 0xd800 || 0xdfff < acc) && - (acc < 0xfffe || 0xffff < acc) = chr acc - | otherwise = error "invalid utf-8" - - aux (w:ws) acc - | w .&. 0xc0 == 0x80 = aux ws - $ shiftL acc 6 .|. fromEnum (w .&. 0x3f) - | otherwise = error "invalid utf-8" diff --git a/elm-format-lib/src/Text/Parsec/Combinator.hs b/elm-format-lib/src/Text/Parsec/Combinator.hs deleted file mode 100644 index ba77c2d28..000000000 --- a/elm-format-lib/src/Text/Parsec/Combinator.hs +++ /dev/null @@ -1,80 +0,0 @@ -module Text.Parsec.Combinator - ( choice - , many1 - , manyTill - , skipMany1 - , option - , optionMaybe - , anyToken - , notFollowedBy - , between - , eof - ) where - -import qualified Parse.Primitives as EP -import Text.Parsec.Prim (unexpected, Parser(..), (<|>), (), try, many, skipMany) -import Text.Parsec.Error (Message(UnExpect), newErrorMessage) -import Text.Parsec.Char (anyChar) - -import Control.Monad (mzero, liftM) - - -choice :: [Parser a] -> Parser a -choice ps = foldr (<|>) mzero ps - - -many1 :: Parser a -> Parser [a] -many1 p = - do x <- p - xs <- many p - return (x:xs) - - -manyTill :: Parser a -> Parser end -> Parser [a] -manyTill p end = - scan - where - scan = - do{ _ <- end; return [] } - <|> - do{ x <- p; xs <- scan; return (x:xs) } - - -skipMany1 :: Parser a -> Parser () -skipMany1 p = - do _ <- p - skipMany p - - -option :: a -> Parser a -> Parser a -option x p = p <|> return x - - -optionMaybe :: Parser a -> Parser (Maybe a) -optionMaybe p = option Nothing (liftM Just p) - - -anyToken :: Parser Char -anyToken = anyChar - - -notFollowedBy :: Show a => Parser a -> Parser () -notFollowedBy p = - try $ do{ c <- try p; unexpected (show c) } <|> return () - - -between :: Parser open -> Parser close -> Parser a -> Parser a -between open close p = - do{ _ <- open; x <- p; _ <- close; return x } - - --- `eof` makes the parser fail if the entire inpute hasn't been consumed. --- This function sits in an odd possition right now because the new parser --- (`Parse.Primiteves.fromByteString` and `Parse.Primitives.fromSnippet`) --- automatically does this whereas the adapter (`Text.Parsec.Prim.runParserT`) --- does not. --- --- I think the solution is to remove the eof behaviour from the new parser, --- but we'll see -eof :: Parser () -eof = notFollowedBy anyToken "end of input" diff --git a/elm-format-lib/src/Text/Parsec/Error.hs b/elm-format-lib/src/Text/Parsec/Error.hs deleted file mode 100644 index fc9fa342e..000000000 --- a/elm-format-lib/src/Text/Parsec/Error.hs +++ /dev/null @@ -1,141 +0,0 @@ --- Large parts of this module are copied from https://hackage.haskell.org/package/parsec-3.1.14.0/docs/src/Text.Parsec.Error.html#messageString - -module Text.Parsec.Error - ( Message(..) - , ParseError - , newErrorMessage - , newErrorUnknown - , errorPos - , errorMessages - , setErrorMessage - , mergeError - ) where - -import Data.List (nub, sort) -import Data.Typeable (Typeable) - -import Parse.Primitives (Row, Col) -import Text.Parsec.Pos (SourcePos, newPos) - - -data Message - = SysUnExpect !String -- @ library generated unexpect - | UnExpect !String -- @ unexpected something - | Expect !String -- @ expecting something - | Message !String -- @ raw message - deriving ( Typeable ) - - -instance Enum Message where - fromEnum (SysUnExpect _) = 0 - fromEnum (UnExpect _) = 1 - fromEnum (Expect _) = 2 - fromEnum (Message _) = 3 - toEnum _ = error "toEnum is undefined for Message" - - -instance Eq Message where - m1 == m2 = fromEnum m1 == fromEnum m2 - - -instance Ord Message where - compare msg1 msg2 = compare (fromEnum msg1) (fromEnum msg2) - - -messageString :: Message -> String -messageString (SysUnExpect s) = s -messageString (UnExpect s) = s -messageString (Expect s) = s -messageString (Message s) = s - - -data ParseError = ParseError String Row Col [Message] - - -errorPos :: ParseError -> SourcePos -errorPos (ParseError sourceName row col _) = newPos sourceName row col - - -errorMessages :: ParseError -> [Message] -errorMessages (ParseError _ _ _ messages) = messages - - - --- Create parse errors - - -newErrorMessage :: Message -> String -> Row -> Col -> ParseError -newErrorMessage msg sourceName row col - = ParseError sourceName row col [msg] - - -newErrorUnknown :: String -> Row -> Col -> ParseError -newErrorUnknown sourceName row col - = ParseError sourceName row col [] - - -setErrorMessage :: Message -> ParseError -> ParseError -setErrorMessage msg (ParseError sourceName row col msgs) - = ParseError sourceName row col (msg : filter (msg /=) msgs) - - -mergeError :: ParseError -> ParseError -> ParseError -mergeError e1@(ParseError sn1 r1 c1 msgs1) e2@(ParseError sn2 r2 c2 msgs2) - -- prefer meaningful errors - | null msgs2 && not (null msgs1) = e1 - | null msgs1 && not (null msgs2) = e2 - | otherwise - = case (r1, c1) `compare` (r2, c2) of - -- select the longest match - EQ -> ParseError sn1 r1 c1 (msgs1 ++ msgs2) - GT -> e1 - LT -> e2 - - -instance Show ParseError where - show err - = show (errorPos err) ++ ":" ++ - showErrorMessages "or" "unknown parse error" - "expecting" "unexpected" "end of input" - (errorMessages err) - - -showErrorMessages :: - String -> String -> String -> String -> String -> [Message] -> String -showErrorMessages msgOr msgUnknown msgExpecting msgUnExpected msgEndOfInput msgs - | null msgs = msgUnknown - | otherwise = concat $ map ("\n"++) $ clean $ - [showSysUnExpect,showUnExpect,showExpect,showMessages] - where - (sysUnExpect,msgs1) = span ((SysUnExpect "") ==) msgs - (unExpect,msgs2) = span ((UnExpect "") ==) msgs1 - (expect,messages) = span ((Expect "") ==) msgs2 - - showExpect = showMany msgExpecting expect - showUnExpect = showMany msgUnExpected unExpect - showSysUnExpect | not (null unExpect) || - null sysUnExpect = "" - | null firstMsg = msgUnExpected ++ " " ++ msgEndOfInput - | otherwise = msgUnExpected ++ " " ++ firstMsg - where - firstMsg = messageString (head sysUnExpect) - - showMessages = showMany "" messages - - -- helpers - showMany pre msgs3 = case clean (map messageString msgs3) of - [] -> "" - ms | null pre -> commasOr ms - | otherwise -> pre ++ " " ++ commasOr ms - - commasOr [] = "" - commasOr [m] = m - commasOr ms = commaSep (init ms) ++ " " ++ msgOr ++ " " ++ last ms - - commaSep = separate ", " . clean - - separate _ [] = "" - separate _ [m] = m - separate sep (m:ms) = m ++ sep ++ separate sep ms - - clean = nub . filter (not . null) diff --git a/elm-format-lib/src/Text/Parsec/Indent.hs b/elm-format-lib/src/Text/Parsec/Indent.hs deleted file mode 100644 index b13a44d57..000000000 --- a/elm-format-lib/src/Text/Parsec/Indent.hs +++ /dev/null @@ -1,52 +0,0 @@ -{-# LANGUAGE FlexibleContexts #-} - -module Text.Parsec.Indent - ( runIndent - , block - , indented - , checkIndent - , withPos - ) where - -import Text.Parsec.Prim (Parser, getParserState, updateParserState) -import Text.Parsec.Combinator (many1) -import qualified Parse.Primitives as EP - -import Data.Word (Word16) - - -runIndent :: s -> a -> a -runIndent _ = id - - -block :: Parser a -> Parser [a] -block p = withPos $ do - r <- many1 (checkIndent >> p) - return r - - -indented :: Parser () -indented = - do (EP.State _ _ _ indent _ col _ _) <- getParserState - if col <= indent then fail "not indented" else do return () - - -checkIndent :: Parser () -checkIndent = - do (EP.State _ _ _ indent _ col _ _) <- getParserState - if indent == col then return () else fail "indentation doesn't match" - - -withPos :: Parser a -> Parser a -withPos (EP.Parser p) = - EP.Parser $ \s@(EP.State _ _ _ indent _ col _ _) cok eok cerr eerr -> - let - cok' x s' = cok x (setIndent indent s') - eok' x s' = eok x (setIndent indent s') - in - p (setIndent col s) cok' eok' cerr eerr - - -setIndent :: Word16 -> EP.State -> EP.State -setIndent indent (EP.State s p e _ r c nl sn) = - EP.State s p e indent r c nl sn diff --git a/elm-format-lib/src/Text/Parsec/Pos.hs b/elm-format-lib/src/Text/Parsec/Pos.hs deleted file mode 100644 index 3900ed2ad..000000000 --- a/elm-format-lib/src/Text/Parsec/Pos.hs +++ /dev/null @@ -1,40 +0,0 @@ -module Text.Parsec.Pos - ( SourceName - , SourcePos - , newPos - , sourceLine - , sourceColumn - ) where - -import Parse.Primitives as EP - - -type SourceName = String - - -data SourcePos = SourcePos SourceName !EP.Row !EP.Col - - -newPos :: SourceName -> EP.Row -> EP.Col -> SourcePos -newPos = - SourcePos - - -sourceLine :: SourcePos -> Int -sourceLine (SourcePos _ row _) = - fromIntegral row - - -sourceColumn :: SourcePos -> Int -sourceColumn (SourcePos _ _ col) = - fromIntegral col - - -instance Show SourcePos where - show (SourcePos name line column) - | null name = showLineColumn - | otherwise = "\"" ++ name ++ "\" " ++ showLineColumn - where - showLineColumn = "(line " ++ show line ++ - ", column " ++ show column ++ - ")" diff --git a/elm-format-lib/src/Text/Parsec/Prim.hs b/elm-format-lib/src/Text/Parsec/Prim.hs deleted file mode 100644 index 1edbcba1f..000000000 --- a/elm-format-lib/src/Text/Parsec/Prim.hs +++ /dev/null @@ -1,269 +0,0 @@ -{-# LANGUAGE PolymorphicComponents #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE UndecidableInstances #-} -{-# LANGUAGE FunctionalDependencies #-} -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE FlexibleInstances #-} - -module Text.Parsec.Prim - ( unexpected - , Parser(..) - , () - , (<|>) - , lookAhead - , try - , many - , skipMany - , runParserT - , getPosition - , getInput - , setInput - , getState - , updateState - , getParserState - , updateParserState - ) where - -import qualified Control.Applicative as Applicative ( Applicative(..), Alternative(..) ) -import Control.Monad (MonadPlus(..)) -import qualified Control.Monad.Fail as Fail - -import Text.Parsec.Pos (SourceName, SourcePos, newPos, sourceLine, sourceColumn) -import Text.Parsec.Error (ParseError) -import qualified Text.Parsec.Error as Error - -import qualified Parse.Primitives as EP -import Parse.State (State(..)) - -import Data.Bits (shiftR, (.&.)) -import Data.Word (Word8) -import Data.Char (ord) - -import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B -import Foreign.Ptr (Ptr, plusPtr) -import Foreign.Storable (peek) -import Foreign.ForeignPtr (ForeignPtr, touchForeignPtr) -import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) - - -unknownError :: EP.Row -> EP.Col -> ParseError -unknownError row col = - Error.newErrorUnknown "" row col - - -unexpected :: String -> Parser a -unexpected msg = - EP.Parser $ \(EP.State _ _ _ _ row col _ _) _ _ _ eerr -> - eerr row col (Error.newErrorMessage (Error.UnExpect msg) "TODO") - - -type Parser a = EP.Parser ParseError a - - -instance Applicative.Alternative (EP.Parser ParseError) where - empty = mzero - (<|>) = mplus - - -instance Fail.MonadFail (EP.Parser ParseError) where - fail = parserFail - - -parserFail :: String -> Parser a -parserFail msg = - EP.Parser $ \(EP.State _ _ _ _ row col sourceName _) _ _ _ eerr -> - eerr row col $ Error.newErrorMessage (Error.Message msg) sourceName - - -instance MonadPlus (EP.Parser ParseError) where - mzero = parserZero - mplus = parserPlus - - -parserZero :: Parser a -parserZero = - EP.Parser $ \state _ _ _ eerr -> - let - (EP.State _ _ _ _ row col _ _) = state - in - eerr row col unknownError - - -parserPlus :: Parser a -> Parser a -> Parser a -parserPlus (EP.Parser p) (EP.Parser q) = - EP.Parser $ \s cok eok cerr eerr -> - let - meerr r1 c1 toErr1 = - let - neerr r2 c2 toErr2 = - let - err = Error.mergeError (toErr1 r1 c1) (toErr2 r2 c2) - row = fromIntegral $ sourceLine $ Error.errorPos err - col = fromIntegral $ sourceColumn $ Error.errorPos err - in - eerr row col (\_ _ -> err) - in q s cok eok cerr neerr - in - p s cok eok cerr meerr - - -infixr 1 <|> -infix 0 - - -(<|>) :: Parser a -> Parser a -> Parser a -(<|>) = mplus - - -() :: Parser a -> String -> Parser a -() (EP.Parser p) msg = - EP.Parser $ \s@(EP.State _ _ _ _ _ _ sn _) cok eok cerr eerr -> - let - eerr' row col _ = - eerr row col (Error.newErrorMessage (Error.Expect msg) sn) - in - p s cok eok cerr eerr' - - -lookAhead :: Parser a -> Parser a -lookAhead (EP.Parser p) = - EP.Parser $ \s _ eok cerr eerr -> - let - eok' a _ = eok a s - in - p s eok' eok' cerr eerr - -try :: Parser a -> Parser a -try (EP.Parser parser) = - EP.Parser $ \s cok eok _ err -> - parser s cok eok err err - - -many :: Parser a -> Parser [a] -many (EP.Parser p) = - EP.Parser $ \s cok eok cerr eerr -> - let - many_ acc x s' = - p - s' - (many_ (x:acc)) - parserDoesNotConsumeErr - cerr - (\_ _ _ -> cok (reverse (x:acc)) s') - in - p - s - (many_ []) - parserDoesNotConsumeErr - cerr - (\_ _ _ -> eok [] s) - - -skipMany ::Parser a -> Parser () -skipMany (EP.Parser p) = - EP.Parser $ \s cok eok cerr eerr -> - let - skipMany_ s' = - p - s' - (\_ -> skipMany_) - parserDoesNotConsumeErr - cerr - (\_ _ _ -> cok () s') - in - skipMany_ s - - --- Note that causing a runtime crash when using `many` or `skipMany` with a --- parser that does not consume is the same behaviour as it was with parsec -parserDoesNotConsumeErr = error "Text.Parsec.Prim.many: combinator 'many' is applied to a parser that accepts an empty string." - -runParserT :: Parser a -> State -> SourceName -> String -> Either ParseError a -runParserT (EP.Parser p) (State newline) name source = - B.accursedUnutterablePerformIO $ - let - (B.PS fptr offset length) = stringToByteString source - !pos = plusPtr (unsafeForeignPtrToPtr fptr) offset - !end = plusPtr pos length - !result = p (EP.State fptr pos end 1 1 1 name newline) toOk toOk toErr toErr - in - do touchForeignPtr fptr - return result - - -toOk :: a -> EP.State -> Either x a -toOk !a _ = - Right a - - -toErr :: EP.Row -> EP.Col -> (EP.Row -> EP.Col -> x) -> Either x a -toErr row col toError = - Left (toError row col) - - -stringToByteString :: String -> B.ByteString -stringToByteString = B.pack . concatMap encodeChar - - --- https://hackage.haskell.org/package/utf8-string-1.0.2/docs/src/Codec.Binary.UTF8.String.html#encodeChar -encodeChar :: Char -> [Word8] -encodeChar = map fromIntegral . go . ord - where - go oc - | oc <= 0x7f = [oc] - - | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) - , 0x80 + oc .&. 0x3f - ] - - | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) - , 0x80 + ((oc `shiftR` 6) .&. 0x3f) - , 0x80 + oc .&. 0x3f - ] - - | otherwise = [ 0xf0 + (oc `shiftR` 18) - , 0x80 + ((oc `shiftR` 12) .&. 0x3f) - , 0x80 + ((oc `shiftR` 6) .&. 0x3f) - , 0x80 + oc .&. 0x3f - ] - - -getPosition :: Parser SourcePos -getPosition = - do (EP.State _ _ _ _ row col sourceName _) <- getParserState - return $ newPos sourceName row col - - -getInput :: Parser String -getInput = undefined - -setInput :: String -> Parser () -setInput = undefined - - -getState :: Parser State -getState = - do (EP.State _ _ _ _ _ _ _ newline) <- getParserState - return (State newline) - - -updateState :: (State -> State) -> Parser () -updateState f = - do updateParserState - (\(EP.State src pos end indent row col sourceName newline) -> - let - (State newline') = f (State newline) - in - EP.State src pos end indent row col sourceName newline' - ) - return () - - -getParserState :: Parser EP.State -getParserState = updateParserState id - - -updateParserState :: (EP.State -> EP.State) -> Parser EP.State -updateParserState f = - EP.Parser $ \s _ eok _ _ -> eok (f s) (f s) diff --git a/elm-format-lib/src/Text/ParserCombinators/Parsec/Char.hs b/elm-format-lib/src/Text/ParserCombinators/Parsec/Char.hs deleted file mode 100644 index 008fadb9f..000000000 --- a/elm-format-lib/src/Text/ParserCombinators/Parsec/Char.hs +++ /dev/null @@ -1,5 +0,0 @@ -module Text.ParserCombinators.Parsec.Char (lower) where - -import qualified Text.Parsec.Char - -lower = Text.Parsec.Char.lower diff --git a/elm-format-lib/src/Text/ParserCombinators/Parsec/Combinator.hs b/elm-format-lib/src/Text/ParserCombinators/Parsec/Combinator.hs deleted file mode 100644 index 8f80e06e9..000000000 --- a/elm-format-lib/src/Text/ParserCombinators/Parsec/Combinator.hs +++ /dev/null @@ -1,5 +0,0 @@ -module Text.ParserCombinators.Parsec.Combinator (eof) where - -import qualified Text.Parsec.Combinator - -eof = Text.Parsec.Combinator.eof diff --git a/elm-format-lib/test/Parse/ExpressionTest.hs b/elm-format-lib/test/Parse/ExpressionTest.hs index 11190137d..42f9c08a3 100644 --- a/elm-format-lib/test/Parse/ExpressionTest.hs +++ b/elm-format-lib/test/Parse/ExpressionTest.hs @@ -13,7 +13,7 @@ import Data.Coapplicative import Data.Functor.Identity import qualified Data.Indexed as I import qualified Data.Text as Text -import Text.Parsec.Char (string) +import Parse.ParsecAdapter (string) import ElmFormat.ImportInfo (ImportInfo(..)) import ElmFormat.Render.Box (formatExpression, ExpressionContext(..)) import ElmVersion diff --git a/elm-format-lib/test/Parse/HelpersTest.hs b/elm-format-lib/test/Parse/HelpersTest.hs index 0a4e1b50f..5c99b77a6 100644 --- a/elm-format-lib/test/Parse/HelpersTest.hs +++ b/elm-format-lib/test/Parse/HelpersTest.hs @@ -6,7 +6,7 @@ import Test.Tasty.HUnit import AST.V0_16 import Parse.Helpers (parens'') import Parse.IParser -import Text.ParserCombinators.Parsec.Char (lower) +import Parse.ParsecAdapter (lower) import Parse.TestHelpers diff --git a/elm-format-lib/test/Parse/TestHelpers.hs b/elm-format-lib/test/Parse/TestHelpers.hs index 092ce60a2..45b80e6d3 100644 --- a/elm-format-lib/test/Parse/TestHelpers.hs +++ b/elm-format-lib/test/Parse/TestHelpers.hs @@ -8,11 +8,11 @@ import Test.Tasty.HUnit import AST.V0_16 import AST.Structure import Data.Indexed as I +import Parse.ParsecAdapter (eof) import Parse.Helpers (iParse) import Parse.IParser import Reporting.Annotation hiding (at) import Reporting.Region -import Text.ParserCombinators.Parsec.Combinator (eof) import qualified Data.List as List import qualified Data.List.Split as List diff --git a/new-parser-2021-notes.md b/new-parser-2021-notes.md deleted file mode 100644 index 9404c65fe..000000000 --- a/new-parser-2021-notes.md +++ /dev/null @@ -1,165 +0,0 @@ -# GSOC Notes - -## About this document - -Here are various notes I (`@emmabastas`) have made during the effort to integrate the parser from the Elm compiler into elm-format as part of a [GSOC project](https://github.com/elm-tooling/gsoc-projects/issues/13). Eventually, all of these notes will live elsewhere, like source code comments of github issues, but for now they are centralized in this document. - -## Missing test coverage - -* elm-format tests unicode in string literals, but unicode can appear in variable names and possibly other places. We need to figure exactly where and how unicode can appear and add test coverage for that. - -* Should elm-format gracefully handle invalid utf-8? If so we might want test coverage for that. - - **UPDATE:** It's really uncomon for people to have invalid utf-8, and if so, they proably have bigger probles than elm-format crashing. For now, just crashig with a descriptive error message is enoug. - -## Notes on timeline - -As the timeline stands now, most of the time will be spent implementing the wrapper functions. After some more work, I don't think that this is how it will play out, I think we can be more ambitious. - -Turns out there isn't as much to implement for the adapter layer as it might seem. `Text.Parsec.Char` only really consists of two major functions to implement, `string` and `satisfy`, all the other functions are implemented in terms of `satisfy`. Functions in `Text.Parsec.Combinator` are all implemented in terms of `Text.Parsec.Prim` functions. `Text.Parsec.Pos` and `Text.Parsec.Error` will essentially be copies of their respective parsec modules with minor changes. `Text.Parsec.Indent` probably won't do much either. So that leaves us with `Text.Parsec.Prim` containing the functions that need genuine implementations, apart from the _get*_, _set*_ and _update*_ style functions. This all leaves is with very few time consuming functions to implement. The one thing that has the potential of consuming a lot of time is if these foundational functions turns out to be very tricky to get right, with failing integration tests that could be to broad to help us pinpoint the bugs. Getting the first integration test to pass feels like to most difficult thing to do right now. - -All this said, I think we can start to think about what we want to do with our time after the adapter layer has been implemented. - -## Notes on implemented functions - -* `Text.Parsec.Prim` - * `Functor`, `Applicative` and `Monad` instances for `Parser`. Uses the underlying elm parser. - - The implementations simply unwrap to the underlying `elm/compiler` parser, applies the relevant function and re-wraps. So for this to work `parsec` and `elm/compiler` need to have the same idea of how these instances should behave, which they seem to do from looking at the code. - - * instance `Applicative.Alternative` for `Parser`. Implemented exactly as it was by parsec. - - * instance `MonadPlus` for `Parser`. Risks of bugs being introduced. - - * instance `Fail.MonadFail` for `Parser`. Implementation closely matches that of parsec. - - The implementation of `mplus` is straightforward except for the error merging behaviour in parsec, where if the two parsers fails the errors are merged somehow. Not confident that I've got this right.. - - * `<|>`. Implemented exactly as it was by parsec. - - * ``. Possibly incomplete implementation. Parsec does error handling differently to the new parser, so how it's a bit unclear to me how to properly implement this, but I think this implementation should be acceptable. - - * `try`. Implemented exactly as it was by parsec. - - * `many`. Risk of bugs being introduced. - - * `skipMany`. Not to complicated. - - * `lookAhead`. Trivial implementation. - - The implementation of `many` is one of the more complex functions in parsec, and recursion is being used. Furthermore this implementation does not closely follow the original implementation. - - * `runParserT`. Closely follows the implementation of `Parse.Primitives.fromByteString` - - Difference being that `runParserT` converts the `String` to a `ByteString` first. Care has been taken that unicode points arent truncated when converting to `[Word8]` as an intermediate conversion step. - - * `getPosition`, `getState`, `updateState`. Trivial implementations. - -* `Text.Parsec.Combinator` - Almost all of the functions in this module are implemented in terms of functions found in `Text.Parsec.Prim`, and as such most of the functions are implemented exactly as they where by parserc (except for formatting changes). Only functions that somehow differ from the implementetion in parsec are listed. - - * `anyToken`. Same as `Text.Parsec.Char.anyChar` - Due to the adapter not being generic over the input type these two functions behave the same. - - * `eof`. Implemented exactly as it was by parsec - - The elm parser fails if all input isn't consumed, which makes sense in the context of compiling Elm. parsec however defaults to succeeding even if everyting isn't consumed, and that behaviour is changed by `eof`. So as of right now, `eof` becommes a NoOp, maybe `Parse.Primitives` will have to be changed to not fail on unconsumed input (for elm-format it can still make sense to not parse all input) at some point, but not right not. - -* `Text.Parsec.Char` - The functions in this module all deal with `Char` (AKA unicode). There's really only two important functions here: `satisfy` wich consumes chars as long as a predicate holds, and `string` which succeed if a given string exatcly matches what is being consumed, and fails otherwise. Some care has to be taken here because parsec deals with `Char`'s whereas the new parser deals with `Word8`'s. The current implementation handles valid utf-8, but not invalid utf-8. - -* `Text.Parsec.Indent` - * `indented`. Simple function. - - In indents `indented` function there is a `put $ setSourceLine s (sourceLine pos)` line which I don't really get. All of the functions from indents used by elm-format only care about the column of the reference, and not the row, so don't think this will be a problem. - - * `checkIndent`. Simple function. - - * `withPos`. Simple function, but I might have done in wrong.. - -## Mapping between the parsec and Elm parser - -`parsec` and `elm/compiler`'s parser very much operate on the same principles; continuation-passing style with four continuations in a parser: _empty ok_, _consumed ok_, _empty error_ and consumed error. `elm/compiler`'s parser if however less generic, it only parses bytestrings and has no concept of an "user" state. But `elm/compiler`s parser is general over the error type though, whereas `parsec` limits the user to string error messages - -Here is what the declarations for the two parsers look like: - -`parsec`. The type variables `s`, `u`, `m` and `a` represents _input data_, _user state_, _underlying monad_ and _output_ respectively. -```haskell --- The general parsec type -newtype ParsecT s u m a = - ParsecT ( - forall b . - State s u - -> (a -> State s u -> ParseError -> m b) -- consumed ok - -> (a -> State s u -> ParseError -> m b) -- empty ok - -> (ParseError -> m b) -- consumed err - -> (ParseError -> m b) -- empty err - -> m b - -data State s u = - State { - stateInput :: s, - statePos :: !SourcePos, - stateUser :: !u - } - -data SourcePos = SourcePos SourceName Line Column - -type SourceName = String -type Line = Int -type Column = Int - - --- The specific instance of parsec parser used in elm-format -type IParser a = ParsecT String UserState UnderlyingMonad a - -data UserState = UserState - { newline :: [Bool] - } - -type UnderlyingMonad = Control.Monad.State SourcePos -``` - -`elm/compiler`: -```haskell --- Elm -newtype Parser x a = - Parser ( - forall b. - State - -> (a -> State -> b) -- consumed ok - -> (a -> State -> b) -- empty ok - -> (Row -> Col -> (Row -> Col -> x) -> b) -- consumed err - -> (Row -> Col -> (Row -> Col -> x) -> b) -- empty err - -> b - ) - - -data State = - State - { _src :: ForeignPtr Word8 - , _pos :: !(Ptr Word8) - , _end :: !(Ptr Word8) - , _indent :: !Word16 - , _row :: !Row - , _col :: !Col - } - - -type Row = Word16 -type Col = Word16 -``` - -### Mapping state - -### Input -In `elm/compiler`'s parser, `_src`, `_pos` and `_end` are internals of the input bytestring to parse. For `IParser` the input is a `String`. Mapping between these datatypes is easy, all the is needed is for care to be taken that unicode data in `Char`'s isn't truncated when converting to `Word8` before creating a bytestring with `pack`. - -### Location -mapping between `elm/compiler`'s `_row`&`_col` the `Line`&`Column` in `parsec`'s `statePos :: SourcePos` is trivial. However, the `SourcePos` also stores a `SourceName`. I doubt that this is every used by `elm-format`, but if it is then the wrapper layer can store that information instead. - -### Indentation -`elm-format` uses `indents` for indentation aware parsing, which stores indentation information in the _underlying monad_, i.e. this bit `Control.Monad.State SourcePos`. `elm/compiler` stores indentation in `_indent`. The problem arises of how to map `Word16` to `SourcePos`.. Turns out that while `indents` does make use of both `Line` and `Column` in `SourcePos`, the functions actually used by `elm-format` only ever cares about `Column`, which is exactly what `_indent` maps to. So this works out as well! - -### User state -The user state in `elm-format` is the `newline :: [Bool]` bit, and I don't understand what that means. Will have to ask Aaron that.