Haskell Alex - 正则表达式匹配错误的字符串？答案

【问题标题】：Haskell Alex - regex matches wrong string?Haskell Alex - 正则表达式匹配错误的字符串？
【发布时间】：2012-06-12 16:20:54
【问题描述】：

我正在尝试为基于缩进的语法编写词法分析器，但无法匹配缩进。

这是我的代码：

{
module Lexer ( main ) where

import System.IO.Unsafe
}


%wrapper "monadUserState"

$whitespace = [\ \t\b]
$digit      = 0-9                                            -- digits
$alpha      = [A-Za-z]
$letter     = [a-zA-Z]                                       -- alphabetic characters
$ident      = [$letter $digit _]                             -- identifier character
$indent     = [\ \t]

@number     = [$digit]+
@identifier = $alpha($alpha|_|$digit)*

error:-

@identifier { mkL LVarId }

\n $whitespace* \n { skip }
\n $whitespace*    { setIndent }
$whitespace+       { skip }

{

data Lexeme = Lexeme AlexPosn LexemeClass (Maybe String)

instance Show Lexeme where
    show (Lexeme _ LEOF _)   = "  Lexeme EOF"
    show (Lexeme p cl  mbs) = " Lexeme class=" ++ show cl ++ showap p ++ showst mbs
      where
        showap pp = " posn=" ++ showPosn pp
        showst Nothing  = ""
        showst (Just s) = " string=" ++ show s

instance Eq Lexeme where
    (Lexeme _ cls1 _) == (Lexeme _ cls2 _) = cls1 == cls2

showPosn :: AlexPosn -> String
showPosn (AlexPn _ line col) = show line ++ ':': show col

tokPosn :: Lexeme -> AlexPosn
tokPosn (Lexeme p _ _) = p

data LexemeClass
    = LVarId
    | LTIndent Int
    | LTDedent Int
    | LIndent
    | LDedent
    | LEOF
    deriving (Show, Eq)

mkL :: LexemeClass -> AlexInput -> Int -> Alex Lexeme
mkL c (p, _, _, str) len = return (Lexeme p c (Just (take len str)))

data AlexUserState = AlexUserState { indent :: Int }

alexInitUserState :: AlexUserState
alexInitUserState = AlexUserState 0

type Action = AlexInput -> Int -> Alex Lexeme

getLexerIndentLevel :: Alex Int
getLexerIndentLevel = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, indent ust)

setLexerIndentLevel :: Int -> Alex ()
setLexerIndentLevel i = Alex $ \s@AlexState{alex_ust=ust} -> Right (s{alex_ust=(AlexUserState i)}, ())

setIndent :: Action
setIndent input@(p, _, _, str) i = do
    --let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"
    lastIndent <- getLexerIndentLevel
    currIndent <- countIndent (drop 1 str) 0 -- first char is always \n
    if (lastIndent < currIndent) then
        do setLexerIndentLevel currIndent
           mkL (LTIndent (currIndent - lastIndent)) input i
    else if (lastIndent > currIndent) then
        do setLexerIndentLevel currIndent
           mkL (LTDedent (lastIndent - currIndent)) input i
    else alexMonadScan
  where
    countIndent str total
        | take 1 str == "\t" = do skip input 1
                                  countIndent (drop 1 str) (total+1)
        | take 4 str == "    " = do skip input 4
                                    countIndent (drop 4 str) (total+1)
        | otherwise = return total

alexEOF :: Alex Lexeme
alexEOF = return (Lexeme undefined LEOF Nothing)

scanner :: String -> Either String [Lexeme]
scanner str =
    let loop = do
        tok@(Lexeme _ cl _) <- alexMonadScan
        if (cl == LEOF)
            then return [tok]
            else do toks <- loop
                    return (tok:toks)
    in runAlex str loop

addIndentations :: [Lexeme] -> [Lexeme]
addIndentations (lex@(Lexeme pos (LTIndent c) _):ls) =
    concat [iter lex c, addIndentations ls]
  where iter lex c = if c == 0 then []
                     else (Lexeme pos LIndent Nothing):(iter lex (c-1))
addIndentations (lex@(Lexeme pos (LTDedent c) _):ls) =
    concat [iter lex c, addIndentations ls]
  where iter lex c = if c == 0 then []
                     else (Lexeme pos LDedent Nothing):(iter lex (c-1))
addIndentations (l:ls) = l:(addIndentations ls)
addIndentations [] = []


main = do
    s <- getContents
    return ()
    print $ fmap addIndentations (scanner s)

}

问题是在\n $whitespace* { setIndent } 行中，正则表达式匹配错误的字符串，并用这个错误的字符串调用setIndent。出于调试目的，我在setIndent函数中添加了unsafePerformIO，下面是程序运行示例：

begin       
        first indent
|matched string: 
        first indent
                second indent
                second indent
dedent
dedent
|
|matched string: 
                second indent
dedent
|
|matched string: 
dedent
|
|matched string: 
|
Right [ Lexeme class=LVarId posn=1:1 string="begin", Lexeme class=LIndent posn=1:6, Lexeme class=LVarId posn=2:15 string="indent", Lexeme class=LIndent posn=2:21, Lexeme class=LDedent posn=3:30, Lexeme class=LDedent posn=3:30, Lexeme class=LVarId posn=4:1 string="dedent",  Lexeme EOF]

所以setIndent 被调用的不仅仅是空格。并且返回缩进的词位后，字符串的其他部分被省略。

这是 Alex 的错误吗？还是我做错了什么？

【问题讨论】：

标签： regex haskell lexical-analysis alex

【解决方案1】：

所以我没有详细分析你的代码，但我确实注意到了这一点：

setIndent :: Action
setIndent input@(p, _, _, str) i = do
    --let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"

请注意，str 是输入的其余部分，而不仅仅是当前标记。要获取当前令牌，您需要take i str。也许这会给您一种印象，即令牌匹配的输入比实际更多。

当然，我们在 GHC 自己的词法分析器中处理缩进，因此您可能想要 look there for ideas（尽管您可能期望它相当大且复杂）。

【讨论】：

感谢您的回复，所以我更改了代码，现在它可以在take i str 上运行，但没有任何改变。在setIndent 返回缩进标记后，Alex 省略了该行的其余部分（因为 Alex 操作函数不决定输入字符串的下一部分，我认为这与我的 setIndent 无关函数）
好吧，你的正则表达式说它匹配一个换行符后跟零个或多个空格字符，这看起来正是你的调试输出中发生的事情。
它匹配空白字符，但它也匹配空白字符后面的一些附加字符，并且在换行符后计算空白字符的正则表达式之后的正则表达式不匹配这些字符。如果您实际运行代码并键入一些缩进的行（缩进是制表符的 4 个空格），您会看到缩进标记之后的标记以某种方式被省略了。