从正则表达式到 NFA 和 DFA [关闭]答案

【问题标题】：From a regular expression to NFA and to DFA [closed]从正则表达式到 NFA 和 DFA [关闭]
【发布时间】：2020-03-23 05:09:41
【问题描述】：

我有以下正则表达式：

[A−Z]∗01∗[^A−Z]{3}

字母表是[A-Z][0-9]。

有人能解释一下转换为 NFA 再转换为 DFA 的正确方法吗？以及如何在 C 中实现 epsilon 闭包。

【问题讨论】：

标签： c regex dfa nfa

【解决方案1】：

使用 ε-moves (NFA-ε) 实现 NFA

首先，在这种特殊情况下，36 个字符的字母表可以简化为 4 个专有类。

让\1 表示[A−Z]
让\2 表示[^A−Z01] 又名[2-9]

这会将我们的字母表缩减为\1、0、1 和\2。

[A−Z]*01*[^A−Z]{3} 变为 \1*01*[01\2]{3}

Thompson's construction 可用于将正则表达式转换为 NFA-ε。

\1*01*[01\2]{3} 等价于\1*01*[01\2][01\2][01\2]，因此我们得到：

我们可以使用上面描述的状态图为 NFA-ε 生成以下转换表：

      ε       \1      0       1       \2
      ------- ------- ------- ------- -------
S00   S01,S03                                   Starting State
S01           S02
S02   S01,S03
S03                   S04
S04   S05,S07
S05                           S06
S06   S05,S07
S07                   S08     S08     S08
S08                   S09     S09     S09
S09                   S10     S10     S10
S10                                             Accepting State

当然，也很明显也可以使用下面的NFA：

实现起来会简单得多，因为实现 NFA 引擎比实现 NFA-ε 引擎要简单得多。但是，通过以下步骤将 NFA-ε 转换为 NFA 很简单：

展平递归 ε-moves。

例如，

S00--ε-->{S01}
S01--ε-->{S02}
S02--ε-->{S03}
S02--1-->{S04}

变成

S00--ε-->{S01,S02,S03}
S01--ε-->{S02,S03}
S02--ε-->{S03}
S02--1-->{S04}

合并通过 ε-moves 达到的状态转换。

例如，

S00--ε-->{S01,S02,S03}
S01--ε-->{S02,S03}
S02--ε-->{S03}
S03--1-->{S04}

变成

S00--ε-->{S01,S02,S03}
S00--1-->{S04}
S01--ε-->{S02,S03}
S01--1-->{S04}
S02--ε-->{S03}
S02--1-->{S04}
S03--1-->{S04}

将通过 ε-moves 达到接受状态的状态添加到接受状态集。
放下 ε-moves。

对于我们的 NFA-ε，我们得到以下结果：

我们可以更进一步，清理 NFA。

存在可以删除的无法访问的状态。（S01、S03、S05 和 S07 不可达。）
可以组合相同的状态。（S00 和 S02 具有相同的转换，S04 和 S06 也是如此。）

执行这些清理后，我们得到

是不是很眼熟？

是时候实施了！以下实现了上面的 NFA-ε，但在处理输入之前使用上述步骤自动将其转换为 NFA。（它不执行任何清理步骤。）

#include <stdio.h>
#include <stdint.h>


/*
   ε                 => 0
   'A'..'Z' = 41..5A => 1
   '0'      = 30     => 2
   '1'      = 31     => 3
   '2'..'9' = 32..39 => 4
*/

#define NUM_INPUTS 5

#define _ -1

static int char_to_input[0x80] = {
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 00..0F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 10..1F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 20..2F
    2, 3, 4, 4, 4, 4, 4, 4, 4, 4, _, _, _, _, _, _,  // 30..3F
    _, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, _, _, _, _, _,  // 50..5F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 60..6F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 70..7F
};

#undef _


#define S(x) (1 << x)
#define S00 (1 <<  0)
#define S01 (1 <<  1)
#define S02 (1 <<  2)
#define S03 (1 <<  3)
#define S04 (1 <<  4)
#define S05 (1 <<  5)
#define S06 (1 <<  6)
#define S07 (1 <<  7)
#define S08 (1 <<  8)
#define S09 (1 <<  9)
#define S10 (1 << 10)

#define _ 0

static uint_least16_t transitions[][NUM_INPUTS] = {
   //       ε  ,   \1  ,    0  ,    1, ,   \2
   // -------  ,  ---  ,  ---  ,  ---  ,  ---
   {  S01|S03  ,  _    ,  _    ,  _    ,  _    },   // S00   Starting State
   {  _        ,  S02  ,  _    ,  _    ,  _    },   // S01
   {  S01|S03  ,  _    ,  _    ,  _    ,  _    },   // S02
   {  _        ,  _    ,  S04  ,  _    ,  _    },   // S03
   {  S05|S07  ,  _    ,  _    ,  _    ,  _    },   // S04
   {  _        ,  _    ,  _    ,  S06  ,  _    },   // S05
   {  S05|S07  ,  _    ,  _    ,  _    ,  _    },   // S06
   {  _        ,  _    ,  S08  ,  S08  ,  S08  },   // S07
   {  _        ,  _    ,  S09  ,  S09  ,  S09  },   // S08
   {  _        ,  _    ,  S10  ,  S10  ,  S10  },   // S09
   {  _        ,  _    ,  _    ,  _    ,  _    },   // S10   Accepting State
};

#undef _

uint_least16_t STATES_START  = S00;
uint_least16_t STATES_REJECT = 0;
uint_least16_t STATES_ACCEPT = S10;

#define NUM_STATES (sizeof(transitions)/sizeof(transitions[0]))


int main(int argc, char** argv) {
   if (argc != 2) {
      fprintf(stderr, "usage\n");
      return 2;
   }

   // Flatten recursive ε-moves.
   for (size_t s1=0; s1<NUM_STATES; ++s1) {
      uint_least16_t e_states = transitions[s1][0];

      do {
         for (size_t s2=0; s2<NUM_STATES; ++s2) {
            if (e_states & S(s2))
               e_states |= transitions[s2][0];
         }
      } while (e_states != transitions[s1][0]);

      transitions[s1][0] = e_states;
   }

   // Convert NFA-ε into NFA.
   {
      // For each state s1,
      // for each state s2,
      // if s1 has an ε-move to s2,
      // merge s2's transitions into s1's.
      for (size_t s1=0; s1<NUM_STATES; ++s1) {
         uint_least16_t e_states = transitions[s1][0];
         for (size_t s2=0; s2<NUM_STATES; ++s2) {
            if (e_states & S(s2)) {
               for (size_t i=1; i<NUM_INPUTS; ++i) {
                  transitions[s1][i] |= transitions[s2][i];
               }
            }
         }
      }

      // For each state s1,
      // for each accepting state s2,
      // if s1 has an ε-move to s2,
      // add s1 to the set of accepting states.
      for (size_t s1=0; s1<NUM_STATES; ++s1) {
         uint_least16_t e_states = transitions[s1][0];
         uint_least16_t Ss1 = S(s1);
         if (!(STATES_ACCEPT & Ss1)) {
            for (size_t s2=0; s2<NUM_STATES; ++s2) {
               if (e_states & S(s2) & STATES_ACCEPT) {
                  STATES_ACCEPT |= Ss1;
                  break;
               }
            }
         }
      }
   }

   // NFA engine
   {
      const char *str = argv[1];
      size_t i = 0;
      uint_least16_t states = STATES_START;
      while (1) {
         unsigned char ch = (unsigned char)str[i];
         if (!ch) {
            if (states & STATES_ACCEPT) {
               fprintf(stdout, "Match.\n");
               return 0;
            }
         }

         if (ch >= 0x80) {
            states = STATES_REJECT;
         } else {
            int input = char_to_input[ch];
            if (input < 0) {
               states = STATES_REJECT;
            } else {
               uint_least16_t new_states = STATES_REJECT;
               for (size_t s=0; s<NUM_STATES; ++s,states>>=1) {
                  if (states & 1)
                     new_states |= transitions[s][input];
               }

               states = new_states;
            }
         }

         if (states == STATES_REJECT) {
            fprintf(stderr, "Not a match. (Failed at offset %zu.)\n", i);
            return 1;
         }

         ++i;
      }
   }
}

（我使用_ 来突出重要的值。）

可以使用powerset contruction 将 NFA 转换为 DFA，但这样做没有多大意义。

【讨论】：