11.2.0/bits/regex_compiler.tcc

// class template regex -*- C++ -*-

// Copyright (C) 2013-2021 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

/**
 *  @file bits/regex_compiler.tcc
 *  This is an internal header file, included by other library headers.
 *  Do not attempt to use it directly. @headername{regex}
 */

// FIXME make comments doxygen format.

/*
// This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
// (http://swtch.com/~rsc/regexp/regexp1.html),
// but doesn't strictly follow it.
//
// When compiling, states are *chained* instead of tree- or graph-constructed.
// It's more like structured programs: there's if statement and loop statement.
//
// For alternative structure (say "a|b"), aka "if statement", two branches
// should be constructed. However, these two shall merge to an "end_tag" at
// the end of this operator:
//
//                branch1
//              /        \
// => begin_tag            end_tag =>
//              \        /
//                branch2
//
// This is the difference between this implementation and that in Russ's
// article.
//
// That's why we introduced dummy node here ------ "end_tag" is a dummy node.
// All dummy nodes will be eliminated at the end of compilation.
*/

namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION

namespace __detail
{
  template<typename _TraitsT>
    _Compiler<_TraitsT>::
    _Compiler(_IterT __b, _IterT __e,
              const typename _TraitsT::locale_type& __loc, _FlagT __flags)
    : _M_flags((__flags
                & (regex_constants::ECMAScript
                   | regex_constants::basic
                   | regex_constants::extended
                   | regex_constants::grep
                   | regex_constants::egrep
                   | regex_constants::awk))
               ? __flags
               : __flags | regex_constants::ECMAScript),
      _M_scanner(__b, __e, _M_flags, __loc),
      _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
      _M_traits(_M_nfa->_M_traits),
      _M_ctype(std::use_facet<_CtypeT>(__loc))
    {
      _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
      __r._M_append(_M_nfa->_M_insert_subexpr_begin());
      this->_M_disjunction();
      if (!_M_match_token(_ScannerT::_S_token_eof))
        __throw_regex_error(regex_constants::error_paren);
      __r._M_append(_M_pop());
      __glibcxx_assert(_M_stack.empty());
      __r._M_append(_M_nfa->_M_insert_subexpr_end());
      __r._M_append(_M_nfa->_M_insert_accept());
      _M_nfa->_M_eliminate_dummy();
    }

  template<typename _TraitsT>
    void
    _Compiler<_TraitsT>::
    _M_disjunction()
    {
      this->_M_alternative();
      while (_M_match_token(_ScannerT::_S_token_or))
        {
          _StateSeqT __alt1 = _M_pop();
          this->_M_alternative();
          _StateSeqT __alt2 = _M_pop();
          auto __end = _M_nfa->_M_insert_dummy();
          __alt1._M_append(__end);
          __alt2._M_append(__end);
          // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
          // executes _M_alt before _M_next, as well as executing left
          // alternative before right one.
          _M_stack.push(_StateSeqT(*_M_nfa,
                                   _M_nfa->_M_insert_alt(
                                     __alt2._M_start, __alt1._M_start, false),
                                   __end));
        }
    }

  template<typename _TraitsT>
    void
    _Compiler<_TraitsT>::
    _M_alternative()
    {
      if (this->_M_term())
        {
          _StateSeqT __re = _M_pop();
          this->_M_alternative();
          __re._M_append(_M_pop());
          _M_stack.push(__re);
        }
      else
        _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
    }

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_term()
    {
      if (this->_M_assertion())
        return true;
      if (this->_M_atom())
        {
          while (this->_M_quantifier());
          return true;
        }
      return false;
    }

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_assertion()
    {
      if (_M_match_token(_ScannerT::_S_token_line_begin))
        _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
      else if (_M_match_token(_ScannerT::_S_token_line_end))
        _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
      else if (_M_match_token(_ScannerT::_S_token_word_bound))
        // _M_value[0] == 'n' means it's negative, say "not word boundary".
        _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
              _M_insert_word_bound(_M_value[0] == 'n')));
      else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
        {
          auto __neg = _M_value[0] == 'n';
          this->_M_disjunction();
          if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
            __throw_regex_error(regex_constants::error_paren,
                                "Parenthesis is not closed.");
          auto __tmp = _M_pop();
          __tmp._M_append(_M_nfa->_M_insert_accept());
          _M_stack.push(
              _StateSeqT(
                *_M_nfa,
                _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
        }
      else
        return false;
      return true;
    }

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_quantifier()
    {
      bool __neg = (_M_flags & regex_constants::ECMAScript);
      auto __init = [this, &__neg]()
        {
          if (_M_stack.empty())
            __throw_regex_error(regex_constants::error_badrepeat,
                                "Nothing to repeat before a quantifier.");
          __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
        };
      if (_M_match_token(_ScannerT::_S_token_closure0))
        {
          __init();
          auto __e = _M_pop();
          _StateSeqT __r(*_M_nfa,
                         _M_nfa->_M_insert_repeat(_S_invalid_state_id,
                                                  __e._M_start, __neg));
          __e._M_append(__r);
          _M_stack.push(__r);
        }
      else if (_M_match_token(_ScannerT::_S_token_closure1))
        {
          __init();
          auto __e = _M_pop();
          __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
                                                 __e._M_start, __neg));
          _M_stack.push(__e);
        }
      else if (_M_match_token(_ScannerT::_S_token_opt))
        {
          __init();
          auto __e = _M_pop();
          auto __end = _M_nfa->_M_insert_dummy();
          _StateSeqT __r(*_M_nfa,
                         _M_nfa->_M_insert_repeat(_S_invalid_state_id,
                                                  __e._M_start, __neg));
          __e._M_append(__end);
          __r._M_append(__end);
          _M_stack.push(__r);
        }
      else if (_M_match_token(_ScannerT::_S_token_interval_begin))
        {
          if (_M_stack.empty())
            __throw_regex_error(regex_constants::error_badrepeat,
                                "Nothing to repeat before a quantifier.");
          if (!_M_match_token(_ScannerT::_S_token_dup_count))
            __throw_regex_error(regex_constants::error_badbrace,
                                "Unexpected token in brace expression.");
          _StateSeqT __r(_M_pop());
          _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
          long __min_rep = _M_cur_int_value(10);
          bool __infi = false;
          long __n = 0;

          // {3
          if (_M_match_token(_ScannerT::_S_token_comma))
            {
              if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
                __n = _M_cur_int_value(10) - __min_rep;
              else
                __infi = true;
            }
          if (!_M_match_token(_ScannerT::_S_token_interval_end))
            __throw_regex_error(regex_constants::error_brace,
                                "Unexpected end of brace expression.");

          __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);

          for (long __i = 0; __i < __min_rep; ++__i)
            __e._M_append(__r._M_clone());

          if (__infi)
            {
              auto __tmp = __r._M_clone();
              _StateSeqT __s(*_M_nfa,
                             _M_nfa->_M_insert_repeat(_S_invalid_state_id,
                                                      __tmp._M_start, __neg));
              __tmp._M_append(__s);
              __e._M_append(__s);
            }
          else
            {
              if (__n < 0)
                __throw_regex_error(regex_constants::error_badbrace,
                                    "Invalid range in brace expression.");
              auto __end = _M_nfa->_M_insert_dummy();
              // _M_alt is the "match more" branch, and _M_next is the
              // "match less" one. Switch _M_alt and _M_next of all created
              // nodes. This is a hack but IMO works well.
              std::stack<_StateIdT> __stack;
              for (long __i = 0; __i < __n; ++__i)
                {
                  auto __tmp = __r._M_clone();
                  auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
                                                        __end, __neg);
                  __stack.push(__alt);
                  __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
                }
              __e._M_append(__end);
              while (!__stack.empty())
                {
                  auto& __tmp = (*_M_nfa)[__stack.top()];
                  __stack.pop();
                  std::swap(__tmp._M_next, __tmp._M_alt);
                }
            }
          _M_stack.push(__e);
        }
      else
        return false;
      return true;
    }

#define __INSERT_REGEX_MATCHER(__func, ...)\
        do {\
          if (!(_M_flags & regex_constants::icase))\
            if (!(_M_flags & regex_constants::collate))\
              __func<false, false>(__VA_ARGS__);\
            else\
              __func<false, true>(__VA_ARGS__);\
          else\
            if (!(_M_flags & regex_constants::collate))\
              __func<true, false>(__VA_ARGS__);\
            else\
              __func<true, true>(__VA_ARGS__);\
        } while (false)

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_atom()
    {
      if (_M_match_token(_ScannerT::_S_token_anychar))
        {
          if (!(_M_flags & regex_constants::ECMAScript))
            __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
          else
            __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
        }
      else if (_M_try_char())
        __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
      else if (_M_match_token(_ScannerT::_S_token_backref))
        _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
                                 _M_insert_backref(_M_cur_int_value(10))));
      else if (_M_match_token(_ScannerT::_S_token_quoted_class))
        __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
      else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
        {
          _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
          this->_M_disjunction();
          if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
            __throw_regex_error(regex_constants::error_paren,
                                "Parenthesis is not closed.");
          __r._M_append(_M_pop());
          _M_stack.push(__r);
        }
      else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
        {
          _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
          this->_M_disjunction();
          if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
            __throw_regex_error(regex_constants::error_paren,
                                "Parenthesis is not closed.");
          __r._M_append(_M_pop());
          __r._M_append(_M_nfa->_M_insert_subexpr_end());
          _M_stack.push(__r);
        }
      else if (!_M_bracket_expression())
        return false;
      return true;
    }

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_bracket_expression()
    {
      bool __neg =
        _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
      if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
        return false;
      __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
      return true;
    }
#undef __INSERT_REGEX_MATCHER

  template<typename _TraitsT>
  template<bool __icase, bool __collate>
    void
    _Compiler<_TraitsT>::
    _M_insert_any_matcher_ecma()
    {
      _M_stack.push(_StateSeqT(*_M_nfa,
        _M_nfa->_M_insert_matcher
          (_AnyMatcher<_TraitsT, true, __icase, __collate>
            (_M_traits))));
    }

  template<typename _TraitsT>
  template<bool __icase, bool __collate>
    void
    _Compiler<_TraitsT>::
    _M_insert_any_matcher_posix()
    {
      _M_stack.push(_StateSeqT(*_M_nfa,
        _M_nfa->_M_insert_matcher
          (_AnyMatcher<_TraitsT, false, __icase, __collate>
            (_M_traits))));
    }

  template<typename _TraitsT>
  template<bool __icase, bool __collate>
    void
    _Compiler<_TraitsT>::
    _M_insert_char_matcher()
    {
      _M_stack.push(_StateSeqT(*_M_nfa,
        _M_nfa->_M_insert_matcher
          (_CharMatcher<_TraitsT, __icase, __collate>
            (_M_value[0], _M_traits))));
    }

  template<typename _TraitsT>
  template<bool __icase, bool __collate>
    void
    _Compiler<_TraitsT>::
    _M_insert_character_class_matcher()
    {
      __glibcxx_assert(_M_value.size() == 1);
      _BracketMatcher<_TraitsT, __icase, __collate> __matcher
        (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
      __matcher._M_add_character_class(_M_value, false);
      __matcher._M_ready();
      _M_stack.push(_StateSeqT(*_M_nfa,
        _M_nfa->_M_insert_matcher(std::move(__matcher))));
    }

  template<typename _TraitsT>
  template<bool __icase, bool __collate>
    void
    _Compiler<_TraitsT>::
    _M_insert_bracket_matcher(bool __neg)
    {
      _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
      pair<bool, _CharT> __last_char; // Optional<_CharT>
      __last_char.first = false;
      if (!(_M_flags & regex_constants::ECMAScript))
        {
          if (_M_try_char())
            {
              __last_char.first = true;
              __last_char.second = _M_value[0];
            }
          else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
            {
              __last_char.first = true;
              __last_char.second = '-';
            }
        }
      while (_M_expression_term(__last_char, __matcher));
      if (__last_char.first)
        __matcher._M_add_char(__last_char.second);
      __matcher._M_ready();
      _M_stack.push(_StateSeqT(
                      *_M_nfa,
                      _M_nfa->_M_insert_matcher(std::move(__matcher))));
    }

  template<typename _TraitsT>
  template<bool __icase, bool __collate>
    bool
    _Compiler<_TraitsT>::
    _M_expression_term(pair<bool, _CharT>& __last_char,
                       _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
    {
      if (_M_match_token(_ScannerT::_S_token_bracket_end))
        return false;

      const auto __push_char = [&](_CharT __ch)
      {
        if (__last_char.first)
          __matcher._M_add_char(__last_char.second);
        else
          __last_char.first = true;
        __last_char.second = __ch;
      };
      const auto __flush = [&]
      {
        if (__last_char.first)
          {
            __matcher._M_add_char(__last_char.second);
            __last_char.first = false;
          }
      };

      if (_M_match_token(_ScannerT::_S_token_collsymbol))
        {
          auto __symbol = __matcher._M_add_collate_element(_M_value);
          if (__symbol.size() == 1)
            __push_char(__symbol[0]);
          else
            __flush();
        }
      else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
        {
          __flush();
          __matcher._M_add_equivalence_class(_M_value);
        }
      else if (_M_match_token(_ScannerT::_S_token_char_class_name))
        {
          __flush();
          __matcher._M_add_character_class(_M_value, false);
        }
      else if (_M_try_char())
        __push_char(_M_value[0]);
      // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
      // except when the '-' is the first or last character in the bracket
      // expression ([--0]). ECMAScript treats all '-' after a range as a
      // normal character. Also see above, where _M_expression_term gets called.
      //
      // As a result, POSIX rejects [-----], but ECMAScript doesn't.
      // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
      // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
      //
      // It turns out that no one reads BNFs ;)
      else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
        {
          if (!__last_char.first)
            {
              if (!(_M_flags & regex_constants::ECMAScript))
                {
                  if (_M_match_token(_ScannerT::_S_token_bracket_end))
                    {
                      __push_char('-');
                      return false;
                    }
                  __throw_regex_error(
                    regex_constants::error_range,
                    "Unexpected dash in bracket expression. For POSIX syntax, "
                    "a dash is not treated literally only when it is at "
                    "beginning or end.");
                }
              __push_char('-');
            }
          else
            {
              if (_M_try_char())
                {
                  __matcher._M_make_range(__last_char.second, _M_value[0]);
                  __last_char.first = false;
                }
              else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
                {
                  __matcher._M_make_range(__last_char.second, '-');
                  __last_char.first = false;
                }
              else
                {
                  if (_M_scanner._M_get_token()
                      != _ScannerT::_S_token_bracket_end)
                    __throw_regex_error(
                      regex_constants::error_range,
                      "Character is expected after a dash.");
                  __push_char('-');
                }
            }
        }
      else if (_M_match_token(_ScannerT::_S_token_quoted_class))
        {
          __flush();
          __matcher._M_add_character_class(_M_value,
                                           _M_ctype.is(_CtypeT::upper,
                                                       _M_value[0]));
        }
      else
        __throw_regex_error(regex_constants::error_brack,
                            "Unexpected character in bracket expression.");

      return true;
    }

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_try_char()
    {
      bool __is_char = false;
      if (_M_match_token(_ScannerT::_S_token_oct_num))
        {
          __is_char = true;
          _M_value.assign(1, _M_cur_int_value(8));
        }
      else if (_M_match_token(_ScannerT::_S_token_hex_num))
        {
          __is_char = true;
          _M_value.assign(1, _M_cur_int_value(16));
        }
      else if (_M_match_token(_ScannerT::_S_token_ord_char))
        __is_char = true;
      return __is_char;
    }

  template<typename _TraitsT>
    bool
    _Compiler<_TraitsT>::
    _M_match_token(_TokenT token)
    {
      if (token == _M_scanner._M_get_token())
        {
          _M_value = _M_scanner._M_get_value();
          _M_scanner._M_advance();
          return true;
        }
      return false;
    }

  template<typename _TraitsT>
    int
    _Compiler<_TraitsT>::
    _M_cur_int_value(int __radix)
    {
      long __v = 0;
      for (typename _StringT::size_type __i = 0;
           __i < _M_value.length(); ++__i)
        __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
      return __v;
    }

  template<typename _TraitsT, bool __icase, bool __collate>
    bool
    _BracketMatcher<_TraitsT, __icase, __collate>::
    _M_apply(_CharT __ch, false_type) const
    {
      return [this, __ch]
      {
        if (std::binary_search(_M_char_set.begin(), _M_char_set.end(),
                               _M_translator._M_translate(__ch)))
          return true;
        auto __s = _M_translator._M_transform(__ch);
        for (auto& __it : _M_range_set)
          if (_M_translator._M_match_range(__it.first, __it.second, __s))
            return true;
        if (_M_traits.isctype(__ch, _M_class_set))
          return true;
        if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
                      _M_traits.transform_primary(&__ch, &__ch+1))
            != _M_equiv_set.end())
          return true;
        for (auto& __it : _M_neg_class_set)
          if (!_M_traits.isctype(__ch, __it))
            return true;
        return false;
      }() ^ _M_is_non_matching;
    }
} // namespace __detail

_GLIBCXX_END_NAMESPACE_VERSION
} // namespace
Revision:	1166
Committed:	Tue Oct 26 14:22:36 2021 UTC (4 years ago) by rossy
File size:	18919 byte(s)
Log Message:	Daodan: Replace MinGW build env with an up-to-date MSYS2 env
#	Content
1	// class template regex -- C++ --
2
3	// Copyright (C) 2013-2021 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	/**
26	* @file bits/regex_compiler.tcc
27	* This is an internal header file, included by other library headers.
28	* Do not attempt to use it directly. @headername{regex}
29	*/
30
31	// FIXME make comments doxygen format.
32
33	/*
34	// This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
35	// (http://swtch.com/~rsc/regexp/regexp1.html),
36	// but doesn't strictly follow it.
37	//
38	// When compiling, states are chained instead of tree- or graph-constructed.
39	// It's more like structured programs: there's if statement and loop statement.
40	//
41	// For alternative structure (say "a\|b"), aka "if statement", two branches
42	// should be constructed. However, these two shall merge to an "end_tag" at
43	// the end of this operator:
44	//
45	// branch1
46	// / \
47	// => begin_tag end_tag =>
48	// \ /
49	// branch2
50	//
51	// This is the difference between this implementation and that in Russ's
52	// article.
53	//
54	// That's why we introduced dummy node here ------ "end_tag" is a dummy node.
55	// All dummy nodes will be eliminated at the end of compilation.
56	*/
57
58	namespace std _GLIBCXX_VISIBILITY(default)
59	{
60	_GLIBCXX_BEGIN_NAMESPACE_VERSION
61
62	namespace __detail
63	{
64	template<typename _TraitsT>
65	_Compiler<_TraitsT>::
66	_Compiler(_IterT __b, _IterT __e,
67	const typename _TraitsT::locale_type& __loc, _FlagT __flags)
68	: _M_flags((__flags
69	& (regex_constants::ECMAScript
70	\| regex_constants::basic
71	\| regex_constants::extended
72	\| regex_constants::grep
73	\| regex_constants::egrep
74	\| regex_constants::awk))
75	? __flags
76	: __flags \| regex_constants::ECMAScript),
77	_M_scanner(__b, __e, _M_flags, __loc),
78	_M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
79	_M_traits(_M_nfa->_M_traits),
80	_M_ctype(std::use_facet<_CtypeT>(__loc))
81	{
82	_StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
83	__r._M_append(_M_nfa->_M_insert_subexpr_begin());
84	this->_M_disjunction();
85	if (!_M_match_token(_ScannerT::_S_token_eof))
86	__throw_regex_error(regex_constants::error_paren);
87	__r._M_append(_M_pop());
88	__glibcxx_assert(_M_stack.empty());
89	__r._M_append(_M_nfa->_M_insert_subexpr_end());
90	__r._M_append(_M_nfa->_M_insert_accept());
91	_M_nfa->_M_eliminate_dummy();
92	}
93
94	template<typename _TraitsT>
95	void
96	_Compiler<_TraitsT>::
97	_M_disjunction()
98	{
99	this->_M_alternative();
100	while (_M_match_token(_ScannerT::_S_token_or))
101	{
102	_StateSeqT __alt1 = _M_pop();
103	this->_M_alternative();
104	_StateSeqT __alt2 = _M_pop();
105	auto __end = _M_nfa->_M_insert_dummy();
106	__alt1._M_append(__end);
107	__alt2._M_append(__end);
108	// __alt2 is state._M_next, __alt1 is state._M_alt. The executor
109	// executes _M_alt before _M_next, as well as executing left
110	// alternative before right one.
111	_M_stack.push(_StateSeqT(*_M_nfa,
112	_M_nfa->_M_insert_alt(
113	__alt2._M_start, __alt1._M_start, false),
114	__end));
115	}
116	}
117
118	template<typename _TraitsT>
119	void
120	_Compiler<_TraitsT>::
121	_M_alternative()
122	{
123	if (this->_M_term())
124	{
125	_StateSeqT __re = _M_pop();
126	this->_M_alternative();
127	__re._M_append(_M_pop());
128	_M_stack.push(__re);
129	}
130	else
131	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
132	}
133
134	template<typename _TraitsT>
135	bool
136	_Compiler<_TraitsT>::
137	_M_term()
138	{
139	if (this->_M_assertion())
140	return true;
141	if (this->_M_atom())
142	{
143	while (this->_M_quantifier());
144	return true;
145	}
146	return false;
147	}
148
149	template<typename _TraitsT>
150	bool
151	_Compiler<_TraitsT>::
152	_M_assertion()
153	{
154	if (_M_match_token(_ScannerT::_S_token_line_begin))
155	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
156	else if (_M_match_token(_ScannerT::_S_token_line_end))
157	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
158	else if (_M_match_token(_ScannerT::_S_token_word_bound))
159	// _M_value[0] == 'n' means it's negative, say "not word boundary".
160	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
161	_M_insert_word_bound(_M_value[0] == 'n')));
162	else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
163	{
164	auto __neg = _M_value[0] == 'n';
165	this->_M_disjunction();
166	if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
167	__throw_regex_error(regex_constants::error_paren,
168	"Parenthesis is not closed.");
169	auto __tmp = _M_pop();
170	__tmp._M_append(_M_nfa->_M_insert_accept());
171	_M_stack.push(
172	_StateSeqT(
173	*_M_nfa,
174	_M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
175	}
176	else
177	return false;
178	return true;
179	}
180
181	template<typename _TraitsT>
182	bool
183	_Compiler<_TraitsT>::
184	_M_quantifier()
185	{
186	bool __neg = (_M_flags & regex_constants::ECMAScript);
187	auto __init = [this, &__neg]()
188	{
189	if (_M_stack.empty())
190	__throw_regex_error(regex_constants::error_badrepeat,
191	"Nothing to repeat before a quantifier.");
192	__neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
193	};
194	if (_M_match_token(_ScannerT::_S_token_closure0))
195	{
196	__init();
197	auto __e = _M_pop();
198	_StateSeqT __r(*_M_nfa,
199	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
200	__e._M_start, __neg));
201	__e._M_append(__r);
202	_M_stack.push(__r);
203	}
204	else if (_M_match_token(_ScannerT::_S_token_closure1))
205	{
206	__init();
207	auto __e = _M_pop();
208	__e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
209	__e._M_start, __neg));
210	_M_stack.push(__e);
211	}
212	else if (_M_match_token(_ScannerT::_S_token_opt))
213	{
214	__init();
215	auto __e = _M_pop();
216	auto __end = _M_nfa->_M_insert_dummy();
217	_StateSeqT __r(*_M_nfa,
218	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
219	__e._M_start, __neg));
220	__e._M_append(__end);
221	__r._M_append(__end);
222	_M_stack.push(__r);
223	}
224	else if (_M_match_token(_ScannerT::_S_token_interval_begin))
225	{
226	if (_M_stack.empty())
227	__throw_regex_error(regex_constants::error_badrepeat,
228	"Nothing to repeat before a quantifier.");
229	if (!_M_match_token(_ScannerT::_S_token_dup_count))
230	__throw_regex_error(regex_constants::error_badbrace,
231	"Unexpected token in brace expression.");
232	_StateSeqT __r(_M_pop());
233	_StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
234	long __min_rep = _M_cur_int_value(10);
235	bool __infi = false;
236	long __n = 0;
237
238	// {3
239	if (_M_match_token(_ScannerT::_S_token_comma))
240	{
241	if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
242	__n = _M_cur_int_value(10) - __min_rep;
243	else
244	__infi = true;
245	}
246	if (!_M_match_token(_ScannerT::_S_token_interval_end))
247	__throw_regex_error(regex_constants::error_brace,
248	"Unexpected end of brace expression.");
249
250	__neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
251
252	for (long __i = 0; __i < __min_rep; ++__i)
253	__e._M_append(__r._M_clone());
254
255	if (__infi)
256	{
257	auto __tmp = __r._M_clone();
258	_StateSeqT __s(*_M_nfa,
259	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
260	__tmp._M_start, __neg));
261	__tmp._M_append(__s);
262	__e._M_append(__s);
263	}
264	else
265	{
266	if (__n < 0)
267	__throw_regex_error(regex_constants::error_badbrace,
268	"Invalid range in brace expression.");
269	auto __end = _M_nfa->_M_insert_dummy();
270	// _M_alt is the "match more" branch, and _M_next is the
271	// "match less" one. Switch _M_alt and _M_next of all created
272	// nodes. This is a hack but IMO works well.
273	std::stack<_StateIdT> __stack;
274	for (long __i = 0; __i < __n; ++__i)
275	{
276	auto __tmp = __r._M_clone();
277	auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
278	__end, __neg);
279	__stack.push(__alt);
280	__e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
281	}
282	__e._M_append(__end);
283	while (!__stack.empty())
284	{
285	auto& __tmp = (*_M_nfa)[__stack.top()];
286	__stack.pop();
287	std::swap(__tmp._M_next, __tmp._M_alt);
288	}
289	}
290	_M_stack.push(__e);
291	}
292	else
293	return false;
294	return true;
295	}
296
297	#define __INSERT_REGEX_MATCHER(__func, ...)\
298	do {\
299	if (!(_M_flags & regex_constants::icase))\
300	if (!(_M_flags & regex_constants::collate))\
301	__func<false, false>(__VA_ARGS__);\
302	else\
303	__func<false, true>(__VA_ARGS__);\
304	else\
305	if (!(_M_flags & regex_constants::collate))\
306	__func<true, false>(__VA_ARGS__);\
307	else\
308	__func<true, true>(__VA_ARGS__);\
309	} while (false)
310
311	template<typename _TraitsT>
312	bool
313	_Compiler<_TraitsT>::
314	_M_atom()
315	{
316	if (_M_match_token(_ScannerT::_S_token_anychar))
317	{
318	if (!(_M_flags & regex_constants::ECMAScript))
319	__INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
320	else
321	__INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
322	}
323	else if (_M_try_char())
324	__INSERT_REGEX_MATCHER(_M_insert_char_matcher);
325	else if (_M_match_token(_ScannerT::_S_token_backref))
326	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
327	_M_insert_backref(_M_cur_int_value(10))));
328	else if (_M_match_token(_ScannerT::_S_token_quoted_class))
329	__INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
330	else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
331	{
332	_StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
333	this->_M_disjunction();
334	if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
335	__throw_regex_error(regex_constants::error_paren,
336	"Parenthesis is not closed.");
337	__r._M_append(_M_pop());
338	_M_stack.push(__r);
339	}
340	else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
341	{
342	_StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
343	this->_M_disjunction();
344	if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
345	__throw_regex_error(regex_constants::error_paren,
346	"Parenthesis is not closed.");
347	__r._M_append(_M_pop());
348	__r._M_append(_M_nfa->_M_insert_subexpr_end());
349	_M_stack.push(__r);
350	}
351	else if (!_M_bracket_expression())
352	return false;
353	return true;
354	}
355
356	template<typename _TraitsT>
357	bool
358	_Compiler<_TraitsT>::
359	_M_bracket_expression()
360	{
361	bool __neg =
362	_M_match_token(_ScannerT::_S_token_bracket_neg_begin);
363	if (!(__neg \|\| _M_match_token(_ScannerT::_S_token_bracket_begin)))
364	return false;
365	__INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
366	return true;
367	}
368	#undef __INSERT_REGEX_MATCHER
369
370	template<typename _TraitsT>
371	template<bool __icase, bool __collate>
372	void
373	_Compiler<_TraitsT>::
374	_M_insert_any_matcher_ecma()
375	{
376	_M_stack.push(_StateSeqT(*_M_nfa,
377	_M_nfa->_M_insert_matcher
378	(_AnyMatcher<_TraitsT, true, __icase, __collate>
379	(_M_traits))));
380	}
381
382	template<typename _TraitsT>
383	template<bool __icase, bool __collate>
384	void
385	_Compiler<_TraitsT>::
386	_M_insert_any_matcher_posix()
387	{
388	_M_stack.push(_StateSeqT(*_M_nfa,
389	_M_nfa->_M_insert_matcher
390	(_AnyMatcher<_TraitsT, false, __icase, __collate>
391	(_M_traits))));
392	}
393
394	template<typename _TraitsT>
395	template<bool __icase, bool __collate>
396	void
397	_Compiler<_TraitsT>::
398	_M_insert_char_matcher()
399	{
400	_M_stack.push(_StateSeqT(*_M_nfa,
401	_M_nfa->_M_insert_matcher
402	(_CharMatcher<_TraitsT, __icase, __collate>
403	(_M_value[0], _M_traits))));
404	}
405
406	template<typename _TraitsT>
407	template<bool __icase, bool __collate>
408	void
409	_Compiler<_TraitsT>::
410	_M_insert_character_class_matcher()
411	{
412	__glibcxx_assert(_M_value.size() == 1);
413	_BracketMatcher<_TraitsT, __icase, __collate> __matcher
414	(_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
415	__matcher._M_add_character_class(_M_value, false);
416	__matcher._M_ready();
417	_M_stack.push(_StateSeqT(*_M_nfa,
418	_M_nfa->_M_insert_matcher(std::move(__matcher))));
419	}
420
421	template<typename _TraitsT>
422	template<bool __icase, bool __collate>
423	void
424	_Compiler<_TraitsT>::
425	_M_insert_bracket_matcher(bool __neg)
426	{
427	_BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
428	pair<bool, _CharT> __last_char; // Optional<_CharT>
429	__last_char.first = false;
430	if (!(_M_flags & regex_constants::ECMAScript))
431	{
432	if (_M_try_char())
433	{
434	__last_char.first = true;
435	__last_char.second = _M_value[0];
436	}
437	else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
438	{
439	__last_char.first = true;
440	__last_char.second = '-';
441	}
442	}
443	while (_M_expression_term(__last_char, __matcher));
444	if (__last_char.first)
445	__matcher._M_add_char(__last_char.second);
446	__matcher._M_ready();
447	_M_stack.push(_StateSeqT(
448	*_M_nfa,
449	_M_nfa->_M_insert_matcher(std::move(__matcher))));
450	}
451
452	template<typename _TraitsT>
453	template<bool __icase, bool __collate>
454	bool
455	_Compiler<_TraitsT>::
456	_M_expression_term(pair<bool, _CharT>& __last_char,
457	_BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
458	{
459	if (_M_match_token(_ScannerT::_S_token_bracket_end))
460	return false;
461
462	const auto __push_char = [&](_CharT __ch)
463	{
464	if (__last_char.first)
465	__matcher._M_add_char(__last_char.second);
466	else
467	__last_char.first = true;
468	__last_char.second = __ch;
469	};
470	const auto __flush = [&]
471	{
472	if (__last_char.first)
473	{
474	__matcher._M_add_char(__last_char.second);
475	__last_char.first = false;
476	}
477	};
478
479	if (_M_match_token(_ScannerT::_S_token_collsymbol))
480	{
481	auto __symbol = __matcher._M_add_collate_element(_M_value);
482	if (__symbol.size() == 1)
483	__push_char(__symbol[0]);
484	else
485	__flush();
486	}
487	else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
488	{
489	__flush();
490	__matcher._M_add_equivalence_class(_M_value);
491	}
492	else if (_M_match_token(_ScannerT::_S_token_char_class_name))
493	{
494	__flush();
495	__matcher._M_add_character_class(_M_value, false);
496	}
497	else if (_M_try_char())
498	__push_char(_M_value[0]);
499	// POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
500	// except when the '-' is the first or last character in the bracket
501	// expression ([--0]). ECMAScript treats all '-' after a range as a
502	// normal character. Also see above, where _M_expression_term gets called.
503	//
504	// As a result, POSIX rejects [-----], but ECMAScript doesn't.
505	// Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
506	// Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
507	//
508	// It turns out that no one reads BNFs ;)
509	else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
510	{
511	if (!__last_char.first)
512	{
513	if (!(_M_flags & regex_constants::ECMAScript))
514	{
515	if (_M_match_token(_ScannerT::_S_token_bracket_end))
516	{
517	__push_char('-');
518	return false;
519	}
520	__throw_regex_error(
521	regex_constants::error_range,
522	"Unexpected dash in bracket expression. For POSIX syntax, "
523	"a dash is not treated literally only when it is at "
524	"beginning or end.");
525	}
526	__push_char('-');
527	}
528	else
529	{
530	if (_M_try_char())
531	{
532	__matcher._M_make_range(__last_char.second, _M_value[0]);
533	__last_char.first = false;
534	}
535	else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
536	{
537	__matcher._M_make_range(__last_char.second, '-');
538	__last_char.first = false;
539	}
540	else
541	{
542	if (_M_scanner._M_get_token()
543	!= _ScannerT::_S_token_bracket_end)
544	__throw_regex_error(
545	regex_constants::error_range,
546	"Character is expected after a dash.");
547	__push_char('-');
548	}
549	}
550	}
551	else if (_M_match_token(_ScannerT::_S_token_quoted_class))
552	{
553	__flush();
554	__matcher._M_add_character_class(_M_value,
555	_M_ctype.is(_CtypeT::upper,
556	_M_value[0]));
557	}
558	else
559	__throw_regex_error(regex_constants::error_brack,
560	"Unexpected character in bracket expression.");
561
562	return true;
563	}
564
565	template<typename _TraitsT>
566	bool
567	_Compiler<_TraitsT>::
568	_M_try_char()
569	{
570	bool __is_char = false;
571	if (_M_match_token(_ScannerT::_S_token_oct_num))
572	{
573	__is_char = true;
574	_M_value.assign(1, _M_cur_int_value(8));
575	}
576	else if (_M_match_token(_ScannerT::_S_token_hex_num))
577	{
578	__is_char = true;
579	_M_value.assign(1, _M_cur_int_value(16));
580	}
581	else if (_M_match_token(_ScannerT::_S_token_ord_char))
582	__is_char = true;
583	return __is_char;
584	}
585
586	template<typename _TraitsT>
587	bool
588	_Compiler<_TraitsT>::
589	_M_match_token(_TokenT token)
590	{
591	if (token == _M_scanner._M_get_token())
592	{
593	_M_value = _M_scanner._M_get_value();
594	_M_scanner._M_advance();
595	return true;
596	}
597	return false;
598	}
599
600	template<typename _TraitsT>
601	int
602	_Compiler<_TraitsT>::
603	_M_cur_int_value(int __radix)
604	{
605	long __v = 0;
606	for (typename _StringT::size_type __i = 0;
607	__i < _M_value.length(); ++__i)
608	__v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
609	return __v;
610	}
611
612	template<typename _TraitsT, bool __icase, bool __collate>
613	bool
614	_BracketMatcher<_TraitsT, __icase, __collate>::
615	_M_apply(_CharT __ch, false_type) const
616	{
617	return [this, __ch]
618	{
619	if (std::binary_search(_M_char_set.begin(), _M_char_set.end(),
620	_M_translator._M_translate(__ch)))
621	return true;
622	auto __s = _M_translator._M_transform(__ch);
623	for (auto& __it : _M_range_set)
624	if (_M_translator._M_match_range(__it.first, __it.second, __s))
625	return true;
626	if (_M_traits.isctype(__ch, _M_class_set))
627	return true;
628	if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
629	_M_traits.transform_primary(&__ch, &__ch+1))
630	!= _M_equiv_set.end())
631	return true;
632	for (auto& __it : _M_neg_class_set)
633	if (!_M_traits.isctype(__ch, __it))
634	return true;
635	return false;
636	}() ^ _M_is_non_matching;
637	}
638	} // namespace __detail
639
640	_GLIBCXX_END_NAMESPACE_VERSION
641	} // namespace