Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log
Repository URL

regex_scanner.tcc@ 1182

Last change on this file since 1182 was 1166, checked in by rossy, 3 years ago
Daodan: Replace MinGW build env with an up-to-date MSYS2 env
File size: 14.7 KB

Rev	Line
[1166]	1	// class template regex -- C++ --
	2
	3	// Copyright (C) 2013-2021 Free Software Foundation, Inc.
	4	//
	5	// This file is part of the GNU ISO C++ Library. This library is free
	6	// software; you can redistribute it and/or modify it under the
	7	// terms of the GNU General Public License as published by the
	8	// Free Software Foundation; either version 3, or (at your option)
	9	// any later version.
	10
	11	// This library is distributed in the hope that it will be useful,
	12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	// GNU General Public License for more details.
	15
	16	// Under Section 7 of GPL version 3, you are granted additional
	17	// permissions described in the GCC Runtime Library Exception, version
	18	// 3.1, as published by the Free Software Foundation.
	19
	20	// You should have received a copy of the GNU General Public License and
	21	// a copy of the GCC Runtime Library Exception along with this program;
	22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	23	// <http://www.gnu.org/licenses/>.
	24
	25	/**
	26	* @file bits/regex_scanner.tcc
	27	* This is an internal header file, included by other library headers.
	28	* Do not attempt to use it directly. @headername{regex}
	29	*/
	30
	31	// FIXME make comments doxygen format.
	32
	33	// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
	34	// and awk
	35	// 1) grep is basic except '\n' is treated as '\|'
	36	// 2) egrep is extended except '\n' is treated as '\|'
	37	// 3) awk is extended except special escaping rules, and there's no
	38	// back-reference.
	39	//
	40	// References:
	41	//
	42	// ECMAScript: ECMA-262 15.10
	43	//
	44	// basic, extended:
	45	// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
	46	//
	47	// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
	48
	49	namespace std _GLIBCXX_VISIBILITY(default)
	50	{
	51	_GLIBCXX_BEGIN_NAMESPACE_VERSION
	52
	53	namespace __detail
	54	{
	55	template<typename _CharT>
	56	_Scanner<_CharT>::
	57	_Scanner(typename _Scanner::_IterT __begin,
	58	typename _Scanner::_IterT __end,
	59	_FlagT __flags, std::locale __loc)
	60	: _ScannerBase(__flags),
	61	_M_current(__begin), _M_end(__end),
	62	_M_ctype(std::use_facet<_CtypeT>(__loc)),
	63	_M_eat_escape(_M_is_ecma()
	64	? &_Scanner::_M_eat_escape_ecma
	65	: &_Scanner::_M_eat_escape_posix)
	66	{ _M_advance(); }
	67
	68	template<typename _CharT>
	69	void
	70	_Scanner<_CharT>::
	71	_M_advance()
	72	{
	73	if (_M_current == _M_end)
	74	{
	75	_M_token = _S_token_eof;
	76	return;
	77	}
	78
	79	if (_M_state == _S_state_normal)
	80	_M_scan_normal();
	81	else if (_M_state == _S_state_in_bracket)
	82	_M_scan_in_bracket();
	83	else if (_M_state == _S_state_in_brace)
	84	_M_scan_in_brace();
	85	else
	86	{
	87	__glibcxx_assert(false);
	88	}
	89	}
	90
	91	// Differences between styles:
	92	// 1) "\(", "\)", "\{" in basic. It's not escaping.
	93	// 2) "(?:", "(?=", "(?!" in ECMAScript.
	94	template<typename _CharT>
	95	void
	96	_Scanner<_CharT>::
	97	_M_scan_normal()
	98	{
	99	auto __c = *_M_current++;
	100
	101	if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
	102	{
	103	_M_token = _S_token_ord_char;
	104	_M_value.assign(1, __c);
	105	return;
	106	}
	107	if (__c == '\\')
	108	{
	109	if (_M_current == _M_end)
	110	__throw_regex_error(
	111	regex_constants::error_escape,
	112	"Unexpected end of regex when escaping.");
	113
	114	if (!_M_is_basic()
	115	\|\| (*_M_current != '('
	116	&& *_M_current != ')'
	117	&& *_M_current != '{'))
	118	{
	119	(this->*_M_eat_escape)();
	120	return;
	121	}
	122	__c = *_M_current++;
	123	}
	124	if (__c == '(')
	125	{
	126	if (_M_is_ecma() && *_M_current == '?')
	127	{
	128	if (++_M_current == _M_end)
	129	__throw_regex_error(
	130	regex_constants::error_paren,
	131	"Unexpected end of regex when in an open parenthesis.");
	132
	133	if (*_M_current == ':')
	134	{
	135	++_M_current;
	136	_M_token = _S_token_subexpr_no_group_begin;
	137	}
	138	else if (*_M_current == '=')
	139	{
	140	++_M_current;
	141	_M_token = _S_token_subexpr_lookahead_begin;
	142	_M_value.assign(1, 'p');
	143	}
	144	else if (*_M_current == '!')
	145	{
	146	++_M_current;
	147	_M_token = _S_token_subexpr_lookahead_begin;
	148	_M_value.assign(1, 'n');
	149	}
	150	else
	151	__throw_regex_error(
	152	regex_constants::error_paren,
	153	"Invalid special open parenthesis.");
	154	}
	155	else if (_M_flags & regex_constants::nosubs)
	156	_M_token = _S_token_subexpr_no_group_begin;
	157	else
	158	_M_token = _S_token_subexpr_begin;
	159	}
	160	else if (__c == ')')
	161	_M_token = _S_token_subexpr_end;
	162	else if (__c == '[')
	163	{
	164	_M_state = _S_state_in_bracket;
	165	_M_at_bracket_start = true;
	166	if (_M_current != _M_end && *_M_current == '^')
	167	{
	168	_M_token = _S_token_bracket_neg_begin;
	169	++_M_current;
	170	}
	171	else
	172	_M_token = _S_token_bracket_begin;
	173	}
	174	else if (__c == '{')
	175	{
	176	_M_state = _S_state_in_brace;
	177	_M_token = _S_token_interval_begin;
	178	}
	179	else if (__c != ']' && __c != '}')
	180	{
	181	auto __it = _M_token_tbl;
	182	auto __narrowc = _M_ctype.narrow(__c, '\0');
	183	for (; __it->first != '\0'; ++__it)
	184	if (__it->first == __narrowc)
	185	{
	186	_M_token = __it->second;
	187	return;
	188	}
	189	__glibcxx_assert(false);
	190	}
	191	else
	192	{
	193	_M_token = _S_token_ord_char;
	194	_M_value.assign(1, __c);
	195	}
	196	}
	197
	198	// Differences between styles:
	199	// 1) different semantics of "[]" and "[^]".
	200	// 2) Escaping in bracket expr.
	201	template<typename _CharT>
	202	void
	203	_Scanner<_CharT>::
	204	_M_scan_in_bracket()
	205	{
	206	if (_M_current == _M_end)
	207	__throw_regex_error(
	208	regex_constants::error_brack,
	209	"Unexpected end of regex when in bracket expression.");
	210
	211	auto __c = *_M_current++;
	212
	213	if (__c == '-')
	214	_M_token = _S_token_bracket_dash;
	215	else if (__c == '[')
	216	{
	217	if (_M_current == _M_end)
	218	__throw_regex_error(regex_constants::error_brack,
	219	"Unexpected character class open bracket.");
	220
	221	if (*_M_current == '.')
	222	{
	223	_M_token = _S_token_collsymbol;
	224	_M_eat_class(*_M_current++);
	225	}
	226	else if (*_M_current == ':')
	227	{
	228	_M_token = _S_token_char_class_name;
	229	_M_eat_class(*_M_current++);
	230	}
	231	else if (*_M_current == '=')
	232	{
	233	_M_token = _S_token_equiv_class_name;
	234	_M_eat_class(*_M_current++);
	235	}
	236	else
	237	{
	238	_M_token = _S_token_ord_char;
	239	_M_value.assign(1, __c);
	240	}
	241	}
	242	// In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
	243	// literally. So "[]]" and "[^]]" are valid regexes. See the testcases
	244	// `*/empty_range.cc`.
	245	else if (__c == ']' && (_M_is_ecma() \|\| !_M_at_bracket_start))
	246	{
	247	_M_token = _S_token_bracket_end;
	248	_M_state = _S_state_normal;
	249	}
	250	// ECMAScript and awk permits escaping in bracket.
	251	else if (__c == '\\' && (_M_is_ecma() \|\| _M_is_awk()))
	252	(this->*_M_eat_escape)();
	253	else
	254	{
	255	_M_token = _S_token_ord_char;
	256	_M_value.assign(1, __c);
	257	}
	258	_M_at_bracket_start = false;
	259	}
	260
	261	// Differences between styles:
	262	// 1) "\}" in basic style.
	263	template<typename _CharT>
	264	void
	265	_Scanner<_CharT>::
	266	_M_scan_in_brace()
	267	{
	268	if (_M_current == _M_end)
	269	__throw_regex_error(
	270	regex_constants::error_brace,
	271	"Unexpected end of regex when in brace expression.");
	272
	273	auto __c = *_M_current++;
	274
	275	if (_M_ctype.is(_CtypeT::digit, __c))
	276	{
	277	_M_token = _S_token_dup_count;
	278	_M_value.assign(1, __c);
	279	while (_M_current != _M_end
	280	&& _M_ctype.is(_CtypeT::digit, *_M_current))
	281	_M_value += *_M_current++;
	282	}
	283	else if (__c == ',')
	284	_M_token = _S_token_comma;
	285	// basic use \}.
	286	else if (_M_is_basic())
	287	{
	288	if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
	289	{
	290	_M_state = _S_state_normal;
	291	_M_token = _S_token_interval_end;
	292	++_M_current;
	293	}
	294	else
	295	__throw_regex_error(regex_constants::error_badbrace,
	296	"Unexpected character in brace expression.");
	297	}
	298	else if (__c == '}')
	299	{
	300	_M_state = _S_state_normal;
	301	_M_token = _S_token_interval_end;
	302	}
	303	else
	304	__throw_regex_error(regex_constants::error_badbrace,
	305	"Unexpected character in brace expression.");
	306	}
	307
	308	template<typename _CharT>
	309	void
	310	_Scanner<_CharT>::
	311	_M_eat_escape_ecma()
	312	{
	313	if (_M_current == _M_end)
	314	__throw_regex_error(regex_constants::error_escape,
	315	"Unexpected end of regex when escaping.");
	316
	317	auto __c = *_M_current++;
	318	auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
	319
	320	if (__pos != nullptr && (__c != 'b' \|\| _M_state == _S_state_in_bracket))
	321	{
	322	_M_token = _S_token_ord_char;
	323	_M_value.assign(1, *__pos);
	324	}
	325	else if (__c == 'b')
	326	{
	327	_M_token = _S_token_word_bound;
	328	_M_value.assign(1, 'p');
	329	}
	330	else if (__c == 'B')
	331	{
	332	_M_token = _S_token_word_bound;
	333	_M_value.assign(1, 'n');
	334	}
	335	// N3376 28.13
	336	else if (__c == 'd'
	337	\|\| __c == 'D'
	338	\|\| __c == 's'
	339	\|\| __c == 'S'
	340	\|\| __c == 'w'
	341	\|\| __c == 'W')
	342	{
	343	_M_token = _S_token_quoted_class;
	344	_M_value.assign(1, __c);
	345	}
	346	else if (__c == 'c')
	347	{
	348	if (_M_current == _M_end)
	349	__throw_regex_error(
	350	regex_constants::error_escape,
	351	"Unexpected end of regex when reading control code.");
	352	_M_token = _S_token_ord_char;
	353	_M_value.assign(1, *_M_current++);
	354	}
	355	else if (__c == 'x' \|\| __c == 'u')
	356	{
	357	_M_value.erase();
	358	for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
	359	{
	360	if (_M_current == _M_end
	361	\|\| !_M_ctype.is(_CtypeT::xdigit, *_M_current))
	362	__throw_regex_error(
	363	regex_constants::error_escape,
	364	"Unexpected end of regex when ascii character.");
	365	_M_value += *_M_current++;
	366	}
	367	_M_token = _S_token_hex_num;
	368	}
	369	// ECMAScript recognizes multi-digit back-references.
	370	else if (_M_ctype.is(_CtypeT::digit, __c))
	371	{
	372	_M_value.assign(1, __c);
	373	while (_M_current != _M_end
	374	&& _M_ctype.is(_CtypeT::digit, *_M_current))
	375	_M_value += *_M_current++;
	376	_M_token = _S_token_backref;
	377	}
	378	else
	379	{
	380	_M_token = _S_token_ord_char;
	381	_M_value.assign(1, __c);
	382	}
	383	}
	384
	385	// Differences between styles:
	386	// 1) Extended doesn't support backref, but basic does.
	387	template<typename _CharT>
	388	void
	389	_Scanner<_CharT>::
	390	_M_eat_escape_posix()
	391	{
	392	if (_M_current == _M_end)
	393	__throw_regex_error(regex_constants::error_escape,
	394	"Unexpected end of regex when escaping.");
	395
	396	auto __c = *_M_current;
	397	auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
	398
	399	if (__pos != nullptr && *__pos != '\0')
	400	{
	401	_M_token = _S_token_ord_char;
	402	_M_value.assign(1, __c);
	403	}
	404	// We MUST judge awk before handling backrefs. There's no backref in awk.
	405	else if (_M_is_awk())
	406	{
	407	_M_eat_escape_awk();
	408	return;
	409	}
	410	else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
	411	{
	412	_M_token = _S_token_backref;
	413	_M_value.assign(1, __c);
	414	}
	415	else
	416	{
	417	#ifdef __STRICT_ANSI__
	418	// POSIX says it is undefined to escape ordinary characters
	419	__throw_regex_error(regex_constants::error_escape,
	420	"Unexpected escape character.");
	421	#else
	422	_M_token = _S_token_ord_char;
	423	_M_value.assign(1, __c);
	424	#endif
	425	}
	426	++_M_current;
	427	}
	428
	429	template<typename _CharT>
	430	void
	431	_Scanner<_CharT>::
	432	_M_eat_escape_awk()
	433	{
	434	auto __c = *_M_current++;
	435	auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
	436
	437	if (__pos != nullptr)
	438	{
	439	_M_token = _S_token_ord_char;
	440	_M_value.assign(1, *__pos);
	441	}
	442	// \ddd for oct representation
	443	else if (_M_ctype.is(_CtypeT::digit, __c)
	444	&& __c != '8'
	445	&& __c != '9')
	446	{
	447	_M_value.assign(1, __c);
	448	for (int __i = 0;
	449	__i < 2
	450	&& _M_current != _M_end
	451	&& _M_ctype.is(_CtypeT::digit, *_M_current)
	452	&& *_M_current != '8'
	453	&& *_M_current != '9';
	454	__i++)
	455	_M_value += *_M_current++;
	456	_M_token = _S_token_oct_num;
	457	return;
	458	}
	459	else
	460	__throw_regex_error(regex_constants::error_escape,
	461	"Unexpected escape character.");
	462	}
	463
	464	// Eats a character class or throws an exception.
	465	// __ch could be ':', '.' or '=', _M_current is the char after ']' when
	466	// returning.
	467	template<typename _CharT>
	468	void
	469	_Scanner<_CharT>::
	470	_M_eat_class(char __ch)
	471	{
	472	for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
	473	_M_value += *_M_current++;
	474	if (_M_current == _M_end
	475	\|\| *_M_current++ != __ch
	476	\|\| _M_current == _M_end // skip __ch
	477	\|\| *_M_current++ != ']') // skip ']'
	478	{
	479	if (__ch == ':')
	480	__throw_regex_error(regex_constants::error_ctype,
	481	"Unexpected end of character class.");
	482	else
	483	__throw_regex_error(regex_constants::error_collate,
	484	"Unexpected end of character class.");
	485	}
	486	}
	487
	488	#ifdef _GLIBCXX_DEBUG
	489	template<typename _CharT>
	490	std::ostream&
	491	_Scanner<_CharT>::
	492	_M_print(std::ostream& ostr)
	493	{
	494	switch (_M_token)
	495	{
	496	case _S_token_anychar:
	497	ostr << "any-character\n";
	498	break;
	499	case _S_token_backref:
	500	ostr << "backref\n";
	501	break;
	502	case _S_token_bracket_begin:
	503	ostr << "bracket-begin\n";
	504	break;
	505	case _S_token_bracket_neg_begin:
	506	ostr << "bracket-neg-begin\n";
	507	break;
	508	case _S_token_bracket_end:
	509	ostr << "bracket-end\n";
	510	break;
	511	case _S_token_char_class_name:
	512	ostr << "char-class-name \"" << _M_value << "\"\n";
	513	break;
	514	case _S_token_closure0:
	515	ostr << "closure0\n";
	516	break;
	517	case _S_token_closure1:
	518	ostr << "closure1\n";
	519	break;
	520	case _S_token_collsymbol:
	521	ostr << "collsymbol \"" << _M_value << "\"\n";
	522	break;
	523	case _S_token_comma:
	524	ostr << "comma\n";
	525	break;
	526	case _S_token_dup_count:
	527	ostr << "dup count: " << _M_value << "\n";
	528	break;
	529	case _S_token_eof:
	530	ostr << "EOF\n";
	531	break;
	532	case _S_token_equiv_class_name:
	533	ostr << "equiv-class-name \"" << _M_value << "\"\n";
	534	break;
	535	case _S_token_interval_begin:
	536	ostr << "interval begin\n";
	537	break;
	538	case _S_token_interval_end:
	539	ostr << "interval end\n";
	540	break;
	541	case _S_token_line_begin:
	542	ostr << "line begin\n";
	543	break;
	544	case _S_token_line_end:
	545	ostr << "line end\n";
	546	break;
	547	case _S_token_opt:
	548	ostr << "opt\n";
	549	break;
	550	case _S_token_or:
	551	ostr << "or\n";
	552	break;
	553	case _S_token_ord_char:
	554	ostr << "ordinary character: \"" << _M_value << "\"\n";
	555	break;
	556	case _S_token_subexpr_begin:
	557	ostr << "subexpr begin\n";
	558	break;
	559	case _S_token_subexpr_no_group_begin:
	560	ostr << "no grouping subexpr begin\n";
	561	break;
	562	case _S_token_subexpr_lookahead_begin:
	563	ostr << "lookahead subexpr begin\n";
	564	break;
	565	case _S_token_subexpr_end:
	566	ostr << "subexpr end\n";
	567	break;
	568	case _S_token_unknown:
	569	ostr << "-- unknown token --\n";
	570	break;
	571	case _S_token_oct_num:
	572	ostr << "oct number " << _M_value << "\n";
	573	break;
	574	case _S_token_hex_num:
	575	ostr << "hex number " << _M_value << "\n";
	576	break;
	577	case _S_token_quoted_class:
	578	ostr << "quoted class " << "\\" << _M_value << "\n";
	579	break;
	580	default:
	581	_GLIBCXX_DEBUG_ASSERT(false);
	582	}
	583	return ostr;
	584	}
	585	#endif
	586
	587	} // namespace __detail
	588	_GLIBCXX_END_NAMESPACE_VERSION
	589	} // namespace

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Daodan/MSYS2/mingw32/include/c++/11.2.0/bits/regex_scanner.tcc@ 1182

Download in other formats: