Blame - src/regexp_nfa.c - android_external_vim

blob: 142e1abe6c6eb0e7252e7fa41a2fa43045f449f1 [file] [log] [blame]

Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1	/* vi:set ts=8 sts=4 sw=4:
				2	*
				3	* NFA regular expression implementation.
				4	*
				5	* This file is included in "regexp.c".
				6	*/
				7
				8	#ifdef DEBUG
				9	/* Comment this out to disable log files. They can get pretty big */
				10	# define ENABLE_LOG
				11	# define LOG_NAME "log_nfarun.log"
Bram Moolenaar	7fcff1f	2013-05-20 21:49:13 +0200	[diff] [blame^]	12	# define NFA_REGEXP_DEBUG_LOG
				13	# define NFA_REGEXP_DEBUG_LOG_NAME "nfa_regexp_debug.log"
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	14	#endif
				15
				16	/* Upper limit allowed for {m,n} repetitions handled by NFA */
				17	#define NFA_BRACES_MAXLIMIT 10
				18	/* For allocating space for the postfix representation */
				19	#define NFA_POSTFIX_MULTIPLIER (NFA_BRACES_MAXLIMIT + 2)*2
				20	/* Size of stack, used when converting the postfix regexp into NFA */
				21	#define NFA_STACK_SIZE 1024
				22
				23	enum
				24	{
				25	NFA_SPLIT = -1024,
				26	NFA_MATCH,
				27	NFA_SKIP_CHAR, /* matches a 0-length char */
				28	NFA_END_NEG_RANGE, /* Used when expanding [^ab] */
				29
				30	NFA_CONCAT,
				31	NFA_OR,
				32	NFA_STAR,
				33	NFA_PLUS,
				34	NFA_QUEST,
				35	NFA_QUEST_NONGREEDY, /* Non-greedy version of \? */
				36	NFA_NOT, /* used for [^ab] negated char ranges */
				37
				38	NFA_BOL, /* ^ Begin line */
				39	NFA_EOL, /* $ End line */
				40	NFA_BOW, /* \< Begin word */
				41	NFA_EOW, /* \> End word */
				42	NFA_BOF, /* \%^ Begin file */
				43	NFA_EOF, /* \%$ End file */
				44	NFA_NEWL,
				45	NFA_ZSTART, /* Used for \zs */
				46	NFA_ZEND, /* Used for \ze */
				47	NFA_NOPEN, /* Start of subexpression marked with \%( */
				48	NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
				49	NFA_START_INVISIBLE,
				50	NFA_END_INVISIBLE,
				51	NFA_MULTIBYTE, /* Next nodes in NFA are part of the same
				52	multibyte char */
				53	NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */
				54	NFA_COMPOSING, /* Next nodes in NFA are part of the
				55	composing multibyte char */
				56	NFA_END_COMPOSING, /* End of a composing char in the NFA */
				57
				58	/* The following are used only in the postfix form, not in the NFA */
				59	NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */
				60	NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */
				61	NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */
				62	NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@<! */
				63	NFA_PREV_ATOM_LIKE_PATTERN, /* Used for \@> */
				64
				65	NFA_MOPEN,
				66	NFA_MCLOSE = NFA_MOPEN + NSUBEXP,
				67
				68	/* NFA_FIRST_NL */
				69	NFA_ANY = NFA_MCLOSE + NSUBEXP, /* Match any one character. */
				70	NFA_ANYOF, /* Match any character in this string. */
				71	NFA_ANYBUT, /* Match any character not in this string. */
				72	NFA_IDENT, /* Match identifier char */
				73	NFA_SIDENT, /* Match identifier char but no digit */
				74	NFA_KWORD, /* Match keyword char */
				75	NFA_SKWORD, /* Match word char but no digit */
				76	NFA_FNAME, /* Match file name char */
				77	NFA_SFNAME, /* Match file name char but no digit */
				78	NFA_PRINT, /* Match printable char */
				79	NFA_SPRINT, /* Match printable char but no digit */
				80	NFA_WHITE, /* Match whitespace char */
				81	NFA_NWHITE, /* Match non-whitespace char */
				82	NFA_DIGIT, /* Match digit char */
				83	NFA_NDIGIT, /* Match non-digit char */
				84	NFA_HEX, /* Match hex char */
				85	NFA_NHEX, /* Match non-hex char */
				86	NFA_OCTAL, /* Match octal char */
				87	NFA_NOCTAL, /* Match non-octal char */
				88	NFA_WORD, /* Match word char */
				89	NFA_NWORD, /* Match non-word char */
				90	NFA_HEAD, /* Match head char */
				91	NFA_NHEAD, /* Match non-head char */
				92	NFA_ALPHA, /* Match alpha char */
				93	NFA_NALPHA, /* Match non-alpha char */
				94	NFA_LOWER, /* Match lowercase char */
				95	NFA_NLOWER, /* Match non-lowercase char */
				96	NFA_UPPER, /* Match uppercase char */
				97	NFA_NUPPER, /* Match non-uppercase char */
				98	NFA_FIRST_NL = NFA_ANY + ADD_NL,
				99	NFA_LAST_NL = NFA_NUPPER + ADD_NL,
				100
				101	/* Character classes [:alnum:] etc */
				102	NFA_CLASS_ALNUM,
				103	NFA_CLASS_ALPHA,
				104	NFA_CLASS_BLANK,
				105	NFA_CLASS_CNTRL,
				106	NFA_CLASS_DIGIT,
				107	NFA_CLASS_GRAPH,
				108	NFA_CLASS_LOWER,
				109	NFA_CLASS_PRINT,
				110	NFA_CLASS_PUNCT,
				111	NFA_CLASS_SPACE,
				112	NFA_CLASS_UPPER,
				113	NFA_CLASS_XDIGIT,
				114	NFA_CLASS_TAB,
				115	NFA_CLASS_RETURN,
				116	NFA_CLASS_BACKSPACE,
				117	NFA_CLASS_ESCAPE
				118	};
				119
				120	/* Keep in sync with classchars. */
				121	static int nfa_classcodes[] = {
				122	NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
				123	NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
				124	NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
				125	NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
				126	NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
				127	NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
				128	NFA_UPPER, NFA_NUPPER
				129	};
				130
				131	static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
				132
				133	/*
				134	* NFA errors can be of 3 types:
				135	* *** NFA runtime errors, when something unknown goes wrong. The NFA fails
				136	* silently and revert the to backtracking engine.
				137	* syntax_error = FALSE;
				138	* *** Regexp syntax errors, when the input regexp is not syntactically correct.
				139	* The NFA engine displays an error message, and nothing else happens.
				140	* syntax_error = TRUE
				141	* *** Unsupported features, when the input regexp uses an operator that is not
				142	* implemented in the NFA. The NFA engine fails silently, and reverts to the
				143	* old backtracking engine.
				144	* syntax_error = FALSE
				145	* "The NFA fails" means that "compiling the regexp with the NFA fails":
				146	* nfa_regcomp() returns FAIL.
				147	*/
				148	static int syntax_error = FALSE;
				149
				150	/* NFA regexp \ze operator encountered. */
				151	static int nfa_has_zend = FALSE;
				152
				153	static int post_start; / holds the postfix form of r.e. */
				154	static int *post_end;
				155	static int *post_ptr;
				156
				157	static int nstate; /* Number of states in the NFA. */
				158	static int istate; /* Index in the state vector, used in new_state() */
				159	static int nstate_max; /* Upper bound of estimated number of states. */
				160
				161
				162	static int nfa_regcomp_start __ARGS((char_u*expr, int re_flags));
				163	static int nfa_recognize_char_class __ARGS((char_u start, char_u end, int extra_newl));
				164	static int nfa_emit_equi_class __ARGS((int c, int neg));
				165	static void nfa_inc __ARGS((char_u **p));
				166	static void nfa_dec __ARGS((char_u **p));
				167	static int nfa_regatom __ARGS((void));
				168	static int nfa_regpiece __ARGS((void));
				169	static int nfa_regconcat __ARGS((void));
				170	static int nfa_regbranch __ARGS((void));
				171	static int nfa_reg __ARGS((int paren));
				172	#ifdef DEBUG
				173	static void nfa_set_code __ARGS((int c));
				174	static void nfa_postfix_dump __ARGS((char_u *expr, int retval));
				175	static void nfa_print_state __ARGS((FILE debugf, nfa_state_T state, int ident));
				176	static void nfa_dump __ARGS((nfa_regprog_T *prog));
				177	#endif
				178	static int *re2post __ARGS((void));
				179	static nfa_state_T new_state __ARGS((int c, nfa_state_T out, nfa_state_T *out1));
				180	static nfa_state_T post2nfa __ARGS((int postfix, int *end, int nfa_calc_size));
				181	static int check_char_class __ARGS((int class, int c));
				182	static void st_error __ARGS((int postfix, int end, int *p));
				183	static void nfa_save_listids __ARGS((nfa_state_T start, int list));
				184	static void nfa_restore_listids __ARGS((nfa_state_T start, int list));
				185	static void nfa_set_null_listids __ARGS((nfa_state_T *start));
				186	static void nfa_set_neg_listids __ARGS((nfa_state_T *start));
				187	static long nfa_regtry __ARGS((nfa_state_T *start, colnr_T col));
				188	static long nfa_regexec_both __ARGS((char_u *line, colnr_T col));
				189	static regprog_T nfa_regcomp __ARGS((char_u expr, int re_flags));
				190	static int nfa_regexec __ARGS((regmatch_T rmp, char_u line, colnr_T col));
				191	static long nfa_regexec_multi __ARGS((regmmatch_T rmp, win_T win, buf_T buf, linenr_T lnum, colnr_T col, proftime_T tm));
				192
				193	/* helper functions used when doing re2post() ... regatom() parsing */
				194	#define EMIT(c) do { \
				195	if (post_ptr >= post_end) \
				196	return FAIL; \
				197	*post_ptr++ = c; \
				198	} while (0)
				199
				200	#define EMIT_MBYTE(c) \
				201	len = (*mb_char2bytes)(c, buf); \
				202	EMIT(buf[0]); \
				203	for (i = 1; i < len; i++) \
				204	{ \
				205	EMIT(buf[i]); \
				206	EMIT(NFA_CONCAT); \
				207	} \
				208	EMIT(NFA_MULTIBYTE);
				209
				210	#define EMIT_COMPOSING_UTF(input) \
				211	len = utfc_ptr2len(input); \
				212	EMIT(input[0]); \
				213	for (i = 1; i < len; i++) \
				214	{ \
				215	EMIT(input[i]); \
				216	EMIT(NFA_CONCAT); \
				217	} \
				218	EMIT(NFA_COMPOSING);
				219
				220	/*
				221	* Initialize internal variables before NFA compilation.
				222	* Return OK on success, FAIL otherwise.
				223	*/
				224	static int
				225	nfa_regcomp_start(expr, re_flags)
				226	char_u *expr;
				227	int re_flags; /* see vim_regcomp() */
				228	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	229	size_t postfix_size;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	230
				231	nstate = 0;
				232	istate = 0;
				233	/* A reasonable estimation for size */
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	234	nstate_max = (int)(STRLEN(expr) + 1) * NFA_POSTFIX_MULTIPLIER;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	235
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	236	/* Some items blow up in size, such as [A-z]. Add more space for that.
				237	* TODO: some patterns may still fail. */
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	238	nstate_max += 1000;
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	239
				240	/* Size for postfix representation of expr. */
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	241	postfix_size = sizeof(post_start) nstate_max;
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	242
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	243	post_start = (int *)lalloc(postfix_size, TRUE);
				244	if (post_start == NULL)
				245	return FAIL;
				246	vim_memset(post_start, 0, postfix_size);
				247	post_ptr = post_start;
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	248	post_end = post_start + nstate_max;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	249	nfa_has_zend = FALSE;
				250
				251	regcomp_start(expr, re_flags);
				252
				253	return OK;
				254	}
				255
				256	/*
				257	* Search between "start" and "end" and try to recognize a
				258	* character class in expanded form. For example [0-9].
				259	* On success, return the id the character class to be emitted.
				260	* On failure, return 0 (=FAIL)
				261	* Start points to the first char of the range, while end should point
				262	* to the closing brace.
				263	*/
				264	static int
				265	nfa_recognize_char_class(start, end, extra_newl)
				266	char_u *start;
				267	char_u *end;
				268	int extra_newl;
				269	{
				270	int i;
				271	/* Each of these variables takes up a char in "config[]",
				272	* in the order they are here. */
				273	int not = FALSE, af = FALSE, AF = FALSE, az = FALSE, AZ = FALSE,
				274	o7 = FALSE, o9 = FALSE, underscore = FALSE, newl = FALSE;
				275	char_u *p;
				276	#define NCONFIGS 16
				277	int classid[NCONFIGS] = {
				278	NFA_DIGIT, NFA_NDIGIT, NFA_HEX, NFA_NHEX,
				279	NFA_OCTAL, NFA_NOCTAL, NFA_WORD, NFA_NWORD,
				280	NFA_HEAD, NFA_NHEAD, NFA_ALPHA, NFA_NALPHA,
				281	NFA_LOWER, NFA_NLOWER, NFA_UPPER, NFA_NUPPER
				282	};
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	283	char_u myconfig[10];
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	284	char_u config[NCONFIGS][9] = {
				285	"000000100", /* digit */
				286	"100000100", /* non digit */
				287	"011000100", /* hex-digit */
				288	"111000100", /* non hex-digit */
				289	"000001000", /* octal-digit */
				290	"100001000", /* [^0-7] */
				291	"000110110", /* [0-9A-Za-z_] */
				292	"100110110", /* [^0-9A-Za-z_] */
				293	"000110010", /* head of word */
				294	"100110010", /* not head of word */
				295	"000110000", /* alphabetic char a-z */
				296	"100110000", /* non alphabetic char */
				297	"000100000", /* lowercase letter */
				298	"100100000", /* non lowercase */
				299	"000010000", /* uppercase */
				300	"100010000" /* non uppercase */
				301	};
				302
				303	if (extra_newl == TRUE)
				304	newl = TRUE;
				305
				306	if (*end != ']')
				307	return FAIL;
				308	p = start;
				309	if (*p == '^')
				310	{
				311	not = TRUE;
				312	p ++;
				313	}
				314
				315	while (p < end)
				316	{
				317	if (p + 2 < end && *(p + 1) == '-')
				318	{
				319	switch (*p)
				320	{
				321	case '0':
				322	if (*(p + 2) == '9')
				323	{
				324	o9 = TRUE;
				325	break;
				326	}
				327	else
				328	if (*(p + 2) == '7')
				329	{
				330	o7 = TRUE;
				331	break;
				332	}
				333	case 'a':
				334	if (*(p + 2) == 'z')
				335	{
				336	az = TRUE;
				337	break;
				338	}
				339	else
				340	if (*(p + 2) == 'f')
				341	{
				342	af = TRUE;
				343	break;
				344	}
				345	case 'A':
				346	if (*(p + 2) == 'Z')
				347	{
				348	AZ = TRUE;
				349	break;
				350	}
				351	else
				352	if (*(p + 2) == 'F')
				353	{
				354	AF = TRUE;
				355	break;
				356	}
				357	/* FALLTHROUGH */
				358	default:
				359	return FAIL;
				360	}
				361	p += 3;
				362	}
				363	else if (p + 1 < end && p == '\\' && (p + 1) == 'n')
				364	{
				365	newl = TRUE;
				366	p += 2;
				367	}
				368	else if (*p == '_')
				369	{
				370	underscore = TRUE;
				371	p ++;
				372	}
				373	else if (*p == '\n')
				374	{
				375	newl = TRUE;
				376	p ++;
				377	}
				378	else
				379	return FAIL;
				380	} /* while (p < end) */
				381
				382	if (p != end)
				383	return FAIL;
				384
				385	/* build the config that represents the ranges we gathered */
				386	STRCPY(myconfig, "000000000");
				387	if (not == TRUE)
				388	myconfig[0] = '1';
				389	if (af == TRUE)
				390	myconfig[1] = '1';
				391	if (AF == TRUE)
				392	myconfig[2] = '1';
				393	if (az == TRUE)
				394	myconfig[3] = '1';
				395	if (AZ == TRUE)
				396	myconfig[4] = '1';
				397	if (o7 == TRUE)
				398	myconfig[5] = '1';
				399	if (o9 == TRUE)
				400	myconfig[6] = '1';
				401	if (underscore == TRUE)
				402	myconfig[7] = '1';
				403	if (newl == TRUE)
				404	{
				405	myconfig[8] = '1';
				406	extra_newl = ADD_NL;
				407	}
				408	/* try to recognize character classes */
				409	for (i = 0; i < NCONFIGS; i++)
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	410	if (STRNCMP(myconfig, config[i], 8) == 0)
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	411	return classid[i] + extra_newl;
				412
				413	/* fallthrough => no success so far */
				414	return FAIL;
				415
				416	#undef NCONFIGS
				417	}
				418
				419	/*
				420	* Produce the bytes for equivalence class "c".
				421	* Currently only handles latin1, latin9 and utf-8.
				422	* Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
				423	* equivalent to 'a OR b OR c'
				424	*
				425	* NOTE! When changing this function, also update reg_equi_class()
				426	*/
				427	static int
				428	nfa_emit_equi_class(c, neg)
				429	int c;
				430	int neg;
				431	{
				432	int first = TRUE;
				433	int glue = neg == TRUE ? NFA_CONCAT : NFA_OR;
				434	#define EMIT2(c) \
				435	EMIT(c); \
				436	if (neg == TRUE) { \
				437	EMIT(NFA_NOT); \
				438	} \
				439	if (first == FALSE) \
				440	EMIT(glue); \
				441	else \
				442	first = FALSE; \
				443
				444	#ifdef FEAT_MBYTE
				445	if (enc_utf8 \|\| STRCMP(p_enc, "latin1") == 0
				446	\|\| STRCMP(p_enc, "iso-8859-15") == 0)
				447	#endif
				448	{
				449	switch (c)
				450	{
				451	case 'A': case '\300': case '\301': case '\302':
				452	case '\303': case '\304': case '\305':
				453	EMIT2('A'); EMIT2('\300'); EMIT2('\301');
				454	EMIT2('\302'); EMIT2('\303'); EMIT2('\304');
				455	EMIT2('\305');
				456	return OK;
				457
				458	case 'C': case '\307':
				459	EMIT2('C'); EMIT2('\307');
				460	return OK;
				461
				462	case 'E': case '\310': case '\311': case '\312': case '\313':
				463	EMIT2('E'); EMIT2('\310'); EMIT2('\311');
				464	EMIT2('\312'); EMIT2('\313');
				465	return OK;
				466
				467	case 'I': case '\314': case '\315': case '\316': case '\317':
				468	EMIT2('I'); EMIT2('\314'); EMIT2('\315');
				469	EMIT2('\316'); EMIT2('\317');
				470	return OK;
				471
				472	case 'N': case '\321':
				473	EMIT2('N'); EMIT2('\321');
				474	return OK;
				475
				476	case 'O': case '\322': case '\323': case '\324': case '\325':
				477	case '\326':
				478	EMIT2('O'); EMIT2('\322'); EMIT2('\323');
				479	EMIT2('\324'); EMIT2('\325'); EMIT2('\326');
				480	return OK;
				481
				482	case 'U': case '\331': case '\332': case '\333': case '\334':
				483	EMIT2('U'); EMIT2('\331'); EMIT2('\332');
				484	EMIT2('\333'); EMIT2('\334');
				485	return OK;
				486
				487	case 'Y': case '\335':
				488	EMIT2('Y'); EMIT2('\335');
				489	return OK;
				490
				491	case 'a': case '\340': case '\341': case '\342':
				492	case '\343': case '\344': case '\345':
				493	EMIT2('a'); EMIT2('\340'); EMIT2('\341');
				494	EMIT2('\342'); EMIT2('\343'); EMIT2('\344');
				495	EMIT2('\345');
				496	return OK;
				497
				498	case 'c': case '\347':
				499	EMIT2('c'); EMIT2('\347');
				500	return OK;
				501
				502	case 'e': case '\350': case '\351': case '\352': case '\353':
				503	EMIT2('e'); EMIT2('\350'); EMIT2('\351');
				504	EMIT2('\352'); EMIT2('\353');
				505	return OK;
				506
				507	case 'i': case '\354': case '\355': case '\356': case '\357':
				508	EMIT2('i'); EMIT2('\354'); EMIT2('\355');
				509	EMIT2('\356'); EMIT2('\357');
				510	return OK;
				511
				512	case 'n': case '\361':
				513	EMIT2('n'); EMIT2('\361');
				514	return OK;
				515
				516	case 'o': case '\362': case '\363': case '\364': case '\365':
				517	case '\366':
				518	EMIT2('o'); EMIT2('\362'); EMIT2('\363');
				519	EMIT2('\364'); EMIT2('\365'); EMIT2('\366');
				520	return OK;
				521
				522	case 'u': case '\371': case '\372': case '\373': case '\374':
				523	EMIT2('u'); EMIT2('\371'); EMIT2('\372');
				524	EMIT2('\373'); EMIT2('\374');
				525	return OK;
				526
				527	case 'y': case '\375': case '\377':
				528	EMIT2('y'); EMIT2('\375'); EMIT2('\377');
				529	return OK;
				530
				531	default:
				532	return FAIL;
				533	}
				534	}
				535
				536	EMIT(c);
				537	return OK;
				538	#undef EMIT2
				539	}
				540
				541	/*
				542	* Code to parse regular expression.
				543	*
				544	* We try to reuse parsing functions in regexp.c to
				545	* minimize surprise and keep the syntax consistent.
				546	*/
				547
				548	/*
				549	* Increments the pointer "p" by one (multi-byte) character.
				550	*/
				551	static void
				552	nfa_inc(p)
				553	char_u **p;
				554	{
				555	#ifdef FEAT_MBYTE
				556	if (has_mbyte)
				557	mb_ptr2char_adv(p);
				558	else
				559	#endif
				560	p = p + 1;
				561	}
				562
				563	/*
				564	* Decrements the pointer "p" by one (multi-byte) character.
				565	*/
				566	static void
				567	nfa_dec(p)
				568	char_u **p;
				569	{
				570	#ifdef FEAT_MBYTE
				571	char_u p2, oldp;
				572
				573	if (has_mbyte)
				574	{
				575	oldp = *p;
				576	/* Try to find the multibyte char that advances to the current
				577	* position. */
				578	do
				579	{
				580	p = p - 1;
				581	p2 = *p;
				582	mb_ptr2char_adv(&p2);
				583	} while (p2 != oldp);
				584	}
				585	#else
				586	p = p - 1;
				587	#endif
				588	}
				589
				590	/*
				591	* Parse the lowest level.
				592	*
				593	* An atom can be one of a long list of items. Many atoms match one character
				594	* in the text. It is often an ordinary character or a character class.
				595	* Braces can be used to make a pattern into an atom. The "\z(\)" construct
				596	* is only for syntax highlighting.
				597	*
				598	* atom ::= ordinary-atom
				599	* or $ pattern $
				600	* or \%( pattern \)
				601	* or \z( pattern \)
				602	*/
				603	static int
				604	nfa_regatom()
				605	{
				606	int c;
				607	int charclass;
				608	int equiclass;
				609	int collclass;
				610	int got_coll_char;
				611	char_u *p;
				612	char_u *endp;
				613	#ifdef FEAT_MBYTE
				614	char_u *old_regparse = regparse;
				615	int clen;
				616	int len;
				617	static char_u buf[30];
				618	int i;
				619	#endif
				620	int extra = 0;
				621	int first;
				622	int emit_range;
				623	int negated;
				624	int result;
				625	int startc = -1;
				626	int endc = -1;
				627	int oldstartc = -1;
				628	int cpo_lit; /* 'cpoptions' contains 'l' flag */
				629	int cpo_bsl; /* 'cpoptions' contains '\' flag */
				630	int glue; /* ID that will "glue" nodes together */
				631
				632	cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
				633	cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
				634
				635	c = getchr();
				636
				637	#ifdef FEAT_MBYTE
				638	/* clen has the length of the current char, without composing chars */
				639	clen = (*mb_char2len)(c);
				640	if (has_mbyte && clen > 1)
				641	goto nfa_do_multibyte;
				642	#endif
				643	switch (c)
				644	{
				645	case Magic('^'):
				646	EMIT(NFA_BOL);
				647	break;
				648
				649	case Magic('$'):
				650	EMIT(NFA_EOL);
				651	#if defined(FEAT_SYN_HL) \|\| defined(PROTO)
				652	had_eol = TRUE;
				653	#endif
				654	break;
				655
				656	case Magic('<'):
				657	EMIT(NFA_BOW);
				658	break;
				659
				660	case Magic('>'):
				661	EMIT(NFA_EOW);
				662	break;
				663
				664	case Magic('_'):
				665	c = no_Magic(getchr());
				666	if (c == '^') /* "\_^" is start-of-line */
				667	{
				668	EMIT(NFA_BOL);
				669	break;
				670	}
				671	if (c == '$') /* "\_$" is end-of-line */
				672	{
				673	EMIT(NFA_EOL);
				674	#if defined(FEAT_SYN_HL) \|\| defined(PROTO)
				675	had_eol = TRUE;
				676	#endif
				677	break;
				678	}
				679
				680	extra = ADD_NL;
				681
				682	/* "\_[" is collection plus newline */
				683	if (c == '[')
				684	/* TODO: make this work
				685	* goto collection; */
				686	return FAIL;
				687
				688	/* "\_x" is character class plus newline */
				689	/FALLTHROUGH/
				690
				691	/*
				692	* Character classes.
				693	*/
				694	case Magic('.'):
				695	case Magic('i'):
				696	case Magic('I'):
				697	case Magic('k'):
				698	case Magic('K'):
				699	case Magic('f'):
				700	case Magic('F'):
				701	case Magic('p'):
				702	case Magic('P'):
				703	case Magic('s'):
				704	case Magic('S'):
				705	case Magic('d'):
				706	case Magic('D'):
				707	case Magic('x'):
				708	case Magic('X'):
				709	case Magic('o'):
				710	case Magic('O'):
				711	case Magic('w'):
				712	case Magic('W'):
				713	case Magic('h'):
				714	case Magic('H'):
				715	case Magic('a'):
				716	case Magic('A'):
				717	case Magic('l'):
				718	case Magic('L'):
				719	case Magic('u'):
				720	case Magic('U'):
				721	p = vim_strchr(classchars, no_Magic(c));
				722	if (p == NULL)
				723	{
				724	return FAIL; /* runtime error */
				725	}
				726	#ifdef FEAT_MBYTE
				727	/* When '.' is followed by a composing char ignore the dot, so that
				728	* the composing char is matched here. */
				729	if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
				730	{
				731	c = getchr();
				732	goto nfa_do_multibyte;
				733	}
				734	#endif
				735	EMIT(nfa_classcodes[p - classchars]);
				736	if (extra == ADD_NL)
				737	{
				738	EMIT(NFA_NEWL);
				739	EMIT(NFA_OR);
				740	regflags \|= RF_HASNL;
				741	}
				742	break;
				743
				744	case Magic('n'):
				745	if (reg_string)
				746	/* In a string "\n" matches a newline character. */
				747	EMIT(NL);
				748	else
				749	{
				750	/* In buffer text "\n" matches the end of a line. */
				751	EMIT(NFA_NEWL);
				752	regflags \|= RF_HASNL;
				753	}
				754	break;
				755
				756	case Magic('('):
				757	if (nfa_reg(REG_PAREN) == FAIL)
				758	return FAIL; /* cascaded error */
				759	break;
				760
				761	case NUL:
				762	syntax_error = TRUE;
				763	EMSG_RET_FAIL(_("E865: (NFA) Regexp end encountered prematurely"));
				764
				765	case Magic('\|'):
				766	case Magic('&'):
				767	case Magic(')'):
				768	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	769	EMSGN(_(e_misplaced), no_Magic(c));
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	770	return FAIL;
				771
				772	case Magic('='):
				773	case Magic('?'):
				774	case Magic('+'):
				775	case Magic('@'):
				776	case Magic('*'):
				777	case Magic('{'):
				778	/* these should follow an atom, not form an atom */
				779	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	780	EMSGN(_(e_misplaced), no_Magic(c));
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	781	return FAIL;
				782
				783	case Magic('~'): /* previous substitute pattern */
				784	/* Not supported yet */
				785	return FAIL;
				786
				787	case Magic('1'):
				788	case Magic('2'):
				789	case Magic('3'):
				790	case Magic('4'):
				791	case Magic('5'):
				792	case Magic('6'):
				793	case Magic('7'):
				794	case Magic('8'):
				795	case Magic('9'):
				796	/* not supported yet */
				797	return FAIL;
				798
				799	case Magic('z'):
				800	c = no_Magic(getchr());
				801	switch (c)
				802	{
				803	case 's':
				804	EMIT(NFA_ZSTART);
				805	break;
				806	case 'e':
				807	EMIT(NFA_ZEND);
				808	nfa_has_zend = TRUE;
				809	/* TODO: Currently \ze does not work properly. */
				810	return FAIL;
				811	/* break; */
				812	case '1':
				813	case '2':
				814	case '3':
				815	case '4':
				816	case '5':
				817	case '6':
				818	case '7':
				819	case '8':
				820	case '9':
				821	case '(':
				822	/* \z1...\z9 and \z( not yet supported */
				823	return FAIL;
				824	default:
				825	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	826	EMSGN(_("E867: (NFA) Unknown operator '\\z%c'"),
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	827	no_Magic(c));
				828	return FAIL;
				829	}
				830	break;
				831
				832	case Magic('%'):
				833	c = no_Magic(getchr());
				834	switch (c)
				835	{
				836	/* () without a back reference */
				837	case '(':
				838	if (nfa_reg(REG_NPAREN) == FAIL)
				839	return FAIL;
				840	EMIT(NFA_NOPEN);
				841	break;
				842
				843	case 'd': /* %d123 decimal */
				844	case 'o': /* %o123 octal */
				845	case 'x': /* %xab hex 2 */
				846	case 'u': /* %uabcd hex 4 */
				847	case 'U': /* %U1234abcd hex 8 */
				848	/* Not yet supported */
				849	return FAIL;
				850
				851	c = coll_get_char();
				852	#ifdef FEAT_MBYTE
				853	if ((*mb_char2len)(c) > 1)
				854	{
				855	EMIT_MBYTE(c);
				856	}
				857	else
				858	#endif
				859	EMIT(c);
				860	break;
				861
				862	/* Catch \%^ and \%$ regardless of where they appear in the
				863	* pattern -- regardless of whether or not it makes sense. */
				864	case '^':
				865	EMIT(NFA_BOF);
				866	/* Not yet supported */
				867	return FAIL;
				868	break;
				869
				870	case '$':
				871	EMIT(NFA_EOF);
				872	/* Not yet supported */
				873	return FAIL;
				874	break;
				875
				876	case '#':
				877	/* not supported yet */
				878	return FAIL;
				879	break;
				880
				881	case 'V':
				882	/* not supported yet */
				883	return FAIL;
				884	break;
				885
				886	case '[':
				887	/* \%[abc] not supported yet */
				888	return FAIL;
				889
				890	default:
				891	/* not supported yet */
				892	return FAIL;
				893	}
				894	break;
				895
				896	/* collection: */
				897	case Magic('['):
				898	/*
				899	* Glue is emitted between several atoms from the [].
				900	* It is either NFA_OR, or NFA_CONCAT.
				901	*
				902	* [abc] expands to 'a b NFA_OR c NFA_OR' (in postfix notation)
				903	* [^abc] expands to 'a NFA_NOT b NFA_NOT NFA_CONCAT c NFA_NOT
				904	* NFA_CONCAT NFA_END_NEG_RANGE NFA_CONCAT' (in postfix
				905	* notation)
				906	*
				907	*/
				908
				909
				910	/* Emit negation atoms, if needed.
				911	* The CONCAT below merges the NOT with the previous node. */
				912	#define TRY_NEG() \
				913	if (negated == TRUE) \
				914	{ \
				915	EMIT(NFA_NOT); \
				916	}
				917
				918	/* Emit glue between important nodes : CONCAT or OR. */
				919	#define EMIT_GLUE() \
				920	if (first == FALSE) \
				921	EMIT(glue); \
				922	else \
				923	first = FALSE;
				924
				925	p = regparse;
				926	endp = skip_anyof(p);
				927	if (*endp == ']')
				928	{
				929	/*
				930	* Try to reverse engineer character classes. For example,
				931	* recognize that [0-9] stands for \d and [A-Za-z_] with \h,
				932	* and perform the necessary substitutions in the NFA.
				933	*/
				934	result = nfa_recognize_char_class(regparse, endp,
				935	extra == ADD_NL);
				936	if (result != FAIL)
				937	{
				938	if (result >= NFA_DIGIT && result <= NFA_NUPPER)
				939	EMIT(result);
				940	else /* must be char class + newline */
				941	{
				942	EMIT(result - ADD_NL);
				943	EMIT(NFA_NEWL);
				944	EMIT(NFA_OR);
				945	}
				946	regparse = endp;
				947	nfa_inc(&regparse);
				948	return OK;
				949	}
				950	/*
				951	* Failed to recognize a character class. Use the simple
				952	* version that turns [abc] into 'a' OR 'b' OR 'c'
				953	*/
				954	startc = endc = oldstartc = -1;
				955	first = TRUE; /* Emitting first atom in this sequence? */
				956	negated = FALSE;
				957	glue = NFA_OR;
				958	if (regparse == '^') / negated range */
				959	{
				960	negated = TRUE;
				961	glue = NFA_CONCAT;
				962	nfa_inc(&regparse);
				963	}
				964	if (*regparse == '-')
				965	{
				966	startc = '-';
				967	EMIT(startc);
				968	TRY_NEG();
				969	EMIT_GLUE();
				970	nfa_inc(&regparse);
				971	}
				972	/* Emit the OR branches for each character in the [] */
				973	emit_range = FALSE;
				974	while (regparse < endp)
				975	{
				976	oldstartc = startc;
				977	startc = -1;
				978	got_coll_char = FALSE;
				979	if (*regparse == '[')
				980	{
				981	/* Check for [: :], [= =], [. .] */
				982	equiclass = collclass = 0;
				983	charclass = get_char_class(&regparse);
				984	if (charclass == CLASS_NONE)
				985	{
				986	equiclass = get_equi_class(&regparse);
				987	if (equiclass == 0)
				988	collclass = get_coll_element(&regparse);
				989	}
				990
				991	/* Character class like [:alpha:] */
				992	if (charclass != CLASS_NONE)
				993	{
				994	switch (charclass)
				995	{
				996	case CLASS_ALNUM:
				997	EMIT(NFA_CLASS_ALNUM);
				998	break;
				999	case CLASS_ALPHA:
				1000	EMIT(NFA_CLASS_ALPHA);
				1001	break;
				1002	case CLASS_BLANK:
				1003	EMIT(NFA_CLASS_BLANK);
				1004	break;
				1005	case CLASS_CNTRL:
				1006	EMIT(NFA_CLASS_CNTRL);
				1007	break;
				1008	case CLASS_DIGIT:
				1009	EMIT(NFA_CLASS_DIGIT);
				1010	break;
				1011	case CLASS_GRAPH:
				1012	EMIT(NFA_CLASS_GRAPH);
				1013	break;
				1014	case CLASS_LOWER:
				1015	EMIT(NFA_CLASS_LOWER);
				1016	break;
				1017	case CLASS_PRINT:
				1018	EMIT(NFA_CLASS_PRINT);
				1019	break;
				1020	case CLASS_PUNCT:
				1021	EMIT(NFA_CLASS_PUNCT);
				1022	break;
				1023	case CLASS_SPACE:
				1024	EMIT(NFA_CLASS_SPACE);
				1025	break;
				1026	case CLASS_UPPER:
				1027	EMIT(NFA_CLASS_UPPER);
				1028	break;
				1029	case CLASS_XDIGIT:
				1030	EMIT(NFA_CLASS_XDIGIT);
				1031	break;
				1032	case CLASS_TAB:
				1033	EMIT(NFA_CLASS_TAB);
				1034	break;
				1035	case CLASS_RETURN:
				1036	EMIT(NFA_CLASS_RETURN);
				1037	break;
				1038	case CLASS_BACKSPACE:
				1039	EMIT(NFA_CLASS_BACKSPACE);
				1040	break;
				1041	case CLASS_ESCAPE:
				1042	EMIT(NFA_CLASS_ESCAPE);
				1043	break;
				1044	}
				1045	TRY_NEG();
				1046	EMIT_GLUE();
				1047	continue;
				1048	}
				1049	/* Try equivalence class [=a=] and the like */
				1050	if (equiclass != 0)
				1051	{
				1052	result = nfa_emit_equi_class(equiclass, negated);
				1053	if (result == FAIL)
				1054	{
				1055	/* should never happen */
				1056	EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
				1057	}
				1058	EMIT_GLUE();
				1059	continue;
				1060	}
				1061	/* Try collating class like [. .] */
				1062	if (collclass != 0)
				1063	{
				1064	startc = collclass; /* allow [.a.]-x as a range */
				1065	/* Will emit the proper atom at the end of the
				1066	* while loop. */
				1067	}
				1068	}
				1069	/* Try a range like 'a-x' or '\t-z' */
				1070	if (*regparse == '-')
				1071	{
				1072	emit_range = TRUE;
				1073	startc = oldstartc;
				1074	nfa_inc(&regparse);
				1075	continue; /* reading the end of the range */
				1076	}
				1077
				1078	/* Now handle simple and escaped characters.
				1079	* Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
				1080	* accepts "\t", "\e", etc., but only when the 'l' flag in
				1081	* 'cpoptions' is not included.
				1082	* Posix doesn't recognize backslash at all.
				1083	*/
				1084	if (*regparse == '\\'
				1085	&& !cpo_bsl
				1086	&& regparse + 1 <= endp
				1087	&& (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
				1088	\|\| (!cpo_lit
				1089	&& vim_strchr(REGEXP_ABBR, regparse[1])
				1090	!= NULL)
				1091	)
				1092	)
				1093	{
				1094	nfa_inc(&regparse);
				1095
				1096	if (regparse == 'n' \|\| regparse == 'n')
				1097	startc = reg_string ? NL : NFA_NEWL;
				1098	else
				1099	if (*regparse == 'd'
				1100	\|\| *regparse == 'o'
				1101	\|\| *regparse == 'x'
				1102	\|\| *regparse == 'u'
				1103	\|\| *regparse == 'U'
				1104	)
				1105	{
				1106	/* TODO(RE) This needs more testing */
				1107	startc = coll_get_char();
				1108	got_coll_char = TRUE;
				1109	nfa_dec(&regparse);
				1110	}
				1111	else
				1112	{
				1113	/* \r,\t,\e,\b */
				1114	startc = backslash_trans(*regparse);
				1115	}
				1116	}
				1117
				1118	/* Normal printable char */
				1119	if (startc == -1)
				1120	#ifdef FEAT_MBYTE
				1121	startc = (*mb_ptr2char)(regparse);
				1122	#else
				1123	startc = *regparse;
				1124	#endif
				1125
				1126	/* Previous char was '-', so this char is end of range. */
				1127	if (emit_range)
				1128	{
				1129	endc = startc; startc = oldstartc;
				1130	if (startc > endc)
				1131	EMSG_RET_FAIL(_(e_invrange));
				1132	#ifdef FEAT_MBYTE
				1133	if (has_mbyte && ((*mb_char2len)(startc) > 1
				1134	\|\| (*mb_char2len)(endc) > 1))
				1135	{
				1136	if (endc > startc + 256)
				1137	EMSG_RET_FAIL(_(e_invrange));
				1138	/* Emit the range. "startc" was already emitted, so
				1139	* skip it. */
				1140	for (c = startc + 1; c <= endc; c++)
				1141	{
				1142	if ((*mb_char2len)(c) > 1)
				1143	{
				1144	EMIT_MBYTE(c);
				1145	}
				1146	else
				1147	EMIT(c);
				1148	TRY_NEG();
				1149	EMIT_GLUE();
				1150	}
				1151	emit_range = FALSE;
				1152	}
				1153	else
				1154	#endif
				1155	{
				1156	#ifdef EBCDIC
				1157	int alpha_only = FALSE;
				1158
				1159	/* for alphabetical range skip the gaps
				1160	* 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
				1161	if (isalpha(startc) && isalpha(endc))
				1162	alpha_only = TRUE;
				1163	#endif
				1164	/* Emit the range. "startc" was already emitted, so
				1165	* skip it. */
				1166	for (c = startc + 1; c <= endc; c++)
				1167	#ifdef EBCDIC
				1168	if (!alpha_only \|\| isalpha(startc))
				1169	#endif
				1170	{
				1171	EMIT(c);
				1172	TRY_NEG();
				1173	EMIT_GLUE();
				1174	}
				1175	emit_range = FALSE;
				1176	}
				1177	}
				1178	else
				1179	{
				1180	/*
				1181	* This char (startc) is not part of a range. Just
				1182	* emit it.
				1183	*
				1184	* Normally, simply emit startc. But if we get char
				1185	* code=0 from a collating char, then replace it with
				1186	* 0x0a.
				1187	*
				1188	* This is needed to completely mimic the behaviour of
				1189	* the backtracking engine.
				1190	*/
				1191	if (got_coll_char == TRUE && startc == 0)
				1192	EMIT(0x0a);
				1193	else
				1194	#ifdef FEAT_MBYTE
				1195	if ((*mb_char2len)(startc) > 1)
				1196	{
				1197	EMIT_MBYTE(startc);
				1198	}
				1199	else
				1200	#endif
				1201	EMIT(startc);
				1202	TRY_NEG();
				1203	EMIT_GLUE();
				1204	}
				1205
				1206	nfa_inc(&regparse);
				1207	} /* while (p < endp) */
				1208
				1209	nfa_dec(&regparse);
				1210	if (regparse == '-') / if last, '-' is just a char */
				1211	{
				1212	EMIT('-');
				1213	TRY_NEG();
				1214	EMIT_GLUE();
				1215	}
				1216	nfa_inc(&regparse);
				1217
				1218	if (extra == ADD_NL) /* \_[] also matches \n */
				1219	{
				1220	EMIT(reg_string ? NL : NFA_NEWL);
				1221	TRY_NEG();
				1222	EMIT_GLUE();
				1223	}
				1224
				1225	/* skip the trailing ] */
				1226	regparse = endp;
				1227	nfa_inc(&regparse);
				1228	if (negated == TRUE)
				1229	{
				1230	/* Mark end of negated char range */
				1231	EMIT(NFA_END_NEG_RANGE);
				1232	EMIT(NFA_CONCAT);
				1233	}
				1234	return OK;
				1235	} /* if exists closing ] */
				1236	else if (reg_strict)
				1237	{
				1238	syntax_error = TRUE;
				1239	EMSG_RET_FAIL(_(e_missingbracket));
				1240	}
				1241
				1242	/* FALLTHROUGH */
				1243	default:
				1244	{
				1245	#ifdef FEAT_MBYTE
				1246	int plen;
				1247
				1248	nfa_do_multibyte:
				1249	/* length of current char, with composing chars,
				1250	* from pointer */
				1251	plen = (*mb_ptr2len)(old_regparse);
				1252	if (enc_utf8 && clen != plen)
				1253	{
				1254	/* A composing character is always handled as a
				1255	* separate atom, surrounded by NFA_COMPOSING and
				1256	* NFA_END_COMPOSING. Note that right now we are
				1257	* building the postfix form, not the NFA itself;
				1258	* a composing char could be: a, b, c, NFA_COMPOSING
				1259	* where 'a', 'b', 'c' are chars with codes > 256.
				1260	*/
				1261	EMIT_COMPOSING_UTF(old_regparse);
				1262	regparse = old_regparse + plen;
				1263	}
				1264	else
				1265	/* A multi-byte character is always handled as a
				1266	* separate atom, surrounded by NFA_MULTIBYTE and
				1267	* NFA_END_MULTIBYTE */
				1268	if (plen > 1)
				1269	{
				1270	EMIT_MBYTE(c);
				1271	}
				1272	else
				1273	#endif
				1274	{
				1275	c = no_Magic(c);
				1276	EMIT(c);
				1277	}
				1278	return OK;
				1279	}
				1280	}
				1281
				1282	#undef TRY_NEG
				1283	#undef EMIT_GLUE
				1284
				1285	return OK;
				1286	}
				1287
				1288	/*
				1289	* Parse something followed by possible [*+=].
				1290	*
				1291	* A piece is an atom, possibly followed by a multi, an indication of how many
				1292	* times the atom can be matched. Example: "a*" matches any sequence of "a"
				1293	* characters: "", "a", "aa", etc.
				1294	*
				1295	* piece ::= atom
				1296	* or atom multi
				1297	*/
				1298	static int
				1299	nfa_regpiece()
				1300	{
				1301	int i;
				1302	int op;
				1303	int ret;
				1304	long minval, maxval;
				1305	int greedy = TRUE; /* Braces are prefixed with '-' ? */
				1306	char_u old_regparse, new_regparse;
				1307	int c2;
				1308	int old_post_ptr, my_post_start;
				1309	int old_regnpar;
				1310	int quest;
				1311
				1312	/* Save the current position in the regexp, so that we can use it if
				1313	* <atom>{m,n} is next. */
				1314	old_regparse = regparse;
				1315	/* Save current number of open parenthesis, so we can use it if
				1316	* <atom>{m,n} is next */
				1317	old_regnpar = regnpar;
				1318	/* store current pos in the postfix form, for \{m,n} involving 0s */
				1319	my_post_start = post_ptr;
				1320
				1321	ret = nfa_regatom();
				1322	if (ret == FAIL)
				1323	return FAIL; /* cascaded error */
				1324
				1325	op = peekchr();
				1326	if (re_multi_type(op) == NOT_MULTI)
				1327	return OK;
				1328
				1329	skipchr();
				1330	switch (op)
				1331	{
				1332	case Magic('*'):
				1333	EMIT(NFA_STAR);
				1334	break;
				1335
				1336	case Magic('+'):
				1337	/*
				1338	* Trick: Normally, (a*)\+ would match the whole input "aaa". The
				1339	* first and only submatch would be "aaa". But the backtracking
				1340	* engine interprets the plus as "try matching one more time", and
				1341	* a* matches a second time at the end of the input, the empty
				1342	* string.
				1343	* The submatch will the empty string.
				1344	*
				1345	* In order to be consistent with the old engine, we disable
				1346	* NFA_PLUS, and replace <atom>+ with <atom><atom>*
				1347	*/
				1348	/* EMIT(NFA_PLUS); */
				1349	regnpar = old_regnpar;
				1350	regparse = old_regparse;
				1351	curchr = -1;
				1352	if (nfa_regatom() == FAIL)
				1353	return FAIL;
				1354	EMIT(NFA_STAR);
				1355	EMIT(NFA_CONCAT);
				1356	skipchr(); /* skip the \+ */
				1357	break;
				1358
				1359	case Magic('@'):
				1360	op = no_Magic(getchr());
				1361	switch(op)
				1362	{
				1363	case '=':
				1364	EMIT(NFA_PREV_ATOM_NO_WIDTH);
				1365	break;
				1366	case '!':
				1367	case '<':
				1368	case '>':
				1369	/* Not supported yet */
				1370	return FAIL;
				1371	default:
				1372	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	1373	EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1374	return FAIL;
				1375	}
				1376	break;
				1377
				1378	case Magic('?'):
				1379	case Magic('='):
				1380	EMIT(NFA_QUEST);
				1381	break;
				1382
				1383	case Magic('{'):
				1384	/* a{2,5} will expand to 'aaa?a?a?'
				1385	* a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
				1386	* version of '?'
				1387	* \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
				1388	* parenthesis have the same id
				1389	*/
				1390
				1391	greedy = TRUE;
				1392	c2 = peekchr();
				1393	if (c2 == '-' \|\| c2 == Magic('-'))
				1394	{
				1395	skipchr();
				1396	greedy = FALSE;
				1397	}
				1398	if (!read_limits(&minval, &maxval))
				1399	{
				1400	syntax_error = TRUE;
				1401	EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
				1402	}
				1403	/* <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
				1404	* <atom>* */
				1405	if (minval == 0 && maxval == MAX_LIMIT && greedy)
				1406	{
				1407	EMIT(NFA_STAR);
				1408	break;
				1409	}
				1410
				1411	if (maxval > NFA_BRACES_MAXLIMIT)
				1412	{
				1413	/* This would yield a huge automaton and use too much memory.
				1414	* Revert to old engine */
				1415	return FAIL;
				1416	}
				1417
				1418	/* Special case: x{0} or x{-0} */
				1419	if (maxval == 0)
				1420	{
				1421	/* Ignore result of previous call to nfa_regatom() */
				1422	post_ptr = my_post_start;
				1423	/* NFA_SKIP_CHAR has 0-length and works everywhere */
				1424	EMIT(NFA_SKIP_CHAR);
				1425	return OK;
				1426	}
				1427
				1428	/* Ignore previous call to nfa_regatom() */
				1429	post_ptr = my_post_start;
				1430	/* Save pos after the repeated atom and the \{} */
				1431	new_regparse = regparse;
				1432
				1433	new_regparse = regparse;
				1434	quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
				1435	for (i = 0; i < maxval; i++)
				1436	{
				1437	/* Goto beginning of the repeated atom */
				1438	regparse = old_regparse;
				1439	curchr = -1;
				1440	/* Restore count of parenthesis */
				1441	regnpar = old_regnpar;
				1442	old_post_ptr = post_ptr;
				1443	if (nfa_regatom() == FAIL)
				1444	return FAIL;
				1445	/* after "minval" times, atoms are optional */
				1446	if (i + 1 > minval)
				1447	EMIT(quest);
				1448	if (old_post_ptr != my_post_start)
				1449	EMIT(NFA_CONCAT);
				1450	}
				1451
				1452	/* Go to just after the repeated atom and the \{} */
				1453	regparse = new_regparse;
				1454	curchr = -1;
				1455
				1456	break;
				1457
				1458
				1459	default:
				1460	break;
				1461	} /* end switch */
				1462
				1463	if (re_multi_type(peekchr()) != NOT_MULTI)
				1464	{
				1465	/* Can't have a multi follow a multi. */
				1466	syntax_error = TRUE;
				1467	EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi !"));
				1468	}
				1469
				1470	return OK;
				1471	}
				1472
				1473	/*
				1474	* Parse one or more pieces, concatenated. It matches a match for the
				1475	* first piece, followed by a match for the second piece, etc. Example:
				1476	* "f[0-9]b", first matches "f", then a digit and then "b".
				1477	*
				1478	* concat ::= piece
				1479	* or piece piece
				1480	* or piece piece piece
				1481	* etc.
				1482	*/
				1483	static int
				1484	nfa_regconcat()
				1485	{
				1486	int cont = TRUE;
				1487	int first = TRUE;
				1488
				1489	while (cont)
				1490	{
				1491	switch (peekchr())
				1492	{
				1493	case NUL:
				1494	case Magic('\|'):
				1495	case Magic('&'):
				1496	case Magic(')'):
				1497	cont = FALSE;
				1498	break;
				1499
				1500	case Magic('Z'):
				1501	#ifdef FEAT_MBYTE
				1502	regflags \|= RF_ICOMBINE;
				1503	#endif
				1504	skipchr_keepstart();
				1505	break;
				1506	case Magic('c'):
				1507	regflags \|= RF_ICASE;
				1508	skipchr_keepstart();
				1509	break;
				1510	case Magic('C'):
				1511	regflags \|= RF_NOICASE;
				1512	skipchr_keepstart();
				1513	break;
				1514	case Magic('v'):
				1515	reg_magic = MAGIC_ALL;
				1516	skipchr_keepstart();
				1517	curchr = -1;
				1518	break;
				1519	case Magic('m'):
				1520	reg_magic = MAGIC_ON;
				1521	skipchr_keepstart();
				1522	curchr = -1;
				1523	break;
				1524	case Magic('M'):
				1525	reg_magic = MAGIC_OFF;
				1526	skipchr_keepstart();
				1527	curchr = -1;
				1528	break;
				1529	case Magic('V'):
				1530	reg_magic = MAGIC_NONE;
				1531	skipchr_keepstart();
				1532	curchr = -1;
				1533	break;
				1534
				1535	default:
				1536	if (nfa_regpiece() == FAIL)
				1537	return FAIL;
				1538	if (first == FALSE)
				1539	EMIT(NFA_CONCAT);
				1540	else
				1541	first = FALSE;
				1542	break;
				1543	}
				1544	}
				1545
				1546	return OK;
				1547	}
				1548
				1549	/*
				1550	* Parse a branch, one or more concats, separated by "\&". It matches the
				1551	* last concat, but only if all the preceding concats also match at the same
				1552	* position. Examples:
				1553	* "foobeep\&..." matches "foo" in "foobeep".
				1554	* ".Peter\&.Bob" matches in a line containing both "Peter" and "Bob"
				1555	*
				1556	* branch ::= concat
				1557	* or concat \& concat
				1558	* or concat \& concat \& concat
				1559	* etc.
				1560	*/
				1561	static int
				1562	nfa_regbranch()
				1563	{
				1564	int ch;
				1565	int *old_post_ptr;
				1566
				1567	old_post_ptr = post_ptr;
				1568
				1569	/* First branch, possibly the only one */
				1570	if (nfa_regconcat() == FAIL)
				1571	return FAIL;
				1572
				1573	ch = peekchr();
				1574	/* Try next concats */
				1575	while (ch == Magic('&'))
				1576	{
				1577	skipchr();
				1578	EMIT(NFA_NOPEN);
				1579	EMIT(NFA_PREV_ATOM_NO_WIDTH);
				1580	old_post_ptr = post_ptr;
				1581	if (nfa_regconcat() == FAIL)
				1582	return FAIL;
				1583	/* if concat is empty, skip a input char. But do emit a node */
				1584	if (old_post_ptr == post_ptr)
				1585	EMIT(NFA_SKIP_CHAR);
				1586	EMIT(NFA_CONCAT);
				1587	ch = peekchr();
				1588	}
				1589
				1590	/* Even if a branch is empty, emit one node for it */
				1591	if (old_post_ptr == post_ptr)
				1592	EMIT(NFA_SKIP_CHAR);
				1593
				1594	return OK;
				1595	}
				1596
				1597	/*
				1598	* Parse a pattern, one or more branches, separated by "\\|". It matches
				1599	* anything that matches one of the branches. Example: "foo\\|beep" matches
				1600	* "foo" and matches "beep". If more than one branch matches, the first one
				1601	* is used.
				1602	*
				1603	* pattern ::= branch
				1604	* or branch \\| branch
				1605	* or branch \\| branch \\| branch
				1606	* etc.
				1607	*/
				1608	static int
				1609	nfa_reg(paren)
				1610	int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
				1611	{
				1612	int parno = 0;
				1613
				1614	#ifdef FEAT_SYN_HL
				1615	#endif
				1616	if (paren == REG_PAREN)
				1617	{
				1618	if (regnpar >= NSUBEXP) /* Too many `(' */
				1619	{
				1620	syntax_error = TRUE;
				1621	EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
				1622	}
				1623	parno = regnpar++;
				1624	}
				1625
				1626	if (nfa_regbranch() == FAIL)
				1627	return FAIL; /* cascaded error */
				1628
				1629	while (peekchr() == Magic('\|'))
				1630	{
				1631	skipchr();
				1632	if (nfa_regbranch() == FAIL)
				1633	return FAIL; /* cascaded error */
				1634	EMIT(NFA_OR);
				1635	}
				1636
				1637	/* Check for proper termination. */
				1638	if (paren != REG_NOPAREN && getchr() != Magic(')'))
				1639	{
				1640	syntax_error = TRUE;
				1641	if (paren == REG_NPAREN)
				1642	EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
				1643	else
				1644	EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
				1645	}
				1646	else if (paren == REG_NOPAREN && peekchr() != NUL)
				1647	{
				1648	syntax_error = TRUE;
				1649	if (peekchr() == Magic(')'))
				1650	EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
				1651	else
				1652	EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
				1653	}
				1654	/*
				1655	* Here we set the flag allowing back references to this set of
				1656	* parentheses.
				1657	*/
				1658	if (paren == REG_PAREN)
				1659	{
				1660	had_endbrace[parno] = TRUE; /* have seen the close paren */
				1661	EMIT(NFA_MOPEN + parno);
				1662	}
				1663
				1664	return OK;
				1665	}
				1666
				1667	typedef struct
				1668	{
				1669	char_u *start[NSUBEXP];
				1670	char_u *end[NSUBEXP];
				1671	lpos_T startpos[NSUBEXP];
				1672	lpos_T endpos[NSUBEXP];
				1673	} regsub_T;
				1674
				1675	static int nfa_regmatch __ARGS((nfa_state_T start, regsub_T submatch, regsub_T *m));
				1676
				1677	#ifdef DEBUG
				1678	static char_u code[50];
				1679
				1680	static void
				1681	nfa_set_code(c)
				1682	int c;
				1683	{
				1684	int addnl = FALSE;
				1685
				1686	if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
				1687	{
				1688	addnl = TRUE;
				1689	c -= ADD_NL;
				1690	}
				1691
				1692	STRCPY(code, "");
				1693	switch (c)
				1694	{
				1695	case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
				1696	case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
				1697	case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
				1698	case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
				1699	case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
				1700	case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
				1701
				1702	case NFA_PREV_ATOM_NO_WIDTH:
				1703	STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
				1704	case NFA_NOPEN: STRCPY(code, "NFA_MOPEN_INVISIBLE"); break;
				1705	case NFA_NCLOSE: STRCPY(code, "NFA_MCLOSE_INVISIBLE"); break;
				1706	case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
				1707	case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
				1708
				1709	case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break;
				1710	case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break;
				1711
				1712	case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
				1713	case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
				1714
				1715	case NFA_MOPEN + 0:
				1716	case NFA_MOPEN + 1:
				1717	case NFA_MOPEN + 2:
				1718	case NFA_MOPEN + 3:
				1719	case NFA_MOPEN + 4:
				1720	case NFA_MOPEN + 5:
				1721	case NFA_MOPEN + 6:
				1722	case NFA_MOPEN + 7:
				1723	case NFA_MOPEN + 8:
				1724	case NFA_MOPEN + 9:
				1725	STRCPY(code, "NFA_MOPEN(x)");
				1726	code[10] = c - NFA_MOPEN + '0';
				1727	break;
				1728	case NFA_MCLOSE + 0:
				1729	case NFA_MCLOSE + 1:
				1730	case NFA_MCLOSE + 2:
				1731	case NFA_MCLOSE + 3:
				1732	case NFA_MCLOSE + 4:
				1733	case NFA_MCLOSE + 5:
				1734	case NFA_MCLOSE + 6:
				1735	case NFA_MCLOSE + 7:
				1736	case NFA_MCLOSE + 8:
				1737	case NFA_MCLOSE + 9:
				1738	STRCPY(code, "NFA_MCLOSE(x)");
				1739	code[11] = c - NFA_MCLOSE + '0';
				1740	break;
				1741	case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
				1742	case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
				1743	case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
				1744	case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
				1745	case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
				1746	case NFA_PLUS: STRCPY(code, "NFA_PLUS "); break;
				1747	case NFA_NOT: STRCPY(code, "NFA_NOT "); break;
				1748	case NFA_SKIP_CHAR: STRCPY(code, "NFA_SKIP_CHAR"); break;
				1749	case NFA_OR: STRCPY(code, "NFA_OR"); break;
				1750	case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
				1751	case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
				1752	case NFA_END_NEG_RANGE: STRCPY(code, "NFA_END_NEG_RANGE"); break;
				1753	case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
				1754	case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
				1755	case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
				1756	case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
				1757	case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
				1758	case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
				1759	case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
				1760	case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
				1761	case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
				1762	case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
				1763	case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
				1764	case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
				1765	case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
				1766	case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
				1767	case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
				1768	case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
				1769
				1770	case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
				1771	case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
				1772	case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
				1773	case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
				1774	case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
				1775	case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
				1776	case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
				1777	case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
				1778	case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
				1779	case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
				1780	case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
				1781	case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
				1782	case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
				1783	case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
				1784	case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
				1785	case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
				1786	case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
				1787	case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
				1788	case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
				1789	case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
				1790	case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
				1791	case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
				1792	case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
				1793	case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
				1794	case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
				1795	case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
				1796	case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
				1797
				1798	default:
				1799	STRCPY(code, "CHAR(x)");
				1800	code[5] = c;
				1801	}
				1802
				1803	if (addnl == TRUE)
				1804	STRCAT(code, " + NEWLINE ");
				1805
				1806	}
				1807
				1808	#ifdef ENABLE_LOG
				1809	static FILE *log_fd;
				1810
				1811	/*
				1812	* Print the postfix notation of the current regexp.
				1813	*/
				1814	static void
				1815	nfa_postfix_dump(expr, retval)
				1816	char_u *expr;
				1817	int retval;
				1818	{
				1819	int *p;
				1820	FILE *f;
				1821
				1822	f = fopen("LOG.log", "a");
				1823	if (f != NULL)
				1824	{
				1825	fprintf(f, "\n-------------------------\n");
				1826	if (retval == FAIL)
				1827	fprintf(f, ">>> NFA engine failed ... \n");
				1828	else if (retval == OK)
				1829	fprintf(f, ">>> NFA engine succeeded !\n");
				1830	fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
				1831	for (p=post_start; *p; p++)
				1832	{
				1833	nfa_set_code(*p);
				1834	fprintf(f, "%s, ", code);
				1835	}
				1836	fprintf(f, "\"\nPostfix notation (int): ");
				1837	for (p=post_start; *p; p++)
				1838	fprintf(f, "%d ", *p);
				1839	fprintf(f, "\n\n");
				1840	fclose(f);
				1841	}
				1842	}
				1843
				1844	/*
				1845	* Print the NFA starting with a root node "state".
				1846	*/
				1847	static void
				1848	nfa_print_state(debugf, state, ident)
				1849	FILE *debugf;
				1850	nfa_state_T *state;
				1851	int ident;
				1852	{
				1853	int i;
				1854
				1855	if (state == NULL)
				1856	return;
				1857
				1858	fprintf(debugf, "(%2d)", abs(state->id));
				1859	for (i = 0; i < ident; i++)
				1860	fprintf(debugf, "%c", ' ');
				1861
				1862	nfa_set_code(state->c);
				1863	fprintf(debugf, "%s %s (%d) (id=%d)\n",
				1864	state->negated ? "NOT" : "", code, state->c, abs(state->id));
				1865	if (state->id < 0)
				1866	return;
				1867
				1868	state->id = abs(state->id) * -1;
				1869	nfa_print_state(debugf, state->out, ident + 4);
				1870	nfa_print_state(debugf, state->out1, ident + 4);
				1871	}
				1872
				1873	/*
				1874	* Print the NFA state machine.
				1875	*/
				1876	static void
				1877	nfa_dump(prog)
				1878	nfa_regprog_T *prog;
				1879	{
				1880	FILE *debugf = fopen("LOG.log", "a");
				1881
				1882	if (debugf != NULL)
				1883	{
				1884	nfa_print_state(debugf, prog->start, 0);
				1885	fclose(debugf);
				1886	}
				1887	}
				1888	#endif /* ENABLE_LOG */
				1889	#endif /* DEBUG */
				1890
				1891	/*
				1892	* Parse r.e. @expr and convert it into postfix form.
				1893	* Return the postfix string on success, NULL otherwise.
				1894	*/
				1895	static int *
				1896	re2post()
				1897	{
				1898	if (nfa_reg(REG_NOPAREN) == FAIL)
				1899	return NULL;
				1900	EMIT(NFA_MOPEN);
				1901	return post_start;
				1902	}
				1903
				1904	/* NB. Some of the code below is inspired by Russ's. */
				1905
				1906	/*
				1907	* Represents an NFA state plus zero or one or two arrows exiting.
				1908	* if c == MATCH, no arrows out; matching state.
				1909	* If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
				1910	* If c < 256, labeled arrow with character c to out.
				1911	*/
				1912
				1913	static nfa_state_T state_ptr; / points to nfa_prog->state */
				1914
				1915	/*
				1916	* Allocate and initialize nfa_state_T.
				1917	*/
				1918	static nfa_state_T *
				1919	new_state(c, out, out1)
				1920	int c;
				1921	nfa_state_T *out;
				1922	nfa_state_T *out1;
				1923	{
				1924	nfa_state_T *s;
				1925
				1926	if (istate >= nstate)
				1927	return NULL;
				1928
				1929	s = &state_ptr[istate++];
				1930
				1931	s->c = c;
				1932	s->out = out;
				1933	s->out1 = out1;
				1934
				1935	s->id = istate;
				1936	s->lastlist = 0;
				1937	s->lastthread = NULL;
				1938	s->visits = 0;
				1939	s->negated = FALSE;
				1940
				1941	return s;
				1942	}
				1943
				1944	/*
				1945	* A partially built NFA without the matching state filled in.
				1946	* Frag_T.start points at the start state.
				1947	* Frag_T.out is a list of places that need to be set to the
				1948	* next state for this fragment.
				1949	*/
				1950	typedef union Ptrlist Ptrlist;
				1951	struct Frag
				1952	{
				1953	nfa_state_T *start;
				1954	Ptrlist *out;
				1955	};
				1956	typedef struct Frag Frag_T;
				1957
				1958	static Frag_T frag __ARGS((nfa_state_T start, Ptrlist out));
				1959	static Ptrlist list1 __ARGS((nfa_state_T *outp));
				1960	static void patch __ARGS((Ptrlist l, nfa_state_T s));
				1961	static Ptrlist append __ARGS((Ptrlist l1, Ptrlist *l2));
				1962	static void st_push __ARGS((Frag_T s, Frag_T *p, Frag_T stack_end));
				1963	static Frag_T st_pop __ARGS((Frag_T *p, Frag_T stack));
				1964
				1965	/*
Bram Moolenaar	053bb60	2013-05-20 13:55:21 +0200	[diff] [blame]	1966	* Initialize a Frag_T struct and return it.
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1967	*/
				1968	static Frag_T
				1969	frag(start, out)
				1970	nfa_state_T *start;
				1971	Ptrlist *out;
				1972	{
Bram Moolenaar	053bb60	2013-05-20 13:55:21 +0200	[diff] [blame]	1973	Frag_T n;
				1974
				1975	n.start = start;
				1976	n.out = out;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1977	return n;
				1978	}
				1979
				1980	/*
				1981	* Since the out pointers in the list are always
				1982	* uninitialized, we use the pointers themselves
				1983	* as storage for the Ptrlists.
				1984	*/
				1985	union Ptrlist
				1986	{
				1987	Ptrlist *next;
				1988	nfa_state_T *s;
				1989	};
				1990
				1991	/*
				1992	* Create singleton list containing just outp.
				1993	*/
				1994	static Ptrlist *
				1995	list1(outp)
				1996	nfa_state_T **outp;
				1997	{
				1998	Ptrlist *l;
				1999
				2000	l = (Ptrlist *)outp;
				2001	l->next = NULL;
				2002	return l;
				2003	}
				2004
				2005	/*
				2006	* Patch the list of states at out to point to start.
				2007	*/
				2008	static void
				2009	patch(l, s)
				2010	Ptrlist *l;
				2011	nfa_state_T *s;
				2012	{
				2013	Ptrlist *next;
				2014
				2015	for (; l; l = next)
				2016	{
				2017	next = l->next;
				2018	l->s = s;
				2019	}
				2020	}
				2021
				2022
				2023	/*
				2024	* Join the two lists l1 and l2, returning the combination.
				2025	*/
				2026	static Ptrlist *
				2027	append(l1, l2)
				2028	Ptrlist *l1;
				2029	Ptrlist *l2;
				2030	{
				2031	Ptrlist *oldl1;
				2032
				2033	oldl1 = l1;
				2034	while (l1->next)
				2035	l1 = l1->next;
				2036	l1->next = l2;
				2037	return oldl1;
				2038	}
				2039
				2040	/*
				2041	* Stack used for transforming postfix form into NFA.
				2042	*/
				2043	static Frag_T empty;
				2044
				2045	static void
				2046	st_error(postfix, end, p)
				2047	int *postfix;
				2048	int *end;
				2049	int *p;
				2050	{
				2051	FILE *df;
				2052	int *p2;
				2053
				2054	df = fopen("stack.err", "a");
				2055	if (df)
				2056	{
				2057	fprintf(df, "Error popping the stack!\n");
				2058	#ifdef DEBUG
				2059	fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
				2060	#endif
				2061	fprintf(df, "Postfix form is: ");
				2062	#ifdef DEBUG
				2063	for (p2 = postfix; p2 < end; p2++)
				2064	{
				2065	nfa_set_code(*p2);
				2066	fprintf(df, "%s, ", code);
				2067	}
				2068	nfa_set_code(*p);
				2069	fprintf(df, "\nCurrent position is: ");
				2070	for (p2 = postfix; p2 <= p; p2 ++)
				2071	{
				2072	nfa_set_code(*p2);
				2073	fprintf(df, "%s, ", code);
				2074	}
				2075	#else
				2076	for (p2 = postfix; p2 < end; p2++)
				2077	{
				2078	fprintf(df, "%d, ", *p2);
				2079	}
				2080	fprintf(df, "\nCurrent position is: ");
				2081	for (p2 = postfix; p2 <= p; p2 ++)
				2082	{
				2083	fprintf(df, "%d, ", *p2);
				2084	}
				2085	#endif
				2086	fprintf(df, "\n--------------------------\n");
				2087	fclose(df);
				2088	}
				2089	EMSG(_("E874: (NFA) Could not pop the stack !"));
				2090	}
				2091
				2092	/*
				2093	* Push an item onto the stack.
				2094	*/
				2095	static void
				2096	st_push(s, p, stack_end)
				2097	Frag_T s;
				2098	Frag_T **p;
				2099	Frag_T *stack_end;
				2100	{
				2101	Frag_T stackp = p;
				2102
				2103	if (stackp >= stack_end)
				2104	return;
				2105	*stackp = s;
				2106	p = p + 1;
				2107	}
				2108
				2109	/*
				2110	* Pop an item from the stack.
				2111	*/
				2112	static Frag_T
				2113	st_pop(p, stack)
				2114	Frag_T **p;
				2115	Frag_T *stack;
				2116	{
				2117	Frag_T *stackp;
				2118
				2119	p = p - 1;
				2120	stackp = *p;
				2121	if (stackp < stack)
				2122	return empty;
				2123	return **p;
				2124	}
				2125
				2126	/*
				2127	* Convert a postfix form into its equivalent NFA.
				2128	* Return the NFA start state on success, NULL otherwise.
				2129	*/
				2130	static nfa_state_T *
				2131	post2nfa(postfix, end, nfa_calc_size)
				2132	int *postfix;
				2133	int *end;
				2134	int nfa_calc_size;
				2135	{
				2136	int *p;
				2137	int mopen;
				2138	int mclose;
				2139	Frag_T *stack = NULL;
				2140	Frag_T *stackp = NULL;
				2141	Frag_T *stack_end = NULL;
				2142	Frag_T e1;
				2143	Frag_T e2;
				2144	Frag_T e;
				2145	nfa_state_T *s;
				2146	nfa_state_T *s1;
				2147	nfa_state_T *matchstate;
				2148
				2149	if (postfix == NULL)
				2150	return NULL;
				2151
Bram Moolenaar	053bb60	2013-05-20 13:55:21 +0200	[diff] [blame]	2152	#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2153	#define POP() st_pop(&stackp, stack); \
				2154	if (stackp < stack) \
				2155	{ \
				2156	st_error(postfix, end, p); \
				2157	return NULL; \
				2158	}
				2159
				2160	if (nfa_calc_size == FALSE)
				2161	{
				2162	/* Allocate space for the stack. Max states on the stack : nstate */
				2163	stack = (Frag_T ) lalloc((nstate + 1)sizeof(Frag_T), TRUE);
				2164	stackp = stack;
				2165	stack_end = stack + NFA_STACK_SIZE;
				2166	}
				2167
				2168	for (p = postfix; p < end; ++p)
				2169	{
				2170	switch (*p)
				2171	{
				2172	case NFA_CONCAT:
				2173	/* Catenation.
				2174	* Pay attention: this operator does not exist
				2175	* in the r.e. itself (it is implicit, really).
				2176	* It is added when r.e. is translated to postfix
				2177	* form in re2post().
				2178	*
				2179	* No new state added here. */
				2180	if (nfa_calc_size == TRUE)
				2181	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2182	/* nstate += 0; */
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2183	break;
				2184	}
				2185	e2 = POP();
				2186	e1 = POP();
				2187	patch(e1.out, e2.start);
				2188	PUSH(frag(e1.start, e2.out));
				2189	break;
				2190
				2191	case NFA_NOT:
				2192	/* Negation of a character */
				2193	if (nfa_calc_size == TRUE)
				2194	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2195	/* nstate += 0; */
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2196	break;
				2197	}
				2198	e1 = POP();
				2199	e1.start->negated = TRUE;
				2200	if (e1.start->c == NFA_MULTIBYTE \|\| e1.start->c == NFA_COMPOSING)
				2201	e1.start->out1->negated = TRUE;
				2202	PUSH(e1);
				2203	break;
				2204
				2205	case NFA_OR:
				2206	/* Alternation */
				2207	if (nfa_calc_size == TRUE)
				2208	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2209	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2210	break;
				2211	}
				2212	e2 = POP();
				2213	e1 = POP();
				2214	s = new_state(NFA_SPLIT, e1.start, e2.start);
				2215	if (s == NULL)
				2216	return NULL;
				2217	PUSH(frag(s, append(e1.out, e2.out)));
				2218	break;
				2219
				2220	case NFA_STAR:
				2221	/* Zero or more */
				2222	if (nfa_calc_size == TRUE)
				2223	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2224	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2225	break;
				2226	}
				2227	e = POP();
				2228	s = new_state(NFA_SPLIT, e.start, NULL);
				2229	if (s == NULL)
				2230	return NULL;
				2231	patch(e.out, s);
				2232	PUSH(frag(s, list1(&s->out1)));
				2233	break;
				2234
				2235	case NFA_QUEST:
				2236	/* one or zero atoms=> greedy match */
				2237	if (nfa_calc_size == TRUE)
				2238	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2239	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2240	break;
				2241	}
				2242	e = POP();
				2243	s = new_state(NFA_SPLIT, e.start, NULL);
				2244	if (s == NULL)
				2245	return NULL;
				2246	PUSH(frag(s, append(e.out, list1(&s->out1))));
				2247	break;
				2248
				2249	case NFA_QUEST_NONGREEDY:
				2250	/* zero or one atoms => non-greedy match */
				2251	if (nfa_calc_size == TRUE)
				2252	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2253	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2254	break;
				2255	}
				2256	e = POP();
				2257	s = new_state(NFA_SPLIT, NULL, e.start);
				2258	if (s == NULL)
				2259	return NULL;
				2260	PUSH(frag(s, append(e.out, list1(&s->out))));
				2261	break;
				2262
				2263	case NFA_PLUS:
				2264	/* One or more */
				2265	if (nfa_calc_size == TRUE)
				2266	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2267	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2268	break;
				2269	}
				2270	e = POP();
				2271	s = new_state(NFA_SPLIT, e.start, NULL);
				2272	if (s == NULL)
				2273	return NULL;
				2274	patch(e.out, s);
				2275	PUSH(frag(e.start, list1(&s->out1)));
				2276	break;
				2277
				2278	case NFA_SKIP_CHAR:
				2279	/* Symbol of 0-length, Used in a repetition
				2280	* with max/min count of 0 */
				2281	if (nfa_calc_size == TRUE)
				2282	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2283	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2284	break;
				2285	}
				2286	s = new_state(NFA_SKIP_CHAR, NULL, NULL);
				2287	if (s == NULL)
				2288	return NULL;
				2289	PUSH(frag(s, list1(&s->out)));
				2290	break;
				2291
				2292	case NFA_PREV_ATOM_NO_WIDTH:
				2293	/* The \@= operator: match the preceding atom with 0 width.
				2294	* Surrounds the preceding atom with START_INVISIBLE and
				2295	* END_INVISIBLE, similarly to MOPEN.
				2296	*/
				2297	/* TODO: Maybe this drops the speed? */
				2298	return NULL;
				2299
				2300	if (nfa_calc_size == TRUE)
				2301	{
				2302	nstate += 2;
				2303	break;
				2304	}
				2305	e = POP();
				2306	s1 = new_state(NFA_END_INVISIBLE, NULL, NULL);
				2307	if (s1 == NULL)
				2308	return NULL;
				2309	patch(e.out, s1);
				2310
				2311	s = new_state(NFA_START_INVISIBLE, e.start, s1);
				2312	if (s == NULL)
				2313	return NULL;
				2314	PUSH(frag(s, list1(&s1->out)));
				2315	break;
				2316
				2317	case NFA_MOPEN + 0: /* Submatch */
				2318	case NFA_MOPEN + 1:
				2319	case NFA_MOPEN + 2:
				2320	case NFA_MOPEN + 3:
				2321	case NFA_MOPEN + 4:
				2322	case NFA_MOPEN + 5:
				2323	case NFA_MOPEN + 6:
				2324	case NFA_MOPEN + 7:
				2325	case NFA_MOPEN + 8:
				2326	case NFA_MOPEN + 9:
				2327	case NFA_NOPEN: /* \%( "Invisible Submatch" */
				2328	case NFA_MULTIBYTE: /* mbyte char */
				2329	case NFA_COMPOSING: /* composing char */
				2330	if (nfa_calc_size == TRUE)
				2331	{
				2332	nstate += 2;
				2333	break;
				2334	}
				2335
				2336	mopen = *p;
				2337	switch (*p)
				2338	{
				2339	case NFA_NOPEN:
				2340	mclose = NFA_NCLOSE;
				2341	break;
				2342	case NFA_MULTIBYTE:
				2343	mclose = NFA_END_MULTIBYTE;
				2344	break;
				2345	case NFA_COMPOSING:
				2346	mclose = NFA_END_COMPOSING;
				2347	break;
				2348	default:
				2349	/* NFA_MOPEN(0) ... NFA_MOPEN(9) */
				2350	mclose = *p + NSUBEXP;
				2351	break;
				2352	}
				2353
				2354	/* Allow "NFA_MOPEN" as a valid postfix representation for
				2355	* the empty regexp "". In this case, the NFA will be
				2356	* NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
				2357	* empty groups of parenthesis, and empty mbyte chars */
				2358	if (stackp == stack)
				2359	{
				2360	s = new_state(mopen, NULL, NULL);
				2361	if (s == NULL)
				2362	return NULL;
				2363	s1 = new_state(mclose, NULL, NULL);
				2364	if (s1 == NULL)
				2365	return NULL;
				2366	patch(list1(&s->out), s1);
				2367	PUSH(frag(s, list1(&s1->out)));
				2368	break;
				2369	}
				2370
				2371	/* At least one node was emitted before NFA_MOPEN, so
				2372	* at least one node will be between NFA_MOPEN and NFA_MCLOSE */
				2373	e = POP();
				2374	s = new_state(mopen, e.start, NULL); /* `(' */
				2375	if (s == NULL)
				2376	return NULL;
				2377
				2378	s1 = new_state(mclose, NULL, NULL); /* `)' */
				2379	if (s1 == NULL)
				2380	return NULL;
				2381	patch(e.out, s1);
				2382
				2383	if (mopen == NFA_MULTIBYTE \|\| mopen == NFA_COMPOSING)
				2384	/* MULTIBYTE->out1 = END_MULTIBYTE
				2385	* COMPOSING->out1 = END_COMPOSING */
				2386	patch(list1(&s->out1), s1);
				2387
				2388	PUSH(frag(s, list1(&s1->out)));
				2389	break;
				2390
				2391	case NFA_ZSTART:
				2392	case NFA_ZEND:
				2393	default:
				2394	/* Operands */
				2395	if (nfa_calc_size == TRUE)
				2396	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2397	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2398	break;
				2399	}
				2400	s = new_state(*p, NULL, NULL);
				2401	if (s == NULL)
				2402	return NULL;
				2403	PUSH(frag(s, list1(&s->out)));
				2404	break;
				2405
				2406	} /* switch(p) /
				2407
				2408	} /* for(p = postfix; p; ++p) /
				2409
				2410	if (nfa_calc_size == TRUE)
				2411	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2412	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2413	return NULL; /* Return value when counting size is ignored anyway */
				2414	}
				2415
				2416	e = POP();
				2417	if (stackp != stack)
				2418	EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
				2419
				2420	if (istate >= nstate)
				2421	EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
				2422
				2423	vim_free(stack);
				2424
				2425	matchstate = &state_ptr[istate++]; /* the match state */
				2426	matchstate->c = NFA_MATCH;
				2427	matchstate->out = matchstate->out1 = NULL;
				2428
				2429	patch(e.out, matchstate);
				2430	return e.start;
				2431
				2432	#undef POP1
				2433	#undef PUSH1
				2434	#undef POP2
				2435	#undef PUSH2
				2436	#undef POP
				2437	#undef PUSH
				2438	}
				2439
				2440	/****************************************************************
				2441	* NFA execution code.
				2442	****************************************************************/
				2443
				2444	/* thread_T contains runtime information of a NFA state */
				2445	struct thread
				2446	{
				2447	nfa_state_T *state;
				2448	regsub_T sub; /* submatch info */
				2449	};
				2450
				2451	typedef struct
				2452	{
				2453	thread_T *t;
				2454	int n;
				2455	} List;
				2456
				2457	static void addstate __ARGS((List l, nfa_state_T state, regsub_T m, int off, int lid, int match));
				2458
				2459	static void
				2460	addstate(l, state, m, off, lid, match)
				2461	List l; / runtime state list */
				2462	nfa_state_T state; / state to update */
				2463	regsub_T m; / pointers to subexpressions */
				2464	int off;
				2465	int lid;
				2466	int match; / found match? */
				2467	{
				2468	regsub_T save;
				2469	int subidx = 0;
				2470
				2471	if (l == NULL \|\| state == NULL)
				2472	return;
				2473
				2474	switch (state->c)
				2475	{
				2476	case NFA_SPLIT:
				2477	case NFA_NOT:
				2478	case NFA_NOPEN:
				2479	case NFA_NCLOSE:
				2480	case NFA_MCLOSE:
				2481	case NFA_MCLOSE + 1:
				2482	case NFA_MCLOSE + 2:
				2483	case NFA_MCLOSE + 3:
				2484	case NFA_MCLOSE + 4:
				2485	case NFA_MCLOSE + 5:
				2486	case NFA_MCLOSE + 6:
				2487	case NFA_MCLOSE + 7:
				2488	case NFA_MCLOSE + 8:
				2489	case NFA_MCLOSE + 9:
				2490	/* Do not remember these nodes in list "thislist" or "nextlist" */
				2491	break;
				2492
				2493	default:
				2494	if (state->lastlist == lid)
				2495	{
				2496	if (++state->visits > 2)
				2497	return;
				2498	}
				2499	else
				2500	{
				2501	/* add the state to the list */
				2502	state->lastlist = lid;
				2503	state->lastthread = &l->t[l->n++];
				2504	state->lastthread->state = state;
				2505	state->lastthread->sub = *m;
				2506	}
				2507	}
				2508
				2509	#ifdef ENABLE_LOG
				2510	nfa_set_code(state->c);
				2511	fprintf(log_fd, "> Adding state %d to list. Character %s, code %d\n",
				2512	abs(state->id), code, state->c);
				2513	#endif
				2514	switch (state->c)
				2515	{
				2516	case NFA_MATCH:
				2517	*match = TRUE;
				2518	break;
				2519
				2520	case NFA_SPLIT:
				2521	addstate(l, state->out, m, off, lid, match);
				2522	addstate(l, state->out1, m, off, lid, match);
				2523	break;
				2524
				2525	case NFA_SKIP_CHAR:
				2526	addstate(l, state->out, m, off, lid, match);
				2527	break;
				2528
				2529	#if 0
				2530	case NFA_END_NEG_RANGE:
				2531	/* Nothing to handle here. nfa_regmatch() will take care of it */
				2532	break;
				2533
				2534	case NFA_NOT:
				2535	EMSG(_("E999: (NFA regexp internal error) Should not process NOT node !"));
				2536	#ifdef ENABLE_LOG
				2537	fprintf(f, "\n\n>>> E999: Added state NFA_NOT to a list ... Something went wrong ! Why wasn't it processed already? \n\n");
				2538	#endif
				2539	break;
				2540
				2541	case NFA_COMPOSING:
				2542	/* nfa_regmatch() will match all the bytes of this composing char. */
				2543	break;
				2544
				2545	case NFA_MULTIBYTE:
				2546	/* nfa_regmatch() will match all the bytes of this multibyte char. */
				2547	break;
				2548	#endif
				2549
				2550	case NFA_END_MULTIBYTE:
				2551	/* Successfully matched this mbyte char */
				2552	addstate(l, state->out, m, off, lid, match);
				2553	break;
				2554
				2555	case NFA_NOPEN:
				2556	case NFA_NCLOSE:
				2557	addstate(l, state->out, m, off, lid, match);
				2558	break;
				2559
				2560	/* If this state is reached, then a recursive call of nfa_regmatch()
				2561	* succeeded. the next call saves the found submatches in the
				2562	* first state after the "invisible" branch. */
				2563	#if 0
				2564	case NFA_END_INVISIBLE:
				2565	break;
				2566	#endif
				2567
				2568	case NFA_MOPEN + 0:
				2569	case NFA_MOPEN + 1:
				2570	case NFA_MOPEN + 2:
				2571	case NFA_MOPEN + 3:
				2572	case NFA_MOPEN + 4:
				2573	case NFA_MOPEN + 5:
				2574	case NFA_MOPEN + 6:
				2575	case NFA_MOPEN + 7:
				2576	case NFA_MOPEN + 8:
				2577	case NFA_MOPEN + 9:
				2578	case NFA_ZSTART:
				2579	subidx = state->c - NFA_MOPEN;
				2580	if (state->c == NFA_ZSTART)
				2581	subidx = 0;
				2582
				2583	if (REG_MULTI)
				2584	{
				2585	save.startpos[subidx] = m->startpos[subidx];
				2586	save.endpos[subidx] = m->endpos[subidx];
				2587	m->startpos[subidx].lnum = reglnum;
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2588	m->startpos[subidx].col = (colnr_T)(reginput - regline + off);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2589	}
				2590	else
				2591	{
				2592	save.start[subidx] = m->start[subidx];
				2593	save.end[subidx] = m->end[subidx];
				2594	m->start[subidx] = reginput + off;
				2595	}
				2596
				2597	addstate(l, state->out, m, off, lid, match);
				2598
				2599	if (REG_MULTI)
				2600	{
				2601	m->startpos[subidx] = save.startpos[subidx];
				2602	m->endpos[subidx] = save.endpos[subidx];
				2603	}
				2604	else
				2605	{
				2606	m->start[subidx] = save.start[subidx];
				2607	m->end[subidx] = save.end[subidx];
				2608	}
				2609	break;
				2610
				2611	case NFA_MCLOSE + 0:
				2612	if (nfa_has_zend == TRUE)
				2613	{
				2614	addstate(l, state->out, m, off, lid, match);
				2615	break;
				2616	}
				2617	case NFA_MCLOSE + 1:
				2618	case NFA_MCLOSE + 2:
				2619	case NFA_MCLOSE + 3:
				2620	case NFA_MCLOSE + 4:
				2621	case NFA_MCLOSE + 5:
				2622	case NFA_MCLOSE + 6:
				2623	case NFA_MCLOSE + 7:
				2624	case NFA_MCLOSE + 8:
				2625	case NFA_MCLOSE + 9:
				2626	case NFA_ZEND:
				2627	subidx = state->c - NFA_MCLOSE;
				2628	if (state->c == NFA_ZEND)
				2629	subidx = 0;
				2630
				2631	if (REG_MULTI)
				2632	{
				2633	save.startpos[subidx] = m->startpos[subidx];
				2634	save.endpos[subidx] = m->endpos[subidx];
				2635	m->endpos[subidx].lnum = reglnum;
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	2636	m->endpos[subidx].col = (colnr_T)(reginput - regline + off);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2637	}
				2638	else
				2639	{
				2640	save.start[subidx] = m->start[subidx];
				2641	save.end[subidx] = m->end[subidx];
				2642	m->end[subidx] = reginput + off;
				2643	}
				2644
				2645	addstate(l, state->out, m, off, lid, match);
				2646
				2647	if (REG_MULTI)
				2648	{
				2649	m->startpos[subidx] = save.startpos[subidx];
				2650	m->endpos[subidx] = save.endpos[subidx];
				2651	}
				2652	else
				2653	{
				2654	m->start[subidx] = save.start[subidx];
				2655	m->end[subidx] = save.end[subidx];
				2656	}
				2657	break;
				2658	}
				2659	}
				2660
				2661	/*
				2662	* Check character class "class" against current character c.
				2663	*/
				2664	static int
				2665	check_char_class(class, c)
				2666	int class;
				2667	int c;
				2668	{
				2669	switch (class)
				2670	{
				2671	case NFA_CLASS_ALNUM:
				2672	if (isalnum(c))
				2673	return OK;
				2674	break;
				2675	case NFA_CLASS_ALPHA:
				2676	if (isalpha(c))
				2677	return OK;
				2678	break;
				2679	case NFA_CLASS_BLANK:
				2680	if (c == ' ' \|\| c == '\t')
				2681	return OK;
				2682	break;
				2683	case NFA_CLASS_CNTRL:
				2684	if (iscntrl(c))
				2685	return OK;
				2686	break;
				2687	case NFA_CLASS_DIGIT:
				2688	if (VIM_ISDIGIT(c))
				2689	return OK;
				2690	break;
				2691	case NFA_CLASS_GRAPH:
				2692	if (isgraph(c))
				2693	return OK;
				2694	break;
				2695	case NFA_CLASS_LOWER:
				2696	if (MB_ISLOWER(c))
				2697	return OK;
				2698	break;
				2699	case NFA_CLASS_PRINT:
				2700	if (vim_isprintc(c))
				2701	return OK;
				2702	break;
				2703	case NFA_CLASS_PUNCT:
				2704	if (ispunct(c))
				2705	return OK;
				2706	break;
				2707	case NFA_CLASS_SPACE:
				2708	if ((c >=9 && c <= 13) \|\| (c == ' '))
				2709	return OK;
				2710	break;
				2711	case NFA_CLASS_UPPER:
				2712	if (MB_ISUPPER(c))
				2713	return OK;
				2714	break;
				2715	case NFA_CLASS_XDIGIT:
				2716	if (vim_isxdigit(c))
				2717	return OK;
				2718	break;
				2719	case NFA_CLASS_TAB:
				2720	if (c == '\t')
				2721	return OK;
				2722	break;
				2723	case NFA_CLASS_RETURN:
				2724	if (c == '\r')
				2725	return OK;
				2726	break;
				2727	case NFA_CLASS_BACKSPACE:
				2728	if (c == '\b')
				2729	return OK;
				2730	break;
				2731	case NFA_CLASS_ESCAPE:
				2732	if (c == '\033')
				2733	return OK;
				2734	break;
				2735
				2736	default:
				2737	/* should not be here :P */
				2738	EMSG_RET_FAIL(_("E877: (NFA regexp) Invalid character class "));
				2739	}
				2740	return FAIL;
				2741	}
				2742
				2743	/*
				2744	* Set all NFA nodes' list ID equal to -1.
				2745	*/
				2746	static void
				2747	nfa_set_neg_listids(start)
				2748	nfa_state_T *start;
				2749	{
				2750	if (start == NULL)
				2751	return;
				2752	if (start->lastlist >= 0)
				2753	{
				2754	start->lastlist = -1;
				2755	nfa_set_neg_listids(start->out);
				2756	nfa_set_neg_listids(start->out1);
				2757	}
				2758	}
				2759
				2760	/*
				2761	* Set all NFA nodes' list ID equal to 0.
				2762	*/
				2763	static void
				2764	nfa_set_null_listids(start)
				2765	nfa_state_T *start;
				2766	{
				2767	if (start == NULL)
				2768	return;
				2769	if (start->lastlist == -1)
				2770	{
				2771	start->lastlist = 0;
				2772	nfa_set_null_listids(start->out);
				2773	nfa_set_null_listids(start->out1);
				2774	}
				2775	}
				2776
				2777	/*
				2778	* Save list IDs for all NFA states in "list".
				2779	*/
				2780	static void
				2781	nfa_save_listids(start, list)
				2782	nfa_state_T *start;
				2783	int *list;
				2784	{
				2785	if (start == NULL)
				2786	return;
				2787	if (start->lastlist != -1)
				2788	{
				2789	list[abs(start->id)] = start->lastlist;
				2790	start->lastlist = -1;
				2791	nfa_save_listids(start->out, list);
				2792	nfa_save_listids(start->out1, list);
				2793	}
				2794	}
				2795
				2796	/*
				2797	* Restore list IDs from "list" to all NFA states.
				2798	*/
				2799	static void
				2800	nfa_restore_listids(start, list)
				2801	nfa_state_T *start;
				2802	int *list;
				2803	{
				2804	if (start == NULL)
				2805	return;
				2806	if (start->lastlist == -1)
				2807	{
				2808	start->lastlist = list[abs(start->id)];
				2809	nfa_restore_listids(start->out, list);
				2810	nfa_restore_listids(start->out1, list);
				2811	}
				2812	}
				2813
				2814	/*
				2815	* Main matching routine.
				2816	*
				2817	* Run NFA to determine whether it matches reginput.
				2818	*
				2819	* Return TRUE if there is a match, FALSE otherwise.
				2820	* Note: Caller must ensure that: start != NULL.
				2821	*/
				2822	static int
				2823	nfa_regmatch(start, submatch, m)
				2824	nfa_state_T *start;
				2825	regsub_T *submatch;
				2826	regsub_T *m;
				2827	{
				2828	int c = -1;
				2829	int n;
				2830	int i = 0;
				2831	int result;
				2832	int size = 0;
				2833	int match = FALSE;
				2834	int flag = 0;
				2835	int old_reglnum = -1;
				2836	int reginput_updated = FALSE;
				2837	thread_T *t;
				2838	char_u *cc;
				2839	char_u *old_reginput = NULL;
				2840	char_u *old_regline = NULL;
				2841	nfa_state_T *sta;
				2842	nfa_state_T *end;
				2843	List list[3];
				2844	List *listtbl[2][2];
				2845	List *ll;
				2846	int listid = 1;
				2847	int endnode = 0;
				2848	List *thislist;
				2849	List *nextlist;
				2850	List *neglist;
				2851	int *listids = NULL;
				2852	int j = 0;
				2853	int len = 0;
Bram Moolenaar	7fcff1f	2013-05-20 21:49:13 +0200	[diff] [blame^]	2854	#ifdef NFA_REGEXP_DEBUG_LOG
				2855	FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2856
				2857	if (debug == NULL)
				2858	{
Bram Moolenaar	7fcff1f	2013-05-20 21:49:13 +0200	[diff] [blame^]	2859	EMSG2(_("(NFA) COULD NOT OPEN %s !"), NFA_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2860	return FALSE;
				2861	}
				2862	#endif
				2863
				2864	/* Allocate memory for the lists of nodes */
				2865	size = (nstate + 1) * sizeof(thread_T);
				2866	list[0].t = (thread_T *)lalloc(size, TRUE);
				2867	list[1].t = (thread_T *)lalloc(size, TRUE);
				2868	list[2].t = (thread_T *)lalloc(size, TRUE);
				2869	if (list[0].t == NULL \|\| list[1].t == NULL \|\| list[2].t == NULL)
				2870	goto theend;
				2871	vim_memset(list[0].t, 0, size);
				2872	vim_memset(list[1].t, 0, size);
				2873	vim_memset(list[2].t, 0, size);
				2874
				2875	#ifdef ENABLE_LOG
				2876	log_fd = fopen(LOG_NAME, "a");
				2877	if (log_fd != NULL)
				2878	{
				2879	fprintf(log_fd, "**********************************\n");
				2880	nfa_set_code(start->c);
				2881	fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
				2882	abs(start->id), code);
				2883	fprintf(log_fd, "**********************************\n");
				2884	}
				2885	else
				2886	{
				2887	EMSG(_("Could not open temporary log file for writing, displaying on stderr ... "));
				2888	log_fd = stderr;
				2889	}
				2890	#endif
				2891
				2892	thislist = &list[0];
				2893	thislist->n = 0;
				2894	nextlist = &list[1];
				2895	nextlist->n = 0;
				2896	neglist = &list[2];
				2897	neglist->n = 0;
				2898	#ifdef ENABLE_LOG
				2899	fprintf(log_fd, "(---) STARTSTATE\n");
				2900	#endif
				2901	addstate(thislist, start, m, 0, listid, &match);
				2902
				2903	/* There are two cases when the NFA advances: 1. input char matches the
				2904	* NFA node and 2. input char does not match the NFA node, but the next
				2905	* node is NFA_NOT. The following macro calls addstate() according to
				2906	* these rules. It is used A LOT, so use the "listtbl" table for speed */
				2907	listtbl[0][0] = NULL;
				2908	listtbl[0][1] = neglist;
				2909	listtbl[1][0] = nextlist;
				2910	listtbl[1][1] = NULL;
				2911	#define ADD_POS_NEG_STATE(node) \
				2912	ll = listtbl[result ? 1 : 0][node->negated]; \
				2913	if (ll != NULL) \
				2914	addstate(ll, node->out , &t->sub, n, listid + 1, &match);
				2915
				2916
				2917	/*
				2918	* Run for each character.
				2919	*/
				2920	do {
				2921	again:
				2922	#ifdef FEAT_MBYTE
				2923	if (has_mbyte)
				2924	{
				2925	c = (*mb_ptr2char)(reginput);
				2926	n = (*mb_ptr2len)(reginput);
				2927	}
				2928	else
				2929	#endif
				2930	{
				2931	c = *reginput;
				2932	n = 1;
				2933	}
				2934	if (c == NUL)
				2935	n = 0;
				2936	cc = (char_u *)&c;
				2937
				2938	/* swap lists */
				2939	thislist = &list[flag];
				2940	nextlist = &list[flag ^= 1];
				2941	nextlist->n = 0; /* `clear' nextlist */
				2942	listtbl[1][0] = nextlist;
				2943	++listid;
				2944
				2945	#ifdef ENABLE_LOG
				2946	fprintf(log_fd, "------------------------------------------\n");
				2947	fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput);
				2948	fprintf(log_fd, ">>> Advanced one character ... Current char is %c (code %d) \n", c, (int)c);
				2949	fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
				2950	for (i = 0; i< thislist->n; i++)
				2951	fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
				2952	fprintf(log_fd, "\n");
				2953	#endif
				2954
Bram Moolenaar	7fcff1f	2013-05-20 21:49:13 +0200	[diff] [blame^]	2955	#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2956	fprintf(debug, "\n-------------------\n");
				2957	#endif
				2958
				2959	/* compute nextlist */
				2960	for (i = 0; i < thislist->n \|\| neglist->n > 0; ++i)
				2961	{
				2962	if (neglist->n > 0)
				2963	{
				2964	t = &neglist->t[0];
				2965	neglist->n --;
				2966	i--;
				2967	}
				2968	else
				2969	t = &thislist->t[i];
				2970
Bram Moolenaar	7fcff1f	2013-05-20 21:49:13 +0200	[diff] [blame^]	2971	#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2972	nfa_set_code(t->state->c);
				2973	fprintf(debug, "%s, ", code);
				2974	#endif
				2975	#ifdef ENABLE_LOG
				2976	nfa_set_code(t->state->c);
				2977	fprintf(log_fd, "(%d) %s, code %d ... \n", abs(t->state->id),
				2978	code, (int)t->state->c);
				2979	#endif
				2980
				2981	/*
				2982	* Handle the possible codes of the current state.
				2983	* The most important is NFA_MATCH.
				2984	*/
				2985	switch (t->state->c)
				2986	{
				2987	case NFA_MATCH:
				2988	match = TRUE;
				2989	*submatch = t->sub;
				2990	#ifdef ENABLE_LOG
				2991	for (j = 0; j < 4; j++)
				2992	if (REG_MULTI)
				2993	fprintf(log_fd, "\n *** group %d, start: c=%d, l=%d, end: c=%d, l=%d",
				2994	j,
				2995	t->sub.startpos[j].col,
				2996	(int)t->sub.startpos[j].lnum,
				2997	t->sub.endpos[j].col,
				2998	(int)t->sub.endpos[j].lnum);
				2999	else
				3000	fprintf(log_fd, "\n *** group %d, start: \"%s\", end: \"%s\"",
				3001	j,
				3002	(char *)t->sub.start[j],
				3003	(char *)t->sub.end[j]);
				3004	fprintf(log_fd, "\n");
				3005	#endif
				3006	goto nextchar; /* found the left-most longest match */
				3007
				3008	case NFA_END_INVISIBLE:
				3009	/* This is only encountered after a NFA_START_INVISIBLE node.
				3010	* They surround a zero-width group, used with "\@=" and "\&".
				3011	* If we got here, it means that the current "invisible" group
				3012	* finished successfully, so return control to the parent
				3013	* nfa_regmatch(). Submatches are stored in *m, and used in
				3014	* the parent call. */
				3015	if (start->c == NFA_MOPEN + 0)
				3016	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3017	&match);
				3018	else
				3019	{
				3020	*m = t->sub;
				3021	match = TRUE;
				3022	}
				3023	break;
				3024
				3025	case NFA_START_INVISIBLE:
				3026	/* Save global variables, and call nfa_regmatch() to check if
				3027	* the current concat matches at this position. The concat
				3028	* ends with the node NFA_END_INVISIBLE */
				3029	old_reginput = reginput;
				3030	old_regline = regline;
				3031	old_reglnum = reglnum;
				3032	if (listids == NULL)
				3033	{
				3034	listids = (int ) lalloc(sizeof(int) nstate, TRUE);
				3035	if (listids == NULL)
				3036	{
				3037	EMSG(_("E878: (NFA) Could not allocate memory for branch traversal!"));
				3038	return 0;
				3039	}
				3040	}
				3041	#ifdef ENABLE_LOG
				3042	if (log_fd != stderr)
				3043	fclose(log_fd);
				3044	log_fd = NULL;
				3045	#endif
				3046	/* Have to clear the listid field of the NFA nodes, so that
				3047	* nfa_regmatch() and addstate() can run properly after
				3048	* recursion. */
				3049	nfa_save_listids(start, listids);
				3050	nfa_set_null_listids(start);
				3051	result = nfa_regmatch(t->state->out, submatch, m);
				3052	nfa_set_neg_listids(start);
				3053	nfa_restore_listids(start, listids);
				3054
				3055	#ifdef ENABLE_LOG
				3056	log_fd = fopen(LOG_NAME, "a");
				3057	if (log_fd != NULL)
				3058	{
				3059	fprintf(log_fd, "****************************\n");
				3060	fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
				3061	fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
				3062	fprintf(log_fd, "****************************\n");
				3063	}
				3064	else
				3065	{
				3066	EMSG(_("Could not open temporary log file for writing, displaying on stderr ... "));
				3067	log_fd = stderr;
				3068	}
				3069	#endif
				3070	if (result == TRUE)
				3071	{
				3072	/* Restore position in input text */
				3073	reginput = old_reginput;
				3074	regline = old_regline;
				3075	reglnum = old_reglnum;
				3076	/* Copy submatch info from the recursive call */
				3077	if (REG_MULTI)
				3078	for (j = 1; j < NSUBEXP; j++)
				3079	{
				3080	t->sub.startpos[j] = m->startpos[j];
				3081	t->sub.endpos[j] = m->endpos[j];
				3082	}
				3083	else
				3084	for (j = 1; j < NSUBEXP; j++)
				3085	{
				3086	t->sub.start[j] = m->start[j];
				3087	t->sub.end[j] = m->end[j];
				3088	}
				3089	/* t->state->out1 is the corresponding END_INVISIBLE node */
				3090	addstate(thislist, t->state->out1->out, &t->sub, 0, listid,
				3091	&match);
				3092	}
				3093	else
				3094	{
				3095	/* continue with next input char */
				3096	reginput = old_reginput;
				3097	}
				3098	break;
				3099
				3100	case NFA_BOL:
				3101	if (reginput == regline)
				3102	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3103	&match);
				3104	break;
				3105
				3106	case NFA_EOL:
				3107	if (c == NUL)
				3108	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3109	&match);
				3110	break;
				3111
				3112	case NFA_BOW:
				3113	{
				3114	int bow = TRUE;
				3115
				3116	if (c == NUL)
				3117	bow = FALSE;
				3118	#ifdef FEAT_MBYTE
				3119	else if (has_mbyte)
				3120	{
				3121	int this_class;
				3122
				3123	/* Get class of current and previous char (if it exists). */
				3124	this_class = mb_get_class(reginput);
				3125	if (this_class <= 1)
				3126	bow = FALSE;
				3127	else if (reg_prev_class() == this_class)
				3128	bow = FALSE;
				3129	}
				3130	#endif
				3131	else if (!vim_iswordc(c)
				3132	\|\| (reginput > regline && vim_iswordc(reginput[-1])))
				3133	bow = FALSE;
				3134	if (bow)
				3135	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3136	&match);
				3137	break;
				3138	}
				3139
				3140	case NFA_EOW:
				3141	{
				3142	int eow = TRUE;
				3143
				3144	if (reginput == regline)
				3145	eow = FALSE;
				3146	#ifdef FEAT_MBYTE
				3147	else if (has_mbyte)
				3148	{
				3149	int this_class, prev_class;
				3150
				3151	/* Get class of current and previous char (if it exists). */
				3152	this_class = mb_get_class(reginput);
				3153	prev_class = reg_prev_class();
				3154	if (this_class == prev_class
				3155	\|\| prev_class == 0 \|\| prev_class == 1)
				3156	eow = FALSE;
				3157	}
				3158	#endif
				3159	else if (!vim_iswordc(reginput[-1])
				3160	\|\| (reginput[0] != NUL && vim_iswordc(c)))
				3161	eow = FALSE;
				3162	if (eow)
				3163	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3164	&match);
				3165	break;
				3166	}
				3167
				3168	case NFA_MULTIBYTE:
				3169	case NFA_COMPOSING:
				3170	switch (t->state->c)
				3171	{
				3172	case NFA_MULTIBYTE: endnode = NFA_END_MULTIBYTE; break;
				3173	case NFA_COMPOSING: endnode = NFA_END_COMPOSING; break;
				3174	default: endnode = 0;
				3175	}
				3176
				3177	result = OK;
				3178	sta = t->state->out;
				3179	len = 1;
				3180	while (sta->c != endnode && len <= n)
				3181	{
				3182	if (reginput[len-1] != sta->c)
				3183	{
				3184	result = OK - 1;
				3185	break;
				3186	}
				3187	len++;
				3188	sta = sta->out;
				3189	}
				3190
				3191	/* if input char length doesn't match regexp char length */
				3192	if (len -1 < n \|\| sta->c != endnode)
				3193	result = OK - 1;
				3194	end = t->state->out1; /* NFA_END_MULTIBYTE or
				3195	NFA_END_COMPOSING */
				3196	/* If \Z was present, then ignore composing characters */
				3197	if (regflags & RF_ICOMBINE)
				3198	result = 1 ^ sta->negated;
				3199	ADD_POS_NEG_STATE(end);
				3200	break;
				3201
				3202	case NFA_NEWL:
				3203	if (!reg_line_lbr && REG_MULTI
				3204	&& c == NUL && reglnum <= reg_maxline)
				3205	{
				3206	if (reginput_updated == FALSE)
				3207	{
				3208	reg_nextline();
				3209	reginput_updated = TRUE;
				3210	}
				3211	addstate(nextlist, t->state->out, &t->sub, n, listid + 1,
				3212	&match);
				3213	}
				3214	break;
				3215
				3216	case NFA_CLASS_ALNUM:
				3217	case NFA_CLASS_ALPHA:
				3218	case NFA_CLASS_BLANK:
				3219	case NFA_CLASS_CNTRL:
				3220	case NFA_CLASS_DIGIT:
				3221	case NFA_CLASS_GRAPH:
				3222	case NFA_CLASS_LOWER:
				3223	case NFA_CLASS_PRINT:
				3224	case NFA_CLASS_PUNCT:
				3225	case NFA_CLASS_SPACE:
				3226	case NFA_CLASS_UPPER:
				3227	case NFA_CLASS_XDIGIT:
				3228	case NFA_CLASS_TAB:
				3229	case NFA_CLASS_RETURN:
				3230	case NFA_CLASS_BACKSPACE:
				3231	case NFA_CLASS_ESCAPE:
				3232	result = check_char_class(t->state->c, c);
				3233	ADD_POS_NEG_STATE(t->state);
				3234	break;
				3235
				3236	case NFA_END_NEG_RANGE:
				3237	/* This follows a series of negated nodes, like:
				3238	* CHAR(x), NFA_NOT, CHAR(y), NFA_NOT etc. */
				3239	if (c > 0)
				3240	addstate(nextlist, t->state->out, &t->sub, n, listid + 1,
				3241	&match);
				3242	break;
				3243
				3244	case NFA_ANY:
				3245	/* Any printable char, not just any char. '\0' (end of input)
				3246	* must not match */
				3247	if (c > 0)
				3248	addstate(nextlist, t->state->out, &t->sub, n, listid + 1,
				3249	&match);
				3250	break;
				3251
				3252	/*
				3253	* Character classes like \a for alpha, \d for digit etc.
				3254	*/
				3255	case NFA_IDENT: /* \i */
				3256	result = vim_isIDc(c);
				3257	ADD_POS_NEG_STATE(t->state);
				3258	break;
				3259
				3260	case NFA_SIDENT: /* \I */
				3261	result = !VIM_ISDIGIT(c) && vim_isIDc(c);
				3262	ADD_POS_NEG_STATE(t->state);
				3263	break;
				3264
				3265	case NFA_KWORD: /* \k */
				3266	result = vim_iswordp(cc);
				3267	ADD_POS_NEG_STATE(t->state);
				3268	break;
				3269
				3270	case NFA_SKWORD: /* \K */
				3271	result = !VIM_ISDIGIT(c) && vim_iswordp(cc);
				3272	ADD_POS_NEG_STATE(t->state);
				3273	break;
				3274
				3275	case NFA_FNAME: /* \f */
				3276	result = vim_isfilec(c);
				3277	ADD_POS_NEG_STATE(t->state);
				3278	break;
				3279
				3280	case NFA_SFNAME: /* \F */
				3281	result = !VIM_ISDIGIT(c) && vim_isfilec(c);
				3282	ADD_POS_NEG_STATE(t->state);
				3283	break;
				3284
				3285	case NFA_PRINT: /* \p */
				3286	result = ptr2cells(cc) == 1;
				3287	ADD_POS_NEG_STATE(t->state);
				3288	break;
				3289
				3290	case NFA_SPRINT: /* \P */
				3291	result = !VIM_ISDIGIT(c) && ptr2cells(cc) == 1;
				3292	ADD_POS_NEG_STATE(t->state);
				3293	break;
				3294
				3295	case NFA_WHITE: /* \s */
				3296	result = vim_iswhite(c);
				3297	ADD_POS_NEG_STATE(t->state);
				3298	break;
				3299
				3300	case NFA_NWHITE: /* \S */
				3301	result = c != NUL && !vim_iswhite(c);
				3302	ADD_POS_NEG_STATE(t->state);
				3303	break;
				3304
				3305	case NFA_DIGIT: /* \d */
				3306	result = ri_digit(c);
				3307	ADD_POS_NEG_STATE(t->state);
				3308	break;
				3309
				3310	case NFA_NDIGIT: /* \D */
				3311	result = c != NUL && !ri_digit(c);
				3312	ADD_POS_NEG_STATE(t->state);
				3313	break;
				3314
				3315	case NFA_HEX: /* \x */
				3316	result = ri_hex(c);
				3317	ADD_POS_NEG_STATE(t->state);
				3318	break;
				3319
				3320	case NFA_NHEX: /* \X */
				3321	result = c != NUL && !ri_hex(c);
				3322	ADD_POS_NEG_STATE(t->state);
				3323	break;
				3324
				3325	case NFA_OCTAL: /* \o */
				3326	result = ri_octal(c);
				3327	ADD_POS_NEG_STATE(t->state);
				3328	break;
				3329
				3330	case NFA_NOCTAL: /* \O */
				3331	result = c != NUL && !ri_octal(c);
				3332	ADD_POS_NEG_STATE(t->state);
				3333	break;
				3334
				3335	case NFA_WORD: /* \w */
				3336	result = ri_word(c);
				3337	ADD_POS_NEG_STATE(t->state);
				3338	break;
				3339
				3340	case NFA_NWORD: /* \W */
				3341	result = c != NUL && !ri_word(c);
				3342	ADD_POS_NEG_STATE(t->state);
				3343	break;
				3344
				3345	case NFA_HEAD: /* \h */
				3346	result = ri_head(c);
				3347	ADD_POS_NEG_STATE(t->state);
				3348	break;
				3349
				3350	case NFA_NHEAD: /* \H */
				3351	result = c != NUL && !ri_head(c);
				3352	ADD_POS_NEG_STATE(t->state);
				3353	break;
				3354
				3355	case NFA_ALPHA: /* \a */
				3356	result = ri_alpha(c);
				3357	ADD_POS_NEG_STATE(t->state);
				3358	break;
				3359
				3360	case NFA_NALPHA: /* \A */
				3361	result = c != NUL && !ri_alpha(c);
				3362	ADD_POS_NEG_STATE(t->state);
				3363	break;
				3364
				3365	case NFA_LOWER: /* \l */
				3366	result = ri_lower(c);
				3367	ADD_POS_NEG_STATE(t->state);
				3368	break;
				3369
				3370	case NFA_NLOWER: /* \L */
				3371	result = c != NUL && !ri_lower(c);
				3372	ADD_POS_NEG_STATE(t->state);
				3373	break;
				3374
				3375	case NFA_UPPER: /* \u */
				3376	result = ri_upper(c);
				3377	ADD_POS_NEG_STATE(t->state);
				3378	break;
				3379
				3380	case NFA_NUPPER: /* \U */
				3381	result = c != NUL && !ri_upper(c);
				3382	ADD_POS_NEG_STATE(t->state);
				3383	break;
				3384
				3385	default: /* regular character */
				3386	result = (no_Magic(t->state->c) == c);
				3387	if (!result)
				3388	result = ireg_ic == TRUE
				3389	&& MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
				3390	ADD_POS_NEG_STATE(t->state);
				3391	break;
				3392	}
				3393
				3394	} /* for (thislist = thislist; thislist->state; thislist++) */
				3395
				3396	/* The first found match is the leftmost one, but there may be a
				3397	* longer one. Keep running the NFA, but don't start from the
				3398	* beginning. Also, do not add the start state in recursive calls of
				3399	* nfa_regmatch(), because recursive calls should only start in the
				3400	* first position. */
				3401	if (match == FALSE && start->c == NFA_MOPEN + 0)
				3402	{
				3403	#ifdef ENABLE_LOG
				3404	fprintf(log_fd, "(---) STARTSTATE\n");
				3405	#endif
				3406	addstate(nextlist, start, m, n, listid + 1, &match);
				3407	}
				3408
				3409	if (reginput_updated)
				3410	{
				3411	reginput_updated = FALSE;
				3412	goto again;
				3413	}
				3414
				3415	#ifdef ENABLE_LOG
				3416	fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
				3417	for (i = 0; i< thislist->n; i++)
				3418	fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
				3419	fprintf(log_fd, "\n");
				3420	#endif
				3421
				3422	nextchar:
				3423	reginput += n;
				3424	} while (c \|\| reginput_updated);
				3425
				3426	#ifdef ENABLE_LOG
				3427	if (log_fd != stderr)
				3428	fclose(log_fd);
				3429	log_fd = NULL;
				3430	#endif
				3431
				3432	theend:
				3433	/* Free memory */
				3434	vim_free(list[0].t);
				3435	vim_free(list[1].t);
				3436	vim_free(list[2].t);
				3437	list[0].t = list[1].t = list[2].t = NULL;
				3438	if (listids != NULL)
				3439	vim_free(listids);
				3440	#undef ADD_POS_NEG_STATE
Bram Moolenaar	7fcff1f	2013-05-20 21:49:13 +0200	[diff] [blame^]	3441	#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	3442	fclose(debug);
				3443	#endif
				3444
				3445	return match;
				3446	}
				3447
				3448	/*
				3449	* Try match of "prog" with at regline["col"].
				3450	* Returns 0 for failure, number of lines contained in the match otherwise.
				3451	*/
				3452	static long
				3453	nfa_regtry(start, col)
				3454	nfa_state_T *start;
				3455	colnr_T col;
				3456	{
				3457	int i;
				3458	regsub_T sub, m;
				3459	#ifdef ENABLE_LOG
				3460	FILE *f;
				3461	#endif
				3462
				3463	reginput = regline + col;
				3464	need_clear_subexpr = TRUE;
				3465
				3466	#ifdef ENABLE_LOG
				3467	f = fopen(LOG_NAME, "a");
				3468	if (f != NULL)
				3469	{
				3470	fprintf(f, "\n\n\n\n\n\n\t\t=======================================================\n");
				3471	fprintf(f, " =======================================================\n");
				3472	#ifdef DEBUG
				3473	fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
				3474	#endif
				3475	fprintf(f, "\tInput text is \"%s\" \n", reginput);
				3476	fprintf(f, " =======================================================\n\n\n\n\n\n\n");
				3477	nfa_print_state(f, start, 0);
				3478	fprintf(f, "\n\n");
				3479	fclose(f);
				3480	}
				3481	else
				3482	EMSG(_("Could not open temporary log file for writing "));
				3483	#endif
				3484
				3485	if (REG_MULTI)
				3486	{
				3487	/* Use 0xff to set lnum to -1 */
				3488	vim_memset(sub.startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3489	vim_memset(sub.endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3490	vim_memset(m.startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3491	vim_memset(m.endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3492	}
				3493	else
				3494	{
				3495	vim_memset(sub.start, 0, sizeof(char_u ) NSUBEXP);
				3496	vim_memset(sub.end, 0, sizeof(char_u ) NSUBEXP);
				3497	vim_memset(m.start, 0, sizeof(char_u ) NSUBEXP);
				3498	vim_memset(m.end, 0, sizeof(char_u ) NSUBEXP);
				3499	}
				3500
				3501	if (nfa_regmatch(start, &sub, &m) == FALSE)
				3502	return 0;
				3503
				3504	cleanup_subexpr();
				3505	if (REG_MULTI)
				3506	{
				3507	for (i = 0; i < NSUBEXP; i++)
				3508	{
				3509	reg_startpos[i] = sub.startpos[i];
				3510	reg_endpos[i] = sub.endpos[i];
				3511	}
				3512
				3513	if (reg_startpos[0].lnum < 0)
				3514	{
				3515	reg_startpos[0].lnum = 0;
				3516	reg_startpos[0].col = col;
				3517	}
				3518	if (reg_endpos[0].lnum < 0)
				3519	{
				3520	reg_endpos[0].lnum = reglnum;
				3521	reg_endpos[0].col = (int)(reginput - regline);
				3522	}
				3523	else
				3524	/* Use line number of "\ze". */
				3525	reglnum = reg_endpos[0].lnum;
				3526	}
				3527	else
				3528	{
				3529	for (i = 0; i < NSUBEXP; i++)
				3530	{
				3531	reg_startp[i] = sub.start[i];
				3532	reg_endp[i] = sub.end[i];
				3533	}
				3534
				3535	if (reg_startp[0] == NULL)
				3536	reg_startp[0] = regline + col;
				3537	if (reg_endp[0] == NULL)
				3538	reg_endp[0] = reginput;
				3539	}
				3540
				3541	return 1 + reglnum;
				3542	}
				3543
				3544	/*
				3545	* Match a regexp against a string ("line" points to the string) or multiple
				3546	* lines ("line" is NULL, use reg_getline()).
				3547	*
				3548	* Returns 0 for failure, number of lines contained in the match otherwise.
				3549	*/
				3550	static long
				3551	nfa_regexec_both(line, col)
				3552	char_u *line;
				3553	colnr_T col; /* column to start looking for match */
				3554	{
				3555	nfa_regprog_T *prog;
				3556	long retval = 0L;
				3557	int i;
				3558
				3559	if (REG_MULTI)
				3560	{
				3561	prog = (nfa_regprog_T *)reg_mmatch->regprog;
				3562	line = reg_getline((linenr_T)0); /* relative to the cursor */
				3563	reg_startpos = reg_mmatch->startpos;
				3564	reg_endpos = reg_mmatch->endpos;
				3565	}
				3566	else
				3567	{
				3568	prog = (nfa_regprog_T *)reg_match->regprog;
				3569	reg_startp = reg_match->startp;
				3570	reg_endp = reg_match->endp;
				3571	}
				3572
				3573	/* Be paranoid... */
				3574	if (prog == NULL \|\| line == NULL)
				3575	{
				3576	EMSG(_(e_null));
				3577	goto theend;
				3578	}
				3579
				3580	/* If the start column is past the maximum column: no need to try. */
				3581	if (ireg_maxcol > 0 && col >= ireg_maxcol)
				3582	goto theend;
				3583
				3584	/* If pattern contains "\c" or "\C": overrule value of ireg_ic */
				3585	if (prog->regflags & RF_ICASE)
				3586	ireg_ic = TRUE;
				3587	else if (prog->regflags & RF_NOICASE)
				3588	ireg_ic = FALSE;
				3589
				3590	#ifdef FEAT_MBYTE
				3591	/* If pattern contains "\Z" overrule value of ireg_icombine */
				3592	if (prog->regflags & RF_ICOMBINE)
				3593	ireg_icombine = TRUE;
				3594	#endif
				3595
				3596	regline = line;
				3597	reglnum = 0; /* relative to line */
				3598
				3599	nstate = prog->nstate;
				3600
				3601	for (i = 0; i < nstate; ++i)
				3602	{
				3603	prog->state[i].id = i;
				3604	prog->state[i].lastlist = 0;
				3605	prog->state[i].visits = 0;
				3606	prog->state[i].lastthread = NULL;
				3607	}
				3608
				3609	retval = nfa_regtry(prog->start, col);
				3610
				3611	theend:
				3612	return retval;
				3613	}
				3614
				3615	/*
				3616	* Compile a regular expression into internal code for the NFA matcher.
				3617	* Returns the program in allocated space. Returns NULL for an error.
				3618	*/
				3619	static regprog_T *
				3620	nfa_regcomp(expr, re_flags)
				3621	char_u *expr;
				3622	int re_flags;
				3623	{
				3624	nfa_regprog_T *prog;
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame]	3625	size_t prog_size;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	3626	int *postfix;
				3627
				3628	if (expr == NULL)
				3629	return NULL;
				3630
				3631	#ifdef DEBUG
				3632	nfa_regengine.expr = expr;
				3633	#endif
				3634
				3635	init_class_tab();
				3636
				3637	if (nfa_regcomp_start(expr, re_flags) == FAIL)
				3638	return NULL;
				3639
				3640	/* Space for compiled regexp */
				3641	prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * nstate_max;
				3642	prog = (nfa_regprog_T *)lalloc(prog_size, TRUE);
				3643	if (prog == NULL)
				3644	goto fail;
				3645	vim_memset(prog, 0, prog_size);
				3646
				3647	/* Build postfix form of the regexp. Needed to build the NFA
				3648	* (and count its size) */
				3649	postfix = re2post();
				3650	if (postfix == NULL)
				3651	goto fail; /* Cascaded (syntax?) error */
				3652
				3653	/*
				3654	* In order to build the NFA, we parse the input regexp twice:
				3655	* 1. first pass to count size (so we can allocate space)
				3656	* 2. second to emit code
				3657	*/
				3658	#ifdef ENABLE_LOG
				3659	{
				3660	FILE *f = fopen(LOG_NAME, "a");
				3661
				3662	if (f != NULL)
				3663	{
				3664	fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\" ... hold on !\n", expr);
				3665	fclose(f);
				3666	}
				3667	}
				3668	#endif
				3669
				3670	/*
				3671	* PASS 1
				3672	* Count number of NFA states in "nstate". Do not build the NFA.
				3673	*/
				3674	post2nfa(postfix, post_ptr, TRUE);
				3675	state_ptr = prog->state;
				3676
				3677	/*
				3678	* PASS 2
				3679	* Build the NFA
				3680	*/
				3681	prog->start = post2nfa(postfix, post_ptr, FALSE);
				3682	if (prog->start == NULL)
				3683	goto fail;
				3684
				3685	prog->regflags = regflags;
				3686	prog->engine = &nfa_regengine;
				3687	prog->nstate = nstate;
				3688	#ifdef ENABLE_LOG
				3689	nfa_postfix_dump(expr, OK);
				3690	nfa_dump(prog);
				3691	#endif
				3692
				3693	out:
				3694	vim_free(post_start);
				3695	post_start = post_ptr = post_end = NULL;
				3696	state_ptr = NULL;
				3697	return (regprog_T *)prog;
				3698
				3699	fail:
				3700	vim_free(prog);
				3701	prog = NULL;
				3702	#ifdef ENABLE_LOG
				3703	nfa_postfix_dump(expr, FAIL);
				3704	#endif
				3705	#ifdef DEBUG
				3706	nfa_regengine.expr = NULL;
				3707	#endif
				3708	goto out;
				3709	}
				3710
				3711
				3712	/*
				3713	* Match a regexp against a string.
				3714	* "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
				3715	* Uses curbuf for line count and 'iskeyword'.
				3716	*
				3717	* Return TRUE if there is a match, FALSE if not.
				3718	*/
				3719	static int
				3720	nfa_regexec(rmp, line, col)
				3721	regmatch_T *rmp;
				3722	char_u line; / string to match against */
				3723	colnr_T col; /* column to start looking for match */
				3724	{
				3725	reg_match = rmp;
				3726	reg_mmatch = NULL;
				3727	reg_maxline = 0;
				3728	reg_line_lbr = FALSE;
				3729	reg_buf = curbuf;
				3730	reg_win = NULL;
				3731	ireg_ic = rmp->rm_ic;
				3732	#ifdef FEAT_MBYTE
				3733	ireg_icombine = FALSE;
				3734	#endif
				3735	ireg_maxcol = 0;
				3736	return (nfa_regexec_both(line, col) != 0);
				3737	}
				3738
				3739	#if defined(FEAT_MODIFY_FNAME) \|\| defined(FEAT_EVAL) \
				3740	\|\| defined(FIND_REPLACE_DIALOG) \|\| defined(PROTO)
				3741
				3742	static int nfa_regexec_nl __ARGS((regmatch_T rmp, char_u line, colnr_T col));
				3743
				3744	/*
				3745	* Like nfa_regexec(), but consider a "\n" in "line" to be a line break.
				3746	*/
				3747	static int
				3748	nfa_regexec_nl(rmp, line, col)
				3749	regmatch_T *rmp;
				3750	char_u line; / string to match against */
				3751	colnr_T col; /* column to start looking for match */
				3752	{
				3753	reg_match = rmp;
				3754	reg_mmatch = NULL;
				3755	reg_maxline = 0;
				3756	reg_line_lbr = TRUE;
				3757	reg_buf = curbuf;
				3758	reg_win = NULL;
				3759	ireg_ic = rmp->rm_ic;
				3760	#ifdef FEAT_MBYTE
				3761	ireg_icombine = FALSE;
				3762	#endif
				3763	ireg_maxcol = 0;
				3764	return (nfa_regexec_both(line, col) != 0);
				3765	}
				3766	#endif
				3767
				3768
				3769	/*
				3770	* Match a regexp against multiple lines.
				3771	* "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
				3772	* Uses curbuf for line count and 'iskeyword'.
				3773	*
				3774	* Return zero if there is no match. Return number of lines contained in the
				3775	* match otherwise.
				3776	*
				3777	* Note: the body is the same as bt_regexec() except for nfa_regexec_both()
				3778	*
				3779	* ! Also NOTE : match may actually be in another line. e.g.:
				3780	* when r.e. is \nc, cursor is at 'a' and the text buffer looks like
				3781	*
				3782	* +-------------------------+
				3783	* \|a \|
				3784	* \|b \|
				3785	* \|c \|
				3786	* \| \|
				3787	* +-------------------------+
				3788	*
				3789	* then nfa_regexec_multi() returns 3. while the original
				3790	* vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
				3791	*
				3792	* FIXME if this behavior is not compatible.
				3793	*/
				3794	static long
				3795	nfa_regexec_multi(rmp, win, buf, lnum, col, tm)
				3796	regmmatch_T *rmp;
				3797	win_T win; / window in which to search or NULL */
				3798	buf_T buf; / buffer in which to search */
				3799	linenr_T lnum; /* nr of line to start looking for match */
				3800	colnr_T col; /* column to start looking for match */
				3801	proftime_T tm UNUSED; / timeout limit or NULL */
				3802	{
				3803	long r;
				3804	buf_T *save_curbuf = curbuf;
				3805
				3806	reg_match = NULL;
				3807	reg_mmatch = rmp;
				3808	reg_buf = buf;
				3809	reg_win = win;
				3810	reg_firstlnum = lnum;
				3811	reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
				3812	reg_line_lbr = FALSE;
				3813	ireg_ic = rmp->rmm_ic;
				3814	#ifdef FEAT_MBYTE
				3815	ireg_icombine = FALSE;
				3816	#endif
				3817	ireg_maxcol = rmp->rmm_maxcol;
				3818
				3819	/* Need to switch to buffer "buf" to make vim_iswordc() work. */
				3820	curbuf = buf;
				3821	r = nfa_regexec_both(NULL, col);
				3822	curbuf = save_curbuf;
				3823
				3824	return r;
				3825	}
				3826
				3827	#ifdef DEBUG
				3828	# undef ENABLE_LOG
				3829	#endif