Blame - src/regexp_nfa.c - android_external_vim

blob: b50697aa3dfbfc29dd8b765bf9e535ab1758cd57 [file] [log] [blame]

Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1	/* vi:set ts=8 sts=4 sw=4:
				2	*
				3	* NFA regular expression implementation.
				4	*
				5	* This file is included in "regexp.c".
				6	*/
				7
				8	#ifdef DEBUG
				9	/* Comment this out to disable log files. They can get pretty big */
				10	# define ENABLE_LOG
				11	# define LOG_NAME "log_nfarun.log"
				12	#endif
				13
				14	/* Upper limit allowed for {m,n} repetitions handled by NFA */
				15	#define NFA_BRACES_MAXLIMIT 10
				16	/* For allocating space for the postfix representation */
				17	#define NFA_POSTFIX_MULTIPLIER (NFA_BRACES_MAXLIMIT + 2)*2
				18	/* Size of stack, used when converting the postfix regexp into NFA */
				19	#define NFA_STACK_SIZE 1024
				20
				21	enum
				22	{
				23	NFA_SPLIT = -1024,
				24	NFA_MATCH,
				25	NFA_SKIP_CHAR, /* matches a 0-length char */
				26	NFA_END_NEG_RANGE, /* Used when expanding [^ab] */
				27
				28	NFA_CONCAT,
				29	NFA_OR,
				30	NFA_STAR,
				31	NFA_PLUS,
				32	NFA_QUEST,
				33	NFA_QUEST_NONGREEDY, /* Non-greedy version of \? */
				34	NFA_NOT, /* used for [^ab] negated char ranges */
				35
				36	NFA_BOL, /* ^ Begin line */
				37	NFA_EOL, /* $ End line */
				38	NFA_BOW, /* \< Begin word */
				39	NFA_EOW, /* \> End word */
				40	NFA_BOF, /* \%^ Begin file */
				41	NFA_EOF, /* \%$ End file */
				42	NFA_NEWL,
				43	NFA_ZSTART, /* Used for \zs */
				44	NFA_ZEND, /* Used for \ze */
				45	NFA_NOPEN, /* Start of subexpression marked with \%( */
				46	NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
				47	NFA_START_INVISIBLE,
				48	NFA_END_INVISIBLE,
				49	NFA_MULTIBYTE, /* Next nodes in NFA are part of the same
				50	multibyte char */
				51	NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */
				52	NFA_COMPOSING, /* Next nodes in NFA are part of the
				53	composing multibyte char */
				54	NFA_END_COMPOSING, /* End of a composing char in the NFA */
				55
				56	/* The following are used only in the postfix form, not in the NFA */
				57	NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */
				58	NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */
				59	NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */
				60	NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@<! */
				61	NFA_PREV_ATOM_LIKE_PATTERN, /* Used for \@> */
				62
				63	NFA_MOPEN,
				64	NFA_MCLOSE = NFA_MOPEN + NSUBEXP,
				65
				66	/* NFA_FIRST_NL */
				67	NFA_ANY = NFA_MCLOSE + NSUBEXP, /* Match any one character. */
				68	NFA_ANYOF, /* Match any character in this string. */
				69	NFA_ANYBUT, /* Match any character not in this string. */
				70	NFA_IDENT, /* Match identifier char */
				71	NFA_SIDENT, /* Match identifier char but no digit */
				72	NFA_KWORD, /* Match keyword char */
				73	NFA_SKWORD, /* Match word char but no digit */
				74	NFA_FNAME, /* Match file name char */
				75	NFA_SFNAME, /* Match file name char but no digit */
				76	NFA_PRINT, /* Match printable char */
				77	NFA_SPRINT, /* Match printable char but no digit */
				78	NFA_WHITE, /* Match whitespace char */
				79	NFA_NWHITE, /* Match non-whitespace char */
				80	NFA_DIGIT, /* Match digit char */
				81	NFA_NDIGIT, /* Match non-digit char */
				82	NFA_HEX, /* Match hex char */
				83	NFA_NHEX, /* Match non-hex char */
				84	NFA_OCTAL, /* Match octal char */
				85	NFA_NOCTAL, /* Match non-octal char */
				86	NFA_WORD, /* Match word char */
				87	NFA_NWORD, /* Match non-word char */
				88	NFA_HEAD, /* Match head char */
				89	NFA_NHEAD, /* Match non-head char */
				90	NFA_ALPHA, /* Match alpha char */
				91	NFA_NALPHA, /* Match non-alpha char */
				92	NFA_LOWER, /* Match lowercase char */
				93	NFA_NLOWER, /* Match non-lowercase char */
				94	NFA_UPPER, /* Match uppercase char */
				95	NFA_NUPPER, /* Match non-uppercase char */
				96	NFA_FIRST_NL = NFA_ANY + ADD_NL,
				97	NFA_LAST_NL = NFA_NUPPER + ADD_NL,
				98
				99	/* Character classes [:alnum:] etc */
				100	NFA_CLASS_ALNUM,
				101	NFA_CLASS_ALPHA,
				102	NFA_CLASS_BLANK,
				103	NFA_CLASS_CNTRL,
				104	NFA_CLASS_DIGIT,
				105	NFA_CLASS_GRAPH,
				106	NFA_CLASS_LOWER,
				107	NFA_CLASS_PRINT,
				108	NFA_CLASS_PUNCT,
				109	NFA_CLASS_SPACE,
				110	NFA_CLASS_UPPER,
				111	NFA_CLASS_XDIGIT,
				112	NFA_CLASS_TAB,
				113	NFA_CLASS_RETURN,
				114	NFA_CLASS_BACKSPACE,
				115	NFA_CLASS_ESCAPE
				116	};
				117
				118	/* Keep in sync with classchars. */
				119	static int nfa_classcodes[] = {
				120	NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
				121	NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
				122	NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
				123	NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
				124	NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
				125	NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
				126	NFA_UPPER, NFA_NUPPER
				127	};
				128
				129	static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
				130
				131	/*
				132	* NFA errors can be of 3 types:
				133	* *** NFA runtime errors, when something unknown goes wrong. The NFA fails
				134	* silently and revert the to backtracking engine.
				135	* syntax_error = FALSE;
				136	* *** Regexp syntax errors, when the input regexp is not syntactically correct.
				137	* The NFA engine displays an error message, and nothing else happens.
				138	* syntax_error = TRUE
				139	* *** Unsupported features, when the input regexp uses an operator that is not
				140	* implemented in the NFA. The NFA engine fails silently, and reverts to the
				141	* old backtracking engine.
				142	* syntax_error = FALSE
				143	* "The NFA fails" means that "compiling the regexp with the NFA fails":
				144	* nfa_regcomp() returns FAIL.
				145	*/
				146	static int syntax_error = FALSE;
				147
				148	/* NFA regexp \ze operator encountered. */
				149	static int nfa_has_zend = FALSE;
				150
				151	static int post_start; / holds the postfix form of r.e. */
				152	static int *post_end;
				153	static int *post_ptr;
				154
				155	static int nstate; /* Number of states in the NFA. */
				156	static int istate; /* Index in the state vector, used in new_state() */
				157	static int nstate_max; /* Upper bound of estimated number of states. */
				158
				159
				160	static int nfa_regcomp_start __ARGS((char_u*expr, int re_flags));
				161	static int nfa_recognize_char_class __ARGS((char_u start, char_u end, int extra_newl));
				162	static int nfa_emit_equi_class __ARGS((int c, int neg));
				163	static void nfa_inc __ARGS((char_u **p));
				164	static void nfa_dec __ARGS((char_u **p));
				165	static int nfa_regatom __ARGS((void));
				166	static int nfa_regpiece __ARGS((void));
				167	static int nfa_regconcat __ARGS((void));
				168	static int nfa_regbranch __ARGS((void));
				169	static int nfa_reg __ARGS((int paren));
				170	#ifdef DEBUG
				171	static void nfa_set_code __ARGS((int c));
				172	static void nfa_postfix_dump __ARGS((char_u *expr, int retval));
				173	static void nfa_print_state __ARGS((FILE debugf, nfa_state_T state, int ident));
				174	static void nfa_dump __ARGS((nfa_regprog_T *prog));
				175	#endif
				176	static int *re2post __ARGS((void));
				177	static nfa_state_T new_state __ARGS((int c, nfa_state_T out, nfa_state_T *out1));
				178	static nfa_state_T post2nfa __ARGS((int postfix, int *end, int nfa_calc_size));
				179	static int check_char_class __ARGS((int class, int c));
				180	static void st_error __ARGS((int postfix, int end, int *p));
				181	static void nfa_save_listids __ARGS((nfa_state_T start, int list));
				182	static void nfa_restore_listids __ARGS((nfa_state_T start, int list));
				183	static void nfa_set_null_listids __ARGS((nfa_state_T *start));
				184	static void nfa_set_neg_listids __ARGS((nfa_state_T *start));
				185	static long nfa_regtry __ARGS((nfa_state_T *start, colnr_T col));
				186	static long nfa_regexec_both __ARGS((char_u *line, colnr_T col));
				187	static regprog_T nfa_regcomp __ARGS((char_u expr, int re_flags));
				188	static int nfa_regexec __ARGS((regmatch_T rmp, char_u line, colnr_T col));
				189	static long nfa_regexec_multi __ARGS((regmmatch_T rmp, win_T win, buf_T buf, linenr_T lnum, colnr_T col, proftime_T tm));
				190
				191	/* helper functions used when doing re2post() ... regatom() parsing */
				192	#define EMIT(c) do { \
				193	if (post_ptr >= post_end) \
				194	return FAIL; \
				195	*post_ptr++ = c; \
				196	} while (0)
				197
				198	#define EMIT_MBYTE(c) \
				199	len = (*mb_char2bytes)(c, buf); \
				200	EMIT(buf[0]); \
				201	for (i = 1; i < len; i++) \
				202	{ \
				203	EMIT(buf[i]); \
				204	EMIT(NFA_CONCAT); \
				205	} \
				206	EMIT(NFA_MULTIBYTE);
				207
				208	#define EMIT_COMPOSING_UTF(input) \
				209	len = utfc_ptr2len(input); \
				210	EMIT(input[0]); \
				211	for (i = 1; i < len; i++) \
				212	{ \
				213	EMIT(input[i]); \
				214	EMIT(NFA_CONCAT); \
				215	} \
				216	EMIT(NFA_COMPOSING);
				217
				218	/*
				219	* Initialize internal variables before NFA compilation.
				220	* Return OK on success, FAIL otherwise.
				221	*/
				222	static int
				223	nfa_regcomp_start(expr, re_flags)
				224	char_u *expr;
				225	int re_flags; /* see vim_regcomp() */
				226	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	227	size_t postfix_size;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	228
				229	nstate = 0;
				230	istate = 0;
				231	/* A reasonable estimation for size */
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	232	nstate_max = (int)(STRLEN(expr) + 1) * NFA_POSTFIX_MULTIPLIER;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	233
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	234	/* Some items blow up in size, such as [A-z]. Add more space for that.
				235	* TODO: some patterns may still fail. */
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	236	nstate_max += 1000;
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	237
				238	/* Size for postfix representation of expr. */
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	239	postfix_size = sizeof(post_start) nstate_max;
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	240
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	241	post_start = (int *)lalloc(postfix_size, TRUE);
				242	if (post_start == NULL)
				243	return FAIL;
				244	vim_memset(post_start, 0, postfix_size);
				245	post_ptr = post_start;
Bram Moolenaar	bc0ea8f	2013-05-20 13:44:29 +0200	[diff] [blame]	246	post_end = post_start + nstate_max;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	247	nfa_has_zend = FALSE;
				248
				249	regcomp_start(expr, re_flags);
				250
				251	return OK;
				252	}
				253
				254	/*
				255	* Search between "start" and "end" and try to recognize a
				256	* character class in expanded form. For example [0-9].
				257	* On success, return the id the character class to be emitted.
				258	* On failure, return 0 (=FAIL)
				259	* Start points to the first char of the range, while end should point
				260	* to the closing brace.
				261	*/
				262	static int
				263	nfa_recognize_char_class(start, end, extra_newl)
				264	char_u *start;
				265	char_u *end;
				266	int extra_newl;
				267	{
				268	int i;
				269	/* Each of these variables takes up a char in "config[]",
				270	* in the order they are here. */
				271	int not = FALSE, af = FALSE, AF = FALSE, az = FALSE, AZ = FALSE,
				272	o7 = FALSE, o9 = FALSE, underscore = FALSE, newl = FALSE;
				273	char_u *p;
				274	#define NCONFIGS 16
				275	int classid[NCONFIGS] = {
				276	NFA_DIGIT, NFA_NDIGIT, NFA_HEX, NFA_NHEX,
				277	NFA_OCTAL, NFA_NOCTAL, NFA_WORD, NFA_NWORD,
				278	NFA_HEAD, NFA_NHEAD, NFA_ALPHA, NFA_NALPHA,
				279	NFA_LOWER, NFA_NLOWER, NFA_UPPER, NFA_NUPPER
				280	};
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	281	char_u myconfig[10];
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	282	char_u config[NCONFIGS][9] = {
				283	"000000100", /* digit */
				284	"100000100", /* non digit */
				285	"011000100", /* hex-digit */
				286	"111000100", /* non hex-digit */
				287	"000001000", /* octal-digit */
				288	"100001000", /* [^0-7] */
				289	"000110110", /* [0-9A-Za-z_] */
				290	"100110110", /* [^0-9A-Za-z_] */
				291	"000110010", /* head of word */
				292	"100110010", /* not head of word */
				293	"000110000", /* alphabetic char a-z */
				294	"100110000", /* non alphabetic char */
				295	"000100000", /* lowercase letter */
				296	"100100000", /* non lowercase */
				297	"000010000", /* uppercase */
				298	"100010000" /* non uppercase */
				299	};
				300
				301	if (extra_newl == TRUE)
				302	newl = TRUE;
				303
				304	if (*end != ']')
				305	return FAIL;
				306	p = start;
				307	if (*p == '^')
				308	{
				309	not = TRUE;
				310	p ++;
				311	}
				312
				313	while (p < end)
				314	{
				315	if (p + 2 < end && *(p + 1) == '-')
				316	{
				317	switch (*p)
				318	{
				319	case '0':
				320	if (*(p + 2) == '9')
				321	{
				322	o9 = TRUE;
				323	break;
				324	}
				325	else
				326	if (*(p + 2) == '7')
				327	{
				328	o7 = TRUE;
				329	break;
				330	}
				331	case 'a':
				332	if (*(p + 2) == 'z')
				333	{
				334	az = TRUE;
				335	break;
				336	}
				337	else
				338	if (*(p + 2) == 'f')
				339	{
				340	af = TRUE;
				341	break;
				342	}
				343	case 'A':
				344	if (*(p + 2) == 'Z')
				345	{
				346	AZ = TRUE;
				347	break;
				348	}
				349	else
				350	if (*(p + 2) == 'F')
				351	{
				352	AF = TRUE;
				353	break;
				354	}
				355	/* FALLTHROUGH */
				356	default:
				357	return FAIL;
				358	}
				359	p += 3;
				360	}
				361	else if (p + 1 < end && p == '\\' && (p + 1) == 'n')
				362	{
				363	newl = TRUE;
				364	p += 2;
				365	}
				366	else if (*p == '_')
				367	{
				368	underscore = TRUE;
				369	p ++;
				370	}
				371	else if (*p == '\n')
				372	{
				373	newl = TRUE;
				374	p ++;
				375	}
				376	else
				377	return FAIL;
				378	} /* while (p < end) */
				379
				380	if (p != end)
				381	return FAIL;
				382
				383	/* build the config that represents the ranges we gathered */
				384	STRCPY(myconfig, "000000000");
				385	if (not == TRUE)
				386	myconfig[0] = '1';
				387	if (af == TRUE)
				388	myconfig[1] = '1';
				389	if (AF == TRUE)
				390	myconfig[2] = '1';
				391	if (az == TRUE)
				392	myconfig[3] = '1';
				393	if (AZ == TRUE)
				394	myconfig[4] = '1';
				395	if (o7 == TRUE)
				396	myconfig[5] = '1';
				397	if (o9 == TRUE)
				398	myconfig[6] = '1';
				399	if (underscore == TRUE)
				400	myconfig[7] = '1';
				401	if (newl == TRUE)
				402	{
				403	myconfig[8] = '1';
				404	extra_newl = ADD_NL;
				405	}
				406	/* try to recognize character classes */
				407	for (i = 0; i < NCONFIGS; i++)
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	408	if (STRNCMP(myconfig, config[i], 8) == 0)
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	409	return classid[i] + extra_newl;
				410
				411	/* fallthrough => no success so far */
				412	return FAIL;
				413
				414	#undef NCONFIGS
				415	}
				416
				417	/*
				418	* Produce the bytes for equivalence class "c".
				419	* Currently only handles latin1, latin9 and utf-8.
				420	* Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
				421	* equivalent to 'a OR b OR c'
				422	*
				423	* NOTE! When changing this function, also update reg_equi_class()
				424	*/
				425	static int
				426	nfa_emit_equi_class(c, neg)
				427	int c;
				428	int neg;
				429	{
				430	int first = TRUE;
				431	int glue = neg == TRUE ? NFA_CONCAT : NFA_OR;
				432	#define EMIT2(c) \
				433	EMIT(c); \
				434	if (neg == TRUE) { \
				435	EMIT(NFA_NOT); \
				436	} \
				437	if (first == FALSE) \
				438	EMIT(glue); \
				439	else \
				440	first = FALSE; \
				441
				442	#ifdef FEAT_MBYTE
				443	if (enc_utf8 \|\| STRCMP(p_enc, "latin1") == 0
				444	\|\| STRCMP(p_enc, "iso-8859-15") == 0)
				445	#endif
				446	{
				447	switch (c)
				448	{
				449	case 'A': case '\300': case '\301': case '\302':
				450	case '\303': case '\304': case '\305':
				451	EMIT2('A'); EMIT2('\300'); EMIT2('\301');
				452	EMIT2('\302'); EMIT2('\303'); EMIT2('\304');
				453	EMIT2('\305');
				454	return OK;
				455
				456	case 'C': case '\307':
				457	EMIT2('C'); EMIT2('\307');
				458	return OK;
				459
				460	case 'E': case '\310': case '\311': case '\312': case '\313':
				461	EMIT2('E'); EMIT2('\310'); EMIT2('\311');
				462	EMIT2('\312'); EMIT2('\313');
				463	return OK;
				464
				465	case 'I': case '\314': case '\315': case '\316': case '\317':
				466	EMIT2('I'); EMIT2('\314'); EMIT2('\315');
				467	EMIT2('\316'); EMIT2('\317');
				468	return OK;
				469
				470	case 'N': case '\321':
				471	EMIT2('N'); EMIT2('\321');
				472	return OK;
				473
				474	case 'O': case '\322': case '\323': case '\324': case '\325':
				475	case '\326':
				476	EMIT2('O'); EMIT2('\322'); EMIT2('\323');
				477	EMIT2('\324'); EMIT2('\325'); EMIT2('\326');
				478	return OK;
				479
				480	case 'U': case '\331': case '\332': case '\333': case '\334':
				481	EMIT2('U'); EMIT2('\331'); EMIT2('\332');
				482	EMIT2('\333'); EMIT2('\334');
				483	return OK;
				484
				485	case 'Y': case '\335':
				486	EMIT2('Y'); EMIT2('\335');
				487	return OK;
				488
				489	case 'a': case '\340': case '\341': case '\342':
				490	case '\343': case '\344': case '\345':
				491	EMIT2('a'); EMIT2('\340'); EMIT2('\341');
				492	EMIT2('\342'); EMIT2('\343'); EMIT2('\344');
				493	EMIT2('\345');
				494	return OK;
				495
				496	case 'c': case '\347':
				497	EMIT2('c'); EMIT2('\347');
				498	return OK;
				499
				500	case 'e': case '\350': case '\351': case '\352': case '\353':
				501	EMIT2('e'); EMIT2('\350'); EMIT2('\351');
				502	EMIT2('\352'); EMIT2('\353');
				503	return OK;
				504
				505	case 'i': case '\354': case '\355': case '\356': case '\357':
				506	EMIT2('i'); EMIT2('\354'); EMIT2('\355');
				507	EMIT2('\356'); EMIT2('\357');
				508	return OK;
				509
				510	case 'n': case '\361':
				511	EMIT2('n'); EMIT2('\361');
				512	return OK;
				513
				514	case 'o': case '\362': case '\363': case '\364': case '\365':
				515	case '\366':
				516	EMIT2('o'); EMIT2('\362'); EMIT2('\363');
				517	EMIT2('\364'); EMIT2('\365'); EMIT2('\366');
				518	return OK;
				519
				520	case 'u': case '\371': case '\372': case '\373': case '\374':
				521	EMIT2('u'); EMIT2('\371'); EMIT2('\372');
				522	EMIT2('\373'); EMIT2('\374');
				523	return OK;
				524
				525	case 'y': case '\375': case '\377':
				526	EMIT2('y'); EMIT2('\375'); EMIT2('\377');
				527	return OK;
				528
				529	default:
				530	return FAIL;
				531	}
				532	}
				533
				534	EMIT(c);
				535	return OK;
				536	#undef EMIT2
				537	}
				538
				539	/*
				540	* Code to parse regular expression.
				541	*
				542	* We try to reuse parsing functions in regexp.c to
				543	* minimize surprise and keep the syntax consistent.
				544	*/
				545
				546	/*
				547	* Increments the pointer "p" by one (multi-byte) character.
				548	*/
				549	static void
				550	nfa_inc(p)
				551	char_u **p;
				552	{
				553	#ifdef FEAT_MBYTE
				554	if (has_mbyte)
				555	mb_ptr2char_adv(p);
				556	else
				557	#endif
				558	p = p + 1;
				559	}
				560
				561	/*
				562	* Decrements the pointer "p" by one (multi-byte) character.
				563	*/
				564	static void
				565	nfa_dec(p)
				566	char_u **p;
				567	{
				568	#ifdef FEAT_MBYTE
				569	char_u p2, oldp;
				570
				571	if (has_mbyte)
				572	{
				573	oldp = *p;
				574	/* Try to find the multibyte char that advances to the current
				575	* position. */
				576	do
				577	{
				578	p = p - 1;
				579	p2 = *p;
				580	mb_ptr2char_adv(&p2);
				581	} while (p2 != oldp);
				582	}
				583	#else
				584	p = p - 1;
				585	#endif
				586	}
				587
				588	/*
				589	* Parse the lowest level.
				590	*
				591	* An atom can be one of a long list of items. Many atoms match one character
				592	* in the text. It is often an ordinary character or a character class.
				593	* Braces can be used to make a pattern into an atom. The "\z(\)" construct
				594	* is only for syntax highlighting.
				595	*
				596	* atom ::= ordinary-atom
				597	* or $ pattern $
				598	* or \%( pattern \)
				599	* or \z( pattern \)
				600	*/
				601	static int
				602	nfa_regatom()
				603	{
				604	int c;
				605	int charclass;
				606	int equiclass;
				607	int collclass;
				608	int got_coll_char;
				609	char_u *p;
				610	char_u *endp;
				611	#ifdef FEAT_MBYTE
				612	char_u *old_regparse = regparse;
				613	int clen;
				614	int len;
				615	static char_u buf[30];
				616	int i;
				617	#endif
				618	int extra = 0;
				619	int first;
				620	int emit_range;
				621	int negated;
				622	int result;
				623	int startc = -1;
				624	int endc = -1;
				625	int oldstartc = -1;
				626	int cpo_lit; /* 'cpoptions' contains 'l' flag */
				627	int cpo_bsl; /* 'cpoptions' contains '\' flag */
				628	int glue; /* ID that will "glue" nodes together */
				629
				630	cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
				631	cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
				632
				633	c = getchr();
				634
				635	#ifdef FEAT_MBYTE
				636	/* clen has the length of the current char, without composing chars */
				637	clen = (*mb_char2len)(c);
				638	if (has_mbyte && clen > 1)
				639	goto nfa_do_multibyte;
				640	#endif
				641	switch (c)
				642	{
				643	case Magic('^'):
				644	EMIT(NFA_BOL);
				645	break;
				646
				647	case Magic('$'):
				648	EMIT(NFA_EOL);
				649	#if defined(FEAT_SYN_HL) \|\| defined(PROTO)
				650	had_eol = TRUE;
				651	#endif
				652	break;
				653
				654	case Magic('<'):
				655	EMIT(NFA_BOW);
				656	break;
				657
				658	case Magic('>'):
				659	EMIT(NFA_EOW);
				660	break;
				661
				662	case Magic('_'):
				663	c = no_Magic(getchr());
				664	if (c == '^') /* "\_^" is start-of-line */
				665	{
				666	EMIT(NFA_BOL);
				667	break;
				668	}
				669	if (c == '$') /* "\_$" is end-of-line */
				670	{
				671	EMIT(NFA_EOL);
				672	#if defined(FEAT_SYN_HL) \|\| defined(PROTO)
				673	had_eol = TRUE;
				674	#endif
				675	break;
				676	}
				677
				678	extra = ADD_NL;
				679
				680	/* "\_[" is collection plus newline */
				681	if (c == '[')
				682	/* TODO: make this work
				683	* goto collection; */
				684	return FAIL;
				685
				686	/* "\_x" is character class plus newline */
				687	/FALLTHROUGH/
				688
				689	/*
				690	* Character classes.
				691	*/
				692	case Magic('.'):
				693	case Magic('i'):
				694	case Magic('I'):
				695	case Magic('k'):
				696	case Magic('K'):
				697	case Magic('f'):
				698	case Magic('F'):
				699	case Magic('p'):
				700	case Magic('P'):
				701	case Magic('s'):
				702	case Magic('S'):
				703	case Magic('d'):
				704	case Magic('D'):
				705	case Magic('x'):
				706	case Magic('X'):
				707	case Magic('o'):
				708	case Magic('O'):
				709	case Magic('w'):
				710	case Magic('W'):
				711	case Magic('h'):
				712	case Magic('H'):
				713	case Magic('a'):
				714	case Magic('A'):
				715	case Magic('l'):
				716	case Magic('L'):
				717	case Magic('u'):
				718	case Magic('U'):
				719	p = vim_strchr(classchars, no_Magic(c));
				720	if (p == NULL)
				721	{
				722	return FAIL; /* runtime error */
				723	}
				724	#ifdef FEAT_MBYTE
				725	/* When '.' is followed by a composing char ignore the dot, so that
				726	* the composing char is matched here. */
				727	if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
				728	{
				729	c = getchr();
				730	goto nfa_do_multibyte;
				731	}
				732	#endif
				733	EMIT(nfa_classcodes[p - classchars]);
				734	if (extra == ADD_NL)
				735	{
				736	EMIT(NFA_NEWL);
				737	EMIT(NFA_OR);
				738	regflags \|= RF_HASNL;
				739	}
				740	break;
				741
				742	case Magic('n'):
				743	if (reg_string)
				744	/* In a string "\n" matches a newline character. */
				745	EMIT(NL);
				746	else
				747	{
				748	/* In buffer text "\n" matches the end of a line. */
				749	EMIT(NFA_NEWL);
				750	regflags \|= RF_HASNL;
				751	}
				752	break;
				753
				754	case Magic('('):
				755	if (nfa_reg(REG_PAREN) == FAIL)
				756	return FAIL; /* cascaded error */
				757	break;
				758
				759	case NUL:
				760	syntax_error = TRUE;
				761	EMSG_RET_FAIL(_("E865: (NFA) Regexp end encountered prematurely"));
				762
				763	case Magic('\|'):
				764	case Magic('&'):
				765	case Magic(')'):
				766	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	767	EMSGN(_(e_misplaced), no_Magic(c));
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	768	return FAIL;
				769
				770	case Magic('='):
				771	case Magic('?'):
				772	case Magic('+'):
				773	case Magic('@'):
				774	case Magic('*'):
				775	case Magic('{'):
				776	/* these should follow an atom, not form an atom */
				777	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	778	EMSGN(_(e_misplaced), no_Magic(c));
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	779	return FAIL;
				780
				781	case Magic('~'): /* previous substitute pattern */
				782	/* Not supported yet */
				783	return FAIL;
				784
				785	case Magic('1'):
				786	case Magic('2'):
				787	case Magic('3'):
				788	case Magic('4'):
				789	case Magic('5'):
				790	case Magic('6'):
				791	case Magic('7'):
				792	case Magic('8'):
				793	case Magic('9'):
				794	/* not supported yet */
				795	return FAIL;
				796
				797	case Magic('z'):
				798	c = no_Magic(getchr());
				799	switch (c)
				800	{
				801	case 's':
				802	EMIT(NFA_ZSTART);
				803	break;
				804	case 'e':
				805	EMIT(NFA_ZEND);
				806	nfa_has_zend = TRUE;
				807	/* TODO: Currently \ze does not work properly. */
				808	return FAIL;
				809	/* break; */
				810	case '1':
				811	case '2':
				812	case '3':
				813	case '4':
				814	case '5':
				815	case '6':
				816	case '7':
				817	case '8':
				818	case '9':
				819	case '(':
				820	/* \z1...\z9 and \z( not yet supported */
				821	return FAIL;
				822	default:
				823	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	824	EMSGN(_("E867: (NFA) Unknown operator '\\z%c'"),
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	825	no_Magic(c));
				826	return FAIL;
				827	}
				828	break;
				829
				830	case Magic('%'):
				831	c = no_Magic(getchr());
				832	switch (c)
				833	{
				834	/* () without a back reference */
				835	case '(':
				836	if (nfa_reg(REG_NPAREN) == FAIL)
				837	return FAIL;
				838	EMIT(NFA_NOPEN);
				839	break;
				840
				841	case 'd': /* %d123 decimal */
				842	case 'o': /* %o123 octal */
				843	case 'x': /* %xab hex 2 */
				844	case 'u': /* %uabcd hex 4 */
				845	case 'U': /* %U1234abcd hex 8 */
				846	/* Not yet supported */
				847	return FAIL;
				848
				849	c = coll_get_char();
				850	#ifdef FEAT_MBYTE
				851	if ((*mb_char2len)(c) > 1)
				852	{
				853	EMIT_MBYTE(c);
				854	}
				855	else
				856	#endif
				857	EMIT(c);
				858	break;
				859
				860	/* Catch \%^ and \%$ regardless of where they appear in the
				861	* pattern -- regardless of whether or not it makes sense. */
				862	case '^':
				863	EMIT(NFA_BOF);
				864	/* Not yet supported */
				865	return FAIL;
				866	break;
				867
				868	case '$':
				869	EMIT(NFA_EOF);
				870	/* Not yet supported */
				871	return FAIL;
				872	break;
				873
				874	case '#':
				875	/* not supported yet */
				876	return FAIL;
				877	break;
				878
				879	case 'V':
				880	/* not supported yet */
				881	return FAIL;
				882	break;
				883
				884	case '[':
				885	/* \%[abc] not supported yet */
				886	return FAIL;
				887
				888	default:
				889	/* not supported yet */
				890	return FAIL;
				891	}
				892	break;
				893
				894	/* collection: */
				895	case Magic('['):
				896	/*
				897	* Glue is emitted between several atoms from the [].
				898	* It is either NFA_OR, or NFA_CONCAT.
				899	*
				900	* [abc] expands to 'a b NFA_OR c NFA_OR' (in postfix notation)
				901	* [^abc] expands to 'a NFA_NOT b NFA_NOT NFA_CONCAT c NFA_NOT
				902	* NFA_CONCAT NFA_END_NEG_RANGE NFA_CONCAT' (in postfix
				903	* notation)
				904	*
				905	*/
				906
				907
				908	/* Emit negation atoms, if needed.
				909	* The CONCAT below merges the NOT with the previous node. */
				910	#define TRY_NEG() \
				911	if (negated == TRUE) \
				912	{ \
				913	EMIT(NFA_NOT); \
				914	}
				915
				916	/* Emit glue between important nodes : CONCAT or OR. */
				917	#define EMIT_GLUE() \
				918	if (first == FALSE) \
				919	EMIT(glue); \
				920	else \
				921	first = FALSE;
				922
				923	p = regparse;
				924	endp = skip_anyof(p);
				925	if (*endp == ']')
				926	{
				927	/*
				928	* Try to reverse engineer character classes. For example,
				929	* recognize that [0-9] stands for \d and [A-Za-z_] with \h,
				930	* and perform the necessary substitutions in the NFA.
				931	*/
				932	result = nfa_recognize_char_class(regparse, endp,
				933	extra == ADD_NL);
				934	if (result != FAIL)
				935	{
				936	if (result >= NFA_DIGIT && result <= NFA_NUPPER)
				937	EMIT(result);
				938	else /* must be char class + newline */
				939	{
				940	EMIT(result - ADD_NL);
				941	EMIT(NFA_NEWL);
				942	EMIT(NFA_OR);
				943	}
				944	regparse = endp;
				945	nfa_inc(&regparse);
				946	return OK;
				947	}
				948	/*
				949	* Failed to recognize a character class. Use the simple
				950	* version that turns [abc] into 'a' OR 'b' OR 'c'
				951	*/
				952	startc = endc = oldstartc = -1;
				953	first = TRUE; /* Emitting first atom in this sequence? */
				954	negated = FALSE;
				955	glue = NFA_OR;
				956	if (regparse == '^') / negated range */
				957	{
				958	negated = TRUE;
				959	glue = NFA_CONCAT;
				960	nfa_inc(&regparse);
				961	}
				962	if (*regparse == '-')
				963	{
				964	startc = '-';
				965	EMIT(startc);
				966	TRY_NEG();
				967	EMIT_GLUE();
				968	nfa_inc(&regparse);
				969	}
				970	/* Emit the OR branches for each character in the [] */
				971	emit_range = FALSE;
				972	while (regparse < endp)
				973	{
				974	oldstartc = startc;
				975	startc = -1;
				976	got_coll_char = FALSE;
				977	if (*regparse == '[')
				978	{
				979	/* Check for [: :], [= =], [. .] */
				980	equiclass = collclass = 0;
				981	charclass = get_char_class(&regparse);
				982	if (charclass == CLASS_NONE)
				983	{
				984	equiclass = get_equi_class(&regparse);
				985	if (equiclass == 0)
				986	collclass = get_coll_element(&regparse);
				987	}
				988
				989	/* Character class like [:alpha:] */
				990	if (charclass != CLASS_NONE)
				991	{
				992	switch (charclass)
				993	{
				994	case CLASS_ALNUM:
				995	EMIT(NFA_CLASS_ALNUM);
				996	break;
				997	case CLASS_ALPHA:
				998	EMIT(NFA_CLASS_ALPHA);
				999	break;
				1000	case CLASS_BLANK:
				1001	EMIT(NFA_CLASS_BLANK);
				1002	break;
				1003	case CLASS_CNTRL:
				1004	EMIT(NFA_CLASS_CNTRL);
				1005	break;
				1006	case CLASS_DIGIT:
				1007	EMIT(NFA_CLASS_DIGIT);
				1008	break;
				1009	case CLASS_GRAPH:
				1010	EMIT(NFA_CLASS_GRAPH);
				1011	break;
				1012	case CLASS_LOWER:
				1013	EMIT(NFA_CLASS_LOWER);
				1014	break;
				1015	case CLASS_PRINT:
				1016	EMIT(NFA_CLASS_PRINT);
				1017	break;
				1018	case CLASS_PUNCT:
				1019	EMIT(NFA_CLASS_PUNCT);
				1020	break;
				1021	case CLASS_SPACE:
				1022	EMIT(NFA_CLASS_SPACE);
				1023	break;
				1024	case CLASS_UPPER:
				1025	EMIT(NFA_CLASS_UPPER);
				1026	break;
				1027	case CLASS_XDIGIT:
				1028	EMIT(NFA_CLASS_XDIGIT);
				1029	break;
				1030	case CLASS_TAB:
				1031	EMIT(NFA_CLASS_TAB);
				1032	break;
				1033	case CLASS_RETURN:
				1034	EMIT(NFA_CLASS_RETURN);
				1035	break;
				1036	case CLASS_BACKSPACE:
				1037	EMIT(NFA_CLASS_BACKSPACE);
				1038	break;
				1039	case CLASS_ESCAPE:
				1040	EMIT(NFA_CLASS_ESCAPE);
				1041	break;
				1042	}
				1043	TRY_NEG();
				1044	EMIT_GLUE();
				1045	continue;
				1046	}
				1047	/* Try equivalence class [=a=] and the like */
				1048	if (equiclass != 0)
				1049	{
				1050	result = nfa_emit_equi_class(equiclass, negated);
				1051	if (result == FAIL)
				1052	{
				1053	/* should never happen */
				1054	EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
				1055	}
				1056	EMIT_GLUE();
				1057	continue;
				1058	}
				1059	/* Try collating class like [. .] */
				1060	if (collclass != 0)
				1061	{
				1062	startc = collclass; /* allow [.a.]-x as a range */
				1063	/* Will emit the proper atom at the end of the
				1064	* while loop. */
				1065	}
				1066	}
				1067	/* Try a range like 'a-x' or '\t-z' */
				1068	if (*regparse == '-')
				1069	{
				1070	emit_range = TRUE;
				1071	startc = oldstartc;
				1072	nfa_inc(&regparse);
				1073	continue; /* reading the end of the range */
				1074	}
				1075
				1076	/* Now handle simple and escaped characters.
				1077	* Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
				1078	* accepts "\t", "\e", etc., but only when the 'l' flag in
				1079	* 'cpoptions' is not included.
				1080	* Posix doesn't recognize backslash at all.
				1081	*/
				1082	if (*regparse == '\\'
				1083	&& !cpo_bsl
				1084	&& regparse + 1 <= endp
				1085	&& (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
				1086	\|\| (!cpo_lit
				1087	&& vim_strchr(REGEXP_ABBR, regparse[1])
				1088	!= NULL)
				1089	)
				1090	)
				1091	{
				1092	nfa_inc(&regparse);
				1093
				1094	if (regparse == 'n' \|\| regparse == 'n')
				1095	startc = reg_string ? NL : NFA_NEWL;
				1096	else
				1097	if (*regparse == 'd'
				1098	\|\| *regparse == 'o'
				1099	\|\| *regparse == 'x'
				1100	\|\| *regparse == 'u'
				1101	\|\| *regparse == 'U'
				1102	)
				1103	{
				1104	/* TODO(RE) This needs more testing */
				1105	startc = coll_get_char();
				1106	got_coll_char = TRUE;
				1107	nfa_dec(&regparse);
				1108	}
				1109	else
				1110	{
				1111	/* \r,\t,\e,\b */
				1112	startc = backslash_trans(*regparse);
				1113	}
				1114	}
				1115
				1116	/* Normal printable char */
				1117	if (startc == -1)
				1118	#ifdef FEAT_MBYTE
				1119	startc = (*mb_ptr2char)(regparse);
				1120	#else
				1121	startc = *regparse;
				1122	#endif
				1123
				1124	/* Previous char was '-', so this char is end of range. */
				1125	if (emit_range)
				1126	{
				1127	endc = startc; startc = oldstartc;
				1128	if (startc > endc)
				1129	EMSG_RET_FAIL(_(e_invrange));
				1130	#ifdef FEAT_MBYTE
				1131	if (has_mbyte && ((*mb_char2len)(startc) > 1
				1132	\|\| (*mb_char2len)(endc) > 1))
				1133	{
				1134	if (endc > startc + 256)
				1135	EMSG_RET_FAIL(_(e_invrange));
				1136	/* Emit the range. "startc" was already emitted, so
				1137	* skip it. */
				1138	for (c = startc + 1; c <= endc; c++)
				1139	{
				1140	if ((*mb_char2len)(c) > 1)
				1141	{
				1142	EMIT_MBYTE(c);
				1143	}
				1144	else
				1145	EMIT(c);
				1146	TRY_NEG();
				1147	EMIT_GLUE();
				1148	}
				1149	emit_range = FALSE;
				1150	}
				1151	else
				1152	#endif
				1153	{
				1154	#ifdef EBCDIC
				1155	int alpha_only = FALSE;
				1156
				1157	/* for alphabetical range skip the gaps
				1158	* 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
				1159	if (isalpha(startc) && isalpha(endc))
				1160	alpha_only = TRUE;
				1161	#endif
				1162	/* Emit the range. "startc" was already emitted, so
				1163	* skip it. */
				1164	for (c = startc + 1; c <= endc; c++)
				1165	#ifdef EBCDIC
				1166	if (!alpha_only \|\| isalpha(startc))
				1167	#endif
				1168	{
				1169	EMIT(c);
				1170	TRY_NEG();
				1171	EMIT_GLUE();
				1172	}
				1173	emit_range = FALSE;
				1174	}
				1175	}
				1176	else
				1177	{
				1178	/*
				1179	* This char (startc) is not part of a range. Just
				1180	* emit it.
				1181	*
				1182	* Normally, simply emit startc. But if we get char
				1183	* code=0 from a collating char, then replace it with
				1184	* 0x0a.
				1185	*
				1186	* This is needed to completely mimic the behaviour of
				1187	* the backtracking engine.
				1188	*/
				1189	if (got_coll_char == TRUE && startc == 0)
				1190	EMIT(0x0a);
				1191	else
				1192	#ifdef FEAT_MBYTE
				1193	if ((*mb_char2len)(startc) > 1)
				1194	{
				1195	EMIT_MBYTE(startc);
				1196	}
				1197	else
				1198	#endif
				1199	EMIT(startc);
				1200	TRY_NEG();
				1201	EMIT_GLUE();
				1202	}
				1203
				1204	nfa_inc(&regparse);
				1205	} /* while (p < endp) */
				1206
				1207	nfa_dec(&regparse);
				1208	if (regparse == '-') / if last, '-' is just a char */
				1209	{
				1210	EMIT('-');
				1211	TRY_NEG();
				1212	EMIT_GLUE();
				1213	}
				1214	nfa_inc(&regparse);
				1215
				1216	if (extra == ADD_NL) /* \_[] also matches \n */
				1217	{
				1218	EMIT(reg_string ? NL : NFA_NEWL);
				1219	TRY_NEG();
				1220	EMIT_GLUE();
				1221	}
				1222
				1223	/* skip the trailing ] */
				1224	regparse = endp;
				1225	nfa_inc(&regparse);
				1226	if (negated == TRUE)
				1227	{
				1228	/* Mark end of negated char range */
				1229	EMIT(NFA_END_NEG_RANGE);
				1230	EMIT(NFA_CONCAT);
				1231	}
				1232	return OK;
				1233	} /* if exists closing ] */
				1234	else if (reg_strict)
				1235	{
				1236	syntax_error = TRUE;
				1237	EMSG_RET_FAIL(_(e_missingbracket));
				1238	}
				1239
				1240	/* FALLTHROUGH */
				1241	default:
				1242	{
				1243	#ifdef FEAT_MBYTE
				1244	int plen;
				1245
				1246	nfa_do_multibyte:
				1247	/* length of current char, with composing chars,
				1248	* from pointer */
				1249	plen = (*mb_ptr2len)(old_regparse);
				1250	if (enc_utf8 && clen != plen)
				1251	{
				1252	/* A composing character is always handled as a
				1253	* separate atom, surrounded by NFA_COMPOSING and
				1254	* NFA_END_COMPOSING. Note that right now we are
				1255	* building the postfix form, not the NFA itself;
				1256	* a composing char could be: a, b, c, NFA_COMPOSING
				1257	* where 'a', 'b', 'c' are chars with codes > 256.
				1258	*/
				1259	EMIT_COMPOSING_UTF(old_regparse);
				1260	regparse = old_regparse + plen;
				1261	}
				1262	else
				1263	/* A multi-byte character is always handled as a
				1264	* separate atom, surrounded by NFA_MULTIBYTE and
				1265	* NFA_END_MULTIBYTE */
				1266	if (plen > 1)
				1267	{
				1268	EMIT_MBYTE(c);
				1269	}
				1270	else
				1271	#endif
				1272	{
				1273	c = no_Magic(c);
				1274	EMIT(c);
				1275	}
				1276	return OK;
				1277	}
				1278	}
				1279
				1280	#undef TRY_NEG
				1281	#undef EMIT_GLUE
				1282
				1283	return OK;
				1284	}
				1285
				1286	/*
				1287	* Parse something followed by possible [*+=].
				1288	*
				1289	* A piece is an atom, possibly followed by a multi, an indication of how many
				1290	* times the atom can be matched. Example: "a*" matches any sequence of "a"
				1291	* characters: "", "a", "aa", etc.
				1292	*
				1293	* piece ::= atom
				1294	* or atom multi
				1295	*/
				1296	static int
				1297	nfa_regpiece()
				1298	{
				1299	int i;
				1300	int op;
				1301	int ret;
				1302	long minval, maxval;
				1303	int greedy = TRUE; /* Braces are prefixed with '-' ? */
				1304	char_u old_regparse, new_regparse;
				1305	int c2;
				1306	int old_post_ptr, my_post_start;
				1307	int old_regnpar;
				1308	int quest;
				1309
				1310	/* Save the current position in the regexp, so that we can use it if
				1311	* <atom>{m,n} is next. */
				1312	old_regparse = regparse;
				1313	/* Save current number of open parenthesis, so we can use it if
				1314	* <atom>{m,n} is next */
				1315	old_regnpar = regnpar;
				1316	/* store current pos in the postfix form, for \{m,n} involving 0s */
				1317	my_post_start = post_ptr;
				1318
				1319	ret = nfa_regatom();
				1320	if (ret == FAIL)
				1321	return FAIL; /* cascaded error */
				1322
				1323	op = peekchr();
				1324	if (re_multi_type(op) == NOT_MULTI)
				1325	return OK;
				1326
				1327	skipchr();
				1328	switch (op)
				1329	{
				1330	case Magic('*'):
				1331	EMIT(NFA_STAR);
				1332	break;
				1333
				1334	case Magic('+'):
				1335	/*
				1336	* Trick: Normally, (a*)\+ would match the whole input "aaa". The
				1337	* first and only submatch would be "aaa". But the backtracking
				1338	* engine interprets the plus as "try matching one more time", and
				1339	* a* matches a second time at the end of the input, the empty
				1340	* string.
				1341	* The submatch will the empty string.
				1342	*
				1343	* In order to be consistent with the old engine, we disable
				1344	* NFA_PLUS, and replace <atom>+ with <atom><atom>*
				1345	*/
				1346	/* EMIT(NFA_PLUS); */
				1347	regnpar = old_regnpar;
				1348	regparse = old_regparse;
				1349	curchr = -1;
				1350	if (nfa_regatom() == FAIL)
				1351	return FAIL;
				1352	EMIT(NFA_STAR);
				1353	EMIT(NFA_CONCAT);
				1354	skipchr(); /* skip the \+ */
				1355	break;
				1356
				1357	case Magic('@'):
				1358	op = no_Magic(getchr());
				1359	switch(op)
				1360	{
				1361	case '=':
				1362	EMIT(NFA_PREV_ATOM_NO_WIDTH);
				1363	break;
				1364	case '!':
				1365	case '<':
				1366	case '>':
				1367	/* Not supported yet */
				1368	return FAIL;
				1369	default:
				1370	syntax_error = TRUE;
Bram Moolenaar	ba40447	2013-05-19 22:31:18 +0200	[diff] [blame]	1371	EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1372	return FAIL;
				1373	}
				1374	break;
				1375
				1376	case Magic('?'):
				1377	case Magic('='):
				1378	EMIT(NFA_QUEST);
				1379	break;
				1380
				1381	case Magic('{'):
				1382	/* a{2,5} will expand to 'aaa?a?a?'
				1383	* a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
				1384	* version of '?'
				1385	* \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
				1386	* parenthesis have the same id
				1387	*/
				1388
				1389	greedy = TRUE;
				1390	c2 = peekchr();
				1391	if (c2 == '-' \|\| c2 == Magic('-'))
				1392	{
				1393	skipchr();
				1394	greedy = FALSE;
				1395	}
				1396	if (!read_limits(&minval, &maxval))
				1397	{
				1398	syntax_error = TRUE;
				1399	EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
				1400	}
				1401	/* <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
				1402	* <atom>* */
				1403	if (minval == 0 && maxval == MAX_LIMIT && greedy)
				1404	{
				1405	EMIT(NFA_STAR);
				1406	break;
				1407	}
				1408
				1409	if (maxval > NFA_BRACES_MAXLIMIT)
				1410	{
				1411	/* This would yield a huge automaton and use too much memory.
				1412	* Revert to old engine */
				1413	return FAIL;
				1414	}
				1415
				1416	/* Special case: x{0} or x{-0} */
				1417	if (maxval == 0)
				1418	{
				1419	/* Ignore result of previous call to nfa_regatom() */
				1420	post_ptr = my_post_start;
				1421	/* NFA_SKIP_CHAR has 0-length and works everywhere */
				1422	EMIT(NFA_SKIP_CHAR);
				1423	return OK;
				1424	}
				1425
				1426	/* Ignore previous call to nfa_regatom() */
				1427	post_ptr = my_post_start;
				1428	/* Save pos after the repeated atom and the \{} */
				1429	new_regparse = regparse;
				1430
				1431	new_regparse = regparse;
				1432	quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
				1433	for (i = 0; i < maxval; i++)
				1434	{
				1435	/* Goto beginning of the repeated atom */
				1436	regparse = old_regparse;
				1437	curchr = -1;
				1438	/* Restore count of parenthesis */
				1439	regnpar = old_regnpar;
				1440	old_post_ptr = post_ptr;
				1441	if (nfa_regatom() == FAIL)
				1442	return FAIL;
				1443	/* after "minval" times, atoms are optional */
				1444	if (i + 1 > minval)
				1445	EMIT(quest);
				1446	if (old_post_ptr != my_post_start)
				1447	EMIT(NFA_CONCAT);
				1448	}
				1449
				1450	/* Go to just after the repeated atom and the \{} */
				1451	regparse = new_regparse;
				1452	curchr = -1;
				1453
				1454	break;
				1455
				1456
				1457	default:
				1458	break;
				1459	} /* end switch */
				1460
				1461	if (re_multi_type(peekchr()) != NOT_MULTI)
				1462	{
				1463	/* Can't have a multi follow a multi. */
				1464	syntax_error = TRUE;
				1465	EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi !"));
				1466	}
				1467
				1468	return OK;
				1469	}
				1470
				1471	/*
				1472	* Parse one or more pieces, concatenated. It matches a match for the
				1473	* first piece, followed by a match for the second piece, etc. Example:
				1474	* "f[0-9]b", first matches "f", then a digit and then "b".
				1475	*
				1476	* concat ::= piece
				1477	* or piece piece
				1478	* or piece piece piece
				1479	* etc.
				1480	*/
				1481	static int
				1482	nfa_regconcat()
				1483	{
				1484	int cont = TRUE;
				1485	int first = TRUE;
				1486
				1487	while (cont)
				1488	{
				1489	switch (peekchr())
				1490	{
				1491	case NUL:
				1492	case Magic('\|'):
				1493	case Magic('&'):
				1494	case Magic(')'):
				1495	cont = FALSE;
				1496	break;
				1497
				1498	case Magic('Z'):
				1499	#ifdef FEAT_MBYTE
				1500	regflags \|= RF_ICOMBINE;
				1501	#endif
				1502	skipchr_keepstart();
				1503	break;
				1504	case Magic('c'):
				1505	regflags \|= RF_ICASE;
				1506	skipchr_keepstart();
				1507	break;
				1508	case Magic('C'):
				1509	regflags \|= RF_NOICASE;
				1510	skipchr_keepstart();
				1511	break;
				1512	case Magic('v'):
				1513	reg_magic = MAGIC_ALL;
				1514	skipchr_keepstart();
				1515	curchr = -1;
				1516	break;
				1517	case Magic('m'):
				1518	reg_magic = MAGIC_ON;
				1519	skipchr_keepstart();
				1520	curchr = -1;
				1521	break;
				1522	case Magic('M'):
				1523	reg_magic = MAGIC_OFF;
				1524	skipchr_keepstart();
				1525	curchr = -1;
				1526	break;
				1527	case Magic('V'):
				1528	reg_magic = MAGIC_NONE;
				1529	skipchr_keepstart();
				1530	curchr = -1;
				1531	break;
				1532
				1533	default:
				1534	if (nfa_regpiece() == FAIL)
				1535	return FAIL;
				1536	if (first == FALSE)
				1537	EMIT(NFA_CONCAT);
				1538	else
				1539	first = FALSE;
				1540	break;
				1541	}
				1542	}
				1543
				1544	return OK;
				1545	}
				1546
				1547	/*
				1548	* Parse a branch, one or more concats, separated by "\&". It matches the
				1549	* last concat, but only if all the preceding concats also match at the same
				1550	* position. Examples:
				1551	* "foobeep\&..." matches "foo" in "foobeep".
				1552	* ".Peter\&.Bob" matches in a line containing both "Peter" and "Bob"
				1553	*
				1554	* branch ::= concat
				1555	* or concat \& concat
				1556	* or concat \& concat \& concat
				1557	* etc.
				1558	*/
				1559	static int
				1560	nfa_regbranch()
				1561	{
				1562	int ch;
				1563	int *old_post_ptr;
				1564
				1565	old_post_ptr = post_ptr;
				1566
				1567	/* First branch, possibly the only one */
				1568	if (nfa_regconcat() == FAIL)
				1569	return FAIL;
				1570
				1571	ch = peekchr();
				1572	/* Try next concats */
				1573	while (ch == Magic('&'))
				1574	{
				1575	skipchr();
				1576	EMIT(NFA_NOPEN);
				1577	EMIT(NFA_PREV_ATOM_NO_WIDTH);
				1578	old_post_ptr = post_ptr;
				1579	if (nfa_regconcat() == FAIL)
				1580	return FAIL;
				1581	/* if concat is empty, skip a input char. But do emit a node */
				1582	if (old_post_ptr == post_ptr)
				1583	EMIT(NFA_SKIP_CHAR);
				1584	EMIT(NFA_CONCAT);
				1585	ch = peekchr();
				1586	}
				1587
				1588	/* Even if a branch is empty, emit one node for it */
				1589	if (old_post_ptr == post_ptr)
				1590	EMIT(NFA_SKIP_CHAR);
				1591
				1592	return OK;
				1593	}
				1594
				1595	/*
				1596	* Parse a pattern, one or more branches, separated by "\\|". It matches
				1597	* anything that matches one of the branches. Example: "foo\\|beep" matches
				1598	* "foo" and matches "beep". If more than one branch matches, the first one
				1599	* is used.
				1600	*
				1601	* pattern ::= branch
				1602	* or branch \\| branch
				1603	* or branch \\| branch \\| branch
				1604	* etc.
				1605	*/
				1606	static int
				1607	nfa_reg(paren)
				1608	int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
				1609	{
				1610	int parno = 0;
				1611
				1612	#ifdef FEAT_SYN_HL
				1613	#endif
				1614	if (paren == REG_PAREN)
				1615	{
				1616	if (regnpar >= NSUBEXP) /* Too many `(' */
				1617	{
				1618	syntax_error = TRUE;
				1619	EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
				1620	}
				1621	parno = regnpar++;
				1622	}
				1623
				1624	if (nfa_regbranch() == FAIL)
				1625	return FAIL; /* cascaded error */
				1626
				1627	while (peekchr() == Magic('\|'))
				1628	{
				1629	skipchr();
				1630	if (nfa_regbranch() == FAIL)
				1631	return FAIL; /* cascaded error */
				1632	EMIT(NFA_OR);
				1633	}
				1634
				1635	/* Check for proper termination. */
				1636	if (paren != REG_NOPAREN && getchr() != Magic(')'))
				1637	{
				1638	syntax_error = TRUE;
				1639	if (paren == REG_NPAREN)
				1640	EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
				1641	else
				1642	EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
				1643	}
				1644	else if (paren == REG_NOPAREN && peekchr() != NUL)
				1645	{
				1646	syntax_error = TRUE;
				1647	if (peekchr() == Magic(')'))
				1648	EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
				1649	else
				1650	EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
				1651	}
				1652	/*
				1653	* Here we set the flag allowing back references to this set of
				1654	* parentheses.
				1655	*/
				1656	if (paren == REG_PAREN)
				1657	{
				1658	had_endbrace[parno] = TRUE; /* have seen the close paren */
				1659	EMIT(NFA_MOPEN + parno);
				1660	}
				1661
				1662	return OK;
				1663	}
				1664
				1665	typedef struct
				1666	{
				1667	char_u *start[NSUBEXP];
				1668	char_u *end[NSUBEXP];
				1669	lpos_T startpos[NSUBEXP];
				1670	lpos_T endpos[NSUBEXP];
				1671	} regsub_T;
				1672
				1673	static int nfa_regmatch __ARGS((nfa_state_T start, regsub_T submatch, regsub_T *m));
				1674
				1675	#ifdef DEBUG
				1676	static char_u code[50];
				1677
				1678	static void
				1679	nfa_set_code(c)
				1680	int c;
				1681	{
				1682	int addnl = FALSE;
				1683
				1684	if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
				1685	{
				1686	addnl = TRUE;
				1687	c -= ADD_NL;
				1688	}
				1689
				1690	STRCPY(code, "");
				1691	switch (c)
				1692	{
				1693	case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
				1694	case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
				1695	case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
				1696	case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
				1697	case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
				1698	case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
				1699
				1700	case NFA_PREV_ATOM_NO_WIDTH:
				1701	STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
				1702	case NFA_NOPEN: STRCPY(code, "NFA_MOPEN_INVISIBLE"); break;
				1703	case NFA_NCLOSE: STRCPY(code, "NFA_MCLOSE_INVISIBLE"); break;
				1704	case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
				1705	case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
				1706
				1707	case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break;
				1708	case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break;
				1709
				1710	case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
				1711	case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
				1712
				1713	case NFA_MOPEN + 0:
				1714	case NFA_MOPEN + 1:
				1715	case NFA_MOPEN + 2:
				1716	case NFA_MOPEN + 3:
				1717	case NFA_MOPEN + 4:
				1718	case NFA_MOPEN + 5:
				1719	case NFA_MOPEN + 6:
				1720	case NFA_MOPEN + 7:
				1721	case NFA_MOPEN + 8:
				1722	case NFA_MOPEN + 9:
				1723	STRCPY(code, "NFA_MOPEN(x)");
				1724	code[10] = c - NFA_MOPEN + '0';
				1725	break;
				1726	case NFA_MCLOSE + 0:
				1727	case NFA_MCLOSE + 1:
				1728	case NFA_MCLOSE + 2:
				1729	case NFA_MCLOSE + 3:
				1730	case NFA_MCLOSE + 4:
				1731	case NFA_MCLOSE + 5:
				1732	case NFA_MCLOSE + 6:
				1733	case NFA_MCLOSE + 7:
				1734	case NFA_MCLOSE + 8:
				1735	case NFA_MCLOSE + 9:
				1736	STRCPY(code, "NFA_MCLOSE(x)");
				1737	code[11] = c - NFA_MCLOSE + '0';
				1738	break;
				1739	case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
				1740	case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
				1741	case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
				1742	case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
				1743	case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
				1744	case NFA_PLUS: STRCPY(code, "NFA_PLUS "); break;
				1745	case NFA_NOT: STRCPY(code, "NFA_NOT "); break;
				1746	case NFA_SKIP_CHAR: STRCPY(code, "NFA_SKIP_CHAR"); break;
				1747	case NFA_OR: STRCPY(code, "NFA_OR"); break;
				1748	case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
				1749	case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
				1750	case NFA_END_NEG_RANGE: STRCPY(code, "NFA_END_NEG_RANGE"); break;
				1751	case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
				1752	case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
				1753	case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
				1754	case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
				1755	case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
				1756	case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
				1757	case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
				1758	case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
				1759	case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
				1760	case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
				1761	case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
				1762	case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
				1763	case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
				1764	case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
				1765	case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
				1766	case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
				1767
				1768	case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
				1769	case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
				1770	case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
				1771	case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
				1772	case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
				1773	case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
				1774	case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
				1775	case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
				1776	case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
				1777	case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
				1778	case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
				1779	case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
				1780	case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
				1781	case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
				1782	case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
				1783	case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
				1784	case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
				1785	case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
				1786	case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
				1787	case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
				1788	case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
				1789	case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
				1790	case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
				1791	case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
				1792	case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
				1793	case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
				1794	case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
				1795
				1796	default:
				1797	STRCPY(code, "CHAR(x)");
				1798	code[5] = c;
				1799	}
				1800
				1801	if (addnl == TRUE)
				1802	STRCAT(code, " + NEWLINE ");
				1803
				1804	}
				1805
				1806	#ifdef ENABLE_LOG
				1807	static FILE *log_fd;
				1808
				1809	/*
				1810	* Print the postfix notation of the current regexp.
				1811	*/
				1812	static void
				1813	nfa_postfix_dump(expr, retval)
				1814	char_u *expr;
				1815	int retval;
				1816	{
				1817	int *p;
				1818	FILE *f;
				1819
				1820	f = fopen("LOG.log", "a");
				1821	if (f != NULL)
				1822	{
				1823	fprintf(f, "\n-------------------------\n");
				1824	if (retval == FAIL)
				1825	fprintf(f, ">>> NFA engine failed ... \n");
				1826	else if (retval == OK)
				1827	fprintf(f, ">>> NFA engine succeeded !\n");
				1828	fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
				1829	for (p=post_start; *p; p++)
				1830	{
				1831	nfa_set_code(*p);
				1832	fprintf(f, "%s, ", code);
				1833	}
				1834	fprintf(f, "\"\nPostfix notation (int): ");
				1835	for (p=post_start; *p; p++)
				1836	fprintf(f, "%d ", *p);
				1837	fprintf(f, "\n\n");
				1838	fclose(f);
				1839	}
				1840	}
				1841
				1842	/*
				1843	* Print the NFA starting with a root node "state".
				1844	*/
				1845	static void
				1846	nfa_print_state(debugf, state, ident)
				1847	FILE *debugf;
				1848	nfa_state_T *state;
				1849	int ident;
				1850	{
				1851	int i;
				1852
				1853	if (state == NULL)
				1854	return;
				1855
				1856	fprintf(debugf, "(%2d)", abs(state->id));
				1857	for (i = 0; i < ident; i++)
				1858	fprintf(debugf, "%c", ' ');
				1859
				1860	nfa_set_code(state->c);
				1861	fprintf(debugf, "%s %s (%d) (id=%d)\n",
				1862	state->negated ? "NOT" : "", code, state->c, abs(state->id));
				1863	if (state->id < 0)
				1864	return;
				1865
				1866	state->id = abs(state->id) * -1;
				1867	nfa_print_state(debugf, state->out, ident + 4);
				1868	nfa_print_state(debugf, state->out1, ident + 4);
				1869	}
				1870
				1871	/*
				1872	* Print the NFA state machine.
				1873	*/
				1874	static void
				1875	nfa_dump(prog)
				1876	nfa_regprog_T *prog;
				1877	{
				1878	FILE *debugf = fopen("LOG.log", "a");
				1879
				1880	if (debugf != NULL)
				1881	{
				1882	nfa_print_state(debugf, prog->start, 0);
				1883	fclose(debugf);
				1884	}
				1885	}
				1886	#endif /* ENABLE_LOG */
				1887	#endif /* DEBUG */
				1888
				1889	/*
				1890	* Parse r.e. @expr and convert it into postfix form.
				1891	* Return the postfix string on success, NULL otherwise.
				1892	*/
				1893	static int *
				1894	re2post()
				1895	{
				1896	if (nfa_reg(REG_NOPAREN) == FAIL)
				1897	return NULL;
				1898	EMIT(NFA_MOPEN);
				1899	return post_start;
				1900	}
				1901
				1902	/* NB. Some of the code below is inspired by Russ's. */
				1903
				1904	/*
				1905	* Represents an NFA state plus zero or one or two arrows exiting.
				1906	* if c == MATCH, no arrows out; matching state.
				1907	* If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
				1908	* If c < 256, labeled arrow with character c to out.
				1909	*/
				1910
				1911	static nfa_state_T state_ptr; / points to nfa_prog->state */
				1912
				1913	/*
				1914	* Allocate and initialize nfa_state_T.
				1915	*/
				1916	static nfa_state_T *
				1917	new_state(c, out, out1)
				1918	int c;
				1919	nfa_state_T *out;
				1920	nfa_state_T *out1;
				1921	{
				1922	nfa_state_T *s;
				1923
				1924	if (istate >= nstate)
				1925	return NULL;
				1926
				1927	s = &state_ptr[istate++];
				1928
				1929	s->c = c;
				1930	s->out = out;
				1931	s->out1 = out1;
				1932
				1933	s->id = istate;
				1934	s->lastlist = 0;
				1935	s->lastthread = NULL;
				1936	s->visits = 0;
				1937	s->negated = FALSE;
				1938
				1939	return s;
				1940	}
				1941
				1942	/*
				1943	* A partially built NFA without the matching state filled in.
				1944	* Frag_T.start points at the start state.
				1945	* Frag_T.out is a list of places that need to be set to the
				1946	* next state for this fragment.
				1947	*/
				1948	typedef union Ptrlist Ptrlist;
				1949	struct Frag
				1950	{
				1951	nfa_state_T *start;
				1952	Ptrlist *out;
				1953	};
				1954	typedef struct Frag Frag_T;
				1955
				1956	static Frag_T frag __ARGS((nfa_state_T start, Ptrlist out));
				1957	static Ptrlist list1 __ARGS((nfa_state_T *outp));
				1958	static void patch __ARGS((Ptrlist l, nfa_state_T s));
				1959	static Ptrlist append __ARGS((Ptrlist l1, Ptrlist *l2));
				1960	static void st_push __ARGS((Frag_T s, Frag_T *p, Frag_T stack_end));
				1961	static Frag_T st_pop __ARGS((Frag_T *p, Frag_T stack));
				1962
				1963	/*
Bram Moolenaar	053bb60	2013-05-20 13:55:21 +0200	[diff] [blame]	1964	* Initialize a Frag_T struct and return it.
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1965	*/
				1966	static Frag_T
				1967	frag(start, out)
				1968	nfa_state_T *start;
				1969	Ptrlist *out;
				1970	{
Bram Moolenaar	053bb60	2013-05-20 13:55:21 +0200	[diff] [blame]	1971	Frag_T n;
				1972
				1973	n.start = start;
				1974	n.out = out;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	1975	return n;
				1976	}
				1977
				1978	/*
				1979	* Since the out pointers in the list are always
				1980	* uninitialized, we use the pointers themselves
				1981	* as storage for the Ptrlists.
				1982	*/
				1983	union Ptrlist
				1984	{
				1985	Ptrlist *next;
				1986	nfa_state_T *s;
				1987	};
				1988
				1989	/*
				1990	* Create singleton list containing just outp.
				1991	*/
				1992	static Ptrlist *
				1993	list1(outp)
				1994	nfa_state_T **outp;
				1995	{
				1996	Ptrlist *l;
				1997
				1998	l = (Ptrlist *)outp;
				1999	l->next = NULL;
				2000	return l;
				2001	}
				2002
				2003	/*
				2004	* Patch the list of states at out to point to start.
				2005	*/
				2006	static void
				2007	patch(l, s)
				2008	Ptrlist *l;
				2009	nfa_state_T *s;
				2010	{
				2011	Ptrlist *next;
				2012
				2013	for (; l; l = next)
				2014	{
				2015	next = l->next;
				2016	l->s = s;
				2017	}
				2018	}
				2019
				2020
				2021	/*
				2022	* Join the two lists l1 and l2, returning the combination.
				2023	*/
				2024	static Ptrlist *
				2025	append(l1, l2)
				2026	Ptrlist *l1;
				2027	Ptrlist *l2;
				2028	{
				2029	Ptrlist *oldl1;
				2030
				2031	oldl1 = l1;
				2032	while (l1->next)
				2033	l1 = l1->next;
				2034	l1->next = l2;
				2035	return oldl1;
				2036	}
				2037
				2038	/*
				2039	* Stack used for transforming postfix form into NFA.
				2040	*/
				2041	static Frag_T empty;
				2042
				2043	static void
				2044	st_error(postfix, end, p)
				2045	int *postfix;
				2046	int *end;
				2047	int *p;
				2048	{
				2049	FILE *df;
				2050	int *p2;
				2051
				2052	df = fopen("stack.err", "a");
				2053	if (df)
				2054	{
				2055	fprintf(df, "Error popping the stack!\n");
				2056	#ifdef DEBUG
				2057	fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
				2058	#endif
				2059	fprintf(df, "Postfix form is: ");
				2060	#ifdef DEBUG
				2061	for (p2 = postfix; p2 < end; p2++)
				2062	{
				2063	nfa_set_code(*p2);
				2064	fprintf(df, "%s, ", code);
				2065	}
				2066	nfa_set_code(*p);
				2067	fprintf(df, "\nCurrent position is: ");
				2068	for (p2 = postfix; p2 <= p; p2 ++)
				2069	{
				2070	nfa_set_code(*p2);
				2071	fprintf(df, "%s, ", code);
				2072	}
				2073	#else
				2074	for (p2 = postfix; p2 < end; p2++)
				2075	{
				2076	fprintf(df, "%d, ", *p2);
				2077	}
				2078	fprintf(df, "\nCurrent position is: ");
				2079	for (p2 = postfix; p2 <= p; p2 ++)
				2080	{
				2081	fprintf(df, "%d, ", *p2);
				2082	}
				2083	#endif
				2084	fprintf(df, "\n--------------------------\n");
				2085	fclose(df);
				2086	}
				2087	EMSG(_("E874: (NFA) Could not pop the stack !"));
				2088	}
				2089
				2090	/*
				2091	* Push an item onto the stack.
				2092	*/
				2093	static void
				2094	st_push(s, p, stack_end)
				2095	Frag_T s;
				2096	Frag_T **p;
				2097	Frag_T *stack_end;
				2098	{
				2099	Frag_T stackp = p;
				2100
				2101	if (stackp >= stack_end)
				2102	return;
				2103	*stackp = s;
				2104	p = p + 1;
				2105	}
				2106
				2107	/*
				2108	* Pop an item from the stack.
				2109	*/
				2110	static Frag_T
				2111	st_pop(p, stack)
				2112	Frag_T **p;
				2113	Frag_T *stack;
				2114	{
				2115	Frag_T *stackp;
				2116
				2117	p = p - 1;
				2118	stackp = *p;
				2119	if (stackp < stack)
				2120	return empty;
				2121	return **p;
				2122	}
				2123
				2124	/*
				2125	* Convert a postfix form into its equivalent NFA.
				2126	* Return the NFA start state on success, NULL otherwise.
				2127	*/
				2128	static nfa_state_T *
				2129	post2nfa(postfix, end, nfa_calc_size)
				2130	int *postfix;
				2131	int *end;
				2132	int nfa_calc_size;
				2133	{
				2134	int *p;
				2135	int mopen;
				2136	int mclose;
				2137	Frag_T *stack = NULL;
				2138	Frag_T *stackp = NULL;
				2139	Frag_T *stack_end = NULL;
				2140	Frag_T e1;
				2141	Frag_T e2;
				2142	Frag_T e;
				2143	nfa_state_T *s;
				2144	nfa_state_T *s1;
				2145	nfa_state_T *matchstate;
				2146
				2147	if (postfix == NULL)
				2148	return NULL;
				2149
Bram Moolenaar	053bb60	2013-05-20 13:55:21 +0200	[diff] [blame]	2150	#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2151	#define POP() st_pop(&stackp, stack); \
				2152	if (stackp < stack) \
				2153	{ \
				2154	st_error(postfix, end, p); \
				2155	return NULL; \
				2156	}
				2157
				2158	if (nfa_calc_size == FALSE)
				2159	{
				2160	/* Allocate space for the stack. Max states on the stack : nstate */
				2161	stack = (Frag_T ) lalloc((nstate + 1)sizeof(Frag_T), TRUE);
				2162	stackp = stack;
				2163	stack_end = stack + NFA_STACK_SIZE;
				2164	}
				2165
				2166	for (p = postfix; p < end; ++p)
				2167	{
				2168	switch (*p)
				2169	{
				2170	case NFA_CONCAT:
				2171	/* Catenation.
				2172	* Pay attention: this operator does not exist
				2173	* in the r.e. itself (it is implicit, really).
				2174	* It is added when r.e. is translated to postfix
				2175	* form in re2post().
				2176	*
				2177	* No new state added here. */
				2178	if (nfa_calc_size == TRUE)
				2179	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2180	/* nstate += 0; */
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2181	break;
				2182	}
				2183	e2 = POP();
				2184	e1 = POP();
				2185	patch(e1.out, e2.start);
				2186	PUSH(frag(e1.start, e2.out));
				2187	break;
				2188
				2189	case NFA_NOT:
				2190	/* Negation of a character */
				2191	if (nfa_calc_size == TRUE)
				2192	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2193	/* nstate += 0; */
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2194	break;
				2195	}
				2196	e1 = POP();
				2197	e1.start->negated = TRUE;
				2198	if (e1.start->c == NFA_MULTIBYTE \|\| e1.start->c == NFA_COMPOSING)
				2199	e1.start->out1->negated = TRUE;
				2200	PUSH(e1);
				2201	break;
				2202
				2203	case NFA_OR:
				2204	/* Alternation */
				2205	if (nfa_calc_size == TRUE)
				2206	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2207	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2208	break;
				2209	}
				2210	e2 = POP();
				2211	e1 = POP();
				2212	s = new_state(NFA_SPLIT, e1.start, e2.start);
				2213	if (s == NULL)
				2214	return NULL;
				2215	PUSH(frag(s, append(e1.out, e2.out)));
				2216	break;
				2217
				2218	case NFA_STAR:
				2219	/* Zero or more */
				2220	if (nfa_calc_size == TRUE)
				2221	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2222	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2223	break;
				2224	}
				2225	e = POP();
				2226	s = new_state(NFA_SPLIT, e.start, NULL);
				2227	if (s == NULL)
				2228	return NULL;
				2229	patch(e.out, s);
				2230	PUSH(frag(s, list1(&s->out1)));
				2231	break;
				2232
				2233	case NFA_QUEST:
				2234	/* one or zero atoms=> greedy match */
				2235	if (nfa_calc_size == TRUE)
				2236	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2237	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2238	break;
				2239	}
				2240	e = POP();
				2241	s = new_state(NFA_SPLIT, e.start, NULL);
				2242	if (s == NULL)
				2243	return NULL;
				2244	PUSH(frag(s, append(e.out, list1(&s->out1))));
				2245	break;
				2246
				2247	case NFA_QUEST_NONGREEDY:
				2248	/* zero or one atoms => non-greedy match */
				2249	if (nfa_calc_size == TRUE)
				2250	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2251	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2252	break;
				2253	}
				2254	e = POP();
				2255	s = new_state(NFA_SPLIT, NULL, e.start);
				2256	if (s == NULL)
				2257	return NULL;
				2258	PUSH(frag(s, append(e.out, list1(&s->out))));
				2259	break;
				2260
				2261	case NFA_PLUS:
				2262	/* One or more */
				2263	if (nfa_calc_size == TRUE)
				2264	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2265	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2266	break;
				2267	}
				2268	e = POP();
				2269	s = new_state(NFA_SPLIT, e.start, NULL);
				2270	if (s == NULL)
				2271	return NULL;
				2272	patch(e.out, s);
				2273	PUSH(frag(e.start, list1(&s->out1)));
				2274	break;
				2275
				2276	case NFA_SKIP_CHAR:
				2277	/* Symbol of 0-length, Used in a repetition
				2278	* with max/min count of 0 */
				2279	if (nfa_calc_size == TRUE)
				2280	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2281	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2282	break;
				2283	}
				2284	s = new_state(NFA_SKIP_CHAR, NULL, NULL);
				2285	if (s == NULL)
				2286	return NULL;
				2287	PUSH(frag(s, list1(&s->out)));
				2288	break;
				2289
				2290	case NFA_PREV_ATOM_NO_WIDTH:
				2291	/* The \@= operator: match the preceding atom with 0 width.
				2292	* Surrounds the preceding atom with START_INVISIBLE and
				2293	* END_INVISIBLE, similarly to MOPEN.
				2294	*/
				2295	/* TODO: Maybe this drops the speed? */
				2296	return NULL;
				2297
				2298	if (nfa_calc_size == TRUE)
				2299	{
				2300	nstate += 2;
				2301	break;
				2302	}
				2303	e = POP();
				2304	s1 = new_state(NFA_END_INVISIBLE, NULL, NULL);
				2305	if (s1 == NULL)
				2306	return NULL;
				2307	patch(e.out, s1);
				2308
				2309	s = new_state(NFA_START_INVISIBLE, e.start, s1);
				2310	if (s == NULL)
				2311	return NULL;
				2312	PUSH(frag(s, list1(&s1->out)));
				2313	break;
				2314
				2315	case NFA_MOPEN + 0: /* Submatch */
				2316	case NFA_MOPEN + 1:
				2317	case NFA_MOPEN + 2:
				2318	case NFA_MOPEN + 3:
				2319	case NFA_MOPEN + 4:
				2320	case NFA_MOPEN + 5:
				2321	case NFA_MOPEN + 6:
				2322	case NFA_MOPEN + 7:
				2323	case NFA_MOPEN + 8:
				2324	case NFA_MOPEN + 9:
				2325	case NFA_NOPEN: /* \%( "Invisible Submatch" */
				2326	case NFA_MULTIBYTE: /* mbyte char */
				2327	case NFA_COMPOSING: /* composing char */
				2328	if (nfa_calc_size == TRUE)
				2329	{
				2330	nstate += 2;
				2331	break;
				2332	}
				2333
				2334	mopen = *p;
				2335	switch (*p)
				2336	{
				2337	case NFA_NOPEN:
				2338	mclose = NFA_NCLOSE;
				2339	break;
				2340	case NFA_MULTIBYTE:
				2341	mclose = NFA_END_MULTIBYTE;
				2342	break;
				2343	case NFA_COMPOSING:
				2344	mclose = NFA_END_COMPOSING;
				2345	break;
				2346	default:
				2347	/* NFA_MOPEN(0) ... NFA_MOPEN(9) */
				2348	mclose = *p + NSUBEXP;
				2349	break;
				2350	}
				2351
				2352	/* Allow "NFA_MOPEN" as a valid postfix representation for
				2353	* the empty regexp "". In this case, the NFA will be
				2354	* NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
				2355	* empty groups of parenthesis, and empty mbyte chars */
				2356	if (stackp == stack)
				2357	{
				2358	s = new_state(mopen, NULL, NULL);
				2359	if (s == NULL)
				2360	return NULL;
				2361	s1 = new_state(mclose, NULL, NULL);
				2362	if (s1 == NULL)
				2363	return NULL;
				2364	patch(list1(&s->out), s1);
				2365	PUSH(frag(s, list1(&s1->out)));
				2366	break;
				2367	}
				2368
				2369	/* At least one node was emitted before NFA_MOPEN, so
				2370	* at least one node will be between NFA_MOPEN and NFA_MCLOSE */
				2371	e = POP();
				2372	s = new_state(mopen, e.start, NULL); /* `(' */
				2373	if (s == NULL)
				2374	return NULL;
				2375
				2376	s1 = new_state(mclose, NULL, NULL); /* `)' */
				2377	if (s1 == NULL)
				2378	return NULL;
				2379	patch(e.out, s1);
				2380
				2381	if (mopen == NFA_MULTIBYTE \|\| mopen == NFA_COMPOSING)
				2382	/* MULTIBYTE->out1 = END_MULTIBYTE
				2383	* COMPOSING->out1 = END_COMPOSING */
				2384	patch(list1(&s->out1), s1);
				2385
				2386	PUSH(frag(s, list1(&s1->out)));
				2387	break;
				2388
				2389	case NFA_ZSTART:
				2390	case NFA_ZEND:
				2391	default:
				2392	/* Operands */
				2393	if (nfa_calc_size == TRUE)
				2394	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2395	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2396	break;
				2397	}
				2398	s = new_state(*p, NULL, NULL);
				2399	if (s == NULL)
				2400	return NULL;
				2401	PUSH(frag(s, list1(&s->out)));
				2402	break;
				2403
				2404	} /* switch(p) /
				2405
				2406	} /* for(p = postfix; p; ++p) /
				2407
				2408	if (nfa_calc_size == TRUE)
				2409	{
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2410	nstate++;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2411	return NULL; /* Return value when counting size is ignored anyway */
				2412	}
				2413
				2414	e = POP();
				2415	if (stackp != stack)
				2416	EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
				2417
				2418	if (istate >= nstate)
				2419	EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
				2420
				2421	vim_free(stack);
				2422
				2423	matchstate = &state_ptr[istate++]; /* the match state */
				2424	matchstate->c = NFA_MATCH;
				2425	matchstate->out = matchstate->out1 = NULL;
				2426
				2427	patch(e.out, matchstate);
				2428	return e.start;
				2429
				2430	#undef POP1
				2431	#undef PUSH1
				2432	#undef POP2
				2433	#undef PUSH2
				2434	#undef POP
				2435	#undef PUSH
				2436	}
				2437
				2438	/****************************************************************
				2439	* NFA execution code.
				2440	****************************************************************/
				2441
				2442	/* thread_T contains runtime information of a NFA state */
				2443	struct thread
				2444	{
				2445	nfa_state_T *state;
				2446	regsub_T sub; /* submatch info */
				2447	};
				2448
				2449	typedef struct
				2450	{
				2451	thread_T *t;
				2452	int n;
				2453	} List;
				2454
				2455	static void addstate __ARGS((List l, nfa_state_T state, regsub_T m, int off, int lid, int match));
				2456
				2457	static void
				2458	addstate(l, state, m, off, lid, match)
				2459	List l; / runtime state list */
				2460	nfa_state_T state; / state to update */
				2461	regsub_T m; / pointers to subexpressions */
				2462	int off;
				2463	int lid;
				2464	int match; / found match? */
				2465	{
				2466	regsub_T save;
				2467	int subidx = 0;
				2468
				2469	if (l == NULL \|\| state == NULL)
				2470	return;
				2471
				2472	switch (state->c)
				2473	{
				2474	case NFA_SPLIT:
				2475	case NFA_NOT:
				2476	case NFA_NOPEN:
				2477	case NFA_NCLOSE:
				2478	case NFA_MCLOSE:
				2479	case NFA_MCLOSE + 1:
				2480	case NFA_MCLOSE + 2:
				2481	case NFA_MCLOSE + 3:
				2482	case NFA_MCLOSE + 4:
				2483	case NFA_MCLOSE + 5:
				2484	case NFA_MCLOSE + 6:
				2485	case NFA_MCLOSE + 7:
				2486	case NFA_MCLOSE + 8:
				2487	case NFA_MCLOSE + 9:
				2488	/* Do not remember these nodes in list "thislist" or "nextlist" */
				2489	break;
				2490
				2491	default:
				2492	if (state->lastlist == lid)
				2493	{
				2494	if (++state->visits > 2)
				2495	return;
				2496	}
				2497	else
				2498	{
				2499	/* add the state to the list */
				2500	state->lastlist = lid;
				2501	state->lastthread = &l->t[l->n++];
				2502	state->lastthread->state = state;
				2503	state->lastthread->sub = *m;
				2504	}
				2505	}
				2506
				2507	#ifdef ENABLE_LOG
				2508	nfa_set_code(state->c);
				2509	fprintf(log_fd, "> Adding state %d to list. Character %s, code %d\n",
				2510	abs(state->id), code, state->c);
				2511	#endif
				2512	switch (state->c)
				2513	{
				2514	case NFA_MATCH:
				2515	*match = TRUE;
				2516	break;
				2517
				2518	case NFA_SPLIT:
				2519	addstate(l, state->out, m, off, lid, match);
				2520	addstate(l, state->out1, m, off, lid, match);
				2521	break;
				2522
				2523	case NFA_SKIP_CHAR:
				2524	addstate(l, state->out, m, off, lid, match);
				2525	break;
				2526
				2527	#if 0
				2528	case NFA_END_NEG_RANGE:
				2529	/* Nothing to handle here. nfa_regmatch() will take care of it */
				2530	break;
				2531
				2532	case NFA_NOT:
				2533	EMSG(_("E999: (NFA regexp internal error) Should not process NOT node !"));
				2534	#ifdef ENABLE_LOG
				2535	fprintf(f, "\n\n>>> E999: Added state NFA_NOT to a list ... Something went wrong ! Why wasn't it processed already? \n\n");
				2536	#endif
				2537	break;
				2538
				2539	case NFA_COMPOSING:
				2540	/* nfa_regmatch() will match all the bytes of this composing char. */
				2541	break;
				2542
				2543	case NFA_MULTIBYTE:
				2544	/* nfa_regmatch() will match all the bytes of this multibyte char. */
				2545	break;
				2546	#endif
				2547
				2548	case NFA_END_MULTIBYTE:
				2549	/* Successfully matched this mbyte char */
				2550	addstate(l, state->out, m, off, lid, match);
				2551	break;
				2552
				2553	case NFA_NOPEN:
				2554	case NFA_NCLOSE:
				2555	addstate(l, state->out, m, off, lid, match);
				2556	break;
				2557
				2558	/* If this state is reached, then a recursive call of nfa_regmatch()
				2559	* succeeded. the next call saves the found submatches in the
				2560	* first state after the "invisible" branch. */
				2561	#if 0
				2562	case NFA_END_INVISIBLE:
				2563	break;
				2564	#endif
				2565
				2566	case NFA_MOPEN + 0:
				2567	case NFA_MOPEN + 1:
				2568	case NFA_MOPEN + 2:
				2569	case NFA_MOPEN + 3:
				2570	case NFA_MOPEN + 4:
				2571	case NFA_MOPEN + 5:
				2572	case NFA_MOPEN + 6:
				2573	case NFA_MOPEN + 7:
				2574	case NFA_MOPEN + 8:
				2575	case NFA_MOPEN + 9:
				2576	case NFA_ZSTART:
				2577	subidx = state->c - NFA_MOPEN;
				2578	if (state->c == NFA_ZSTART)
				2579	subidx = 0;
				2580
				2581	if (REG_MULTI)
				2582	{
				2583	save.startpos[subidx] = m->startpos[subidx];
				2584	save.endpos[subidx] = m->endpos[subidx];
				2585	m->startpos[subidx].lnum = reglnum;
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2586	m->startpos[subidx].col = (colnr_T)(reginput - regline + off);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2587	}
				2588	else
				2589	{
				2590	save.start[subidx] = m->start[subidx];
				2591	save.end[subidx] = m->end[subidx];
				2592	m->start[subidx] = reginput + off;
				2593	}
				2594
				2595	addstate(l, state->out, m, off, lid, match);
				2596
				2597	if (REG_MULTI)
				2598	{
				2599	m->startpos[subidx] = save.startpos[subidx];
				2600	m->endpos[subidx] = save.endpos[subidx];
				2601	}
				2602	else
				2603	{
				2604	m->start[subidx] = save.start[subidx];
				2605	m->end[subidx] = save.end[subidx];
				2606	}
				2607	break;
				2608
				2609	case NFA_MCLOSE + 0:
				2610	if (nfa_has_zend == TRUE)
				2611	{
				2612	addstate(l, state->out, m, off, lid, match);
				2613	break;
				2614	}
				2615	case NFA_MCLOSE + 1:
				2616	case NFA_MCLOSE + 2:
				2617	case NFA_MCLOSE + 3:
				2618	case NFA_MCLOSE + 4:
				2619	case NFA_MCLOSE + 5:
				2620	case NFA_MCLOSE + 6:
				2621	case NFA_MCLOSE + 7:
				2622	case NFA_MCLOSE + 8:
				2623	case NFA_MCLOSE + 9:
				2624	case NFA_ZEND:
				2625	subidx = state->c - NFA_MCLOSE;
				2626	if (state->c == NFA_ZEND)
				2627	subidx = 0;
				2628
				2629	if (REG_MULTI)
				2630	{
				2631	save.startpos[subidx] = m->startpos[subidx];
				2632	save.endpos[subidx] = m->endpos[subidx];
				2633	m->endpos[subidx].lnum = reglnum;
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	2634	m->endpos[subidx].col = (colnr_T)(reginput - regline + off);
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	2635	}
				2636	else
				2637	{
				2638	save.start[subidx] = m->start[subidx];
				2639	save.end[subidx] = m->end[subidx];
				2640	m->end[subidx] = reginput + off;
				2641	}
				2642
				2643	addstate(l, state->out, m, off, lid, match);
				2644
				2645	if (REG_MULTI)
				2646	{
				2647	m->startpos[subidx] = save.startpos[subidx];
				2648	m->endpos[subidx] = save.endpos[subidx];
				2649	}
				2650	else
				2651	{
				2652	m->start[subidx] = save.start[subidx];
				2653	m->end[subidx] = save.end[subidx];
				2654	}
				2655	break;
				2656	}
				2657	}
				2658
				2659	/*
				2660	* Check character class "class" against current character c.
				2661	*/
				2662	static int
				2663	check_char_class(class, c)
				2664	int class;
				2665	int c;
				2666	{
				2667	switch (class)
				2668	{
				2669	case NFA_CLASS_ALNUM:
				2670	if (isalnum(c))
				2671	return OK;
				2672	break;
				2673	case NFA_CLASS_ALPHA:
				2674	if (isalpha(c))
				2675	return OK;
				2676	break;
				2677	case NFA_CLASS_BLANK:
				2678	if (c == ' ' \|\| c == '\t')
				2679	return OK;
				2680	break;
				2681	case NFA_CLASS_CNTRL:
				2682	if (iscntrl(c))
				2683	return OK;
				2684	break;
				2685	case NFA_CLASS_DIGIT:
				2686	if (VIM_ISDIGIT(c))
				2687	return OK;
				2688	break;
				2689	case NFA_CLASS_GRAPH:
				2690	if (isgraph(c))
				2691	return OK;
				2692	break;
				2693	case NFA_CLASS_LOWER:
				2694	if (MB_ISLOWER(c))
				2695	return OK;
				2696	break;
				2697	case NFA_CLASS_PRINT:
				2698	if (vim_isprintc(c))
				2699	return OK;
				2700	break;
				2701	case NFA_CLASS_PUNCT:
				2702	if (ispunct(c))
				2703	return OK;
				2704	break;
				2705	case NFA_CLASS_SPACE:
				2706	if ((c >=9 && c <= 13) \|\| (c == ' '))
				2707	return OK;
				2708	break;
				2709	case NFA_CLASS_UPPER:
				2710	if (MB_ISUPPER(c))
				2711	return OK;
				2712	break;
				2713	case NFA_CLASS_XDIGIT:
				2714	if (vim_isxdigit(c))
				2715	return OK;
				2716	break;
				2717	case NFA_CLASS_TAB:
				2718	if (c == '\t')
				2719	return OK;
				2720	break;
				2721	case NFA_CLASS_RETURN:
				2722	if (c == '\r')
				2723	return OK;
				2724	break;
				2725	case NFA_CLASS_BACKSPACE:
				2726	if (c == '\b')
				2727	return OK;
				2728	break;
				2729	case NFA_CLASS_ESCAPE:
				2730	if (c == '\033')
				2731	return OK;
				2732	break;
				2733
				2734	default:
				2735	/* should not be here :P */
				2736	EMSG_RET_FAIL(_("E877: (NFA regexp) Invalid character class "));
				2737	}
				2738	return FAIL;
				2739	}
				2740
				2741	/*
				2742	* Set all NFA nodes' list ID equal to -1.
				2743	*/
				2744	static void
				2745	nfa_set_neg_listids(start)
				2746	nfa_state_T *start;
				2747	{
				2748	if (start == NULL)
				2749	return;
				2750	if (start->lastlist >= 0)
				2751	{
				2752	start->lastlist = -1;
				2753	nfa_set_neg_listids(start->out);
				2754	nfa_set_neg_listids(start->out1);
				2755	}
				2756	}
				2757
				2758	/*
				2759	* Set all NFA nodes' list ID equal to 0.
				2760	*/
				2761	static void
				2762	nfa_set_null_listids(start)
				2763	nfa_state_T *start;
				2764	{
				2765	if (start == NULL)
				2766	return;
				2767	if (start->lastlist == -1)
				2768	{
				2769	start->lastlist = 0;
				2770	nfa_set_null_listids(start->out);
				2771	nfa_set_null_listids(start->out1);
				2772	}
				2773	}
				2774
				2775	/*
				2776	* Save list IDs for all NFA states in "list".
				2777	*/
				2778	static void
				2779	nfa_save_listids(start, list)
				2780	nfa_state_T *start;
				2781	int *list;
				2782	{
				2783	if (start == NULL)
				2784	return;
				2785	if (start->lastlist != -1)
				2786	{
				2787	list[abs(start->id)] = start->lastlist;
				2788	start->lastlist = -1;
				2789	nfa_save_listids(start->out, list);
				2790	nfa_save_listids(start->out1, list);
				2791	}
				2792	}
				2793
				2794	/*
				2795	* Restore list IDs from "list" to all NFA states.
				2796	*/
				2797	static void
				2798	nfa_restore_listids(start, list)
				2799	nfa_state_T *start;
				2800	int *list;
				2801	{
				2802	if (start == NULL)
				2803	return;
				2804	if (start->lastlist == -1)
				2805	{
				2806	start->lastlist = list[abs(start->id)];
				2807	nfa_restore_listids(start->out, list);
				2808	nfa_restore_listids(start->out1, list);
				2809	}
				2810	}
				2811
				2812	/*
				2813	* Main matching routine.
				2814	*
				2815	* Run NFA to determine whether it matches reginput.
				2816	*
				2817	* Return TRUE if there is a match, FALSE otherwise.
				2818	* Note: Caller must ensure that: start != NULL.
				2819	*/
				2820	static int
				2821	nfa_regmatch(start, submatch, m)
				2822	nfa_state_T *start;
				2823	regsub_T *submatch;
				2824	regsub_T *m;
				2825	{
				2826	int c = -1;
				2827	int n;
				2828	int i = 0;
				2829	int result;
				2830	int size = 0;
				2831	int match = FALSE;
				2832	int flag = 0;
				2833	int old_reglnum = -1;
				2834	int reginput_updated = FALSE;
				2835	thread_T *t;
				2836	char_u *cc;
				2837	char_u *old_reginput = NULL;
				2838	char_u *old_regline = NULL;
				2839	nfa_state_T *sta;
				2840	nfa_state_T *end;
				2841	List list[3];
				2842	List *listtbl[2][2];
				2843	List *ll;
				2844	int listid = 1;
				2845	int endnode = 0;
				2846	List *thislist;
				2847	List *nextlist;
				2848	List *neglist;
				2849	int *listids = NULL;
				2850	int j = 0;
				2851	int len = 0;
				2852	#ifdef DEBUG
				2853	FILE *debug = fopen("list.log", "a");
				2854
				2855	if (debug == NULL)
				2856	{
				2857	EMSG(_("(NFA) COULD NOT OPEN list.log !"));
				2858	return FALSE;
				2859	}
				2860	#endif
				2861
				2862	/* Allocate memory for the lists of nodes */
				2863	size = (nstate + 1) * sizeof(thread_T);
				2864	list[0].t = (thread_T *)lalloc(size, TRUE);
				2865	list[1].t = (thread_T *)lalloc(size, TRUE);
				2866	list[2].t = (thread_T *)lalloc(size, TRUE);
				2867	if (list[0].t == NULL \|\| list[1].t == NULL \|\| list[2].t == NULL)
				2868	goto theend;
				2869	vim_memset(list[0].t, 0, size);
				2870	vim_memset(list[1].t, 0, size);
				2871	vim_memset(list[2].t, 0, size);
				2872
				2873	#ifdef ENABLE_LOG
				2874	log_fd = fopen(LOG_NAME, "a");
				2875	if (log_fd != NULL)
				2876	{
				2877	fprintf(log_fd, "**********************************\n");
				2878	nfa_set_code(start->c);
				2879	fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
				2880	abs(start->id), code);
				2881	fprintf(log_fd, "**********************************\n");
				2882	}
				2883	else
				2884	{
				2885	EMSG(_("Could not open temporary log file for writing, displaying on stderr ... "));
				2886	log_fd = stderr;
				2887	}
				2888	#endif
				2889
				2890	thislist = &list[0];
				2891	thislist->n = 0;
				2892	nextlist = &list[1];
				2893	nextlist->n = 0;
				2894	neglist = &list[2];
				2895	neglist->n = 0;
				2896	#ifdef ENABLE_LOG
				2897	fprintf(log_fd, "(---) STARTSTATE\n");
				2898	#endif
				2899	addstate(thislist, start, m, 0, listid, &match);
				2900
				2901	/* There are two cases when the NFA advances: 1. input char matches the
				2902	* NFA node and 2. input char does not match the NFA node, but the next
				2903	* node is NFA_NOT. The following macro calls addstate() according to
				2904	* these rules. It is used A LOT, so use the "listtbl" table for speed */
				2905	listtbl[0][0] = NULL;
				2906	listtbl[0][1] = neglist;
				2907	listtbl[1][0] = nextlist;
				2908	listtbl[1][1] = NULL;
				2909	#define ADD_POS_NEG_STATE(node) \
				2910	ll = listtbl[result ? 1 : 0][node->negated]; \
				2911	if (ll != NULL) \
				2912	addstate(ll, node->out , &t->sub, n, listid + 1, &match);
				2913
				2914
				2915	/*
				2916	* Run for each character.
				2917	*/
				2918	do {
				2919	again:
				2920	#ifdef FEAT_MBYTE
				2921	if (has_mbyte)
				2922	{
				2923	c = (*mb_ptr2char)(reginput);
				2924	n = (*mb_ptr2len)(reginput);
				2925	}
				2926	else
				2927	#endif
				2928	{
				2929	c = *reginput;
				2930	n = 1;
				2931	}
				2932	if (c == NUL)
				2933	n = 0;
				2934	cc = (char_u *)&c;
				2935
				2936	/* swap lists */
				2937	thislist = &list[flag];
				2938	nextlist = &list[flag ^= 1];
				2939	nextlist->n = 0; /* `clear' nextlist */
				2940	listtbl[1][0] = nextlist;
				2941	++listid;
				2942
				2943	#ifdef ENABLE_LOG
				2944	fprintf(log_fd, "------------------------------------------\n");
				2945	fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput);
				2946	fprintf(log_fd, ">>> Advanced one character ... Current char is %c (code %d) \n", c, (int)c);
				2947	fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
				2948	for (i = 0; i< thislist->n; i++)
				2949	fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
				2950	fprintf(log_fd, "\n");
				2951	#endif
				2952
				2953	#ifdef DEBUG
				2954	fprintf(debug, "\n-------------------\n");
				2955	#endif
				2956
				2957	/* compute nextlist */
				2958	for (i = 0; i < thislist->n \|\| neglist->n > 0; ++i)
				2959	{
				2960	if (neglist->n > 0)
				2961	{
				2962	t = &neglist->t[0];
				2963	neglist->n --;
				2964	i--;
				2965	}
				2966	else
				2967	t = &thislist->t[i];
				2968
				2969	#ifdef DEBUG
				2970	nfa_set_code(t->state->c);
				2971	fprintf(debug, "%s, ", code);
				2972	#endif
				2973	#ifdef ENABLE_LOG
				2974	nfa_set_code(t->state->c);
				2975	fprintf(log_fd, "(%d) %s, code %d ... \n", abs(t->state->id),
				2976	code, (int)t->state->c);
				2977	#endif
				2978
				2979	/*
				2980	* Handle the possible codes of the current state.
				2981	* The most important is NFA_MATCH.
				2982	*/
				2983	switch (t->state->c)
				2984	{
				2985	case NFA_MATCH:
				2986	match = TRUE;
				2987	*submatch = t->sub;
				2988	#ifdef ENABLE_LOG
				2989	for (j = 0; j < 4; j++)
				2990	if (REG_MULTI)
				2991	fprintf(log_fd, "\n *** group %d, start: c=%d, l=%d, end: c=%d, l=%d",
				2992	j,
				2993	t->sub.startpos[j].col,
				2994	(int)t->sub.startpos[j].lnum,
				2995	t->sub.endpos[j].col,
				2996	(int)t->sub.endpos[j].lnum);
				2997	else
				2998	fprintf(log_fd, "\n *** group %d, start: \"%s\", end: \"%s\"",
				2999	j,
				3000	(char *)t->sub.start[j],
				3001	(char *)t->sub.end[j]);
				3002	fprintf(log_fd, "\n");
				3003	#endif
				3004	goto nextchar; /* found the left-most longest match */
				3005
				3006	case NFA_END_INVISIBLE:
				3007	/* This is only encountered after a NFA_START_INVISIBLE node.
				3008	* They surround a zero-width group, used with "\@=" and "\&".
				3009	* If we got here, it means that the current "invisible" group
				3010	* finished successfully, so return control to the parent
				3011	* nfa_regmatch(). Submatches are stored in *m, and used in
				3012	* the parent call. */
				3013	if (start->c == NFA_MOPEN + 0)
				3014	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3015	&match);
				3016	else
				3017	{
				3018	*m = t->sub;
				3019	match = TRUE;
				3020	}
				3021	break;
				3022
				3023	case NFA_START_INVISIBLE:
				3024	/* Save global variables, and call nfa_regmatch() to check if
				3025	* the current concat matches at this position. The concat
				3026	* ends with the node NFA_END_INVISIBLE */
				3027	old_reginput = reginput;
				3028	old_regline = regline;
				3029	old_reglnum = reglnum;
				3030	if (listids == NULL)
				3031	{
				3032	listids = (int ) lalloc(sizeof(int) nstate, TRUE);
				3033	if (listids == NULL)
				3034	{
				3035	EMSG(_("E878: (NFA) Could not allocate memory for branch traversal!"));
				3036	return 0;
				3037	}
				3038	}
				3039	#ifdef ENABLE_LOG
				3040	if (log_fd != stderr)
				3041	fclose(log_fd);
				3042	log_fd = NULL;
				3043	#endif
				3044	/* Have to clear the listid field of the NFA nodes, so that
				3045	* nfa_regmatch() and addstate() can run properly after
				3046	* recursion. */
				3047	nfa_save_listids(start, listids);
				3048	nfa_set_null_listids(start);
				3049	result = nfa_regmatch(t->state->out, submatch, m);
				3050	nfa_set_neg_listids(start);
				3051	nfa_restore_listids(start, listids);
				3052
				3053	#ifdef ENABLE_LOG
				3054	log_fd = fopen(LOG_NAME, "a");
				3055	if (log_fd != NULL)
				3056	{
				3057	fprintf(log_fd, "****************************\n");
				3058	fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
				3059	fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
				3060	fprintf(log_fd, "****************************\n");
				3061	}
				3062	else
				3063	{
				3064	EMSG(_("Could not open temporary log file for writing, displaying on stderr ... "));
				3065	log_fd = stderr;
				3066	}
				3067	#endif
				3068	if (result == TRUE)
				3069	{
				3070	/* Restore position in input text */
				3071	reginput = old_reginput;
				3072	regline = old_regline;
				3073	reglnum = old_reglnum;
				3074	/* Copy submatch info from the recursive call */
				3075	if (REG_MULTI)
				3076	for (j = 1; j < NSUBEXP; j++)
				3077	{
				3078	t->sub.startpos[j] = m->startpos[j];
				3079	t->sub.endpos[j] = m->endpos[j];
				3080	}
				3081	else
				3082	for (j = 1; j < NSUBEXP; j++)
				3083	{
				3084	t->sub.start[j] = m->start[j];
				3085	t->sub.end[j] = m->end[j];
				3086	}
				3087	/* t->state->out1 is the corresponding END_INVISIBLE node */
				3088	addstate(thislist, t->state->out1->out, &t->sub, 0, listid,
				3089	&match);
				3090	}
				3091	else
				3092	{
				3093	/* continue with next input char */
				3094	reginput = old_reginput;
				3095	}
				3096	break;
				3097
				3098	case NFA_BOL:
				3099	if (reginput == regline)
				3100	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3101	&match);
				3102	break;
				3103
				3104	case NFA_EOL:
				3105	if (c == NUL)
				3106	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3107	&match);
				3108	break;
				3109
				3110	case NFA_BOW:
				3111	{
				3112	int bow = TRUE;
				3113
				3114	if (c == NUL)
				3115	bow = FALSE;
				3116	#ifdef FEAT_MBYTE
				3117	else if (has_mbyte)
				3118	{
				3119	int this_class;
				3120
				3121	/* Get class of current and previous char (if it exists). */
				3122	this_class = mb_get_class(reginput);
				3123	if (this_class <= 1)
				3124	bow = FALSE;
				3125	else if (reg_prev_class() == this_class)
				3126	bow = FALSE;
				3127	}
				3128	#endif
				3129	else if (!vim_iswordc(c)
				3130	\|\| (reginput > regline && vim_iswordc(reginput[-1])))
				3131	bow = FALSE;
				3132	if (bow)
				3133	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3134	&match);
				3135	break;
				3136	}
				3137
				3138	case NFA_EOW:
				3139	{
				3140	int eow = TRUE;
				3141
				3142	if (reginput == regline)
				3143	eow = FALSE;
				3144	#ifdef FEAT_MBYTE
				3145	else if (has_mbyte)
				3146	{
				3147	int this_class, prev_class;
				3148
				3149	/* Get class of current and previous char (if it exists). */
				3150	this_class = mb_get_class(reginput);
				3151	prev_class = reg_prev_class();
				3152	if (this_class == prev_class
				3153	\|\| prev_class == 0 \|\| prev_class == 1)
				3154	eow = FALSE;
				3155	}
				3156	#endif
				3157	else if (!vim_iswordc(reginput[-1])
				3158	\|\| (reginput[0] != NUL && vim_iswordc(c)))
				3159	eow = FALSE;
				3160	if (eow)
				3161	addstate(thislist, t->state->out, &t->sub, 0, listid,
				3162	&match);
				3163	break;
				3164	}
				3165
				3166	case NFA_MULTIBYTE:
				3167	case NFA_COMPOSING:
				3168	switch (t->state->c)
				3169	{
				3170	case NFA_MULTIBYTE: endnode = NFA_END_MULTIBYTE; break;
				3171	case NFA_COMPOSING: endnode = NFA_END_COMPOSING; break;
				3172	default: endnode = 0;
				3173	}
				3174
				3175	result = OK;
				3176	sta = t->state->out;
				3177	len = 1;
				3178	while (sta->c != endnode && len <= n)
				3179	{
				3180	if (reginput[len-1] != sta->c)
				3181	{
				3182	result = OK - 1;
				3183	break;
				3184	}
				3185	len++;
				3186	sta = sta->out;
				3187	}
				3188
				3189	/* if input char length doesn't match regexp char length */
				3190	if (len -1 < n \|\| sta->c != endnode)
				3191	result = OK - 1;
				3192	end = t->state->out1; /* NFA_END_MULTIBYTE or
				3193	NFA_END_COMPOSING */
				3194	/* If \Z was present, then ignore composing characters */
				3195	if (regflags & RF_ICOMBINE)
				3196	result = 1 ^ sta->negated;
				3197	ADD_POS_NEG_STATE(end);
				3198	break;
				3199
				3200	case NFA_NEWL:
				3201	if (!reg_line_lbr && REG_MULTI
				3202	&& c == NUL && reglnum <= reg_maxline)
				3203	{
				3204	if (reginput_updated == FALSE)
				3205	{
				3206	reg_nextline();
				3207	reginput_updated = TRUE;
				3208	}
				3209	addstate(nextlist, t->state->out, &t->sub, n, listid + 1,
				3210	&match);
				3211	}
				3212	break;
				3213
				3214	case NFA_CLASS_ALNUM:
				3215	case NFA_CLASS_ALPHA:
				3216	case NFA_CLASS_BLANK:
				3217	case NFA_CLASS_CNTRL:
				3218	case NFA_CLASS_DIGIT:
				3219	case NFA_CLASS_GRAPH:
				3220	case NFA_CLASS_LOWER:
				3221	case NFA_CLASS_PRINT:
				3222	case NFA_CLASS_PUNCT:
				3223	case NFA_CLASS_SPACE:
				3224	case NFA_CLASS_UPPER:
				3225	case NFA_CLASS_XDIGIT:
				3226	case NFA_CLASS_TAB:
				3227	case NFA_CLASS_RETURN:
				3228	case NFA_CLASS_BACKSPACE:
				3229	case NFA_CLASS_ESCAPE:
				3230	result = check_char_class(t->state->c, c);
				3231	ADD_POS_NEG_STATE(t->state);
				3232	break;
				3233
				3234	case NFA_END_NEG_RANGE:
				3235	/* This follows a series of negated nodes, like:
				3236	* CHAR(x), NFA_NOT, CHAR(y), NFA_NOT etc. */
				3237	if (c > 0)
				3238	addstate(nextlist, t->state->out, &t->sub, n, listid + 1,
				3239	&match);
				3240	break;
				3241
				3242	case NFA_ANY:
				3243	/* Any printable char, not just any char. '\0' (end of input)
				3244	* must not match */
				3245	if (c > 0)
				3246	addstate(nextlist, t->state->out, &t->sub, n, listid + 1,
				3247	&match);
				3248	break;
				3249
				3250	/*
				3251	* Character classes like \a for alpha, \d for digit etc.
				3252	*/
				3253	case NFA_IDENT: /* \i */
				3254	result = vim_isIDc(c);
				3255	ADD_POS_NEG_STATE(t->state);
				3256	break;
				3257
				3258	case NFA_SIDENT: /* \I */
				3259	result = !VIM_ISDIGIT(c) && vim_isIDc(c);
				3260	ADD_POS_NEG_STATE(t->state);
				3261	break;
				3262
				3263	case NFA_KWORD: /* \k */
				3264	result = vim_iswordp(cc);
				3265	ADD_POS_NEG_STATE(t->state);
				3266	break;
				3267
				3268	case NFA_SKWORD: /* \K */
				3269	result = !VIM_ISDIGIT(c) && vim_iswordp(cc);
				3270	ADD_POS_NEG_STATE(t->state);
				3271	break;
				3272
				3273	case NFA_FNAME: /* \f */
				3274	result = vim_isfilec(c);
				3275	ADD_POS_NEG_STATE(t->state);
				3276	break;
				3277
				3278	case NFA_SFNAME: /* \F */
				3279	result = !VIM_ISDIGIT(c) && vim_isfilec(c);
				3280	ADD_POS_NEG_STATE(t->state);
				3281	break;
				3282
				3283	case NFA_PRINT: /* \p */
				3284	result = ptr2cells(cc) == 1;
				3285	ADD_POS_NEG_STATE(t->state);
				3286	break;
				3287
				3288	case NFA_SPRINT: /* \P */
				3289	result = !VIM_ISDIGIT(c) && ptr2cells(cc) == 1;
				3290	ADD_POS_NEG_STATE(t->state);
				3291	break;
				3292
				3293	case NFA_WHITE: /* \s */
				3294	result = vim_iswhite(c);
				3295	ADD_POS_NEG_STATE(t->state);
				3296	break;
				3297
				3298	case NFA_NWHITE: /* \S */
				3299	result = c != NUL && !vim_iswhite(c);
				3300	ADD_POS_NEG_STATE(t->state);
				3301	break;
				3302
				3303	case NFA_DIGIT: /* \d */
				3304	result = ri_digit(c);
				3305	ADD_POS_NEG_STATE(t->state);
				3306	break;
				3307
				3308	case NFA_NDIGIT: /* \D */
				3309	result = c != NUL && !ri_digit(c);
				3310	ADD_POS_NEG_STATE(t->state);
				3311	break;
				3312
				3313	case NFA_HEX: /* \x */
				3314	result = ri_hex(c);
				3315	ADD_POS_NEG_STATE(t->state);
				3316	break;
				3317
				3318	case NFA_NHEX: /* \X */
				3319	result = c != NUL && !ri_hex(c);
				3320	ADD_POS_NEG_STATE(t->state);
				3321	break;
				3322
				3323	case NFA_OCTAL: /* \o */
				3324	result = ri_octal(c);
				3325	ADD_POS_NEG_STATE(t->state);
				3326	break;
				3327
				3328	case NFA_NOCTAL: /* \O */
				3329	result = c != NUL && !ri_octal(c);
				3330	ADD_POS_NEG_STATE(t->state);
				3331	break;
				3332
				3333	case NFA_WORD: /* \w */
				3334	result = ri_word(c);
				3335	ADD_POS_NEG_STATE(t->state);
				3336	break;
				3337
				3338	case NFA_NWORD: /* \W */
				3339	result = c != NUL && !ri_word(c);
				3340	ADD_POS_NEG_STATE(t->state);
				3341	break;
				3342
				3343	case NFA_HEAD: /* \h */
				3344	result = ri_head(c);
				3345	ADD_POS_NEG_STATE(t->state);
				3346	break;
				3347
				3348	case NFA_NHEAD: /* \H */
				3349	result = c != NUL && !ri_head(c);
				3350	ADD_POS_NEG_STATE(t->state);
				3351	break;
				3352
				3353	case NFA_ALPHA: /* \a */
				3354	result = ri_alpha(c);
				3355	ADD_POS_NEG_STATE(t->state);
				3356	break;
				3357
				3358	case NFA_NALPHA: /* \A */
				3359	result = c != NUL && !ri_alpha(c);
				3360	ADD_POS_NEG_STATE(t->state);
				3361	break;
				3362
				3363	case NFA_LOWER: /* \l */
				3364	result = ri_lower(c);
				3365	ADD_POS_NEG_STATE(t->state);
				3366	break;
				3367
				3368	case NFA_NLOWER: /* \L */
				3369	result = c != NUL && !ri_lower(c);
				3370	ADD_POS_NEG_STATE(t->state);
				3371	break;
				3372
				3373	case NFA_UPPER: /* \u */
				3374	result = ri_upper(c);
				3375	ADD_POS_NEG_STATE(t->state);
				3376	break;
				3377
				3378	case NFA_NUPPER: /* \U */
				3379	result = c != NUL && !ri_upper(c);
				3380	ADD_POS_NEG_STATE(t->state);
				3381	break;
				3382
				3383	default: /* regular character */
				3384	result = (no_Magic(t->state->c) == c);
				3385	if (!result)
				3386	result = ireg_ic == TRUE
				3387	&& MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
				3388	ADD_POS_NEG_STATE(t->state);
				3389	break;
				3390	}
				3391
				3392	} /* for (thislist = thislist; thislist->state; thislist++) */
				3393
				3394	/* The first found match is the leftmost one, but there may be a
				3395	* longer one. Keep running the NFA, but don't start from the
				3396	* beginning. Also, do not add the start state in recursive calls of
				3397	* nfa_regmatch(), because recursive calls should only start in the
				3398	* first position. */
				3399	if (match == FALSE && start->c == NFA_MOPEN + 0)
				3400	{
				3401	#ifdef ENABLE_LOG
				3402	fprintf(log_fd, "(---) STARTSTATE\n");
				3403	#endif
				3404	addstate(nextlist, start, m, n, listid + 1, &match);
				3405	}
				3406
				3407	if (reginput_updated)
				3408	{
				3409	reginput_updated = FALSE;
				3410	goto again;
				3411	}
				3412
				3413	#ifdef ENABLE_LOG
				3414	fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
				3415	for (i = 0; i< thislist->n; i++)
				3416	fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
				3417	fprintf(log_fd, "\n");
				3418	#endif
				3419
				3420	nextchar:
				3421	reginput += n;
				3422	} while (c \|\| reginput_updated);
				3423
				3424	#ifdef ENABLE_LOG
				3425	if (log_fd != stderr)
				3426	fclose(log_fd);
				3427	log_fd = NULL;
				3428	#endif
				3429
				3430	theend:
				3431	/* Free memory */
				3432	vim_free(list[0].t);
				3433	vim_free(list[1].t);
				3434	vim_free(list[2].t);
				3435	list[0].t = list[1].t = list[2].t = NULL;
				3436	if (listids != NULL)
				3437	vim_free(listids);
				3438	#undef ADD_POS_NEG_STATE
				3439	#ifdef DEBUG
				3440	fclose(debug);
				3441	#endif
				3442
				3443	return match;
				3444	}
				3445
				3446	/*
				3447	* Try match of "prog" with at regline["col"].
				3448	* Returns 0 for failure, number of lines contained in the match otherwise.
				3449	*/
				3450	static long
				3451	nfa_regtry(start, col)
				3452	nfa_state_T *start;
				3453	colnr_T col;
				3454	{
				3455	int i;
				3456	regsub_T sub, m;
				3457	#ifdef ENABLE_LOG
				3458	FILE *f;
				3459	#endif
				3460
				3461	reginput = regline + col;
				3462	need_clear_subexpr = TRUE;
				3463
				3464	#ifdef ENABLE_LOG
				3465	f = fopen(LOG_NAME, "a");
				3466	if (f != NULL)
				3467	{
				3468	fprintf(f, "\n\n\n\n\n\n\t\t=======================================================\n");
				3469	fprintf(f, " =======================================================\n");
				3470	#ifdef DEBUG
				3471	fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
				3472	#endif
				3473	fprintf(f, "\tInput text is \"%s\" \n", reginput);
				3474	fprintf(f, " =======================================================\n\n\n\n\n\n\n");
				3475	nfa_print_state(f, start, 0);
				3476	fprintf(f, "\n\n");
				3477	fclose(f);
				3478	}
				3479	else
				3480	EMSG(_("Could not open temporary log file for writing "));
				3481	#endif
				3482
				3483	if (REG_MULTI)
				3484	{
				3485	/* Use 0xff to set lnum to -1 */
				3486	vim_memset(sub.startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3487	vim_memset(sub.endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3488	vim_memset(m.startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3489	vim_memset(m.endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
				3490	}
				3491	else
				3492	{
				3493	vim_memset(sub.start, 0, sizeof(char_u ) NSUBEXP);
				3494	vim_memset(sub.end, 0, sizeof(char_u ) NSUBEXP);
				3495	vim_memset(m.start, 0, sizeof(char_u ) NSUBEXP);
				3496	vim_memset(m.end, 0, sizeof(char_u ) NSUBEXP);
				3497	}
				3498
				3499	if (nfa_regmatch(start, &sub, &m) == FALSE)
				3500	return 0;
				3501
				3502	cleanup_subexpr();
				3503	if (REG_MULTI)
				3504	{
				3505	for (i = 0; i < NSUBEXP; i++)
				3506	{
				3507	reg_startpos[i] = sub.startpos[i];
				3508	reg_endpos[i] = sub.endpos[i];
				3509	}
				3510
				3511	if (reg_startpos[0].lnum < 0)
				3512	{
				3513	reg_startpos[0].lnum = 0;
				3514	reg_startpos[0].col = col;
				3515	}
				3516	if (reg_endpos[0].lnum < 0)
				3517	{
				3518	reg_endpos[0].lnum = reglnum;
				3519	reg_endpos[0].col = (int)(reginput - regline);
				3520	}
				3521	else
				3522	/* Use line number of "\ze". */
				3523	reglnum = reg_endpos[0].lnum;
				3524	}
				3525	else
				3526	{
				3527	for (i = 0; i < NSUBEXP; i++)
				3528	{
				3529	reg_startp[i] = sub.start[i];
				3530	reg_endp[i] = sub.end[i];
				3531	}
				3532
				3533	if (reg_startp[0] == NULL)
				3534	reg_startp[0] = regline + col;
				3535	if (reg_endp[0] == NULL)
				3536	reg_endp[0] = reginput;
				3537	}
				3538
				3539	return 1 + reglnum;
				3540	}
				3541
				3542	/*
				3543	* Match a regexp against a string ("line" points to the string) or multiple
				3544	* lines ("line" is NULL, use reg_getline()).
				3545	*
				3546	* Returns 0 for failure, number of lines contained in the match otherwise.
				3547	*/
				3548	static long
				3549	nfa_regexec_both(line, col)
				3550	char_u *line;
				3551	colnr_T col; /* column to start looking for match */
				3552	{
				3553	nfa_regprog_T *prog;
				3554	long retval = 0L;
				3555	int i;
				3556
				3557	if (REG_MULTI)
				3558	{
				3559	prog = (nfa_regprog_T *)reg_mmatch->regprog;
				3560	line = reg_getline((linenr_T)0); /* relative to the cursor */
				3561	reg_startpos = reg_mmatch->startpos;
				3562	reg_endpos = reg_mmatch->endpos;
				3563	}
				3564	else
				3565	{
				3566	prog = (nfa_regprog_T *)reg_match->regprog;
				3567	reg_startp = reg_match->startp;
				3568	reg_endp = reg_match->endp;
				3569	}
				3570
				3571	/* Be paranoid... */
				3572	if (prog == NULL \|\| line == NULL)
				3573	{
				3574	EMSG(_(e_null));
				3575	goto theend;
				3576	}
				3577
				3578	/* If the start column is past the maximum column: no need to try. */
				3579	if (ireg_maxcol > 0 && col >= ireg_maxcol)
				3580	goto theend;
				3581
				3582	/* If pattern contains "\c" or "\C": overrule value of ireg_ic */
				3583	if (prog->regflags & RF_ICASE)
				3584	ireg_ic = TRUE;
				3585	else if (prog->regflags & RF_NOICASE)
				3586	ireg_ic = FALSE;
				3587
				3588	#ifdef FEAT_MBYTE
				3589	/* If pattern contains "\Z" overrule value of ireg_icombine */
				3590	if (prog->regflags & RF_ICOMBINE)
				3591	ireg_icombine = TRUE;
				3592	#endif
				3593
				3594	regline = line;
				3595	reglnum = 0; /* relative to line */
				3596
				3597	nstate = prog->nstate;
				3598
				3599	for (i = 0; i < nstate; ++i)
				3600	{
				3601	prog->state[i].id = i;
				3602	prog->state[i].lastlist = 0;
				3603	prog->state[i].visits = 0;
				3604	prog->state[i].lastthread = NULL;
				3605	}
				3606
				3607	retval = nfa_regtry(prog->start, col);
				3608
				3609	theend:
				3610	return retval;
				3611	}
				3612
				3613	/*
				3614	* Compile a regular expression into internal code for the NFA matcher.
				3615	* Returns the program in allocated space. Returns NULL for an error.
				3616	*/
				3617	static regprog_T *
				3618	nfa_regcomp(expr, re_flags)
				3619	char_u *expr;
				3620	int re_flags;
				3621	{
				3622	nfa_regprog_T *prog;
Bram Moolenaar	ca12d7c	2013-05-20 21:26:33 +0200	[diff] [blame^]	3623	size_t prog_size;
Bram Moolenaar	fbc0d2e	2013-05-19 19:40:29 +0200	[diff] [blame]	3624	int *postfix;
				3625
				3626	if (expr == NULL)
				3627	return NULL;
				3628
				3629	#ifdef DEBUG
				3630	nfa_regengine.expr = expr;
				3631	#endif
				3632
				3633	init_class_tab();
				3634
				3635	if (nfa_regcomp_start(expr, re_flags) == FAIL)
				3636	return NULL;
				3637
				3638	/* Space for compiled regexp */
				3639	prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * nstate_max;
				3640	prog = (nfa_regprog_T *)lalloc(prog_size, TRUE);
				3641	if (prog == NULL)
				3642	goto fail;
				3643	vim_memset(prog, 0, prog_size);
				3644
				3645	/* Build postfix form of the regexp. Needed to build the NFA
				3646	* (and count its size) */
				3647	postfix = re2post();
				3648	if (postfix == NULL)
				3649	goto fail; /* Cascaded (syntax?) error */
				3650
				3651	/*
				3652	* In order to build the NFA, we parse the input regexp twice:
				3653	* 1. first pass to count size (so we can allocate space)
				3654	* 2. second to emit code
				3655	*/
				3656	#ifdef ENABLE_LOG
				3657	{
				3658	FILE *f = fopen(LOG_NAME, "a");
				3659
				3660	if (f != NULL)
				3661	{
				3662	fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\" ... hold on !\n", expr);
				3663	fclose(f);
				3664	}
				3665	}
				3666	#endif
				3667
				3668	/*
				3669	* PASS 1
				3670	* Count number of NFA states in "nstate". Do not build the NFA.
				3671	*/
				3672	post2nfa(postfix, post_ptr, TRUE);
				3673	state_ptr = prog->state;
				3674
				3675	/*
				3676	* PASS 2
				3677	* Build the NFA
				3678	*/
				3679	prog->start = post2nfa(postfix, post_ptr, FALSE);
				3680	if (prog->start == NULL)
				3681	goto fail;
				3682
				3683	prog->regflags = regflags;
				3684	prog->engine = &nfa_regengine;
				3685	prog->nstate = nstate;
				3686	#ifdef ENABLE_LOG
				3687	nfa_postfix_dump(expr, OK);
				3688	nfa_dump(prog);
				3689	#endif
				3690
				3691	out:
				3692	vim_free(post_start);
				3693	post_start = post_ptr = post_end = NULL;
				3694	state_ptr = NULL;
				3695	return (regprog_T *)prog;
				3696
				3697	fail:
				3698	vim_free(prog);
				3699	prog = NULL;
				3700	#ifdef ENABLE_LOG
				3701	nfa_postfix_dump(expr, FAIL);
				3702	#endif
				3703	#ifdef DEBUG
				3704	nfa_regengine.expr = NULL;
				3705	#endif
				3706	goto out;
				3707	}
				3708
				3709
				3710	/*
				3711	* Match a regexp against a string.
				3712	* "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
				3713	* Uses curbuf for line count and 'iskeyword'.
				3714	*
				3715	* Return TRUE if there is a match, FALSE if not.
				3716	*/
				3717	static int
				3718	nfa_regexec(rmp, line, col)
				3719	regmatch_T *rmp;
				3720	char_u line; / string to match against */
				3721	colnr_T col; /* column to start looking for match */
				3722	{
				3723	reg_match = rmp;
				3724	reg_mmatch = NULL;
				3725	reg_maxline = 0;
				3726	reg_line_lbr = FALSE;
				3727	reg_buf = curbuf;
				3728	reg_win = NULL;
				3729	ireg_ic = rmp->rm_ic;
				3730	#ifdef FEAT_MBYTE
				3731	ireg_icombine = FALSE;
				3732	#endif
				3733	ireg_maxcol = 0;
				3734	return (nfa_regexec_both(line, col) != 0);
				3735	}
				3736
				3737	#if defined(FEAT_MODIFY_FNAME) \|\| defined(FEAT_EVAL) \
				3738	\|\| defined(FIND_REPLACE_DIALOG) \|\| defined(PROTO)
				3739
				3740	static int nfa_regexec_nl __ARGS((regmatch_T rmp, char_u line, colnr_T col));
				3741
				3742	/*
				3743	* Like nfa_regexec(), but consider a "\n" in "line" to be a line break.
				3744	*/
				3745	static int
				3746	nfa_regexec_nl(rmp, line, col)
				3747	regmatch_T *rmp;
				3748	char_u line; / string to match against */
				3749	colnr_T col; /* column to start looking for match */
				3750	{
				3751	reg_match = rmp;
				3752	reg_mmatch = NULL;
				3753	reg_maxline = 0;
				3754	reg_line_lbr = TRUE;
				3755	reg_buf = curbuf;
				3756	reg_win = NULL;
				3757	ireg_ic = rmp->rm_ic;
				3758	#ifdef FEAT_MBYTE
				3759	ireg_icombine = FALSE;
				3760	#endif
				3761	ireg_maxcol = 0;
				3762	return (nfa_regexec_both(line, col) != 0);
				3763	}
				3764	#endif
				3765
				3766
				3767	/*
				3768	* Match a regexp against multiple lines.
				3769	* "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
				3770	* Uses curbuf for line count and 'iskeyword'.
				3771	*
				3772	* Return zero if there is no match. Return number of lines contained in the
				3773	* match otherwise.
				3774	*
				3775	* Note: the body is the same as bt_regexec() except for nfa_regexec_both()
				3776	*
				3777	* ! Also NOTE : match may actually be in another line. e.g.:
				3778	* when r.e. is \nc, cursor is at 'a' and the text buffer looks like
				3779	*
				3780	* +-------------------------+
				3781	* \|a \|
				3782	* \|b \|
				3783	* \|c \|
				3784	* \| \|
				3785	* +-------------------------+
				3786	*
				3787	* then nfa_regexec_multi() returns 3. while the original
				3788	* vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
				3789	*
				3790	* FIXME if this behavior is not compatible.
				3791	*/
				3792	static long
				3793	nfa_regexec_multi(rmp, win, buf, lnum, col, tm)
				3794	regmmatch_T *rmp;
				3795	win_T win; / window in which to search or NULL */
				3796	buf_T buf; / buffer in which to search */
				3797	linenr_T lnum; /* nr of line to start looking for match */
				3798	colnr_T col; /* column to start looking for match */
				3799	proftime_T tm UNUSED; / timeout limit or NULL */
				3800	{
				3801	long r;
				3802	buf_T *save_curbuf = curbuf;
				3803
				3804	reg_match = NULL;
				3805	reg_mmatch = rmp;
				3806	reg_buf = buf;
				3807	reg_win = win;
				3808	reg_firstlnum = lnum;
				3809	reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
				3810	reg_line_lbr = FALSE;
				3811	ireg_ic = rmp->rmm_ic;
				3812	#ifdef FEAT_MBYTE
				3813	ireg_icombine = FALSE;
				3814	#endif
				3815	ireg_maxcol = rmp->rmm_maxcol;
				3816
				3817	/* Need to switch to buffer "buf" to make vim_iswordc() work. */
				3818	curbuf = buf;
				3819	r = nfa_regexec_both(NULL, col);
				3820	curbuf = save_curbuf;
				3821
				3822	return r;
				3823	}
				3824
				3825	#ifdef DEBUG
				3826	# undef ENABLE_LOG
				3827	#endif