blob: fdd549e47553616f851042d0f0c86bac837874e1 [file] [log] [blame]
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10/*
11 * spellsuggest.c: functions for spelling suggestions
12 */
13
14#include "vim.h"
15
16#if defined(FEAT_SPELL) || defined(PROTO)
17
18/*
19 * Use this to adjust the score after finding suggestions, based on the
20 * suggested word sounding like the bad word. This is much faster than doing
21 * it for every possible suggestion.
22 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
23 * vs "ht") and goes down in the list.
24 * Used when 'spellsuggest' is set to "best".
25 */
26#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
27
28/*
29 * Do the opposite: based on a maximum end score and a known sound score,
30 * compute the maximum word score that can be used.
31 */
32#define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
33
34// only used for su_badflags
35#define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI
36
37/*
38 * Information used when looking for suggestions.
39 */
40typedef struct suginfo_S
41{
42 garray_T su_ga; // suggestions, contains "suggest_T"
43 int su_maxcount; // max. number of suggestions displayed
44 int su_maxscore; // maximum score for adding to su_ga
45 int su_sfmaxscore; // idem, for when doing soundfold words
46 garray_T su_sga; // like su_ga, sound-folded scoring
47 char_u *su_badptr; // start of bad word in line
48 int su_badlen; // length of detected bad word in line
49 int su_badflags; // caps flags for bad word
50 char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen
51 char_u su_fbadword[MAXWLEN]; // su_badword case-folded
52 char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded
53 hashtab_T su_banned; // table with banned words
54 slang_T *su_sallang; // default language for sound folding
55} suginfo_T;
56
57// One word suggestion. Used in "si_ga".
58typedef struct suggest_S
59{
60 char_u *st_word; // suggested word, allocated string
61 int st_wordlen; // STRLEN(st_word)
62 int st_orglen; // length of replaced text
63 int st_score; // lower is better
64 int st_altscore; // used when st_score compares equal
65 int st_salscore; // st_score is for soundalike
66 int st_had_bonus; // bonus already included in score
67 slang_T *st_slang; // language used for sound folding
68} suggest_T;
69
70#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
71
72// TRUE if a word appears in the list of banned words.
73#define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
74
75// Number of suggestions kept when cleaning up. We need to keep more than
76// what is displayed, because when rescore_suggestions() is called the score
77// may change and wrong suggestions may be removed later.
78#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20)
79
80// Threshold for sorting and cleaning up suggestions. Don't want to keep lots
81// of suggestions that are not going to be displayed.
82#define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
83
84// score for various changes
85#define SCORE_SPLIT 149 // split bad word
86#define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS
87#define SCORE_ICASE 52 // slightly different case
88#define SCORE_REGION 200 // word is for different region
89#define SCORE_RARE 180 // rare word
90#define SCORE_SWAP 75 // swap two characters
91#define SCORE_SWAP3 110 // swap two characters in three
92#define SCORE_REP 65 // REP replacement
93#define SCORE_SUBST 93 // substitute a character
94#define SCORE_SIMILAR 33 // substitute a similar character
95#define SCORE_SUBCOMP 33 // substitute a composing character
96#define SCORE_DEL 94 // delete a character
97#define SCORE_DELDUP 66 // delete a duplicated character
98#define SCORE_DELCOMP 28 // delete a composing character
99#define SCORE_INS 96 // insert a character
100#define SCORE_INSDUP 67 // insert a duplicate character
101#define SCORE_INSCOMP 30 // insert a composing character
102#define SCORE_NONWORD 103 // change non-word to word char
103
104#define SCORE_FILE 30 // suggestion from a file
105#define SCORE_MAXINIT 350 // Initial maximum score: higher == slower.
106 // 350 allows for about three changes.
107
108#define SCORE_COMMON1 30 // subtracted for words seen before
109#define SCORE_COMMON2 40 // subtracted for words often seen
110#define SCORE_COMMON3 50 // subtracted for words very often seen
111#define SCORE_THRES2 10 // word count threshold for COMMON2
112#define SCORE_THRES3 100 // word count threshold for COMMON3
113
114// When trying changed soundfold words it becomes slow when trying more than
115// two changes. With less then two changes it's slightly faster but we miss a
116// few good suggestions. In rare cases we need to try three of four changes.
117#define SCORE_SFMAX1 200 // maximum score for first try
118#define SCORE_SFMAX2 300 // maximum score for second try
119#define SCORE_SFMAX3 400 // maximum score for third try
120
121#define SCORE_BIG SCORE_INS * 3 // big difference
122#define SCORE_MAXMAX 999999 // accept any score
123#define SCORE_LIMITMAX 350 // for spell_edit_score_limit()
124
125// for spell_edit_score_limit() we need to know the minimum value of
126// SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS
127#define SCORE_EDIT_MIN SCORE_SIMILAR
128
129/*
130 * For finding suggestions: At each node in the tree these states are tried:
131 */
132typedef enum
133{
134 STATE_START = 0, // At start of node check for NUL bytes (goodword
135 // ends); if badword ends there is a match, otherwise
136 // try splitting word.
137 STATE_NOPREFIX, // try without prefix
138 STATE_SPLITUNDO, // Undo splitting.
139 STATE_ENDNUL, // Past NUL bytes at start of the node.
140 STATE_PLAIN, // Use each byte of the node.
141 STATE_DEL, // Delete a byte from the bad word.
142 STATE_INS_PREP, // Prepare for inserting bytes.
143 STATE_INS, // Insert a byte in the bad word.
144 STATE_SWAP, // Swap two bytes.
145 STATE_UNSWAP, // Undo swap two characters.
146 STATE_SWAP3, // Swap two characters over three.
147 STATE_UNSWAP3, // Undo Swap two characters over three.
148 STATE_UNROT3L, // Undo rotate three characters left
149 STATE_UNROT3R, // Undo rotate three characters right
150 STATE_REP_INI, // Prepare for using REP items.
151 STATE_REP, // Use matching REP items from the .aff file.
152 STATE_REP_UNDO, // Undo a REP item replacement.
153 STATE_FINAL // End of this node.
154} state_T;
155
156/*
157 * Struct to keep the state at each level in suggest_try_change().
158 */
159typedef struct trystate_S
160{
161 state_T ts_state; // state at this level, STATE_
162 int ts_score; // score
163 idx_T ts_arridx; // index in tree array, start of node
164 short ts_curi; // index in list of child nodes
165 char_u ts_fidx; // index in fword[], case-folded bad word
166 char_u ts_fidxtry; // ts_fidx at which bytes may be changed
167 char_u ts_twordlen; // valid length of tword[]
168 char_u ts_prefixdepth; // stack depth for end of prefix or
169 // PFD_PREFIXTREE or PFD_NOPREFIX
170 char_u ts_flags; // TSF_ flags
171 char_u ts_tcharlen; // number of bytes in tword character
172 char_u ts_tcharidx; // current byte index in tword character
173 char_u ts_isdiff; // DIFF_ values
174 char_u ts_fcharstart; // index in fword where badword char started
175 char_u ts_prewordlen; // length of word in "preword[]"
176 char_u ts_splitoff; // index in "tword" after last split
177 char_u ts_splitfidx; // "ts_fidx" at word split
178 char_u ts_complen; // nr of compound words used
179 char_u ts_compsplit; // index for "compflags" where word was spit
180 char_u ts_save_badflags; // su_badflags saved here
181 char_u ts_delidx; // index in fword for char that was deleted,
182 // valid when "ts_flags" has TSF_DIDDEL
183} trystate_T;
184
185// values for ts_isdiff
186#define DIFF_NONE 0 // no different byte (yet)
187#define DIFF_YES 1 // different byte found
188#define DIFF_INSERT 2 // inserting character
189
190// values for ts_flags
191#define TSF_PREFIXOK 1 // already checked that prefix is OK
192#define TSF_DIDSPLIT 2 // tried split at this point
193#define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index
194
195// special values ts_prefixdepth
196#define PFD_NOPREFIX 0xff // not using prefixes
197#define PFD_PREFIXTREE 0xfe // walking through the prefix tree
198#define PFD_NOTSPECIAL 0xfd // highest value that's not special
199
200static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive);
201#ifdef FEAT_EVAL
202static void spell_suggest_expr(suginfo_T *su, char_u *expr);
203#endif
204static void spell_suggest_file(suginfo_T *su, char_u *fname);
205static void spell_suggest_intern(suginfo_T *su, int interactive);
206static void spell_find_cleanup(suginfo_T *su);
207static void suggest_try_special(suginfo_T *su);
208static void suggest_try_change(suginfo_T *su);
209static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold);
210static void go_deeper(trystate_T *stack, int depth, int score_add);
211static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword);
212static void score_comp_sal(suginfo_T *su);
213static void score_combine(suginfo_T *su);
214static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound);
215static void suggest_try_soundalike_prep(void);
216static void suggest_try_soundalike(suginfo_T *su);
217static void suggest_try_soundalike_finish(void);
218static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp);
219static int soundfold_find(slang_T *slang, char_u *word);
220static int similar_chars(slang_T *slang, int c1, int c2);
221static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf);
222static void check_suggestions(suginfo_T *su, garray_T *gap);
223static void add_banned(suginfo_T *su, char_u *word);
224static void rescore_suggestions(suginfo_T *su);
225static void rescore_one(suginfo_T *su, suggest_T *stp);
226static int cleanup_suggestions(garray_T *gap, int maxscore, int keep);
227static int soundalike_score(char_u *goodsound, char_u *badsound);
228static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword);
229static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit);
230static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit);
231
232/*
233 * Return TRUE when the sequence of flags in "compflags" plus "flag" can
234 * possibly form a valid compounded word. This also checks the COMPOUNDRULE
235 * lines if they don't contain wildcards.
236 */
237 static int
238can_be_compound(
239 trystate_T *sp,
240 slang_T *slang,
241 char_u *compflags,
242 int flag)
243{
244 // If the flag doesn't appear in sl_compstartflags or sl_compallflags
245 // then it can't possibly compound.
246 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit
247 ? slang->sl_compstartflags : slang->sl_compallflags, flag))
248 return FALSE;
249
250 // If there are no wildcards, we can check if the flags collected so far
251 // possibly can form a match with COMPOUNDRULE patterns. This only
252 // makes sense when we have two or more words.
253 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit)
254 {
255 int v;
256
257 compflags[sp->ts_complen] = flag;
258 compflags[sp->ts_complen + 1] = NUL;
259 v = match_compoundrule(slang, compflags + sp->ts_compsplit);
260 compflags[sp->ts_complen] = NUL;
261 return v;
262 }
263
264 return TRUE;
265}
266
267/*
268 * Adjust the score of common words.
269 */
270 static int
271score_wordcount_adj(
272 slang_T *slang,
273 int score,
274 char_u *word,
275 int split) // word was split, less bonus
276{
277 hashitem_T *hi;
278 wordcount_T *wc;
279 int bonus;
280 int newscore;
281
282 hi = hash_find(&slang->sl_wordcount, word);
283 if (!HASHITEM_EMPTY(hi))
284 {
285 wc = HI2WC(hi);
286 if (wc->wc_count < SCORE_THRES2)
287 bonus = SCORE_COMMON1;
288 else if (wc->wc_count < SCORE_THRES3)
289 bonus = SCORE_COMMON2;
290 else
291 bonus = SCORE_COMMON3;
292 if (split)
293 newscore = score - bonus / 2;
294 else
295 newscore = score - bonus;
296 if (newscore < 0)
297 return 0;
298 return newscore;
299 }
300 return score;
301}
302
303/*
304 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a
305 * capital. So that make_case_word() can turn WOrd into Word.
306 * Add ALLCAP for "WOrD".
307 */
308 static int
309badword_captype(char_u *word, char_u *end)
310{
311 int flags = captype(word, end);
312 int c;
313 int l, u;
314 int first;
315 char_u *p;
316
317 if (flags & WF_KEEPCAP)
318 {
319 // Count the number of UPPER and lower case letters.
320 l = u = 0;
321 first = FALSE;
322 for (p = word; p < end; MB_PTR_ADV(p))
323 {
324 c = PTR2CHAR(p);
325 if (SPELL_ISUPPER(c))
326 {
327 ++u;
328 if (p == word)
329 first = TRUE;
330 }
331 else
332 ++l;
333 }
334
335 // If there are more UPPER than lower case letters suggest an
336 // ALLCAP word. Otherwise, if the first letter is UPPER then
337 // suggest ONECAP. Exception: "ALl" most likely should be "All",
338 // require three upper case letters.
339 if (u > l && u > 2)
340 flags |= WF_ALLCAP;
341 else if (first)
342 flags |= WF_ONECAP;
343
344 if (u >= 2 && l >= 2) // maCARONI maCAroni
345 flags |= WF_MIXCAP;
346 }
347 return flags;
348}
349
350/*
351 * Opposite of offset2bytes().
352 * "pp" points to the bytes and is advanced over it.
353 * Returns the offset.
354 */
355 static int
356bytes2offset(char_u **pp)
357{
358 char_u *p = *pp;
359 int nr;
360 int c;
361
362 c = *p++;
363 if ((c & 0x80) == 0x00) // 1 byte
364 {
365 nr = c - 1;
366 }
367 else if ((c & 0xc0) == 0x80) // 2 bytes
368 {
369 nr = (c & 0x3f) - 1;
370 nr = nr * 255 + (*p++ - 1);
371 }
372 else if ((c & 0xe0) == 0xc0) // 3 bytes
373 {
374 nr = (c & 0x1f) - 1;
375 nr = nr * 255 + (*p++ - 1);
376 nr = nr * 255 + (*p++ - 1);
377 }
378 else // 4 bytes
379 {
380 nr = (c & 0x0f) - 1;
381 nr = nr * 255 + (*p++ - 1);
382 nr = nr * 255 + (*p++ - 1);
383 nr = nr * 255 + (*p++ - 1);
384 }
385
386 *pp = p;
387 return nr;
388}
389
390// values for sps_flags
391#define SPS_BEST 1
392#define SPS_FAST 2
393#define SPS_DOUBLE 4
394
395static int sps_flags = SPS_BEST; // flags from 'spellsuggest'
396static int sps_limit = 9999; // max nr of suggestions given
397
398/*
399 * Check the 'spellsuggest' option. Return FAIL if it's wrong.
400 * Sets "sps_flags" and "sps_limit".
401 */
402 int
403spell_check_sps(void)
404{
405 char_u *p;
406 char_u *s;
407 char_u buf[MAXPATHL];
408 int f;
409
410 sps_flags = 0;
411 sps_limit = 9999;
412
413 for (p = p_sps; *p != NUL; )
414 {
415 copy_option_part(&p, buf, MAXPATHL, ",");
416
417 f = 0;
418 if (VIM_ISDIGIT(*buf))
419 {
420 s = buf;
421 sps_limit = getdigits(&s);
422 if (*s != NUL && !VIM_ISDIGIT(*s))
423 f = -1;
424 }
425 else if (STRCMP(buf, "best") == 0)
426 f = SPS_BEST;
427 else if (STRCMP(buf, "fast") == 0)
428 f = SPS_FAST;
429 else if (STRCMP(buf, "double") == 0)
430 f = SPS_DOUBLE;
431 else if (STRNCMP(buf, "expr:", 5) != 0
432 && STRNCMP(buf, "file:", 5) != 0)
433 f = -1;
434
435 if (f == -1 || (sps_flags != 0 && f != 0))
436 {
437 sps_flags = SPS_BEST;
438 sps_limit = 9999;
439 return FAIL;
440 }
441 if (f != 0)
442 sps_flags = f;
443 }
444
445 if (sps_flags == 0)
446 sps_flags = SPS_BEST;
447
448 return OK;
449}
450
451/*
452 * "z=": Find badly spelled word under or after the cursor.
453 * Give suggestions for the properly spelled word.
454 * In Visual mode use the highlighted word as the bad word.
455 * When "count" is non-zero use that suggestion.
456 */
457 void
458spell_suggest(int count)
459{
460 char_u *line;
461 pos_T prev_cursor = curwin->w_cursor;
462 char_u wcopy[MAXWLEN + 2];
463 char_u *p;
464 int i;
465 int c;
466 suginfo_T sug;
467 suggest_T *stp;
468 int mouse_used;
469 int need_cap;
470 int limit;
471 int selected = count;
472 int badlen = 0;
473 int msg_scroll_save = msg_scroll;
474
475 if (no_spell_checking(curwin))
476 return;
477
478 if (VIsual_active)
479 {
480 // Use the Visually selected text as the bad word. But reject
481 // a multi-line selection.
482 if (curwin->w_cursor.lnum != VIsual.lnum)
483 {
484 vim_beep(BO_SPELL);
485 return;
486 }
487 badlen = (int)curwin->w_cursor.col - (int)VIsual.col;
488 if (badlen < 0)
489 badlen = -badlen;
490 else
491 curwin->w_cursor.col = VIsual.col;
492 ++badlen;
493 end_visual_mode();
494 }
495 // Find the start of the badly spelled word.
496 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0
497 || curwin->w_cursor.col > prev_cursor.col)
498 {
499 // No bad word or it starts after the cursor: use the word under the
500 // cursor.
501 curwin->w_cursor = prev_cursor;
502 line = ml_get_curline();
503 p = line + curwin->w_cursor.col;
504 // Backup to before start of word.
505 while (p > line && spell_iswordp_nmw(p, curwin))
506 MB_PTR_BACK(line, p);
507 // Forward to start of word.
508 while (*p != NUL && !spell_iswordp_nmw(p, curwin))
509 MB_PTR_ADV(p);
510
511 if (!spell_iswordp_nmw(p, curwin)) // No word found.
512 {
513 beep_flush();
514 return;
515 }
516 curwin->w_cursor.col = (colnr_T)(p - line);
517 }
518
519 // Get the word and its length.
520
521 // Figure out if the word should be capitalised.
522 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col);
523
524 // Make a copy of current line since autocommands may free the line.
525 line = vim_strsave(ml_get_curline());
526 if (line == NULL)
527 goto skip;
528
529 // Get the list of suggestions. Limit to 'lines' - 2 or the number in
530 // 'spellsuggest', whatever is smaller.
531 if (sps_limit > (int)Rows - 2)
532 limit = (int)Rows - 2;
533 else
534 limit = sps_limit;
535 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit,
536 TRUE, need_cap, TRUE);
537
538 if (sug.su_ga.ga_len == 0)
539 msg(_("Sorry, no suggestions"));
540 else if (count > 0)
541 {
542 if (count > sug.su_ga.ga_len)
Bram Moolenaar6c52f822019-12-25 14:13:03 +0100543 smsg(_("Sorry, only %ld suggestions"), (long)sug.su_ga.ga_len);
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200544 }
545 else
546 {
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200547#ifdef FEAT_RIGHTLEFT
548 // When 'rightleft' is set the list is drawn right-left.
549 cmdmsg_rl = curwin->w_p_rl;
550 if (cmdmsg_rl)
551 msg_col = Columns - 1;
552#endif
553
554 // List the suggestions.
555 msg_start();
556 msg_row = Rows - 1; // for when 'cmdheight' > 1
557 lines_left = Rows; // avoid more prompt
558 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
559 sug.su_badlen, sug.su_badptr);
560#ifdef FEAT_RIGHTLEFT
561 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0)
562 {
563 // And now the rabbit from the high hat: Avoid showing the
564 // untranslated message rightleft.
565 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC",
566 sug.su_badlen, sug.su_badptr);
567 }
568#endif
569 msg_puts((char *)IObuff);
570 msg_clr_eos();
571 msg_putchar('\n');
572
573 msg_scroll = TRUE;
574 for (i = 0; i < sug.su_ga.ga_len; ++i)
575 {
576 stp = &SUG(sug.su_ga, i);
577
578 // The suggested word may replace only part of the bad word, add
579 // the not replaced part.
580 vim_strncpy(wcopy, stp->st_word, MAXWLEN);
581 if (sug.su_badlen > stp->st_orglen)
582 vim_strncpy(wcopy + stp->st_wordlen,
583 sug.su_badptr + stp->st_orglen,
584 sug.su_badlen - stp->st_orglen);
585 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1);
586#ifdef FEAT_RIGHTLEFT
587 if (cmdmsg_rl)
588 rl_mirror(IObuff);
589#endif
590 msg_puts((char *)IObuff);
591
592 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy);
593 msg_puts((char *)IObuff);
594
595 // The word may replace more than "su_badlen".
596 if (sug.su_badlen < stp->st_orglen)
597 {
598 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""),
599 stp->st_orglen, sug.su_badptr);
600 msg_puts((char *)IObuff);
601 }
602
603 if (p_verbose > 0)
604 {
605 // Add the score.
606 if (sps_flags & (SPS_DOUBLE | SPS_BEST))
607 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)",
608 stp->st_salscore ? "s " : "",
609 stp->st_score, stp->st_altscore);
610 else
611 vim_snprintf((char *)IObuff, IOSIZE, " (%d)",
612 stp->st_score);
613#ifdef FEAT_RIGHTLEFT
614 if (cmdmsg_rl)
615 // Mirror the numbers, but keep the leading space.
616 rl_mirror(IObuff + 1);
617#endif
618 msg_advance(30);
619 msg_puts((char *)IObuff);
620 }
621 msg_putchar('\n');
622 }
623
624#ifdef FEAT_RIGHTLEFT
625 cmdmsg_rl = FALSE;
626 msg_col = 0;
627#endif
628 // Ask for choice.
629 selected = prompt_for_number(&mouse_used);
630 if (mouse_used)
631 selected -= lines_left;
632 lines_left = Rows; // avoid more prompt
633 // don't delay for 'smd' in normal_cmd()
634 msg_scroll = msg_scroll_save;
635 }
636
637 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK)
638 {
639 // Save the from and to text for :spellrepall.
Bram Moolenaar6c52f822019-12-25 14:13:03 +0100640 VIM_CLEAR(repl_from);
641 VIM_CLEAR(repl_to);
642
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200643 stp = &SUG(sug.su_ga, selected - 1);
644 if (sug.su_badlen > stp->st_orglen)
645 {
646 // Replacing less than "su_badlen", append the remainder to
647 // repl_to.
648 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen);
649 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word,
650 sug.su_badlen - stp->st_orglen,
651 sug.su_badptr + stp->st_orglen);
652 repl_to = vim_strsave(IObuff);
653 }
654 else
655 {
656 // Replacing su_badlen or more, use the whole word.
657 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen);
658 repl_to = vim_strsave(stp->st_word);
659 }
660
661 // Replace the word.
662 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1);
663 if (p != NULL)
664 {
665 c = (int)(sug.su_badptr - line);
666 mch_memmove(p, line, c);
667 STRCPY(p + c, stp->st_word);
668 STRCAT(p, sug.su_badptr + stp->st_orglen);
669 ml_replace(curwin->w_cursor.lnum, p, FALSE);
670 curwin->w_cursor.col = c;
671
672 // For redo we use a change-word command.
673 ResetRedobuff();
674 AppendToRedobuff((char_u *)"ciw");
675 AppendToRedobuffLit(p + c,
676 stp->st_wordlen + sug.su_badlen - stp->st_orglen);
677 AppendCharToRedobuff(ESC);
678
679 // After this "p" may be invalid.
680 changed_bytes(curwin->w_cursor.lnum, c);
681 }
682 }
683 else
684 curwin->w_cursor = prev_cursor;
685
686 spell_find_cleanup(&sug);
687skip:
688 vim_free(line);
689}
690
691/*
692 * Find spell suggestions for "word". Return them in the growarray "*gap" as
693 * a list of allocated strings.
694 */
695 void
696spell_suggest_list(
697 garray_T *gap,
698 char_u *word,
699 int maxcount, // maximum nr of suggestions
700 int need_cap, // 'spellcapcheck' matched
701 int interactive)
702{
703 suginfo_T sug;
704 int i;
705 suggest_T *stp;
706 char_u *wcopy;
707
708 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive);
709
710 // Make room in "gap".
711 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
712 if (ga_grow(gap, sug.su_ga.ga_len) == OK)
713 {
714 for (i = 0; i < sug.su_ga.ga_len; ++i)
715 {
716 stp = &SUG(sug.su_ga, i);
717
718 // The suggested word may replace only part of "word", add the not
719 // replaced part.
720 wcopy = alloc(stp->st_wordlen
721 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1);
722 if (wcopy == NULL)
723 break;
724 STRCPY(wcopy, stp->st_word);
725 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen);
726 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
727 }
728 }
729
730 spell_find_cleanup(&sug);
731}
732
733/*
734 * Find spell suggestions for the word at the start of "badptr".
735 * Return the suggestions in "su->su_ga".
736 * The maximum number of suggestions is "maxcount".
737 * Note: does use info for the current window.
738 * This is based on the mechanisms of Aspell, but completely reimplemented.
739 */
740 static void
741spell_find_suggest(
742 char_u *badptr,
743 int badlen, // length of bad word or 0 if unknown
744 suginfo_T *su,
745 int maxcount,
746 int banbadword, // don't include badword in suggestions
747 int need_cap, // word should start with capital
748 int interactive)
749{
750 hlf_T attr = HLF_COUNT;
751 char_u buf[MAXPATHL];
752 char_u *p;
753 int do_combine = FALSE;
754 char_u *sps_copy;
755#ifdef FEAT_EVAL
756 static int expr_busy = FALSE;
757#endif
758 int c;
759 int i;
760 langp_T *lp;
761
762 // Set the info in "*su".
763 vim_memset(su, 0, sizeof(suginfo_T));
764 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10);
765 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10);
766 if (*badptr == NUL)
767 return;
768 hash_init(&su->su_banned);
769
770 su->su_badptr = badptr;
771 if (badlen != 0)
772 su->su_badlen = badlen;
773 else
774 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE);
775 su->su_maxcount = maxcount;
776 su->su_maxscore = SCORE_MAXINIT;
777
778 if (su->su_badlen >= MAXWLEN)
779 su->su_badlen = MAXWLEN - 1; // just in case
780 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen);
781 (void)spell_casefold(su->su_badptr, su->su_badlen,
782 su->su_fbadword, MAXWLEN);
783 // TODO: make this work if the case-folded text is longer than the original
784 // text. Currently an illegal byte causes wrong pointer computations.
785 su->su_fbadword[su->su_badlen] = NUL;
786
787 // get caps flags for bad word
788 su->su_badflags = badword_captype(su->su_badptr,
789 su->su_badptr + su->su_badlen);
790 if (need_cap)
791 su->su_badflags |= WF_ONECAP;
792
793 // Find the default language for sound folding. We simply use the first
794 // one in 'spelllang' that supports sound folding. That's good for when
795 // using multiple files for one language, it's not that bad when mixing
796 // languages (e.g., "pl,en").
797 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i)
798 {
799 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i);
800 if (lp->lp_sallang != NULL)
801 {
802 su->su_sallang = lp->lp_sallang;
803 break;
804 }
805 }
806
807 // Soundfold the bad word with the default sound folding, so that we don't
808 // have to do this many times.
809 if (su->su_sallang != NULL)
810 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE,
811 su->su_sal_badword);
812
813 // If the word is not capitalised and spell_check() doesn't consider the
814 // word to be bad then it might need to be capitalised. Add a suggestion
815 // for that.
816 c = PTR2CHAR(su->su_badptr);
817 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT)
818 {
819 make_case_word(su->su_badword, buf, WF_ONECAP);
820 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE,
821 0, TRUE, su->su_sallang, FALSE);
822 }
823
824 // Ban the bad word itself. It may appear in another region.
825 if (banbadword)
826 add_banned(su, su->su_badword);
827
828 // Make a copy of 'spellsuggest', because the expression may change it.
829 sps_copy = vim_strsave(p_sps);
830 if (sps_copy == NULL)
831 return;
832
833 // Loop over the items in 'spellsuggest'.
834 for (p = sps_copy; *p != NUL; )
835 {
836 copy_option_part(&p, buf, MAXPATHL, ",");
837
838 if (STRNCMP(buf, "expr:", 5) == 0)
839 {
840#ifdef FEAT_EVAL
841 // Evaluate an expression. Skip this when called recursively,
842 // when using spellsuggest() in the expression.
843 if (!expr_busy)
844 {
845 expr_busy = TRUE;
846 spell_suggest_expr(su, buf + 5);
847 expr_busy = FALSE;
848 }
849#endif
850 }
851 else if (STRNCMP(buf, "file:", 5) == 0)
852 // Use list of suggestions in a file.
853 spell_suggest_file(su, buf + 5);
854 else
855 {
856 // Use internal method.
857 spell_suggest_intern(su, interactive);
858 if (sps_flags & SPS_DOUBLE)
859 do_combine = TRUE;
860 }
861 }
862
863 vim_free(sps_copy);
864
865 if (do_combine)
866 // Combine the two list of suggestions. This must be done last,
867 // because sorting changes the order again.
868 score_combine(su);
869}
870
871#ifdef FEAT_EVAL
872/*
873 * Find suggestions by evaluating expression "expr".
874 */
875 static void
876spell_suggest_expr(suginfo_T *su, char_u *expr)
877{
878 list_T *list;
879 listitem_T *li;
880 int score;
881 char_u *p;
882
883 // The work is split up in a few parts to avoid having to export
884 // suginfo_T.
885 // First evaluate the expression and get the resulting list.
886 list = eval_spell_expr(su->su_badword, expr);
887 if (list != NULL)
888 {
889 // Loop over the items in the list.
890 for (li = list->lv_first; li != NULL; li = li->li_next)
891 if (li->li_tv.v_type == VAR_LIST)
892 {
893 // Get the word and the score from the items.
894 score = get_spellword(li->li_tv.vval.v_list, &p);
895 if (score >= 0 && score <= su->su_maxscore)
896 add_suggestion(su, &su->su_ga, p, su->su_badlen,
897 score, 0, TRUE, su->su_sallang, FALSE);
898 }
899 list_unref(list);
900 }
901
902 // Remove bogus suggestions, sort and truncate at "maxcount".
903 check_suggestions(su, &su->su_ga);
904 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
905}
906#endif
907
908/*
909 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'.
910 */
911 static void
912spell_suggest_file(suginfo_T *su, char_u *fname)
913{
914 FILE *fd;
915 char_u line[MAXWLEN * 2];
916 char_u *p;
917 int len;
918 char_u cword[MAXWLEN];
919
920 // Open the file.
921 fd = mch_fopen((char *)fname, "r");
922 if (fd == NULL)
923 {
924 semsg(_(e_notopen), fname);
925 return;
926 }
927
928 // Read it line by line.
929 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int)
930 {
931 line_breakcheck();
932
933 p = vim_strchr(line, '/');
934 if (p == NULL)
935 continue; // No Tab found, just skip the line.
936 *p++ = NUL;
937 if (STRICMP(su->su_badword, line) == 0)
938 {
939 // Match! Isolate the good word, until CR or NL.
940 for (len = 0; p[len] >= ' '; ++len)
941 ;
942 p[len] = NUL;
943
944 // If the suggestion doesn't have specific case duplicate the case
945 // of the bad word.
946 if (captype(p, NULL) == 0)
947 {
948 make_case_word(p, cword, su->su_badflags);
949 p = cword;
950 }
951
952 add_suggestion(su, &su->su_ga, p, su->su_badlen,
953 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE);
954 }
955 }
956
957 fclose(fd);
958
959 // Remove bogus suggestions, sort and truncate at "maxcount".
960 check_suggestions(su, &su->su_ga);
961 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
962}
963
964/*
965 * Find suggestions for the internal method indicated by "sps_flags".
966 */
967 static void
968spell_suggest_intern(suginfo_T *su, int interactive)
969{
970 // Load the .sug file(s) that are available and not done yet.
971 suggest_load_files();
972
973 // 1. Try special cases, such as repeating a word: "the the" -> "the".
974 //
975 // Set a maximum score to limit the combination of operations that is
976 // tried.
977 suggest_try_special(su);
978
979 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries
980 // from the .aff file and inserting a space (split the word).
981 suggest_try_change(su);
982
983 // For the resulting top-scorers compute the sound-a-like score.
984 if (sps_flags & SPS_DOUBLE)
985 score_comp_sal(su);
986
987 // 3. Try finding sound-a-like words.
988 if ((sps_flags & SPS_FAST) == 0)
989 {
990 if (sps_flags & SPS_BEST)
991 // Adjust the word score for the suggestions found so far for how
Bram Moolenaar32aa1022019-11-02 22:54:41 +0100992 // they sound like.
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200993 rescore_suggestions(su);
994
995 // While going through the soundfold tree "su_maxscore" is the score
996 // for the soundfold word, limits the changes that are being tried,
997 // and "su_sfmaxscore" the rescored score, which is set by
998 // cleanup_suggestions().
999 // First find words with a small edit distance, because this is much
1000 // faster and often already finds the top-N suggestions. If we didn't
1001 // find many suggestions try again with a higher edit distance.
1002 // "sl_sounddone" is used to avoid doing the same word twice.
1003 suggest_try_soundalike_prep();
1004 su->su_maxscore = SCORE_SFMAX1;
1005 su->su_sfmaxscore = SCORE_MAXINIT * 3;
1006 suggest_try_soundalike(su);
1007 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1008 {
1009 // We didn't find enough matches, try again, allowing more
1010 // changes to the soundfold word.
1011 su->su_maxscore = SCORE_SFMAX2;
1012 suggest_try_soundalike(su);
1013 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1014 {
1015 // Still didn't find enough matches, try again, allowing even
1016 // more changes to the soundfold word.
1017 su->su_maxscore = SCORE_SFMAX3;
1018 suggest_try_soundalike(su);
1019 }
1020 }
1021 su->su_maxscore = su->su_sfmaxscore;
1022 suggest_try_soundalike_finish();
1023 }
1024
1025 // When CTRL-C was hit while searching do show the results. Only clear
1026 // got_int when using a command, not for spellsuggest().
1027 ui_breakcheck();
1028 if (interactive && got_int)
1029 {
1030 (void)vgetc();
1031 got_int = FALSE;
1032 }
1033
1034 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0)
1035 {
1036 if (sps_flags & SPS_BEST)
1037 // Adjust the word score for how it sounds like.
1038 rescore_suggestions(su);
1039
1040 // Remove bogus suggestions, sort and truncate at "maxcount".
1041 check_suggestions(su, &su->su_ga);
1042 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
1043 }
1044}
1045
1046/*
1047 * Free the info put in "*su" by spell_find_suggest().
1048 */
1049 static void
1050spell_find_cleanup(suginfo_T *su)
1051{
1052 int i;
1053
1054 // Free the suggestions.
1055 for (i = 0; i < su->su_ga.ga_len; ++i)
1056 vim_free(SUG(su->su_ga, i).st_word);
1057 ga_clear(&su->su_ga);
1058 for (i = 0; i < su->su_sga.ga_len; ++i)
1059 vim_free(SUG(su->su_sga, i).st_word);
1060 ga_clear(&su->su_sga);
1061
1062 // Free the banned words.
1063 hash_clear_all(&su->su_banned, 0);
1064}
1065
1066/*
1067 * Try finding suggestions by recognizing specific situations.
1068 */
1069 static void
1070suggest_try_special(suginfo_T *su)
1071{
1072 char_u *p;
1073 size_t len;
1074 int c;
1075 char_u word[MAXWLEN];
1076
1077 // Recognize a word that is repeated: "the the".
1078 p = skiptowhite(su->su_fbadword);
1079 len = p - su->su_fbadword;
1080 p = skipwhite(p);
1081 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0)
1082 {
1083 // Include badflags: if the badword is onecap or allcap
1084 // use that for the goodword too: "The the" -> "The".
1085 c = su->su_fbadword[len];
1086 su->su_fbadword[len] = NUL;
1087 make_case_word(su->su_fbadword, word, su->su_badflags);
1088 su->su_fbadword[len] = c;
1089
1090 // Give a soundalike score of 0, compute the score as if deleting one
1091 // character.
1092 add_suggestion(su, &su->su_ga, word, su->su_badlen,
1093 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE);
1094 }
1095}
1096
1097/*
1098 * Change the 0 to 1 to measure how much time is spent in each state.
1099 * Output is dumped in "suggestprof".
1100 */
1101#if 0
1102# define SUGGEST_PROFILE
1103proftime_T current;
1104proftime_T total;
1105proftime_T times[STATE_FINAL + 1];
1106long counts[STATE_FINAL + 1];
1107
1108 static void
1109prof_init(void)
1110{
1111 for (int i = 0; i <= STATE_FINAL; ++i)
1112 {
1113 profile_zero(&times[i]);
1114 counts[i] = 0;
1115 }
1116 profile_start(&current);
1117 profile_start(&total);
1118}
1119
1120// call before changing state
1121 static void
1122prof_store(state_T state)
1123{
1124 profile_end(&current);
1125 profile_add(&times[state], &current);
1126 ++counts[state];
1127 profile_start(&current);
1128}
1129# define PROF_STORE(state) prof_store(state);
1130
1131 static void
1132prof_report(char *name)
1133{
1134 FILE *fd = fopen("suggestprof", "a");
1135
1136 profile_end(&total);
1137 fprintf(fd, "-----------------------\n");
1138 fprintf(fd, "%s: %s\n", name, profile_msg(&total));
1139 for (int i = 0; i <= STATE_FINAL; ++i)
1140 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(&times[i]), counts[i]);
1141 fclose(fd);
1142}
1143#else
1144# define PROF_STORE(state)
1145#endif
1146
1147/*
1148 * Try finding suggestions by adding/removing/swapping letters.
1149 */
1150 static void
1151suggest_try_change(suginfo_T *su)
1152{
1153 char_u fword[MAXWLEN]; // copy of the bad word, case-folded
1154 int n;
1155 char_u *p;
1156 int lpi;
1157 langp_T *lp;
1158
1159 // We make a copy of the case-folded bad word, so that we can modify it
1160 // to find matches (esp. REP items). Append some more text, changing
1161 // chars after the bad word may help.
1162 STRCPY(fword, su->su_fbadword);
1163 n = (int)STRLEN(fword);
1164 p = su->su_badptr + su->su_badlen;
1165 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n);
1166
1167 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
1168 {
1169 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
1170
1171 // If reloading a spell file fails it's still in the list but
1172 // everything has been cleared.
1173 if (lp->lp_slang->sl_fbyts == NULL)
1174 continue;
1175
1176 // Try it for this language. Will add possible suggestions.
1177#ifdef SUGGEST_PROFILE
1178 prof_init();
1179#endif
1180 suggest_trie_walk(su, lp, fword, FALSE);
1181#ifdef SUGGEST_PROFILE
1182 prof_report("try_change");
1183#endif
1184 }
1185}
1186
1187// Check the maximum score, if we go over it we won't try this change.
1188#define TRY_DEEPER(su, stack, depth, add) \
1189 (stack[depth].ts_score + (add) < su->su_maxscore)
1190
1191/*
1192 * Try finding suggestions by adding/removing/swapping letters.
1193 *
1194 * This uses a state machine. At each node in the tree we try various
1195 * operations. When trying if an operation works "depth" is increased and the
1196 * stack[] is used to store info. This allows combinations, thus insert one
1197 * character, replace one and delete another. The number of changes is
1198 * limited by su->su_maxscore.
1199 *
1200 * After implementing this I noticed an article by Kemal Oflazer that
1201 * describes something similar: "Error-tolerant Finite State Recognition with
1202 * Applications to Morphological Analysis and Spelling Correction" (1996).
1203 * The implementation in the article is simplified and requires a stack of
1204 * unknown depth. The implementation here only needs a stack depth equal to
1205 * the length of the word.
1206 *
1207 * This is also used for the sound-folded word, "soundfold" is TRUE then.
1208 * The mechanism is the same, but we find a match with a sound-folded word
1209 * that comes from one or more original words. Each of these words may be
1210 * added, this is done by add_sound_suggest().
1211 * Don't use:
1212 * the prefix tree or the keep-case tree
1213 * "su->su_badlen"
1214 * anything to do with upper and lower case
1215 * anything to do with word or non-word characters ("spell_iswordp()")
1216 * banned words
1217 * word flags (rare, region, compounding)
1218 * word splitting for now
1219 * "similar_chars()"
1220 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep"
1221 */
1222 static void
1223suggest_trie_walk(
1224 suginfo_T *su,
1225 langp_T *lp,
1226 char_u *fword,
1227 int soundfold)
1228{
1229 char_u tword[MAXWLEN]; // good word collected so far
1230 trystate_T stack[MAXWLEN];
1231 char_u preword[MAXWLEN * 3]; // word found with proper case;
1232 // concatenation of prefix compound
1233 // words and split word. NUL terminated
1234 // when going deeper but not when coming
1235 // back.
1236 char_u compflags[MAXWLEN]; // compound flags, one for each word
1237 trystate_T *sp;
1238 int newscore;
1239 int score;
1240 char_u *byts, *fbyts, *pbyts;
1241 idx_T *idxs, *fidxs, *pidxs;
1242 int depth;
1243 int c, c2, c3;
1244 int n = 0;
1245 int flags;
1246 garray_T *gap;
1247 idx_T arridx;
1248 int len;
1249 char_u *p;
1250 fromto_T *ftp;
1251 int fl = 0, tl;
1252 int repextra = 0; // extra bytes in fword[] from REP item
1253 slang_T *slang = lp->lp_slang;
1254 int fword_ends;
1255 int goodword_ends;
1256#ifdef DEBUG_TRIEWALK
1257 // Stores the name of the change made at each level.
1258 char_u changename[MAXWLEN][80];
1259#endif
1260 int breakcheckcount = 1000;
1261 int compound_ok;
1262
1263 // Go through the whole case-fold tree, try changes at each node.
1264 // "tword[]" contains the word collected from nodes in the tree.
1265 // "fword[]" the word we are trying to match with (initially the bad
1266 // word).
1267 depth = 0;
1268 sp = &stack[0];
1269 vim_memset(sp, 0, sizeof(trystate_T));
1270 sp->ts_curi = 1;
1271
1272 if (soundfold)
1273 {
1274 // Going through the soundfold tree.
1275 byts = fbyts = slang->sl_sbyts;
1276 idxs = fidxs = slang->sl_sidxs;
1277 pbyts = NULL;
1278 pidxs = NULL;
1279 sp->ts_prefixdepth = PFD_NOPREFIX;
1280 sp->ts_state = STATE_START;
1281 }
1282 else
1283 {
1284 // When there are postponed prefixes we need to use these first. At
1285 // the end of the prefix we continue in the case-fold tree.
1286 fbyts = slang->sl_fbyts;
1287 fidxs = slang->sl_fidxs;
1288 pbyts = slang->sl_pbyts;
1289 pidxs = slang->sl_pidxs;
1290 if (pbyts != NULL)
1291 {
1292 byts = pbyts;
1293 idxs = pidxs;
1294 sp->ts_prefixdepth = PFD_PREFIXTREE;
1295 sp->ts_state = STATE_NOPREFIX; // try without prefix first
1296 }
1297 else
1298 {
1299 byts = fbyts;
1300 idxs = fidxs;
1301 sp->ts_prefixdepth = PFD_NOPREFIX;
1302 sp->ts_state = STATE_START;
1303 }
1304 }
1305
1306 // Loop to find all suggestions. At each round we either:
1307 // - For the current state try one operation, advance "ts_curi",
1308 // increase "depth".
1309 // - When a state is done go to the next, set "ts_state".
1310 // - When all states are tried decrease "depth".
1311 while (depth >= 0 && !got_int)
1312 {
1313 sp = &stack[depth];
1314 switch (sp->ts_state)
1315 {
1316 case STATE_START:
1317 case STATE_NOPREFIX:
1318 // Start of node: Deal with NUL bytes, which means
1319 // tword[] may end here.
1320 arridx = sp->ts_arridx; // current node in the tree
1321 len = byts[arridx]; // bytes in this node
1322 arridx += sp->ts_curi; // index of current byte
1323
1324 if (sp->ts_prefixdepth == PFD_PREFIXTREE)
1325 {
1326 // Skip over the NUL bytes, we use them later.
1327 for (n = 0; n < len && byts[arridx + n] == 0; ++n)
1328 ;
1329 sp->ts_curi += n;
1330
1331 // Always past NUL bytes now.
1332 n = (int)sp->ts_state;
1333 PROF_STORE(sp->ts_state)
1334 sp->ts_state = STATE_ENDNUL;
1335 sp->ts_save_badflags = su->su_badflags;
1336
1337 // At end of a prefix or at start of prefixtree: check for
1338 // following word.
1339 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX)
1340 {
1341 // Set su->su_badflags to the caps type at this position.
1342 // Use the caps type until here for the prefix itself.
1343 if (has_mbyte)
1344 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1345 else
1346 n = sp->ts_fidx;
1347 flags = badword_captype(su->su_badptr, su->su_badptr + n);
1348 su->su_badflags = badword_captype(su->su_badptr + n,
1349 su->su_badptr + su->su_badlen);
1350#ifdef DEBUG_TRIEWALK
1351 sprintf(changename[depth], "prefix");
1352#endif
1353 go_deeper(stack, depth, 0);
1354 ++depth;
1355 sp = &stack[depth];
1356 sp->ts_prefixdepth = depth - 1;
1357 byts = fbyts;
1358 idxs = fidxs;
1359 sp->ts_arridx = 0;
1360
1361 // Move the prefix to preword[] with the right case
1362 // and make find_keepcap_word() works.
1363 tword[sp->ts_twordlen] = NUL;
1364 make_case_word(tword + sp->ts_splitoff,
1365 preword + sp->ts_prewordlen, flags);
1366 sp->ts_prewordlen = (char_u)STRLEN(preword);
1367 sp->ts_splitoff = sp->ts_twordlen;
1368 }
1369 break;
1370 }
1371
1372 if (sp->ts_curi > len || byts[arridx] != 0)
1373 {
1374 // Past bytes in node and/or past NUL bytes.
1375 PROF_STORE(sp->ts_state)
1376 sp->ts_state = STATE_ENDNUL;
1377 sp->ts_save_badflags = su->su_badflags;
1378 break;
1379 }
1380
1381 // End of word in tree.
1382 ++sp->ts_curi; // eat one NUL byte
1383
1384 flags = (int)idxs[arridx];
1385
1386 // Skip words with the NOSUGGEST flag.
1387 if (flags & WF_NOSUGGEST)
1388 break;
1389
1390 fword_ends = (fword[sp->ts_fidx] == NUL
1391 || (soundfold
1392 ? VIM_ISWHITE(fword[sp->ts_fidx])
1393 : !spell_iswordp(fword + sp->ts_fidx, curwin)));
1394 tword[sp->ts_twordlen] = NUL;
1395
1396 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
1397 && (sp->ts_flags & TSF_PREFIXOK) == 0)
1398 {
1399 // There was a prefix before the word. Check that the prefix
1400 // can be used with this word.
1401 // Count the length of the NULs in the prefix. If there are
1402 // none this must be the first try without a prefix.
1403 n = stack[sp->ts_prefixdepth].ts_arridx;
1404 len = pbyts[n++];
1405 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
1406 ;
1407 if (c > 0)
1408 {
1409 c = valid_word_prefix(c, n, flags,
1410 tword + sp->ts_splitoff, slang, FALSE);
1411 if (c == 0)
1412 break;
1413
1414 // Use the WF_RARE flag for a rare prefix.
1415 if (c & WF_RAREPFX)
1416 flags |= WF_RARE;
1417
1418 // Tricky: when checking for both prefix and compounding
1419 // we run into the prefix flag first.
1420 // Remember that it's OK, so that we accept the prefix
1421 // when arriving at a compound flag.
1422 sp->ts_flags |= TSF_PREFIXOK;
1423 }
1424 }
1425
1426 // Check NEEDCOMPOUND: can't use word without compounding. Do try
1427 // appending another compound word below.
1428 if (sp->ts_complen == sp->ts_compsplit && fword_ends
1429 && (flags & WF_NEEDCOMP))
1430 goodword_ends = FALSE;
1431 else
1432 goodword_ends = TRUE;
1433
1434 p = NULL;
1435 compound_ok = TRUE;
1436 if (sp->ts_complen > sp->ts_compsplit)
1437 {
1438 if (slang->sl_nobreak)
1439 {
1440 // There was a word before this word. When there was no
1441 // change in this word (it was correct) add the first word
1442 // as a suggestion. If this word was corrected too, we
1443 // need to check if a correct word follows.
1444 if (sp->ts_fidx - sp->ts_splitfidx
1445 == sp->ts_twordlen - sp->ts_splitoff
1446 && STRNCMP(fword + sp->ts_splitfidx,
1447 tword + sp->ts_splitoff,
1448 sp->ts_fidx - sp->ts_splitfidx) == 0)
1449 {
1450 preword[sp->ts_prewordlen] = NUL;
1451 newscore = score_wordcount_adj(slang, sp->ts_score,
1452 preword + sp->ts_prewordlen,
1453 sp->ts_prewordlen > 0);
1454 // Add the suggestion if the score isn't too bad.
1455 if (newscore <= su->su_maxscore)
1456 add_suggestion(su, &su->su_ga, preword,
1457 sp->ts_splitfidx - repextra,
1458 newscore, 0, FALSE,
1459 lp->lp_sallang, FALSE);
1460 break;
1461 }
1462 }
1463 else
1464 {
1465 // There was a compound word before this word. If this
1466 // word does not support compounding then give up
1467 // (splitting is tried for the word without compound
1468 // flag).
1469 if (((unsigned)flags >> 24) == 0
1470 || sp->ts_twordlen - sp->ts_splitoff
1471 < slang->sl_compminlen)
1472 break;
1473 // For multi-byte chars check character length against
1474 // COMPOUNDMIN.
1475 if (has_mbyte
1476 && slang->sl_compminlen > 0
1477 && mb_charlen(tword + sp->ts_splitoff)
1478 < slang->sl_compminlen)
1479 break;
1480
1481 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1482 compflags[sp->ts_complen + 1] = NUL;
1483 vim_strncpy(preword + sp->ts_prewordlen,
1484 tword + sp->ts_splitoff,
1485 sp->ts_twordlen - sp->ts_splitoff);
1486
1487 // Verify CHECKCOMPOUNDPATTERN rules.
1488 if (match_checkcompoundpattern(preword, sp->ts_prewordlen,
1489 &slang->sl_comppat))
1490 compound_ok = FALSE;
1491
1492 if (compound_ok)
1493 {
1494 p = preword;
1495 while (*skiptowhite(p) != NUL)
1496 p = skipwhite(skiptowhite(p));
1497 if (fword_ends && !can_compound(slang, p,
1498 compflags + sp->ts_compsplit))
1499 // Compound is not allowed. But it may still be
1500 // possible if we add another (short) word.
1501 compound_ok = FALSE;
1502 }
1503
1504 // Get pointer to last char of previous word.
1505 p = preword + sp->ts_prewordlen;
1506 MB_PTR_BACK(preword, p);
1507 }
1508 }
1509
1510 // Form the word with proper case in preword.
1511 // If there is a word from a previous split, append.
1512 // For the soundfold tree don't change the case, simply append.
1513 if (soundfold)
1514 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff);
1515 else if (flags & WF_KEEPCAP)
1516 // Must find the word in the keep-case tree.
1517 find_keepcap_word(slang, tword + sp->ts_splitoff,
1518 preword + sp->ts_prewordlen);
1519 else
1520 {
1521 // Include badflags: If the badword is onecap or allcap
1522 // use that for the goodword too. But if the badword is
1523 // allcap and it's only one char long use onecap.
1524 c = su->su_badflags;
1525 if ((c & WF_ALLCAP)
1526 && su->su_badlen == (*mb_ptr2len)(su->su_badptr))
1527 c = WF_ONECAP;
1528 c |= flags;
1529
1530 // When appending a compound word after a word character don't
1531 // use Onecap.
1532 if (p != NULL && spell_iswordp_nmw(p, curwin))
1533 c &= ~WF_ONECAP;
1534 make_case_word(tword + sp->ts_splitoff,
1535 preword + sp->ts_prewordlen, c);
1536 }
1537
1538 if (!soundfold)
1539 {
1540 // Don't use a banned word. It may appear again as a good
1541 // word, thus remember it.
1542 if (flags & WF_BANNED)
1543 {
1544 add_banned(su, preword + sp->ts_prewordlen);
1545 break;
1546 }
1547 if ((sp->ts_complen == sp->ts_compsplit
1548 && WAS_BANNED(su, preword + sp->ts_prewordlen))
1549 || WAS_BANNED(su, preword))
1550 {
1551 if (slang->sl_compprog == NULL)
1552 break;
1553 // the word so far was banned but we may try compounding
1554 goodword_ends = FALSE;
1555 }
1556 }
1557
1558 newscore = 0;
1559 if (!soundfold) // soundfold words don't have flags
1560 {
1561 if ((flags & WF_REGION)
1562 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
1563 newscore += SCORE_REGION;
1564 if (flags & WF_RARE)
1565 newscore += SCORE_RARE;
1566
1567 if (!spell_valid_case(su->su_badflags,
1568 captype(preword + sp->ts_prewordlen, NULL)))
1569 newscore += SCORE_ICASE;
1570 }
1571
1572 // TODO: how about splitting in the soundfold tree?
1573 if (fword_ends
1574 && goodword_ends
1575 && sp->ts_fidx >= sp->ts_fidxtry
1576 && compound_ok)
1577 {
1578 // The badword also ends: add suggestions.
1579#ifdef DEBUG_TRIEWALK
1580 if (soundfold && STRCMP(preword, "smwrd") == 0)
1581 {
1582 int j;
1583
1584 // print the stack of changes that brought us here
1585 smsg("------ %s -------", fword);
1586 for (j = 0; j < depth; ++j)
1587 smsg("%s", changename[j]);
1588 }
1589#endif
1590 if (soundfold)
1591 {
1592 // For soundfolded words we need to find the original
1593 // words, the edit distance and then add them.
1594 add_sound_suggest(su, preword, sp->ts_score, lp);
1595 }
1596 else if (sp->ts_fidx > 0)
1597 {
1598 // Give a penalty when changing non-word char to word
1599 // char, e.g., "thes," -> "these".
1600 p = fword + sp->ts_fidx;
1601 MB_PTR_BACK(fword, p);
1602 if (!spell_iswordp(p, curwin))
1603 {
1604 p = preword + STRLEN(preword);
1605 MB_PTR_BACK(preword, p);
1606 if (spell_iswordp(p, curwin))
1607 newscore += SCORE_NONWORD;
1608 }
1609
1610 // Give a bonus to words seen before.
1611 score = score_wordcount_adj(slang,
1612 sp->ts_score + newscore,
1613 preword + sp->ts_prewordlen,
1614 sp->ts_prewordlen > 0);
1615
1616 // Add the suggestion if the score isn't too bad.
1617 if (score <= su->su_maxscore)
1618 {
1619 add_suggestion(su, &su->su_ga, preword,
1620 sp->ts_fidx - repextra,
1621 score, 0, FALSE, lp->lp_sallang, FALSE);
1622
1623 if (su->su_badflags & WF_MIXCAP)
1624 {
1625 // We really don't know if the word should be
1626 // upper or lower case, add both.
1627 c = captype(preword, NULL);
1628 if (c == 0 || c == WF_ALLCAP)
1629 {
1630 make_case_word(tword + sp->ts_splitoff,
1631 preword + sp->ts_prewordlen,
1632 c == 0 ? WF_ALLCAP : 0);
1633
1634 add_suggestion(su, &su->su_ga, preword,
1635 sp->ts_fidx - repextra,
1636 score + SCORE_ICASE, 0, FALSE,
1637 lp->lp_sallang, FALSE);
1638 }
1639 }
1640 }
1641 }
1642 }
1643
1644 // Try word split and/or compounding.
1645 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends)
1646 // Don't split halfway a character.
1647 && (!has_mbyte || sp->ts_tcharlen == 0))
1648 {
1649 int try_compound;
1650 int try_split;
1651
1652 // If past the end of the bad word don't try a split.
1653 // Otherwise try changing the next word. E.g., find
1654 // suggestions for "the the" where the second "the" is
1655 // different. It's done like a split.
1656 // TODO: word split for soundfold words
1657 try_split = (sp->ts_fidx - repextra < su->su_badlen)
1658 && !soundfold;
1659
1660 // Get here in several situations:
1661 // 1. The word in the tree ends:
1662 // If the word allows compounding try that. Otherwise try
1663 // a split by inserting a space. For both check that a
1664 // valid words starts at fword[sp->ts_fidx].
1665 // For NOBREAK do like compounding to be able to check if
1666 // the next word is valid.
1667 // 2. The badword does end, but it was due to a change (e.g.,
1668 // a swap). No need to split, but do check that the
1669 // following word is valid.
1670 // 3. The badword and the word in the tree end. It may still
1671 // be possible to compound another (short) word.
1672 try_compound = FALSE;
1673 if (!soundfold
1674 && !slang->sl_nocompoundsugs
1675 && slang->sl_compprog != NULL
1676 && ((unsigned)flags >> 24) != 0
1677 && sp->ts_twordlen - sp->ts_splitoff
1678 >= slang->sl_compminlen
1679 && (!has_mbyte
1680 || slang->sl_compminlen == 0
1681 || mb_charlen(tword + sp->ts_splitoff)
1682 >= slang->sl_compminlen)
1683 && (slang->sl_compsylmax < MAXWLEN
1684 || sp->ts_complen + 1 - sp->ts_compsplit
1685 < slang->sl_compmax)
1686 && (can_be_compound(sp, slang,
1687 compflags, ((unsigned)flags >> 24))))
1688
1689 {
1690 try_compound = TRUE;
1691 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1692 compflags[sp->ts_complen + 1] = NUL;
1693 }
1694
1695 // For NOBREAK we never try splitting, it won't make any word
1696 // valid.
1697 if (slang->sl_nobreak && !slang->sl_nocompoundsugs)
1698 try_compound = TRUE;
1699
1700 // If we could add a compound word, and it's also possible to
1701 // split at this point, do the split first and set
1702 // TSF_DIDSPLIT to avoid doing it again.
1703 else if (!fword_ends
1704 && try_compound
1705 && (sp->ts_flags & TSF_DIDSPLIT) == 0)
1706 {
1707 try_compound = FALSE;
1708 sp->ts_flags |= TSF_DIDSPLIT;
1709 --sp->ts_curi; // do the same NUL again
1710 compflags[sp->ts_complen] = NUL;
1711 }
1712 else
1713 sp->ts_flags &= ~TSF_DIDSPLIT;
1714
1715 if (try_split || try_compound)
1716 {
1717 if (!try_compound && (!fword_ends || !goodword_ends))
1718 {
1719 // If we're going to split need to check that the
1720 // words so far are valid for compounding. If there
1721 // is only one word it must not have the NEEDCOMPOUND
1722 // flag.
1723 if (sp->ts_complen == sp->ts_compsplit
1724 && (flags & WF_NEEDCOMP))
1725 break;
1726 p = preword;
1727 while (*skiptowhite(p) != NUL)
1728 p = skipwhite(skiptowhite(p));
1729 if (sp->ts_complen > sp->ts_compsplit
1730 && !can_compound(slang, p,
1731 compflags + sp->ts_compsplit))
1732 break;
1733
1734 if (slang->sl_nosplitsugs)
1735 newscore += SCORE_SPLIT_NO;
1736 else
1737 newscore += SCORE_SPLIT;
1738
1739 // Give a bonus to words seen before.
1740 newscore = score_wordcount_adj(slang, newscore,
1741 preword + sp->ts_prewordlen, TRUE);
1742 }
1743
1744 if (TRY_DEEPER(su, stack, depth, newscore))
1745 {
1746 go_deeper(stack, depth, newscore);
1747#ifdef DEBUG_TRIEWALK
1748 if (!try_compound && !fword_ends)
1749 sprintf(changename[depth], "%.*s-%s: split",
1750 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1751 else
1752 sprintf(changename[depth], "%.*s-%s: compound",
1753 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1754#endif
1755 // Save things to be restored at STATE_SPLITUNDO.
1756 sp->ts_save_badflags = su->su_badflags;
1757 PROF_STORE(sp->ts_state)
1758 sp->ts_state = STATE_SPLITUNDO;
1759
1760 ++depth;
1761 sp = &stack[depth];
1762
1763 // Append a space to preword when splitting.
1764 if (!try_compound && !fword_ends)
1765 STRCAT(preword, " ");
1766 sp->ts_prewordlen = (char_u)STRLEN(preword);
1767 sp->ts_splitoff = sp->ts_twordlen;
1768 sp->ts_splitfidx = sp->ts_fidx;
1769
1770 // If the badword has a non-word character at this
1771 // position skip it. That means replacing the
1772 // non-word character with a space. Always skip a
1773 // character when the word ends. But only when the
1774 // good word can end.
1775 if (((!try_compound && !spell_iswordp_nmw(fword
1776 + sp->ts_fidx,
1777 curwin))
1778 || fword_ends)
1779 && fword[sp->ts_fidx] != NUL
1780 && goodword_ends)
1781 {
1782 int l;
1783
Bram Moolenaar1614a142019-10-06 22:00:13 +02001784 l = mb_ptr2len(fword + sp->ts_fidx);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001785 if (fword_ends)
1786 {
1787 // Copy the skipped character to preword.
1788 mch_memmove(preword + sp->ts_prewordlen,
1789 fword + sp->ts_fidx, l);
1790 sp->ts_prewordlen += l;
1791 preword[sp->ts_prewordlen] = NUL;
1792 }
1793 else
1794 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST;
1795 sp->ts_fidx += l;
1796 }
1797
1798 // When compounding include compound flag in
1799 // compflags[] (already set above). When splitting we
1800 // may start compounding over again.
1801 if (try_compound)
1802 ++sp->ts_complen;
1803 else
1804 sp->ts_compsplit = sp->ts_complen;
1805 sp->ts_prefixdepth = PFD_NOPREFIX;
1806
1807 // set su->su_badflags to the caps type at this
1808 // position
1809 if (has_mbyte)
1810 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1811 else
1812 n = sp->ts_fidx;
1813 su->su_badflags = badword_captype(su->su_badptr + n,
1814 su->su_badptr + su->su_badlen);
1815
1816 // Restart at top of the tree.
1817 sp->ts_arridx = 0;
1818
1819 // If there are postponed prefixes, try these too.
1820 if (pbyts != NULL)
1821 {
1822 byts = pbyts;
1823 idxs = pidxs;
1824 sp->ts_prefixdepth = PFD_PREFIXTREE;
1825 PROF_STORE(sp->ts_state)
1826 sp->ts_state = STATE_NOPREFIX;
1827 }
1828 }
1829 }
1830 }
1831 break;
1832
1833 case STATE_SPLITUNDO:
1834 // Undo the changes done for word split or compound word.
1835 su->su_badflags = sp->ts_save_badflags;
1836
1837 // Continue looking for NUL bytes.
1838 PROF_STORE(sp->ts_state)
1839 sp->ts_state = STATE_START;
1840
1841 // In case we went into the prefix tree.
1842 byts = fbyts;
1843 idxs = fidxs;
1844 break;
1845
1846 case STATE_ENDNUL:
1847 // Past the NUL bytes in the node.
1848 su->su_badflags = sp->ts_save_badflags;
1849 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0)
1850 {
1851 // The badword ends, can't use STATE_PLAIN.
1852 PROF_STORE(sp->ts_state)
1853 sp->ts_state = STATE_DEL;
1854 break;
1855 }
1856 PROF_STORE(sp->ts_state)
1857 sp->ts_state = STATE_PLAIN;
1858 // FALLTHROUGH
1859
1860 case STATE_PLAIN:
1861 // Go over all possible bytes at this node, add each to tword[]
1862 // and use child node. "ts_curi" is the index.
1863 arridx = sp->ts_arridx;
1864 if (sp->ts_curi > byts[arridx])
1865 {
1866 // Done all bytes at this node, do next state. When still at
1867 // already changed bytes skip the other tricks.
1868 PROF_STORE(sp->ts_state)
1869 if (sp->ts_fidx >= sp->ts_fidxtry)
1870 sp->ts_state = STATE_DEL;
1871 else
1872 sp->ts_state = STATE_FINAL;
1873 }
1874 else
1875 {
1876 arridx += sp->ts_curi++;
1877 c = byts[arridx];
1878
1879 // Normal byte, go one level deeper. If it's not equal to the
1880 // byte in the bad word adjust the score. But don't even try
1881 // when the byte was already changed. And don't try when we
1882 // just deleted this byte, accepting it is always cheaper than
1883 // delete + substitute.
1884 if (c == fword[sp->ts_fidx]
1885 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE))
1886 newscore = 0;
1887 else
1888 newscore = SCORE_SUBST;
1889 if ((newscore == 0
1890 || (sp->ts_fidx >= sp->ts_fidxtry
1891 && ((sp->ts_flags & TSF_DIDDEL) == 0
1892 || c != fword[sp->ts_delidx])))
1893 && TRY_DEEPER(su, stack, depth, newscore))
1894 {
1895 go_deeper(stack, depth, newscore);
1896#ifdef DEBUG_TRIEWALK
1897 if (newscore > 0)
1898 sprintf(changename[depth], "%.*s-%s: subst %c to %c",
1899 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1900 fword[sp->ts_fidx], c);
1901 else
1902 sprintf(changename[depth], "%.*s-%s: accept %c",
1903 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1904 fword[sp->ts_fidx]);
1905#endif
1906 ++depth;
1907 sp = &stack[depth];
1908 ++sp->ts_fidx;
1909 tword[sp->ts_twordlen++] = c;
1910 sp->ts_arridx = idxs[arridx];
1911 if (newscore == SCORE_SUBST)
1912 sp->ts_isdiff = DIFF_YES;
1913 if (has_mbyte)
1914 {
1915 // Multi-byte characters are a bit complicated to
1916 // handle: They differ when any of the bytes differ
1917 // and then their length may also differ.
1918 if (sp->ts_tcharlen == 0)
1919 {
1920 // First byte.
1921 sp->ts_tcharidx = 0;
1922 sp->ts_tcharlen = MB_BYTE2LEN(c);
1923 sp->ts_fcharstart = sp->ts_fidx - 1;
1924 sp->ts_isdiff = (newscore != 0)
1925 ? DIFF_YES : DIFF_NONE;
1926 }
1927 else if (sp->ts_isdiff == DIFF_INSERT)
1928 // When inserting trail bytes don't advance in the
1929 // bad word.
1930 --sp->ts_fidx;
1931 if (++sp->ts_tcharidx == sp->ts_tcharlen)
1932 {
1933 // Last byte of character.
1934 if (sp->ts_isdiff == DIFF_YES)
1935 {
1936 // Correct ts_fidx for the byte length of the
1937 // character (we didn't check that before).
1938 sp->ts_fidx = sp->ts_fcharstart
Bram Moolenaar1614a142019-10-06 22:00:13 +02001939 + mb_ptr2len(
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001940 fword + sp->ts_fcharstart);
1941 // For changing a composing character adjust
1942 // the score from SCORE_SUBST to
1943 // SCORE_SUBCOMP.
1944 if (enc_utf8
1945 && utf_iscomposing(
1946 utf_ptr2char(tword
1947 + sp->ts_twordlen
1948 - sp->ts_tcharlen))
1949 && utf_iscomposing(
1950 utf_ptr2char(fword
1951 + sp->ts_fcharstart)))
1952 sp->ts_score -=
1953 SCORE_SUBST - SCORE_SUBCOMP;
1954
1955 // For a similar character adjust score from
1956 // SCORE_SUBST to SCORE_SIMILAR.
1957 else if (!soundfold
1958 && slang->sl_has_map
1959 && similar_chars(slang,
1960 mb_ptr2char(tword
1961 + sp->ts_twordlen
1962 - sp->ts_tcharlen),
1963 mb_ptr2char(fword
1964 + sp->ts_fcharstart)))
1965 sp->ts_score -=
1966 SCORE_SUBST - SCORE_SIMILAR;
1967 }
1968 else if (sp->ts_isdiff == DIFF_INSERT
1969 && sp->ts_twordlen > sp->ts_tcharlen)
1970 {
1971 p = tword + sp->ts_twordlen - sp->ts_tcharlen;
1972 c = mb_ptr2char(p);
1973 if (enc_utf8 && utf_iscomposing(c))
1974 {
1975 // Inserting a composing char doesn't
1976 // count that much.
1977 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
1978 }
1979 else
1980 {
1981 // If the previous character was the same,
1982 // thus doubling a character, give a bonus
1983 // to the score. Also for the soundfold
1984 // tree (might seem illogical but does
1985 // give better scores).
1986 MB_PTR_BACK(tword, p);
1987 if (c == mb_ptr2char(p))
1988 sp->ts_score -= SCORE_INS
1989 - SCORE_INSDUP;
1990 }
1991 }
1992
1993 // Starting a new char, reset the length.
1994 sp->ts_tcharlen = 0;
1995 }
1996 }
1997 else
1998 {
1999 // If we found a similar char adjust the score.
2000 // We do this after calling go_deeper() because
2001 // it's slow.
2002 if (newscore != 0
2003 && !soundfold
2004 && slang->sl_has_map
2005 && similar_chars(slang,
2006 c, fword[sp->ts_fidx - 1]))
2007 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
2008 }
2009 }
2010 }
2011 break;
2012
2013 case STATE_DEL:
2014 // When past the first byte of a multi-byte char don't try
2015 // delete/insert/swap a character.
2016 if (has_mbyte && sp->ts_tcharlen > 0)
2017 {
2018 PROF_STORE(sp->ts_state)
2019 sp->ts_state = STATE_FINAL;
2020 break;
2021 }
2022 // Try skipping one character in the bad word (delete it).
2023 PROF_STORE(sp->ts_state)
2024 sp->ts_state = STATE_INS_PREP;
2025 sp->ts_curi = 1;
2026 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*')
2027 // Deleting a vowel at the start of a word counts less, see
2028 // soundalike_score().
2029 newscore = 2 * SCORE_DEL / 3;
2030 else
2031 newscore = SCORE_DEL;
2032 if (fword[sp->ts_fidx] != NUL
2033 && TRY_DEEPER(su, stack, depth, newscore))
2034 {
2035 go_deeper(stack, depth, newscore);
2036#ifdef DEBUG_TRIEWALK
2037 sprintf(changename[depth], "%.*s-%s: delete %c",
2038 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2039 fword[sp->ts_fidx]);
2040#endif
2041 ++depth;
2042
2043 // Remember what character we deleted, so that we can avoid
2044 // inserting it again.
2045 stack[depth].ts_flags |= TSF_DIDDEL;
2046 stack[depth].ts_delidx = sp->ts_fidx;
2047
2048 // Advance over the character in fword[]. Give a bonus to the
2049 // score if the same character is following "nn" -> "n". It's
2050 // a bit illogical for soundfold tree but it does give better
2051 // results.
2052 if (has_mbyte)
2053 {
2054 c = mb_ptr2char(fword + sp->ts_fidx);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002055 stack[depth].ts_fidx += mb_ptr2len(fword + sp->ts_fidx);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002056 if (enc_utf8 && utf_iscomposing(c))
2057 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
2058 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx))
2059 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2060 }
2061 else
2062 {
2063 ++stack[depth].ts_fidx;
2064 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1])
2065 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2066 }
2067 break;
2068 }
2069 // FALLTHROUGH
2070
2071 case STATE_INS_PREP:
2072 if (sp->ts_flags & TSF_DIDDEL)
2073 {
2074 // If we just deleted a byte then inserting won't make sense,
2075 // a substitute is always cheaper.
2076 PROF_STORE(sp->ts_state)
2077 sp->ts_state = STATE_SWAP;
2078 break;
2079 }
2080
2081 // skip over NUL bytes
2082 n = sp->ts_arridx;
2083 for (;;)
2084 {
2085 if (sp->ts_curi > byts[n])
2086 {
2087 // Only NUL bytes at this node, go to next state.
2088 PROF_STORE(sp->ts_state)
2089 sp->ts_state = STATE_SWAP;
2090 break;
2091 }
2092 if (byts[n + sp->ts_curi] != NUL)
2093 {
2094 // Found a byte to insert.
2095 PROF_STORE(sp->ts_state)
2096 sp->ts_state = STATE_INS;
2097 break;
2098 }
2099 ++sp->ts_curi;
2100 }
2101 break;
2102
2103 // FALLTHROUGH
2104
2105 case STATE_INS:
2106 // Insert one byte. Repeat this for each possible byte at this
2107 // node.
2108 n = sp->ts_arridx;
2109 if (sp->ts_curi > byts[n])
2110 {
2111 // Done all bytes at this node, go to next state.
2112 PROF_STORE(sp->ts_state)
2113 sp->ts_state = STATE_SWAP;
2114 break;
2115 }
2116
2117 // Do one more byte at this node, but:
2118 // - Skip NUL bytes.
2119 // - Skip the byte if it's equal to the byte in the word,
2120 // accepting that byte is always better.
2121 n += sp->ts_curi++;
2122 c = byts[n];
2123 if (soundfold && sp->ts_twordlen == 0 && c == '*')
2124 // Inserting a vowel at the start of a word counts less,
2125 // see soundalike_score().
2126 newscore = 2 * SCORE_INS / 3;
2127 else
2128 newscore = SCORE_INS;
2129 if (c != fword[sp->ts_fidx]
2130 && TRY_DEEPER(su, stack, depth, newscore))
2131 {
2132 go_deeper(stack, depth, newscore);
2133#ifdef DEBUG_TRIEWALK
2134 sprintf(changename[depth], "%.*s-%s: insert %c",
2135 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2136 c);
2137#endif
2138 ++depth;
2139 sp = &stack[depth];
2140 tword[sp->ts_twordlen++] = c;
2141 sp->ts_arridx = idxs[n];
2142 if (has_mbyte)
2143 {
2144 fl = MB_BYTE2LEN(c);
2145 if (fl > 1)
2146 {
2147 // There are following bytes for the same character.
2148 // We must find all bytes before trying
2149 // delete/insert/swap/etc.
2150 sp->ts_tcharlen = fl;
2151 sp->ts_tcharidx = 1;
2152 sp->ts_isdiff = DIFF_INSERT;
2153 }
2154 }
2155 else
2156 fl = 1;
2157 if (fl == 1)
2158 {
2159 // If the previous character was the same, thus doubling a
2160 // character, give a bonus to the score. Also for
2161 // soundfold words (illogical but does give a better
2162 // score).
2163 if (sp->ts_twordlen >= 2
2164 && tword[sp->ts_twordlen - 2] == c)
2165 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
2166 }
2167 }
2168 break;
2169
2170 case STATE_SWAP:
2171 // Swap two bytes in the bad word: "12" -> "21".
2172 // We change "fword" here, it's changed back afterwards at
2173 // STATE_UNSWAP.
2174 p = fword + sp->ts_fidx;
2175 c = *p;
2176 if (c == NUL)
2177 {
2178 // End of word, can't swap or replace.
2179 PROF_STORE(sp->ts_state)
2180 sp->ts_state = STATE_FINAL;
2181 break;
2182 }
2183
2184 // Don't swap if the first character is not a word character.
2185 // SWAP3 etc. also don't make sense then.
2186 if (!soundfold && !spell_iswordp(p, curwin))
2187 {
2188 PROF_STORE(sp->ts_state)
2189 sp->ts_state = STATE_REP_INI;
2190 break;
2191 }
2192
2193 if (has_mbyte)
2194 {
2195 n = MB_CPTR2LEN(p);
2196 c = mb_ptr2char(p);
2197 if (p[n] == NUL)
2198 c2 = NUL;
2199 else if (!soundfold && !spell_iswordp(p + n, curwin))
2200 c2 = c; // don't swap non-word char
2201 else
2202 c2 = mb_ptr2char(p + n);
2203 }
2204 else
2205 {
2206 if (p[1] == NUL)
2207 c2 = NUL;
2208 else if (!soundfold && !spell_iswordp(p + 1, curwin))
2209 c2 = c; // don't swap non-word char
2210 else
2211 c2 = p[1];
2212 }
2213
2214 // When the second character is NUL we can't swap.
2215 if (c2 == NUL)
2216 {
2217 PROF_STORE(sp->ts_state)
2218 sp->ts_state = STATE_REP_INI;
2219 break;
2220 }
2221
2222 // When characters are identical, swap won't do anything.
2223 // Also get here if the second char is not a word character.
2224 if (c == c2)
2225 {
2226 PROF_STORE(sp->ts_state)
2227 sp->ts_state = STATE_SWAP3;
2228 break;
2229 }
2230 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP))
2231 {
2232 go_deeper(stack, depth, SCORE_SWAP);
2233#ifdef DEBUG_TRIEWALK
2234 sprintf(changename[depth], "%.*s-%s: swap %c and %c",
2235 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2236 c, c2);
2237#endif
2238 PROF_STORE(sp->ts_state)
2239 sp->ts_state = STATE_UNSWAP;
2240 ++depth;
2241 if (has_mbyte)
2242 {
2243 fl = mb_char2len(c2);
2244 mch_memmove(p, p + n, fl);
2245 mb_char2bytes(c, p + fl);
2246 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2247 }
2248 else
2249 {
2250 p[0] = c2;
2251 p[1] = c;
2252 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
2253 }
2254 }
2255 else
2256 {
2257 // If this swap doesn't work then SWAP3 won't either.
2258 PROF_STORE(sp->ts_state)
2259 sp->ts_state = STATE_REP_INI;
2260 }
2261 break;
2262
2263 case STATE_UNSWAP:
2264 // Undo the STATE_SWAP swap: "21" -> "12".
2265 p = fword + sp->ts_fidx;
2266 if (has_mbyte)
2267 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002268 n = mb_ptr2len(p);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002269 c = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002270 mch_memmove(p + mb_ptr2len(p + n), p, n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002271 mb_char2bytes(c, p);
2272 }
2273 else
2274 {
2275 c = *p;
2276 *p = p[1];
2277 p[1] = c;
2278 }
2279 // FALLTHROUGH
2280
2281 case STATE_SWAP3:
2282 // Swap two bytes, skipping one: "123" -> "321". We change
2283 // "fword" here, it's changed back afterwards at STATE_UNSWAP3.
2284 p = fword + sp->ts_fidx;
2285 if (has_mbyte)
2286 {
2287 n = MB_CPTR2LEN(p);
2288 c = mb_ptr2char(p);
2289 fl = MB_CPTR2LEN(p + n);
2290 c2 = mb_ptr2char(p + n);
2291 if (!soundfold && !spell_iswordp(p + n + fl, curwin))
2292 c3 = c; // don't swap non-word char
2293 else
2294 c3 = mb_ptr2char(p + n + fl);
2295 }
2296 else
2297 {
2298 c = *p;
2299 c2 = p[1];
2300 if (!soundfold && !spell_iswordp(p + 2, curwin))
2301 c3 = c; // don't swap non-word char
2302 else
2303 c3 = p[2];
2304 }
2305
2306 // When characters are identical: "121" then SWAP3 result is
2307 // identical, ROT3L result is same as SWAP: "211", ROT3L result is
2308 // same as SWAP on next char: "112". Thus skip all swapping.
2309 // Also skip when c3 is NUL.
2310 // Also get here when the third character is not a word character.
2311 // Second character may any char: "a.b" -> "b.a"
2312 if (c == c3 || c3 == NUL)
2313 {
2314 PROF_STORE(sp->ts_state)
2315 sp->ts_state = STATE_REP_INI;
2316 break;
2317 }
2318 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2319 {
2320 go_deeper(stack, depth, SCORE_SWAP3);
2321#ifdef DEBUG_TRIEWALK
2322 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c",
2323 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2324 c, c3);
2325#endif
2326 PROF_STORE(sp->ts_state)
2327 sp->ts_state = STATE_UNSWAP3;
2328 ++depth;
2329 if (has_mbyte)
2330 {
2331 tl = mb_char2len(c3);
2332 mch_memmove(p, p + n + fl, tl);
2333 mb_char2bytes(c2, p + tl);
2334 mb_char2bytes(c, p + fl + tl);
2335 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
2336 }
2337 else
2338 {
2339 p[0] = p[2];
2340 p[2] = c;
2341 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2342 }
2343 }
2344 else
2345 {
2346 PROF_STORE(sp->ts_state)
2347 sp->ts_state = STATE_REP_INI;
2348 }
2349 break;
2350
2351 case STATE_UNSWAP3:
2352 // Undo STATE_SWAP3: "321" -> "123"
2353 p = fword + sp->ts_fidx;
2354 if (has_mbyte)
2355 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002356 n = mb_ptr2len(p);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002357 c2 = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002358 fl = mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002359 c = mb_ptr2char(p + n + fl);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002360 tl = mb_ptr2len(p + n + fl);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002361 mch_memmove(p + fl + tl, p, n);
2362 mb_char2bytes(c, p);
2363 mb_char2bytes(c2, p + tl);
2364 p = p + tl;
2365 }
2366 else
2367 {
2368 c = *p;
2369 *p = p[2];
2370 p[2] = c;
2371 ++p;
2372 }
2373
2374 if (!soundfold && !spell_iswordp(p, curwin))
2375 {
2376 // Middle char is not a word char, skip the rotate. First and
2377 // third char were already checked at swap and swap3.
2378 PROF_STORE(sp->ts_state)
2379 sp->ts_state = STATE_REP_INI;
2380 break;
2381 }
2382
2383 // Rotate three characters left: "123" -> "231". We change
2384 // "fword" here, it's changed back afterwards at STATE_UNROT3L.
2385 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2386 {
2387 go_deeper(stack, depth, SCORE_SWAP3);
2388#ifdef DEBUG_TRIEWALK
2389 p = fword + sp->ts_fidx;
2390 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c",
2391 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2392 p[0], p[1], p[2]);
2393#endif
2394 PROF_STORE(sp->ts_state)
2395 sp->ts_state = STATE_UNROT3L;
2396 ++depth;
2397 p = fword + sp->ts_fidx;
2398 if (has_mbyte)
2399 {
2400 n = MB_CPTR2LEN(p);
2401 c = mb_ptr2char(p);
2402 fl = MB_CPTR2LEN(p + n);
2403 fl += MB_CPTR2LEN(p + n + fl);
2404 mch_memmove(p, p + n, fl);
2405 mb_char2bytes(c, p + fl);
2406 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2407 }
2408 else
2409 {
2410 c = *p;
2411 *p = p[1];
2412 p[1] = p[2];
2413 p[2] = c;
2414 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2415 }
2416 }
2417 else
2418 {
2419 PROF_STORE(sp->ts_state)
2420 sp->ts_state = STATE_REP_INI;
2421 }
2422 break;
2423
2424 case STATE_UNROT3L:
2425 // Undo ROT3L: "231" -> "123"
2426 p = fword + sp->ts_fidx;
2427 if (has_mbyte)
2428 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002429 n = mb_ptr2len(p);
2430 n += mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002431 c = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002432 tl = mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002433 mch_memmove(p + tl, p, n);
2434 mb_char2bytes(c, p);
2435 }
2436 else
2437 {
2438 c = p[2];
2439 p[2] = p[1];
2440 p[1] = *p;
2441 *p = c;
2442 }
2443
2444 // Rotate three bytes right: "123" -> "312". We change "fword"
2445 // here, it's changed back afterwards at STATE_UNROT3R.
2446 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2447 {
2448 go_deeper(stack, depth, SCORE_SWAP3);
2449#ifdef DEBUG_TRIEWALK
2450 p = fword + sp->ts_fidx;
2451 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c",
2452 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2453 p[0], p[1], p[2]);
2454#endif
2455 PROF_STORE(sp->ts_state)
2456 sp->ts_state = STATE_UNROT3R;
2457 ++depth;
2458 p = fword + sp->ts_fidx;
2459 if (has_mbyte)
2460 {
2461 n = MB_CPTR2LEN(p);
2462 n += MB_CPTR2LEN(p + n);
2463 c = mb_ptr2char(p + n);
2464 tl = MB_CPTR2LEN(p + n);
2465 mch_memmove(p + tl, p, n);
2466 mb_char2bytes(c, p);
2467 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
2468 }
2469 else
2470 {
2471 c = p[2];
2472 p[2] = p[1];
2473 p[1] = *p;
2474 *p = c;
2475 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2476 }
2477 }
2478 else
2479 {
2480 PROF_STORE(sp->ts_state)
2481 sp->ts_state = STATE_REP_INI;
2482 }
2483 break;
2484
2485 case STATE_UNROT3R:
2486 // Undo ROT3R: "312" -> "123"
2487 p = fword + sp->ts_fidx;
2488 if (has_mbyte)
2489 {
2490 c = mb_ptr2char(p);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002491 tl = mb_ptr2len(p);
2492 n = mb_ptr2len(p + tl);
2493 n += mb_ptr2len(p + tl + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002494 mch_memmove(p, p + tl, n);
2495 mb_char2bytes(c, p + n);
2496 }
2497 else
2498 {
2499 c = *p;
2500 *p = p[1];
2501 p[1] = p[2];
2502 p[2] = c;
2503 }
2504 // FALLTHROUGH
2505
2506 case STATE_REP_INI:
2507 // Check if matching with REP items from the .aff file would work.
2508 // Quickly skip if:
2509 // - there are no REP items and we are not in the soundfold trie
2510 // - the score is going to be too high anyway
2511 // - already applied a REP item or swapped here
2512 if ((lp->lp_replang == NULL && !soundfold)
2513 || sp->ts_score + SCORE_REP >= su->su_maxscore
2514 || sp->ts_fidx < sp->ts_fidxtry)
2515 {
2516 PROF_STORE(sp->ts_state)
2517 sp->ts_state = STATE_FINAL;
2518 break;
2519 }
2520
2521 // Use the first byte to quickly find the first entry that may
2522 // match. If the index is -1 there is none.
2523 if (soundfold)
2524 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]];
2525 else
2526 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
2527
2528 if (sp->ts_curi < 0)
2529 {
2530 PROF_STORE(sp->ts_state)
2531 sp->ts_state = STATE_FINAL;
2532 break;
2533 }
2534
2535 PROF_STORE(sp->ts_state)
2536 sp->ts_state = STATE_REP;
2537 // FALLTHROUGH
2538
2539 case STATE_REP:
2540 // Try matching with REP items from the .aff file. For each match
2541 // replace the characters and check if the resulting word is
2542 // valid.
2543 p = fword + sp->ts_fidx;
2544
2545 if (soundfold)
2546 gap = &slang->sl_repsal;
2547 else
2548 gap = &lp->lp_replang->sl_rep;
2549 while (sp->ts_curi < gap->ga_len)
2550 {
2551 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
2552 if (*ftp->ft_from != *p)
2553 {
2554 // past possible matching entries
2555 sp->ts_curi = gap->ga_len;
2556 break;
2557 }
2558 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
2559 && TRY_DEEPER(su, stack, depth, SCORE_REP))
2560 {
2561 go_deeper(stack, depth, SCORE_REP);
2562#ifdef DEBUG_TRIEWALK
2563 sprintf(changename[depth], "%.*s-%s: replace %s with %s",
2564 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2565 ftp->ft_from, ftp->ft_to);
2566#endif
2567 // Need to undo this afterwards.
2568 PROF_STORE(sp->ts_state)
2569 sp->ts_state = STATE_REP_UNDO;
2570
2571 // Change the "from" to the "to" string.
2572 ++depth;
2573 fl = (int)STRLEN(ftp->ft_from);
2574 tl = (int)STRLEN(ftp->ft_to);
2575 if (fl != tl)
2576 {
2577 STRMOVE(p + tl, p + fl);
2578 repextra += tl - fl;
2579 }
2580 mch_memmove(p, ftp->ft_to, tl);
2581 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
2582 stack[depth].ts_tcharlen = 0;
2583 break;
2584 }
2585 }
2586
2587 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP)
2588 {
2589 // No (more) matches.
2590 PROF_STORE(sp->ts_state)
2591 sp->ts_state = STATE_FINAL;
2592 }
2593
2594 break;
2595
2596 case STATE_REP_UNDO:
2597 // Undo a REP replacement and continue with the next one.
2598 if (soundfold)
2599 gap = &slang->sl_repsal;
2600 else
2601 gap = &lp->lp_replang->sl_rep;
2602 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1;
2603 fl = (int)STRLEN(ftp->ft_from);
2604 tl = (int)STRLEN(ftp->ft_to);
2605 p = fword + sp->ts_fidx;
2606 if (fl != tl)
2607 {
2608 STRMOVE(p + fl, p + tl);
2609 repextra -= tl - fl;
2610 }
2611 mch_memmove(p, ftp->ft_from, fl);
2612 PROF_STORE(sp->ts_state)
2613 sp->ts_state = STATE_REP;
2614 break;
2615
2616 default:
2617 // Did all possible states at this level, go up one level.
2618 --depth;
2619
2620 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE)
2621 {
2622 // Continue in or go back to the prefix tree.
2623 byts = pbyts;
2624 idxs = pidxs;
2625 }
2626
2627 // Don't check for CTRL-C too often, it takes time.
2628 if (--breakcheckcount == 0)
2629 {
2630 ui_breakcheck();
2631 breakcheckcount = 1000;
2632 }
2633 }
2634 }
2635}
2636
2637
2638/*
2639 * Go one level deeper in the tree.
2640 */
2641 static void
2642go_deeper(trystate_T *stack, int depth, int score_add)
2643{
2644 stack[depth + 1] = stack[depth];
2645 stack[depth + 1].ts_state = STATE_START;
2646 stack[depth + 1].ts_score = stack[depth].ts_score + score_add;
2647 stack[depth + 1].ts_curi = 1; // start just after length byte
2648 stack[depth + 1].ts_flags = 0;
2649}
2650
2651/*
2652 * "fword" is a good word with case folded. Find the matching keep-case
2653 * words and put it in "kword".
2654 * Theoretically there could be several keep-case words that result in the
2655 * same case-folded word, but we only find one...
2656 */
2657 static void
2658find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword)
2659{
2660 char_u uword[MAXWLEN]; // "fword" in upper-case
2661 int depth;
2662 idx_T tryidx;
2663
2664 // The following arrays are used at each depth in the tree.
2665 idx_T arridx[MAXWLEN];
2666 int round[MAXWLEN];
2667 int fwordidx[MAXWLEN];
2668 int uwordidx[MAXWLEN];
2669 int kwordlen[MAXWLEN];
2670
2671 int flen, ulen;
2672 int l;
2673 int len;
2674 int c;
2675 idx_T lo, hi, m;
2676 char_u *p;
2677 char_u *byts = slang->sl_kbyts; // array with bytes of the words
2678 idx_T *idxs = slang->sl_kidxs; // array with indexes
2679
2680 if (byts == NULL)
2681 {
2682 // array is empty: "cannot happen"
2683 *kword = NUL;
2684 return;
2685 }
2686
2687 // Make an all-cap version of "fword".
2688 allcap_copy(fword, uword);
2689
2690 // Each character needs to be tried both case-folded and upper-case.
2691 // All this gets very complicated if we keep in mind that changing case
2692 // may change the byte length of a multi-byte character...
2693 depth = 0;
2694 arridx[0] = 0;
2695 round[0] = 0;
2696 fwordidx[0] = 0;
2697 uwordidx[0] = 0;
2698 kwordlen[0] = 0;
2699 while (depth >= 0)
2700 {
2701 if (fword[fwordidx[depth]] == NUL)
2702 {
2703 // We are at the end of "fword". If the tree allows a word to end
2704 // here we have found a match.
2705 if (byts[arridx[depth] + 1] == 0)
2706 {
2707 kword[kwordlen[depth]] = NUL;
2708 return;
2709 }
2710
2711 // kword is getting too long, continue one level up
2712 --depth;
2713 }
2714 else if (++round[depth] > 2)
2715 {
2716 // tried both fold-case and upper-case character, continue one
2717 // level up
2718 --depth;
2719 }
2720 else
2721 {
2722 // round[depth] == 1: Try using the folded-case character.
2723 // round[depth] == 2: Try using the upper-case character.
2724 if (has_mbyte)
2725 {
2726 flen = MB_CPTR2LEN(fword + fwordidx[depth]);
2727 ulen = MB_CPTR2LEN(uword + uwordidx[depth]);
2728 }
2729 else
2730 ulen = flen = 1;
2731 if (round[depth] == 1)
2732 {
2733 p = fword + fwordidx[depth];
2734 l = flen;
2735 }
2736 else
2737 {
2738 p = uword + uwordidx[depth];
2739 l = ulen;
2740 }
2741
2742 for (tryidx = arridx[depth]; l > 0; --l)
2743 {
2744 // Perform a binary search in the list of accepted bytes.
2745 len = byts[tryidx++];
2746 c = *p++;
2747 lo = tryidx;
2748 hi = tryidx + len - 1;
2749 while (lo < hi)
2750 {
2751 m = (lo + hi) / 2;
2752 if (byts[m] > c)
2753 hi = m - 1;
2754 else if (byts[m] < c)
2755 lo = m + 1;
2756 else
2757 {
2758 lo = hi = m;
2759 break;
2760 }
2761 }
2762
2763 // Stop if there is no matching byte.
2764 if (hi < lo || byts[lo] != c)
2765 break;
2766
2767 // Continue at the child (if there is one).
2768 tryidx = idxs[lo];
2769 }
2770
2771 if (l == 0)
2772 {
2773 // Found the matching char. Copy it to "kword" and go a
2774 // level deeper.
2775 if (round[depth] == 1)
2776 {
2777 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
2778 flen);
2779 kwordlen[depth + 1] = kwordlen[depth] + flen;
2780 }
2781 else
2782 {
2783 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
2784 ulen);
2785 kwordlen[depth + 1] = kwordlen[depth] + ulen;
2786 }
2787 fwordidx[depth + 1] = fwordidx[depth] + flen;
2788 uwordidx[depth + 1] = uwordidx[depth] + ulen;
2789
2790 ++depth;
2791 arridx[depth] = tryidx;
2792 round[depth] = 0;
2793 }
2794 }
2795 }
2796
2797 // Didn't find it: "cannot happen".
2798 *kword = NUL;
2799}
2800
2801/*
2802 * Compute the sound-a-like score for suggestions in su->su_ga and add them to
2803 * su->su_sga.
2804 */
2805 static void
2806score_comp_sal(suginfo_T *su)
2807{
2808 langp_T *lp;
2809 char_u badsound[MAXWLEN];
2810 int i;
2811 suggest_T *stp;
2812 suggest_T *sstp;
2813 int score;
2814 int lpi;
2815
2816 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL)
2817 return;
2818
2819 // Use the sound-folding of the first language that supports it.
2820 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2821 {
2822 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2823 if (lp->lp_slang->sl_sal.ga_len > 0)
2824 {
2825 // soundfold the bad word
2826 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound);
2827
2828 for (i = 0; i < su->su_ga.ga_len; ++i)
2829 {
2830 stp = &SUG(su->su_ga, i);
2831
2832 // Case-fold the suggested word, sound-fold it and compute the
2833 // sound-a-like score.
2834 score = stp_sal_score(stp, su, lp->lp_slang, badsound);
2835 if (score < SCORE_MAXMAX)
2836 {
2837 // Add the suggestion.
2838 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
2839 sstp->st_word = vim_strsave(stp->st_word);
2840 if (sstp->st_word != NULL)
2841 {
2842 sstp->st_wordlen = stp->st_wordlen;
2843 sstp->st_score = score;
2844 sstp->st_altscore = 0;
2845 sstp->st_orglen = stp->st_orglen;
2846 ++su->su_sga.ga_len;
2847 }
2848 }
2849 }
2850 break;
2851 }
2852 }
2853}
2854
2855/*
2856 * Combine the list of suggestions in su->su_ga and su->su_sga.
2857 * They are entwined.
2858 */
2859 static void
2860score_combine(suginfo_T *su)
2861{
2862 int i;
2863 int j;
2864 garray_T ga;
2865 garray_T *gap;
2866 langp_T *lp;
2867 suggest_T *stp;
2868 char_u *p;
2869 char_u badsound[MAXWLEN];
2870 int round;
2871 int lpi;
2872 slang_T *slang = NULL;
2873
2874 // Add the alternate score to su_ga.
2875 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2876 {
2877 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2878 if (lp->lp_slang->sl_sal.ga_len > 0)
2879 {
2880 // soundfold the bad word
2881 slang = lp->lp_slang;
2882 spell_soundfold(slang, su->su_fbadword, TRUE, badsound);
2883
2884 for (i = 0; i < su->su_ga.ga_len; ++i)
2885 {
2886 stp = &SUG(su->su_ga, i);
2887 stp->st_altscore = stp_sal_score(stp, su, slang, badsound);
2888 if (stp->st_altscore == SCORE_MAXMAX)
2889 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
2890 else
2891 stp->st_score = (stp->st_score * 3
2892 + stp->st_altscore) / 4;
2893 stp->st_salscore = FALSE;
2894 }
2895 break;
2896 }
2897 }
2898
2899 if (slang == NULL) // Using "double" without sound folding.
2900 {
2901 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore,
2902 su->su_maxcount);
2903 return;
2904 }
2905
2906 // Add the alternate score to su_sga.
2907 for (i = 0; i < su->su_sga.ga_len; ++i)
2908 {
2909 stp = &SUG(su->su_sga, i);
2910 stp->st_altscore = spell_edit_score(slang,
2911 su->su_badword, stp->st_word);
2912 if (stp->st_score == SCORE_MAXMAX)
2913 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
2914 else
2915 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
2916 stp->st_salscore = TRUE;
2917 }
2918
2919 // Remove bad suggestions, sort the suggestions and truncate at "maxcount"
2920 // for both lists.
2921 check_suggestions(su, &su->su_ga);
2922 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
2923 check_suggestions(su, &su->su_sga);
2924 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
2925
2926 ga_init2(&ga, (int)sizeof(suginfo_T), 1);
2927 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL)
2928 return;
2929
2930 stp = &SUG(ga, 0);
2931 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i)
2932 {
2933 // round 1: get a suggestion from su_ga
2934 // round 2: get a suggestion from su_sga
2935 for (round = 1; round <= 2; ++round)
2936 {
2937 gap = round == 1 ? &su->su_ga : &su->su_sga;
2938 if (i < gap->ga_len)
2939 {
2940 // Don't add a word if it's already there.
2941 p = SUG(*gap, i).st_word;
2942 for (j = 0; j < ga.ga_len; ++j)
2943 if (STRCMP(stp[j].st_word, p) == 0)
2944 break;
2945 if (j == ga.ga_len)
2946 stp[ga.ga_len++] = SUG(*gap, i);
2947 else
2948 vim_free(p);
2949 }
2950 }
2951 }
2952
2953 ga_clear(&su->su_ga);
2954 ga_clear(&su->su_sga);
2955
2956 // Truncate the list to the number of suggestions that will be displayed.
2957 if (ga.ga_len > su->su_maxcount)
2958 {
2959 for (i = su->su_maxcount; i < ga.ga_len; ++i)
2960 vim_free(stp[i].st_word);
2961 ga.ga_len = su->su_maxcount;
2962 }
2963
2964 su->su_ga = ga;
2965}
2966
2967/*
2968 * For the goodword in "stp" compute the soundalike score compared to the
2969 * badword.
2970 */
2971 static int
2972stp_sal_score(
2973 suggest_T *stp,
2974 suginfo_T *su,
2975 slang_T *slang,
2976 char_u *badsound) // sound-folded badword
2977{
2978 char_u *p;
2979 char_u *pbad;
2980 char_u *pgood;
2981 char_u badsound2[MAXWLEN];
2982 char_u fword[MAXWLEN];
2983 char_u goodsound[MAXWLEN];
2984 char_u goodword[MAXWLEN];
2985 int lendiff;
2986
2987 lendiff = (int)(su->su_badlen - stp->st_orglen);
2988 if (lendiff >= 0)
2989 pbad = badsound;
2990 else
2991 {
2992 // soundfold the bad word with more characters following
2993 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN);
2994
2995 // When joining two words the sound often changes a lot. E.g., "t he"
2996 // sounds like "t h" while "the" sounds like "@". Avoid that by
2997 // removing the space. Don't do it when the good word also contains a
2998 // space.
2999 if (VIM_ISWHITE(su->su_badptr[su->su_badlen])
3000 && *skiptowhite(stp->st_word) == NUL)
3001 for (p = fword; *(p = skiptowhite(p)) != NUL; )
3002 STRMOVE(p, p + 1);
3003
3004 spell_soundfold(slang, fword, TRUE, badsound2);
3005 pbad = badsound2;
3006 }
3007
3008 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN)
3009 {
3010 // Add part of the bad word to the good word, so that we soundfold
3011 // what replaces the bad word.
3012 STRCPY(goodword, stp->st_word);
3013 vim_strncpy(goodword + stp->st_wordlen,
3014 su->su_badptr + su->su_badlen - lendiff, lendiff);
3015 pgood = goodword;
3016 }
3017 else
3018 pgood = stp->st_word;
3019
3020 // Sound-fold the word and compute the score for the difference.
3021 spell_soundfold(slang, pgood, FALSE, goodsound);
3022
3023 return soundalike_score(goodsound, pbad);
3024}
3025
3026// structure used to store soundfolded words that add_sound_suggest() has
3027// handled already.
3028typedef struct
3029{
3030 short sft_score; // lowest score used
3031 char_u sft_word[1]; // soundfolded word, actually longer
3032} sftword_T;
3033
3034static sftword_T dumsft;
3035#define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft)))
3036#define HI2SFT(hi) HIKEY2SFT((hi)->hi_key)
3037
3038/*
3039 * Prepare for calling suggest_try_soundalike().
3040 */
3041 static void
3042suggest_try_soundalike_prep(void)
3043{
3044 langp_T *lp;
3045 int lpi;
3046 slang_T *slang;
3047
3048 // Do this for all languages that support sound folding and for which a
3049 // .sug file has been loaded.
3050 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3051 {
3052 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3053 slang = lp->lp_slang;
3054 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3055 // prepare the hashtable used by add_sound_suggest()
3056 hash_init(&slang->sl_sounddone);
3057 }
3058}
3059
3060/*
3061 * Find suggestions by comparing the word in a sound-a-like form.
3062 * Note: This doesn't support postponed prefixes.
3063 */
3064 static void
3065suggest_try_soundalike(suginfo_T *su)
3066{
3067 char_u salword[MAXWLEN];
3068 langp_T *lp;
3069 int lpi;
3070 slang_T *slang;
3071
3072 // Do this for all languages that support sound folding and for which a
3073 // .sug file has been loaded.
3074 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3075 {
3076 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3077 slang = lp->lp_slang;
3078 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3079 {
3080 // soundfold the bad word
3081 spell_soundfold(slang, su->su_fbadword, TRUE, salword);
3082
3083 // try all kinds of inserts/deletes/swaps/etc.
3084 // TODO: also soundfold the next words, so that we can try joining
3085 // and splitting
3086#ifdef SUGGEST_PROFILE
3087 prof_init();
3088#endif
3089 suggest_trie_walk(su, lp, salword, TRUE);
3090#ifdef SUGGEST_PROFILE
3091 prof_report("soundalike");
3092#endif
3093 }
3094 }
3095}
3096
3097/*
3098 * Finish up after calling suggest_try_soundalike().
3099 */
3100 static void
3101suggest_try_soundalike_finish(void)
3102{
3103 langp_T *lp;
3104 int lpi;
3105 slang_T *slang;
3106 int todo;
3107 hashitem_T *hi;
3108
3109 // Do this for all languages that support sound folding and for which a
3110 // .sug file has been loaded.
3111 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3112 {
3113 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3114 slang = lp->lp_slang;
3115 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3116 {
3117 // Free the info about handled words.
3118 todo = (int)slang->sl_sounddone.ht_used;
3119 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi)
3120 if (!HASHITEM_EMPTY(hi))
3121 {
3122 vim_free(HI2SFT(hi));
3123 --todo;
3124 }
3125
3126 // Clear the hashtable, it may also be used by another region.
3127 hash_clear(&slang->sl_sounddone);
3128 hash_init(&slang->sl_sounddone);
3129 }
3130 }
3131}
3132
3133/*
3134 * A match with a soundfolded word is found. Add the good word(s) that
3135 * produce this soundfolded word.
3136 */
3137 static void
3138add_sound_suggest(
3139 suginfo_T *su,
3140 char_u *goodword,
3141 int score, // soundfold score
3142 langp_T *lp)
3143{
3144 slang_T *slang = lp->lp_slang; // language for sound folding
3145 int sfwordnr;
3146 char_u *nrline;
3147 int orgnr;
3148 char_u theword[MAXWLEN];
3149 int i;
3150 int wlen;
3151 char_u *byts;
3152 idx_T *idxs;
3153 int n;
3154 int wordcount;
3155 int wc;
3156 int goodscore;
3157 hash_T hash;
3158 hashitem_T *hi;
3159 sftword_T *sft;
3160 int bc, gc;
3161 int limit;
3162
3163 // It's very well possible that the same soundfold word is found several
3164 // times with different scores. Since the following is quite slow only do
3165 // the words that have a better score than before. Use a hashtable to
3166 // remember the words that have been done.
3167 hash = hash_hash(goodword);
3168 hi = hash_lookup(&slang->sl_sounddone, goodword, hash);
3169 if (HASHITEM_EMPTY(hi))
3170 {
3171 sft = alloc(sizeof(sftword_T) + STRLEN(goodword));
3172 if (sft != NULL)
3173 {
3174 sft->sft_score = score;
3175 STRCPY(sft->sft_word, goodword);
3176 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash);
3177 }
3178 }
3179 else
3180 {
3181 sft = HI2SFT(hi);
3182 if (score >= sft->sft_score)
3183 return;
3184 sft->sft_score = score;
3185 }
3186
3187 // Find the word nr in the soundfold tree.
3188 sfwordnr = soundfold_find(slang, goodword);
3189 if (sfwordnr < 0)
3190 {
3191 internal_error("add_sound_suggest()");
3192 return;
3193 }
3194
3195 // go over the list of good words that produce this soundfold word
3196 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE);
3197 orgnr = 0;
3198 while (*nrline != NUL)
3199 {
3200 // The wordnr was stored in a minimal nr of bytes as an offset to the
3201 // previous wordnr.
3202 orgnr += bytes2offset(&nrline);
3203
3204 byts = slang->sl_fbyts;
3205 idxs = slang->sl_fidxs;
3206
3207 // Lookup the word "orgnr" one of the two tries.
3208 n = 0;
3209 wordcount = 0;
3210 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen)
3211 {
3212 i = 1;
3213 if (wordcount == orgnr && byts[n + 1] == NUL)
3214 break; // found end of word
3215
3216 if (byts[n + 1] == NUL)
3217 ++wordcount;
3218
3219 // skip over the NUL bytes
3220 for ( ; byts[n + i] == NUL; ++i)
3221 if (i > byts[n]) // safety check
3222 {
3223 STRCPY(theword + wlen, "BAD");
3224 wlen += 3;
3225 goto badword;
3226 }
3227
3228 // One of the siblings must have the word.
3229 for ( ; i < byts[n]; ++i)
3230 {
3231 wc = idxs[idxs[n + i]]; // nr of words under this byte
3232 if (wordcount + wc > orgnr)
3233 break;
3234 wordcount += wc;
3235 }
3236
3237 theword[wlen] = byts[n + i];
3238 n = idxs[n + i];
3239 }
3240badword:
3241 theword[wlen] = NUL;
3242
3243 // Go over the possible flags and regions.
3244 for (; i <= byts[n] && byts[n + i] == NUL; ++i)
3245 {
3246 char_u cword[MAXWLEN];
3247 char_u *p;
3248 int flags = (int)idxs[n + i];
3249
3250 // Skip words with the NOSUGGEST flag
3251 if (flags & WF_NOSUGGEST)
3252 continue;
3253
3254 if (flags & WF_KEEPCAP)
3255 {
3256 // Must find the word in the keep-case tree.
3257 find_keepcap_word(slang, theword, cword);
3258 p = cword;
3259 }
3260 else
3261 {
3262 flags |= su->su_badflags;
3263 if ((flags & WF_CAPMASK) != 0)
3264 {
3265 // Need to fix case according to "flags".
3266 make_case_word(theword, cword, flags);
3267 p = cword;
3268 }
3269 else
3270 p = theword;
3271 }
3272
3273 // Add the suggestion.
3274 if (sps_flags & SPS_DOUBLE)
3275 {
3276 // Add the suggestion if the score isn't too bad.
3277 if (score <= su->su_maxscore)
3278 add_suggestion(su, &su->su_sga, p, su->su_badlen,
3279 score, 0, FALSE, slang, FALSE);
3280 }
3281 else
3282 {
3283 // Add a penalty for words in another region.
3284 if ((flags & WF_REGION)
3285 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
3286 goodscore = SCORE_REGION;
3287 else
3288 goodscore = 0;
3289
3290 // Add a small penalty for changing the first letter from
3291 // lower to upper case. Helps for "tath" -> "Kath", which is
3292 // less common than "tath" -> "path". Don't do it when the
3293 // letter is the same, that has already been counted.
3294 gc = PTR2CHAR(p);
3295 if (SPELL_ISUPPER(gc))
3296 {
3297 bc = PTR2CHAR(su->su_badword);
3298 if (!SPELL_ISUPPER(bc)
3299 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc))
3300 goodscore += SCORE_ICASE / 2;
3301 }
3302
3303 // Compute the score for the good word. This only does letter
3304 // insert/delete/swap/replace. REP items are not considered,
3305 // which may make the score a bit higher.
3306 // Use a limit for the score to make it work faster. Use
3307 // MAXSCORE(), because RESCORE() will change the score.
3308 // If the limit is very high then the iterative method is
3309 // inefficient, using an array is quicker.
3310 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score);
3311 if (limit > SCORE_LIMITMAX)
3312 goodscore += spell_edit_score(slang, su->su_badword, p);
3313 else
3314 goodscore += spell_edit_score_limit(slang, su->su_badword,
3315 p, limit);
3316
3317 // When going over the limit don't bother to do the rest.
3318 if (goodscore < SCORE_MAXMAX)
3319 {
3320 // Give a bonus to words seen before.
3321 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE);
3322
3323 // Add the suggestion if the score isn't too bad.
3324 goodscore = RESCORE(goodscore, score);
3325 if (goodscore <= su->su_sfmaxscore)
3326 add_suggestion(su, &su->su_ga, p, su->su_badlen,
3327 goodscore, score, TRUE, slang, TRUE);
3328 }
3329 }
3330 }
3331 // smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr);
3332 }
3333}
3334
3335/*
3336 * Find word "word" in fold-case tree for "slang" and return the word number.
3337 */
3338 static int
3339soundfold_find(slang_T *slang, char_u *word)
3340{
3341 idx_T arridx = 0;
3342 int len;
3343 int wlen = 0;
3344 int c;
3345 char_u *ptr = word;
3346 char_u *byts;
3347 idx_T *idxs;
3348 int wordnr = 0;
3349
3350 byts = slang->sl_sbyts;
3351 idxs = slang->sl_sidxs;
3352
3353 for (;;)
3354 {
3355 // First byte is the number of possible bytes.
3356 len = byts[arridx++];
3357
3358 // If the first possible byte is a zero the word could end here.
3359 // If the word ends we found the word. If not skip the NUL bytes.
3360 c = ptr[wlen];
3361 if (byts[arridx] == NUL)
3362 {
3363 if (c == NUL)
3364 break;
3365
3366 // Skip over the zeros, there can be several.
3367 while (len > 0 && byts[arridx] == NUL)
3368 {
3369 ++arridx;
3370 --len;
3371 }
3372 if (len == 0)
3373 return -1; // no children, word should have ended here
3374 ++wordnr;
3375 }
3376
3377 // If the word ends we didn't find it.
3378 if (c == NUL)
3379 return -1;
3380
3381 // Perform a binary search in the list of accepted bytes.
3382 if (c == TAB) // <Tab> is handled like <Space>
3383 c = ' ';
3384 while (byts[arridx] < c)
3385 {
3386 // The word count is in the first idxs[] entry of the child.
3387 wordnr += idxs[idxs[arridx]];
3388 ++arridx;
3389 if (--len == 0) // end of the bytes, didn't find it
3390 return -1;
3391 }
3392 if (byts[arridx] != c) // didn't find the byte
3393 return -1;
3394
3395 // Continue at the child (if there is one).
3396 arridx = idxs[arridx];
3397 ++wlen;
3398
3399 // One space in the good word may stand for several spaces in the
3400 // checked word.
3401 if (c == ' ')
3402 while (ptr[wlen] == ' ' || ptr[wlen] == TAB)
3403 ++wlen;
3404 }
3405
3406 return wordnr;
3407}
3408
3409/*
3410 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
3411 * lines in the .aff file.
3412 */
3413 static int
3414similar_chars(slang_T *slang, int c1, int c2)
3415{
3416 int m1, m2;
3417 char_u buf[MB_MAXBYTES + 1];
3418 hashitem_T *hi;
3419
3420 if (c1 >= 256)
3421 {
3422 buf[mb_char2bytes(c1, buf)] = 0;
3423 hi = hash_find(&slang->sl_map_hash, buf);
3424 if (HASHITEM_EMPTY(hi))
3425 m1 = 0;
3426 else
3427 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3428 }
3429 else
3430 m1 = slang->sl_map_array[c1];
3431 if (m1 == 0)
3432 return FALSE;
3433
3434
3435 if (c2 >= 256)
3436 {
3437 buf[mb_char2bytes(c2, buf)] = 0;
3438 hi = hash_find(&slang->sl_map_hash, buf);
3439 if (HASHITEM_EMPTY(hi))
3440 m2 = 0;
3441 else
3442 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3443 }
3444 else
3445 m2 = slang->sl_map_array[c2];
3446
3447 return m1 == m2;
3448}
3449
3450/*
3451 * Add a suggestion to the list of suggestions.
3452 * For a suggestion that is already in the list the lowest score is remembered.
3453 */
3454 static void
3455add_suggestion(
3456 suginfo_T *su,
3457 garray_T *gap, // either su_ga or su_sga
3458 char_u *goodword,
3459 int badlenarg, // len of bad word replaced with "goodword"
3460 int score,
3461 int altscore,
3462 int had_bonus, // value for st_had_bonus
3463 slang_T *slang, // language for sound folding
3464 int maxsf) // su_maxscore applies to soundfold score,
3465 // su_sfmaxscore to the total score.
3466{
3467 int goodlen; // len of goodword changed
3468 int badlen; // len of bad word changed
3469 suggest_T *stp;
3470 suggest_T new_sug;
3471 int i;
3472 char_u *pgood, *pbad;
3473
3474 // Minimize "badlen" for consistency. Avoids that changing "the the" to
3475 // "thee the" is added next to changing the first "the" the "thee".
3476 pgood = goodword + STRLEN(goodword);
3477 pbad = su->su_badptr + badlenarg;
3478 for (;;)
3479 {
3480 goodlen = (int)(pgood - goodword);
3481 badlen = (int)(pbad - su->su_badptr);
3482 if (goodlen <= 0 || badlen <= 0)
3483 break;
3484 MB_PTR_BACK(goodword, pgood);
3485 MB_PTR_BACK(su->su_badptr, pbad);
3486 if (has_mbyte)
3487 {
3488 if (mb_ptr2char(pgood) != mb_ptr2char(pbad))
3489 break;
3490 }
3491 else if (*pgood != *pbad)
3492 break;
3493 }
3494
3495 if (badlen == 0 && goodlen == 0)
3496 // goodword doesn't change anything; may happen for "the the" changing
3497 // the first "the" to itself.
3498 return;
3499
3500 if (gap->ga_len == 0)
3501 i = -1;
3502 else
3503 {
3504 // Check if the word is already there. Also check the length that is
3505 // being replaced "thes," -> "these" is a different suggestion from
3506 // "thes" -> "these".
3507 stp = &SUG(*gap, 0);
3508 for (i = gap->ga_len; --i >= 0; ++stp)
3509 if (stp->st_wordlen == goodlen
3510 && stp->st_orglen == badlen
3511 && STRNCMP(stp->st_word, goodword, goodlen) == 0)
3512 {
3513 // Found it. Remember the word with the lowest score.
3514 if (stp->st_slang == NULL)
3515 stp->st_slang = slang;
3516
3517 new_sug.st_score = score;
3518 new_sug.st_altscore = altscore;
3519 new_sug.st_had_bonus = had_bonus;
3520
3521 if (stp->st_had_bonus != had_bonus)
3522 {
3523 // Only one of the two had the soundalike score computed.
3524 // Need to do that for the other one now, otherwise the
3525 // scores can't be compared. This happens because
3526 // suggest_try_change() doesn't compute the soundalike
3527 // word to keep it fast, while some special methods set
3528 // the soundalike score to zero.
3529 if (had_bonus)
3530 rescore_one(su, stp);
3531 else
3532 {
3533 new_sug.st_word = stp->st_word;
3534 new_sug.st_wordlen = stp->st_wordlen;
3535 new_sug.st_slang = stp->st_slang;
3536 new_sug.st_orglen = badlen;
3537 rescore_one(su, &new_sug);
3538 }
3539 }
3540
3541 if (stp->st_score > new_sug.st_score)
3542 {
3543 stp->st_score = new_sug.st_score;
3544 stp->st_altscore = new_sug.st_altscore;
3545 stp->st_had_bonus = new_sug.st_had_bonus;
3546 }
3547 break;
3548 }
3549 }
3550
3551 if (i < 0 && ga_grow(gap, 1) == OK)
3552 {
3553 // Add a suggestion.
3554 stp = &SUG(*gap, gap->ga_len);
3555 stp->st_word = vim_strnsave(goodword, goodlen);
3556 if (stp->st_word != NULL)
3557 {
3558 stp->st_wordlen = goodlen;
3559 stp->st_score = score;
3560 stp->st_altscore = altscore;
3561 stp->st_had_bonus = had_bonus;
3562 stp->st_orglen = badlen;
3563 stp->st_slang = slang;
3564 ++gap->ga_len;
3565
3566 // If we have too many suggestions now, sort the list and keep
3567 // the best suggestions.
3568 if (gap->ga_len > SUG_MAX_COUNT(su))
3569 {
3570 if (maxsf)
3571 su->su_sfmaxscore = cleanup_suggestions(gap,
3572 su->su_sfmaxscore, SUG_CLEAN_COUNT(su));
3573 else
3574 su->su_maxscore = cleanup_suggestions(gap,
3575 su->su_maxscore, SUG_CLEAN_COUNT(su));
3576 }
3577 }
3578 }
3579}
3580
3581/*
3582 * Suggestions may in fact be flagged as errors. Esp. for banned words and
3583 * for split words, such as "the the". Remove these from the list here.
3584 */
3585 static void
3586check_suggestions(
3587 suginfo_T *su,
3588 garray_T *gap) // either su_ga or su_sga
3589{
3590 suggest_T *stp;
3591 int i;
3592 char_u longword[MAXWLEN + 1];
3593 int len;
3594 hlf_T attr;
3595
3596 stp = &SUG(*gap, 0);
3597 for (i = gap->ga_len - 1; i >= 0; --i)
3598 {
3599 // Need to append what follows to check for "the the".
3600 vim_strncpy(longword, stp[i].st_word, MAXWLEN);
3601 len = stp[i].st_wordlen;
3602 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen,
3603 MAXWLEN - len);
3604 attr = HLF_COUNT;
3605 (void)spell_check(curwin, longword, &attr, NULL, FALSE);
3606 if (attr != HLF_COUNT)
3607 {
3608 // Remove this entry.
3609 vim_free(stp[i].st_word);
3610 --gap->ga_len;
3611 if (i < gap->ga_len)
3612 mch_memmove(stp + i, stp + i + 1,
3613 sizeof(suggest_T) * (gap->ga_len - i));
3614 }
3615 }
3616}
3617
3618
3619/*
3620 * Add a word to be banned.
3621 */
3622 static void
3623add_banned(
3624 suginfo_T *su,
3625 char_u *word)
3626{
3627 char_u *s;
3628 hash_T hash;
3629 hashitem_T *hi;
3630
3631 hash = hash_hash(word);
3632 hi = hash_lookup(&su->su_banned, word, hash);
3633 if (HASHITEM_EMPTY(hi))
3634 {
3635 s = vim_strsave(word);
3636 if (s != NULL)
3637 hash_add_item(&su->su_banned, hi, s, hash);
3638 }
3639}
3640
3641/*
3642 * Recompute the score for all suggestions if sound-folding is possible. This
3643 * is slow, thus only done for the final results.
3644 */
3645 static void
3646rescore_suggestions(suginfo_T *su)
3647{
3648 int i;
3649
3650 if (su->su_sallang != NULL)
3651 for (i = 0; i < su->su_ga.ga_len; ++i)
3652 rescore_one(su, &SUG(su->su_ga, i));
3653}
3654
3655/*
3656 * Recompute the score for one suggestion if sound-folding is possible.
3657 */
3658 static void
3659rescore_one(suginfo_T *su, suggest_T *stp)
3660{
3661 slang_T *slang = stp->st_slang;
3662 char_u sal_badword[MAXWLEN];
3663 char_u *p;
3664
3665 // Only rescore suggestions that have no sal score yet and do have a
3666 // language.
3667 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus)
3668 {
3669 if (slang == su->su_sallang)
3670 p = su->su_sal_badword;
3671 else
3672 {
3673 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword);
3674 p = sal_badword;
3675 }
3676
3677 stp->st_altscore = stp_sal_score(stp, su, slang, p);
3678 if (stp->st_altscore == SCORE_MAXMAX)
3679 stp->st_altscore = SCORE_BIG;
3680 stp->st_score = RESCORE(stp->st_score, stp->st_altscore);
3681 stp->st_had_bonus = TRUE;
3682 }
3683}
3684
3685static int sug_compare(const void *s1, const void *s2);
3686
3687/*
3688 * Function given to qsort() to sort the suggestions on st_score.
3689 * First on "st_score", then "st_altscore" then alphabetically.
3690 */
3691 static int
3692sug_compare(const void *s1, const void *s2)
3693{
3694 suggest_T *p1 = (suggest_T *)s1;
3695 suggest_T *p2 = (suggest_T *)s2;
3696 int n = p1->st_score - p2->st_score;
3697
3698 if (n == 0)
3699 {
3700 n = p1->st_altscore - p2->st_altscore;
3701 if (n == 0)
3702 n = STRICMP(p1->st_word, p2->st_word);
3703 }
3704 return n;
3705}
3706
3707/*
3708 * Cleanup the suggestions:
3709 * - Sort on score.
3710 * - Remove words that won't be displayed.
3711 * Returns the maximum score in the list or "maxscore" unmodified.
3712 */
3713 static int
3714cleanup_suggestions(
3715 garray_T *gap,
3716 int maxscore,
3717 int keep) // nr of suggestions to keep
3718{
3719 suggest_T *stp = &SUG(*gap, 0);
3720 int i;
3721
3722 // Sort the list.
3723 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare);
3724
3725 // Truncate the list to the number of suggestions that will be displayed.
3726 if (gap->ga_len > keep)
3727 {
3728 for (i = keep; i < gap->ga_len; ++i)
3729 vim_free(stp[i].st_word);
3730 gap->ga_len = keep;
Bram Moolenaar569fea22019-12-25 13:55:24 +01003731 if (keep >= 1)
3732 return stp[keep - 1].st_score;
Bram Moolenaar46a426c2019-09-27 12:41:56 +02003733 }
3734 return maxscore;
3735}
3736
3737/*
3738 * Compute a score for two sound-a-like words.
3739 * This permits up to two inserts/deletes/swaps/etc. to keep things fast.
3740 * Instead of a generic loop we write out the code. That keeps it fast by
3741 * avoiding checks that will not be possible.
3742 */
3743 static int
3744soundalike_score(
3745 char_u *goodstart, // sound-folded good word
3746 char_u *badstart) // sound-folded bad word
3747{
3748 char_u *goodsound = goodstart;
3749 char_u *badsound = badstart;
3750 int goodlen;
3751 int badlen;
3752 int n;
3753 char_u *pl, *ps;
3754 char_u *pl2, *ps2;
3755 int score = 0;
3756
3757 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be
3758 // counted so much, vowels halfway the word aren't counted at all.
3759 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound)
3760 {
3761 if ((badsound[0] == NUL && goodsound[1] == NUL)
3762 || (goodsound[0] == NUL && badsound[1] == NUL))
3763 // changing word with vowel to word without a sound
3764 return SCORE_DEL;
3765 if (badsound[0] == NUL || goodsound[0] == NUL)
3766 // more than two changes
3767 return SCORE_MAXMAX;
3768
3769 if (badsound[1] == goodsound[1]
3770 || (badsound[1] != NUL
3771 && goodsound[1] != NUL
3772 && badsound[2] == goodsound[2]))
3773 {
3774 // handle like a substitute
3775 }
3776 else
3777 {
3778 score = 2 * SCORE_DEL / 3;
3779 if (*badsound == '*')
3780 ++badsound;
3781 else
3782 ++goodsound;
3783 }
3784 }
3785
3786 goodlen = (int)STRLEN(goodsound);
3787 badlen = (int)STRLEN(badsound);
3788
3789 // Return quickly if the lengths are too different to be fixed by two
3790 // changes.
3791 n = goodlen - badlen;
3792 if (n < -2 || n > 2)
3793 return SCORE_MAXMAX;
3794
3795 if (n > 0)
3796 {
3797 pl = goodsound; // goodsound is longest
3798 ps = badsound;
3799 }
3800 else
3801 {
3802 pl = badsound; // badsound is longest
3803 ps = goodsound;
3804 }
3805
3806 // Skip over the identical part.
3807 while (*pl == *ps && *pl != NUL)
3808 {
3809 ++pl;
3810 ++ps;
3811 }
3812
3813 switch (n)
3814 {
3815 case -2:
3816 case 2:
3817 // Must delete two characters from "pl".
3818 ++pl; // first delete
3819 while (*pl == *ps)
3820 {
3821 ++pl;
3822 ++ps;
3823 }
3824 // strings must be equal after second delete
3825 if (STRCMP(pl + 1, ps) == 0)
3826 return score + SCORE_DEL * 2;
3827
3828 // Failed to compare.
3829 break;
3830
3831 case -1:
3832 case 1:
3833 // Minimal one delete from "pl" required.
3834
3835 // 1: delete
3836 pl2 = pl + 1;
3837 ps2 = ps;
3838 while (*pl2 == *ps2)
3839 {
3840 if (*pl2 == NUL) // reached the end
3841 return score + SCORE_DEL;
3842 ++pl2;
3843 ++ps2;
3844 }
3845
3846 // 2: delete then swap, then rest must be equal
3847 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3848 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3849 return score + SCORE_DEL + SCORE_SWAP;
3850
3851 // 3: delete then substitute, then the rest must be equal
3852 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3853 return score + SCORE_DEL + SCORE_SUBST;
3854
3855 // 4: first swap then delete
3856 if (pl[0] == ps[1] && pl[1] == ps[0])
3857 {
3858 pl2 = pl + 2; // swap, skip two chars
3859 ps2 = ps + 2;
3860 while (*pl2 == *ps2)
3861 {
3862 ++pl2;
3863 ++ps2;
3864 }
3865 // delete a char and then strings must be equal
3866 if (STRCMP(pl2 + 1, ps2) == 0)
3867 return score + SCORE_SWAP + SCORE_DEL;
3868 }
3869
3870 // 5: first substitute then delete
3871 pl2 = pl + 1; // substitute, skip one char
3872 ps2 = ps + 1;
3873 while (*pl2 == *ps2)
3874 {
3875 ++pl2;
3876 ++ps2;
3877 }
3878 // delete a char and then strings must be equal
3879 if (STRCMP(pl2 + 1, ps2) == 0)
3880 return score + SCORE_SUBST + SCORE_DEL;
3881
3882 // Failed to compare.
3883 break;
3884
3885 case 0:
3886 // Lengths are equal, thus changes must result in same length: An
3887 // insert is only possible in combination with a delete.
3888 // 1: check if for identical strings
3889 if (*pl == NUL)
3890 return score;
3891
3892 // 2: swap
3893 if (pl[0] == ps[1] && pl[1] == ps[0])
3894 {
3895 pl2 = pl + 2; // swap, skip two chars
3896 ps2 = ps + 2;
3897 while (*pl2 == *ps2)
3898 {
3899 if (*pl2 == NUL) // reached the end
3900 return score + SCORE_SWAP;
3901 ++pl2;
3902 ++ps2;
3903 }
3904 // 3: swap and swap again
3905 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3906 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3907 return score + SCORE_SWAP + SCORE_SWAP;
3908
3909 // 4: swap and substitute
3910 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3911 return score + SCORE_SWAP + SCORE_SUBST;
3912 }
3913
3914 // 5: substitute
3915 pl2 = pl + 1;
3916 ps2 = ps + 1;
3917 while (*pl2 == *ps2)
3918 {
3919 if (*pl2 == NUL) // reached the end
3920 return score + SCORE_SUBST;
3921 ++pl2;
3922 ++ps2;
3923 }
3924
3925 // 6: substitute and swap
3926 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3927 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3928 return score + SCORE_SUBST + SCORE_SWAP;
3929
3930 // 7: substitute and substitute
3931 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3932 return score + SCORE_SUBST + SCORE_SUBST;
3933
3934 // 8: insert then delete
3935 pl2 = pl;
3936 ps2 = ps + 1;
3937 while (*pl2 == *ps2)
3938 {
3939 ++pl2;
3940 ++ps2;
3941 }
3942 if (STRCMP(pl2 + 1, ps2) == 0)
3943 return score + SCORE_INS + SCORE_DEL;
3944
3945 // 9: delete then insert
3946 pl2 = pl + 1;
3947 ps2 = ps;
3948 while (*pl2 == *ps2)
3949 {
3950 ++pl2;
3951 ++ps2;
3952 }
3953 if (STRCMP(pl2, ps2 + 1) == 0)
3954 return score + SCORE_INS + SCORE_DEL;
3955
3956 // Failed to compare.
3957 break;
3958 }
3959
3960 return SCORE_MAXMAX;
3961}
3962
3963/*
3964 * Compute the "edit distance" to turn "badword" into "goodword". The less
3965 * deletes/inserts/substitutes/swaps are required the lower the score.
3966 *
3967 * The algorithm is described by Du and Chang, 1992.
3968 * The implementation of the algorithm comes from Aspell editdist.cpp,
3969 * edit_distance(). It has been converted from C++ to C and modified to
3970 * support multi-byte characters.
3971 */
3972 static int
3973spell_edit_score(
3974 slang_T *slang,
3975 char_u *badword,
3976 char_u *goodword)
3977{
3978 int *cnt;
3979 int badlen, goodlen; // lengths including NUL
3980 int j, i;
3981 int t;
3982 int bc, gc;
3983 int pbc, pgc;
3984 char_u *p;
3985 int wbadword[MAXWLEN];
3986 int wgoodword[MAXWLEN];
3987
3988 if (has_mbyte)
3989 {
3990 // Get the characters from the multi-byte strings and put them in an
3991 // int array for easy access.
3992 for (p = badword, badlen = 0; *p != NUL; )
3993 wbadword[badlen++] = mb_cptr2char_adv(&p);
3994 wbadword[badlen++] = 0;
3995 for (p = goodword, goodlen = 0; *p != NUL; )
3996 wgoodword[goodlen++] = mb_cptr2char_adv(&p);
3997 wgoodword[goodlen++] = 0;
3998 }
3999 else
4000 {
4001 badlen = (int)STRLEN(badword) + 1;
4002 goodlen = (int)STRLEN(goodword) + 1;
4003 }
4004
4005 // We use "cnt" as an array: CNT(badword_idx, goodword_idx).
4006#define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
4007 cnt = ALLOC_MULT(int, (badlen + 1) * (goodlen + 1));
4008 if (cnt == NULL)
4009 return 0; // out of memory
4010
4011 CNT(0, 0) = 0;
4012 for (j = 1; j <= goodlen; ++j)
4013 CNT(0, j) = CNT(0, j - 1) + SCORE_INS;
4014
4015 for (i = 1; i <= badlen; ++i)
4016 {
4017 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL;
4018 for (j = 1; j <= goodlen; ++j)
4019 {
4020 if (has_mbyte)
4021 {
4022 bc = wbadword[i - 1];
4023 gc = wgoodword[j - 1];
4024 }
4025 else
4026 {
4027 bc = badword[i - 1];
4028 gc = goodword[j - 1];
4029 }
4030 if (bc == gc)
4031 CNT(i, j) = CNT(i - 1, j - 1);
4032 else
4033 {
4034 // Use a better score when there is only a case difference.
4035 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4036 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
4037 else
4038 {
4039 // For a similar character use SCORE_SIMILAR.
4040 if (slang != NULL
4041 && slang->sl_has_map
4042 && similar_chars(slang, gc, bc))
4043 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1);
4044 else
4045 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
4046 }
4047
4048 if (i > 1 && j > 1)
4049 {
4050 if (has_mbyte)
4051 {
4052 pbc = wbadword[i - 2];
4053 pgc = wgoodword[j - 2];
4054 }
4055 else
4056 {
4057 pbc = badword[i - 2];
4058 pgc = goodword[j - 2];
4059 }
4060 if (bc == pgc && pbc == gc)
4061 {
4062 t = SCORE_SWAP + CNT(i - 2, j - 2);
4063 if (t < CNT(i, j))
4064 CNT(i, j) = t;
4065 }
4066 }
4067 t = SCORE_DEL + CNT(i - 1, j);
4068 if (t < CNT(i, j))
4069 CNT(i, j) = t;
4070 t = SCORE_INS + CNT(i, j - 1);
4071 if (t < CNT(i, j))
4072 CNT(i, j) = t;
4073 }
4074 }
4075 }
4076
4077 i = CNT(badlen - 1, goodlen - 1);
4078 vim_free(cnt);
4079 return i;
4080}
4081
4082typedef struct
4083{
4084 int badi;
4085 int goodi;
4086 int score;
4087} limitscore_T;
4088
4089/*
4090 * Like spell_edit_score(), but with a limit on the score to make it faster.
4091 * May return SCORE_MAXMAX when the score is higher than "limit".
4092 *
4093 * This uses a stack for the edits still to be tried.
4094 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support
4095 * for multi-byte characters.
4096 */
4097 static int
4098spell_edit_score_limit(
4099 slang_T *slang,
4100 char_u *badword,
4101 char_u *goodword,
4102 int limit)
4103{
4104 limitscore_T stack[10]; // allow for over 3 * 2 edits
4105 int stackidx;
4106 int bi, gi;
4107 int bi2, gi2;
4108 int bc, gc;
4109 int score;
4110 int score_off;
4111 int minscore;
4112 int round;
4113
4114 // Multi-byte characters require a bit more work, use a different function
4115 // to avoid testing "has_mbyte" quite often.
4116 if (has_mbyte)
4117 return spell_edit_score_limit_w(slang, badword, goodword, limit);
4118
4119 // The idea is to go from start to end over the words. So long as
4120 // characters are equal just continue, this always gives the lowest score.
4121 // When there is a difference try several alternatives. Each alternative
4122 // increases "score" for the edit distance. Some of the alternatives are
4123 // pushed unto a stack and tried later, some are tried right away. At the
4124 // end of the word the score for one alternative is known. The lowest
4125 // possible score is stored in "minscore".
4126 stackidx = 0;
4127 bi = 0;
4128 gi = 0;
4129 score = 0;
4130 minscore = limit + 1;
4131
4132 for (;;)
4133 {
4134 // Skip over an equal part, score remains the same.
4135 for (;;)
4136 {
4137 bc = badword[bi];
4138 gc = goodword[gi];
4139 if (bc != gc) // stop at a char that's different
4140 break;
4141 if (bc == NUL) // both words end
4142 {
4143 if (score < minscore)
4144 minscore = score;
4145 goto pop; // do next alternative
4146 }
4147 ++bi;
4148 ++gi;
4149 }
4150
4151 if (gc == NUL) // goodword ends, delete badword chars
4152 {
4153 do
4154 {
4155 if ((score += SCORE_DEL) >= minscore)
4156 goto pop; // do next alternative
4157 } while (badword[++bi] != NUL);
4158 minscore = score;
4159 }
4160 else if (bc == NUL) // badword ends, insert badword chars
4161 {
4162 do
4163 {
4164 if ((score += SCORE_INS) >= minscore)
4165 goto pop; // do next alternative
4166 } while (goodword[++gi] != NUL);
4167 minscore = score;
4168 }
4169 else // both words continue
4170 {
4171 // If not close to the limit, perform a change. Only try changes
4172 // that may lead to a lower score than "minscore".
4173 // round 0: try deleting a char from badword
4174 // round 1: try inserting a char in badword
4175 for (round = 0; round <= 1; ++round)
4176 {
4177 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4178 if (score_off < minscore)
4179 {
4180 if (score_off + SCORE_EDIT_MIN >= minscore)
4181 {
4182 // Near the limit, rest of the words must match. We
4183 // can check that right now, no need to push an item
4184 // onto the stack.
4185 bi2 = bi + 1 - round;
4186 gi2 = gi + round;
4187 while (goodword[gi2] == badword[bi2])
4188 {
4189 if (goodword[gi2] == NUL)
4190 {
4191 minscore = score_off;
4192 break;
4193 }
4194 ++bi2;
4195 ++gi2;
4196 }
4197 }
4198 else
4199 {
4200 // try deleting/inserting a character later
4201 stack[stackidx].badi = bi + 1 - round;
4202 stack[stackidx].goodi = gi + round;
4203 stack[stackidx].score = score_off;
4204 ++stackidx;
4205 }
4206 }
4207 }
4208
4209 if (score + SCORE_SWAP < minscore)
4210 {
4211 // If swapping two characters makes a match then the
4212 // substitution is more expensive, thus there is no need to
4213 // try both.
4214 if (gc == badword[bi + 1] && bc == goodword[gi + 1])
4215 {
4216 // Swap two characters, that is: skip them.
4217 gi += 2;
4218 bi += 2;
4219 score += SCORE_SWAP;
4220 continue;
4221 }
4222 }
4223
4224 // Substitute one character for another which is the same
4225 // thing as deleting a character from both goodword and badword.
4226 // Use a better score when there is only a case difference.
4227 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4228 score += SCORE_ICASE;
4229 else
4230 {
4231 // For a similar character use SCORE_SIMILAR.
4232 if (slang != NULL
4233 && slang->sl_has_map
4234 && similar_chars(slang, gc, bc))
4235 score += SCORE_SIMILAR;
4236 else
4237 score += SCORE_SUBST;
4238 }
4239
4240 if (score < minscore)
4241 {
4242 // Do the substitution.
4243 ++gi;
4244 ++bi;
4245 continue;
4246 }
4247 }
4248pop:
4249 // Get here to try the next alternative, pop it from the stack.
4250 if (stackidx == 0) // stack is empty, finished
4251 break;
4252
4253 // pop an item from the stack
4254 --stackidx;
4255 gi = stack[stackidx].goodi;
4256 bi = stack[stackidx].badi;
4257 score = stack[stackidx].score;
4258 }
4259
4260 // When the score goes over "limit" it may actually be much higher.
4261 // Return a very large number to avoid going below the limit when giving a
4262 // bonus.
4263 if (minscore > limit)
4264 return SCORE_MAXMAX;
4265 return minscore;
4266}
4267
4268/*
4269 * Multi-byte version of spell_edit_score_limit().
4270 * Keep it in sync with the above!
4271 */
4272 static int
4273spell_edit_score_limit_w(
4274 slang_T *slang,
4275 char_u *badword,
4276 char_u *goodword,
4277 int limit)
4278{
4279 limitscore_T stack[10]; // allow for over 3 * 2 edits
4280 int stackidx;
4281 int bi, gi;
4282 int bi2, gi2;
4283 int bc, gc;
4284 int score;
4285 int score_off;
4286 int minscore;
4287 int round;
4288 char_u *p;
4289 int wbadword[MAXWLEN];
4290 int wgoodword[MAXWLEN];
4291
4292 // Get the characters from the multi-byte strings and put them in an
4293 // int array for easy access.
4294 bi = 0;
4295 for (p = badword; *p != NUL; )
4296 wbadword[bi++] = mb_cptr2char_adv(&p);
4297 wbadword[bi++] = 0;
4298 gi = 0;
4299 for (p = goodword; *p != NUL; )
4300 wgoodword[gi++] = mb_cptr2char_adv(&p);
4301 wgoodword[gi++] = 0;
4302
4303 // The idea is to go from start to end over the words. So long as
4304 // characters are equal just continue, this always gives the lowest score.
4305 // When there is a difference try several alternatives. Each alternative
4306 // increases "score" for the edit distance. Some of the alternatives are
4307 // pushed unto a stack and tried later, some are tried right away. At the
4308 // end of the word the score for one alternative is known. The lowest
4309 // possible score is stored in "minscore".
4310 stackidx = 0;
4311 bi = 0;
4312 gi = 0;
4313 score = 0;
4314 minscore = limit + 1;
4315
4316 for (;;)
4317 {
4318 // Skip over an equal part, score remains the same.
4319 for (;;)
4320 {
4321 bc = wbadword[bi];
4322 gc = wgoodword[gi];
4323
4324 if (bc != gc) // stop at a char that's different
4325 break;
4326 if (bc == NUL) // both words end
4327 {
4328 if (score < minscore)
4329 minscore = score;
4330 goto pop; // do next alternative
4331 }
4332 ++bi;
4333 ++gi;
4334 }
4335
4336 if (gc == NUL) // goodword ends, delete badword chars
4337 {
4338 do
4339 {
4340 if ((score += SCORE_DEL) >= minscore)
4341 goto pop; // do next alternative
4342 } while (wbadword[++bi] != NUL);
4343 minscore = score;
4344 }
4345 else if (bc == NUL) // badword ends, insert badword chars
4346 {
4347 do
4348 {
4349 if ((score += SCORE_INS) >= minscore)
4350 goto pop; // do next alternative
4351 } while (wgoodword[++gi] != NUL);
4352 minscore = score;
4353 }
4354 else // both words continue
4355 {
4356 // If not close to the limit, perform a change. Only try changes
4357 // that may lead to a lower score than "minscore".
4358 // round 0: try deleting a char from badword
4359 // round 1: try inserting a char in badword
4360 for (round = 0; round <= 1; ++round)
4361 {
4362 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4363 if (score_off < minscore)
4364 {
4365 if (score_off + SCORE_EDIT_MIN >= minscore)
4366 {
4367 // Near the limit, rest of the words must match. We
4368 // can check that right now, no need to push an item
4369 // onto the stack.
4370 bi2 = bi + 1 - round;
4371 gi2 = gi + round;
4372 while (wgoodword[gi2] == wbadword[bi2])
4373 {
4374 if (wgoodword[gi2] == NUL)
4375 {
4376 minscore = score_off;
4377 break;
4378 }
4379 ++bi2;
4380 ++gi2;
4381 }
4382 }
4383 else
4384 {
4385 // try deleting a character from badword later
4386 stack[stackidx].badi = bi + 1 - round;
4387 stack[stackidx].goodi = gi + round;
4388 stack[stackidx].score = score_off;
4389 ++stackidx;
4390 }
4391 }
4392 }
4393
4394 if (score + SCORE_SWAP < minscore)
4395 {
4396 // If swapping two characters makes a match then the
4397 // substitution is more expensive, thus there is no need to
4398 // try both.
4399 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1])
4400 {
4401 // Swap two characters, that is: skip them.
4402 gi += 2;
4403 bi += 2;
4404 score += SCORE_SWAP;
4405 continue;
4406 }
4407 }
4408
4409 // Substitute one character for another which is the same
4410 // thing as deleting a character from both goodword and badword.
4411 // Use a better score when there is only a case difference.
4412 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4413 score += SCORE_ICASE;
4414 else
4415 {
4416 // For a similar character use SCORE_SIMILAR.
4417 if (slang != NULL
4418 && slang->sl_has_map
4419 && similar_chars(slang, gc, bc))
4420 score += SCORE_SIMILAR;
4421 else
4422 score += SCORE_SUBST;
4423 }
4424
4425 if (score < minscore)
4426 {
4427 // Do the substitution.
4428 ++gi;
4429 ++bi;
4430 continue;
4431 }
4432 }
4433pop:
4434 // Get here to try the next alternative, pop it from the stack.
4435 if (stackidx == 0) // stack is empty, finished
4436 break;
4437
4438 // pop an item from the stack
4439 --stackidx;
4440 gi = stack[stackidx].goodi;
4441 bi = stack[stackidx].badi;
4442 score = stack[stackidx].score;
4443 }
4444
4445 // When the score goes over "limit" it may actually be much higher.
4446 // Return a very large number to avoid going below the limit when giving a
4447 // bonus.
4448 if (minscore > limit)
4449 return SCORE_MAXMAX;
4450 return minscore;
4451}
4452#endif // FEAT_SPELL