blob: b2b402919fb44f671a069e71cd096b39e613192d [file] [log] [blame]
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10/*
11 * spellsuggest.c: functions for spelling suggestions
12 */
13
14#include "vim.h"
15
16#if defined(FEAT_SPELL) || defined(PROTO)
17
18/*
19 * Use this to adjust the score after finding suggestions, based on the
20 * suggested word sounding like the bad word. This is much faster than doing
21 * it for every possible suggestion.
22 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
23 * vs "ht") and goes down in the list.
24 * Used when 'spellsuggest' is set to "best".
25 */
26#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
27
28/*
29 * Do the opposite: based on a maximum end score and a known sound score,
30 * compute the maximum word score that can be used.
31 */
32#define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
33
34// only used for su_badflags
35#define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI
36
37/*
38 * Information used when looking for suggestions.
39 */
40typedef struct suginfo_S
41{
42 garray_T su_ga; // suggestions, contains "suggest_T"
43 int su_maxcount; // max. number of suggestions displayed
44 int su_maxscore; // maximum score for adding to su_ga
45 int su_sfmaxscore; // idem, for when doing soundfold words
46 garray_T su_sga; // like su_ga, sound-folded scoring
47 char_u *su_badptr; // start of bad word in line
48 int su_badlen; // length of detected bad word in line
49 int su_badflags; // caps flags for bad word
50 char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen
51 char_u su_fbadword[MAXWLEN]; // su_badword case-folded
52 char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded
53 hashtab_T su_banned; // table with banned words
54 slang_T *su_sallang; // default language for sound folding
55} suginfo_T;
56
57// One word suggestion. Used in "si_ga".
58typedef struct suggest_S
59{
60 char_u *st_word; // suggested word, allocated string
61 int st_wordlen; // STRLEN(st_word)
62 int st_orglen; // length of replaced text
63 int st_score; // lower is better
64 int st_altscore; // used when st_score compares equal
65 int st_salscore; // st_score is for soundalike
66 int st_had_bonus; // bonus already included in score
67 slang_T *st_slang; // language used for sound folding
68} suggest_T;
69
70#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
71
72// TRUE if a word appears in the list of banned words.
73#define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
74
75// Number of suggestions kept when cleaning up. We need to keep more than
76// what is displayed, because when rescore_suggestions() is called the score
77// may change and wrong suggestions may be removed later.
78#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20)
79
80// Threshold for sorting and cleaning up suggestions. Don't want to keep lots
81// of suggestions that are not going to be displayed.
82#define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
83
84// score for various changes
85#define SCORE_SPLIT 149 // split bad word
86#define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS
87#define SCORE_ICASE 52 // slightly different case
88#define SCORE_REGION 200 // word is for different region
89#define SCORE_RARE 180 // rare word
90#define SCORE_SWAP 75 // swap two characters
91#define SCORE_SWAP3 110 // swap two characters in three
92#define SCORE_REP 65 // REP replacement
93#define SCORE_SUBST 93 // substitute a character
94#define SCORE_SIMILAR 33 // substitute a similar character
95#define SCORE_SUBCOMP 33 // substitute a composing character
96#define SCORE_DEL 94 // delete a character
97#define SCORE_DELDUP 66 // delete a duplicated character
98#define SCORE_DELCOMP 28 // delete a composing character
99#define SCORE_INS 96 // insert a character
100#define SCORE_INSDUP 67 // insert a duplicate character
101#define SCORE_INSCOMP 30 // insert a composing character
102#define SCORE_NONWORD 103 // change non-word to word char
103
104#define SCORE_FILE 30 // suggestion from a file
105#define SCORE_MAXINIT 350 // Initial maximum score: higher == slower.
106 // 350 allows for about three changes.
107
108#define SCORE_COMMON1 30 // subtracted for words seen before
109#define SCORE_COMMON2 40 // subtracted for words often seen
110#define SCORE_COMMON3 50 // subtracted for words very often seen
111#define SCORE_THRES2 10 // word count threshold for COMMON2
112#define SCORE_THRES3 100 // word count threshold for COMMON3
113
114// When trying changed soundfold words it becomes slow when trying more than
Dominique Pelleaf4a61a2021-12-27 17:21:41 +0000115// two changes. With less than two changes it's slightly faster but we miss a
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200116// few good suggestions. In rare cases we need to try three of four changes.
117#define SCORE_SFMAX1 200 // maximum score for first try
118#define SCORE_SFMAX2 300 // maximum score for second try
119#define SCORE_SFMAX3 400 // maximum score for third try
120
121#define SCORE_BIG SCORE_INS * 3 // big difference
122#define SCORE_MAXMAX 999999 // accept any score
123#define SCORE_LIMITMAX 350 // for spell_edit_score_limit()
124
125// for spell_edit_score_limit() we need to know the minimum value of
126// SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS
127#define SCORE_EDIT_MIN SCORE_SIMILAR
128
129/*
130 * For finding suggestions: At each node in the tree these states are tried:
131 */
132typedef enum
133{
134 STATE_START = 0, // At start of node check for NUL bytes (goodword
135 // ends); if badword ends there is a match, otherwise
136 // try splitting word.
137 STATE_NOPREFIX, // try without prefix
138 STATE_SPLITUNDO, // Undo splitting.
139 STATE_ENDNUL, // Past NUL bytes at start of the node.
140 STATE_PLAIN, // Use each byte of the node.
141 STATE_DEL, // Delete a byte from the bad word.
142 STATE_INS_PREP, // Prepare for inserting bytes.
143 STATE_INS, // Insert a byte in the bad word.
144 STATE_SWAP, // Swap two bytes.
145 STATE_UNSWAP, // Undo swap two characters.
146 STATE_SWAP3, // Swap two characters over three.
147 STATE_UNSWAP3, // Undo Swap two characters over three.
148 STATE_UNROT3L, // Undo rotate three characters left
149 STATE_UNROT3R, // Undo rotate three characters right
150 STATE_REP_INI, // Prepare for using REP items.
151 STATE_REP, // Use matching REP items from the .aff file.
152 STATE_REP_UNDO, // Undo a REP item replacement.
153 STATE_FINAL // End of this node.
154} state_T;
155
156/*
157 * Struct to keep the state at each level in suggest_try_change().
158 */
159typedef struct trystate_S
160{
161 state_T ts_state; // state at this level, STATE_
162 int ts_score; // score
163 idx_T ts_arridx; // index in tree array, start of node
164 short ts_curi; // index in list of child nodes
165 char_u ts_fidx; // index in fword[], case-folded bad word
166 char_u ts_fidxtry; // ts_fidx at which bytes may be changed
167 char_u ts_twordlen; // valid length of tword[]
168 char_u ts_prefixdepth; // stack depth for end of prefix or
169 // PFD_PREFIXTREE or PFD_NOPREFIX
170 char_u ts_flags; // TSF_ flags
171 char_u ts_tcharlen; // number of bytes in tword character
172 char_u ts_tcharidx; // current byte index in tword character
173 char_u ts_isdiff; // DIFF_ values
174 char_u ts_fcharstart; // index in fword where badword char started
175 char_u ts_prewordlen; // length of word in "preword[]"
176 char_u ts_splitoff; // index in "tword" after last split
177 char_u ts_splitfidx; // "ts_fidx" at word split
178 char_u ts_complen; // nr of compound words used
179 char_u ts_compsplit; // index for "compflags" where word was spit
180 char_u ts_save_badflags; // su_badflags saved here
181 char_u ts_delidx; // index in fword for char that was deleted,
182 // valid when "ts_flags" has TSF_DIDDEL
183} trystate_T;
184
185// values for ts_isdiff
186#define DIFF_NONE 0 // no different byte (yet)
187#define DIFF_YES 1 // different byte found
188#define DIFF_INSERT 2 // inserting character
189
190// values for ts_flags
191#define TSF_PREFIXOK 1 // already checked that prefix is OK
192#define TSF_DIDSPLIT 2 // tried split at this point
193#define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index
194
195// special values ts_prefixdepth
196#define PFD_NOPREFIX 0xff // not using prefixes
197#define PFD_PREFIXTREE 0xfe // walking through the prefix tree
198#define PFD_NOTSPECIAL 0xfd // highest value that's not special
199
200static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive);
201#ifdef FEAT_EVAL
202static void spell_suggest_expr(suginfo_T *su, char_u *expr);
203#endif
204static void spell_suggest_file(suginfo_T *su, char_u *fname);
205static void spell_suggest_intern(suginfo_T *su, int interactive);
206static void spell_find_cleanup(suginfo_T *su);
207static void suggest_try_special(suginfo_T *su);
208static void suggest_try_change(suginfo_T *su);
209static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold);
210static void go_deeper(trystate_T *stack, int depth, int score_add);
211static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword);
212static void score_comp_sal(suginfo_T *su);
213static void score_combine(suginfo_T *su);
214static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound);
215static void suggest_try_soundalike_prep(void);
216static void suggest_try_soundalike(suginfo_T *su);
217static void suggest_try_soundalike_finish(void);
218static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp);
219static int soundfold_find(slang_T *slang, char_u *word);
220static int similar_chars(slang_T *slang, int c1, int c2);
221static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf);
222static void check_suggestions(suginfo_T *su, garray_T *gap);
223static void add_banned(suginfo_T *su, char_u *word);
224static void rescore_suggestions(suginfo_T *su);
225static void rescore_one(suginfo_T *su, suggest_T *stp);
226static int cleanup_suggestions(garray_T *gap, int maxscore, int keep);
227static int soundalike_score(char_u *goodsound, char_u *badsound);
228static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword);
229static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit);
230static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit);
231
232/*
233 * Return TRUE when the sequence of flags in "compflags" plus "flag" can
234 * possibly form a valid compounded word. This also checks the COMPOUNDRULE
235 * lines if they don't contain wildcards.
236 */
237 static int
238can_be_compound(
239 trystate_T *sp,
240 slang_T *slang,
241 char_u *compflags,
242 int flag)
243{
244 // If the flag doesn't appear in sl_compstartflags or sl_compallflags
245 // then it can't possibly compound.
246 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit
247 ? slang->sl_compstartflags : slang->sl_compallflags, flag))
248 return FALSE;
249
250 // If there are no wildcards, we can check if the flags collected so far
251 // possibly can form a match with COMPOUNDRULE patterns. This only
252 // makes sense when we have two or more words.
253 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit)
254 {
255 int v;
256
257 compflags[sp->ts_complen] = flag;
258 compflags[sp->ts_complen + 1] = NUL;
259 v = match_compoundrule(slang, compflags + sp->ts_compsplit);
260 compflags[sp->ts_complen] = NUL;
261 return v;
262 }
263
264 return TRUE;
265}
266
267/*
268 * Adjust the score of common words.
269 */
270 static int
271score_wordcount_adj(
272 slang_T *slang,
273 int score,
274 char_u *word,
275 int split) // word was split, less bonus
276{
277 hashitem_T *hi;
278 wordcount_T *wc;
279 int bonus;
280 int newscore;
281
282 hi = hash_find(&slang->sl_wordcount, word);
283 if (!HASHITEM_EMPTY(hi))
284 {
285 wc = HI2WC(hi);
286 if (wc->wc_count < SCORE_THRES2)
287 bonus = SCORE_COMMON1;
288 else if (wc->wc_count < SCORE_THRES3)
289 bonus = SCORE_COMMON2;
290 else
291 bonus = SCORE_COMMON3;
292 if (split)
293 newscore = score - bonus / 2;
294 else
295 newscore = score - bonus;
296 if (newscore < 0)
297 return 0;
298 return newscore;
299 }
300 return score;
301}
302
303/*
304 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a
305 * capital. So that make_case_word() can turn WOrd into Word.
306 * Add ALLCAP for "WOrD".
307 */
308 static int
309badword_captype(char_u *word, char_u *end)
310{
311 int flags = captype(word, end);
312 int c;
313 int l, u;
314 int first;
315 char_u *p;
316
317 if (flags & WF_KEEPCAP)
318 {
319 // Count the number of UPPER and lower case letters.
320 l = u = 0;
321 first = FALSE;
322 for (p = word; p < end; MB_PTR_ADV(p))
323 {
324 c = PTR2CHAR(p);
325 if (SPELL_ISUPPER(c))
326 {
327 ++u;
328 if (p == word)
329 first = TRUE;
330 }
331 else
332 ++l;
333 }
334
335 // If there are more UPPER than lower case letters suggest an
336 // ALLCAP word. Otherwise, if the first letter is UPPER then
337 // suggest ONECAP. Exception: "ALl" most likely should be "All",
338 // require three upper case letters.
339 if (u > l && u > 2)
340 flags |= WF_ALLCAP;
341 else if (first)
342 flags |= WF_ONECAP;
343
344 if (u >= 2 && l >= 2) // maCARONI maCAroni
345 flags |= WF_MIXCAP;
346 }
347 return flags;
348}
349
350/*
351 * Opposite of offset2bytes().
352 * "pp" points to the bytes and is advanced over it.
353 * Returns the offset.
354 */
355 static int
356bytes2offset(char_u **pp)
357{
358 char_u *p = *pp;
359 int nr;
360 int c;
361
362 c = *p++;
363 if ((c & 0x80) == 0x00) // 1 byte
364 {
365 nr = c - 1;
366 }
367 else if ((c & 0xc0) == 0x80) // 2 bytes
368 {
369 nr = (c & 0x3f) - 1;
370 nr = nr * 255 + (*p++ - 1);
371 }
372 else if ((c & 0xe0) == 0xc0) // 3 bytes
373 {
374 nr = (c & 0x1f) - 1;
375 nr = nr * 255 + (*p++ - 1);
376 nr = nr * 255 + (*p++ - 1);
377 }
378 else // 4 bytes
379 {
380 nr = (c & 0x0f) - 1;
381 nr = nr * 255 + (*p++ - 1);
382 nr = nr * 255 + (*p++ - 1);
383 nr = nr * 255 + (*p++ - 1);
384 }
385
386 *pp = p;
387 return nr;
388}
389
390// values for sps_flags
391#define SPS_BEST 1
392#define SPS_FAST 2
393#define SPS_DOUBLE 4
394
395static int sps_flags = SPS_BEST; // flags from 'spellsuggest'
396static int sps_limit = 9999; // max nr of suggestions given
397
398/*
399 * Check the 'spellsuggest' option. Return FAIL if it's wrong.
400 * Sets "sps_flags" and "sps_limit".
401 */
402 int
403spell_check_sps(void)
404{
405 char_u *p;
406 char_u *s;
407 char_u buf[MAXPATHL];
408 int f;
409
410 sps_flags = 0;
411 sps_limit = 9999;
412
413 for (p = p_sps; *p != NUL; )
414 {
415 copy_option_part(&p, buf, MAXPATHL, ",");
416
417 f = 0;
418 if (VIM_ISDIGIT(*buf))
419 {
420 s = buf;
421 sps_limit = getdigits(&s);
422 if (*s != NUL && !VIM_ISDIGIT(*s))
423 f = -1;
424 }
425 else if (STRCMP(buf, "best") == 0)
426 f = SPS_BEST;
427 else if (STRCMP(buf, "fast") == 0)
428 f = SPS_FAST;
429 else if (STRCMP(buf, "double") == 0)
430 f = SPS_DOUBLE;
431 else if (STRNCMP(buf, "expr:", 5) != 0
432 && STRNCMP(buf, "file:", 5) != 0)
433 f = -1;
434
435 if (f == -1 || (sps_flags != 0 && f != 0))
436 {
437 sps_flags = SPS_BEST;
438 sps_limit = 9999;
439 return FAIL;
440 }
441 if (f != 0)
442 sps_flags = f;
443 }
444
445 if (sps_flags == 0)
446 sps_flags = SPS_BEST;
447
448 return OK;
449}
450
451/*
452 * "z=": Find badly spelled word under or after the cursor.
453 * Give suggestions for the properly spelled word.
454 * In Visual mode use the highlighted word as the bad word.
455 * When "count" is non-zero use that suggestion.
456 */
457 void
458spell_suggest(int count)
459{
460 char_u *line;
461 pos_T prev_cursor = curwin->w_cursor;
462 char_u wcopy[MAXWLEN + 2];
463 char_u *p;
464 int i;
465 int c;
466 suginfo_T sug;
467 suggest_T *stp;
468 int mouse_used;
469 int need_cap;
470 int limit;
471 int selected = count;
472 int badlen = 0;
473 int msg_scroll_save = msg_scroll;
Bram Moolenaar152e79e2020-06-10 15:32:08 +0200474 int wo_spell_save = curwin->w_p_spell;
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200475
Bram Moolenaar152e79e2020-06-10 15:32:08 +0200476 if (!curwin->w_p_spell)
477 {
478 did_set_spelllang(curwin);
479 curwin->w_p_spell = TRUE;
480 }
481
482 if (*curwin->w_s->b_p_spl == NUL)
483 {
Bram Moolenaar460ae5d2022-01-01 14:19:49 +0000484 emsg(_(e_spell_checking_is_not_possible));
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200485 return;
Bram Moolenaar152e79e2020-06-10 15:32:08 +0200486 }
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200487
488 if (VIsual_active)
489 {
490 // Use the Visually selected text as the bad word. But reject
491 // a multi-line selection.
492 if (curwin->w_cursor.lnum != VIsual.lnum)
493 {
494 vim_beep(BO_SPELL);
495 return;
496 }
497 badlen = (int)curwin->w_cursor.col - (int)VIsual.col;
498 if (badlen < 0)
499 badlen = -badlen;
500 else
501 curwin->w_cursor.col = VIsual.col;
502 ++badlen;
503 end_visual_mode();
504 }
505 // Find the start of the badly spelled word.
506 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0
507 || curwin->w_cursor.col > prev_cursor.col)
508 {
509 // No bad word or it starts after the cursor: use the word under the
510 // cursor.
511 curwin->w_cursor = prev_cursor;
512 line = ml_get_curline();
513 p = line + curwin->w_cursor.col;
514 // Backup to before start of word.
515 while (p > line && spell_iswordp_nmw(p, curwin))
516 MB_PTR_BACK(line, p);
517 // Forward to start of word.
518 while (*p != NUL && !spell_iswordp_nmw(p, curwin))
519 MB_PTR_ADV(p);
520
521 if (!spell_iswordp_nmw(p, curwin)) // No word found.
522 {
523 beep_flush();
524 return;
525 }
526 curwin->w_cursor.col = (colnr_T)(p - line);
527 }
528
529 // Get the word and its length.
530
531 // Figure out if the word should be capitalised.
532 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col);
533
534 // Make a copy of current line since autocommands may free the line.
535 line = vim_strsave(ml_get_curline());
536 if (line == NULL)
537 goto skip;
538
539 // Get the list of suggestions. Limit to 'lines' - 2 or the number in
540 // 'spellsuggest', whatever is smaller.
541 if (sps_limit > (int)Rows - 2)
542 limit = (int)Rows - 2;
543 else
544 limit = sps_limit;
545 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit,
546 TRUE, need_cap, TRUE);
547
548 if (sug.su_ga.ga_len == 0)
549 msg(_("Sorry, no suggestions"));
550 else if (count > 0)
551 {
552 if (count > sug.su_ga.ga_len)
Bram Moolenaar6c52f822019-12-25 14:13:03 +0100553 smsg(_("Sorry, only %ld suggestions"), (long)sug.su_ga.ga_len);
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200554 }
555 else
556 {
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200557#ifdef FEAT_RIGHTLEFT
558 // When 'rightleft' is set the list is drawn right-left.
559 cmdmsg_rl = curwin->w_p_rl;
560 if (cmdmsg_rl)
561 msg_col = Columns - 1;
562#endif
563
564 // List the suggestions.
565 msg_start();
566 msg_row = Rows - 1; // for when 'cmdheight' > 1
567 lines_left = Rows; // avoid more prompt
568 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
569 sug.su_badlen, sug.su_badptr);
570#ifdef FEAT_RIGHTLEFT
571 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0)
572 {
573 // And now the rabbit from the high hat: Avoid showing the
574 // untranslated message rightleft.
575 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC",
576 sug.su_badlen, sug.su_badptr);
577 }
578#endif
579 msg_puts((char *)IObuff);
580 msg_clr_eos();
581 msg_putchar('\n');
582
583 msg_scroll = TRUE;
584 for (i = 0; i < sug.su_ga.ga_len; ++i)
585 {
586 stp = &SUG(sug.su_ga, i);
587
588 // The suggested word may replace only part of the bad word, add
589 // the not replaced part.
590 vim_strncpy(wcopy, stp->st_word, MAXWLEN);
591 if (sug.su_badlen > stp->st_orglen)
592 vim_strncpy(wcopy + stp->st_wordlen,
593 sug.su_badptr + stp->st_orglen,
594 sug.su_badlen - stp->st_orglen);
595 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1);
596#ifdef FEAT_RIGHTLEFT
597 if (cmdmsg_rl)
598 rl_mirror(IObuff);
599#endif
600 msg_puts((char *)IObuff);
601
602 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy);
603 msg_puts((char *)IObuff);
604
605 // The word may replace more than "su_badlen".
606 if (sug.su_badlen < stp->st_orglen)
607 {
608 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""),
609 stp->st_orglen, sug.su_badptr);
610 msg_puts((char *)IObuff);
611 }
612
613 if (p_verbose > 0)
614 {
615 // Add the score.
616 if (sps_flags & (SPS_DOUBLE | SPS_BEST))
617 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)",
618 stp->st_salscore ? "s " : "",
619 stp->st_score, stp->st_altscore);
620 else
621 vim_snprintf((char *)IObuff, IOSIZE, " (%d)",
622 stp->st_score);
623#ifdef FEAT_RIGHTLEFT
624 if (cmdmsg_rl)
625 // Mirror the numbers, but keep the leading space.
626 rl_mirror(IObuff + 1);
627#endif
628 msg_advance(30);
629 msg_puts((char *)IObuff);
630 }
631 msg_putchar('\n');
632 }
633
634#ifdef FEAT_RIGHTLEFT
635 cmdmsg_rl = FALSE;
636 msg_col = 0;
637#endif
638 // Ask for choice.
639 selected = prompt_for_number(&mouse_used);
640 if (mouse_used)
641 selected -= lines_left;
642 lines_left = Rows; // avoid more prompt
643 // don't delay for 'smd' in normal_cmd()
644 msg_scroll = msg_scroll_save;
645 }
646
647 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK)
648 {
649 // Save the from and to text for :spellrepall.
Bram Moolenaar6c52f822019-12-25 14:13:03 +0100650 VIM_CLEAR(repl_from);
651 VIM_CLEAR(repl_to);
652
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200653 stp = &SUG(sug.su_ga, selected - 1);
654 if (sug.su_badlen > stp->st_orglen)
655 {
656 // Replacing less than "su_badlen", append the remainder to
657 // repl_to.
658 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen);
659 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word,
660 sug.su_badlen - stp->st_orglen,
661 sug.su_badptr + stp->st_orglen);
662 repl_to = vim_strsave(IObuff);
663 }
664 else
665 {
666 // Replacing su_badlen or more, use the whole word.
667 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen);
668 repl_to = vim_strsave(stp->st_word);
669 }
670
671 // Replace the word.
672 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1);
673 if (p != NULL)
674 {
675 c = (int)(sug.su_badptr - line);
676 mch_memmove(p, line, c);
677 STRCPY(p + c, stp->st_word);
678 STRCAT(p, sug.su_badptr + stp->st_orglen);
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200679
680 // For redo we use a change-word command.
681 ResetRedobuff();
682 AppendToRedobuff((char_u *)"ciw");
683 AppendToRedobuffLit(p + c,
684 stp->st_wordlen + sug.su_badlen - stp->st_orglen);
685 AppendCharToRedobuff(ESC);
686
Bram Moolenaar6b949612020-06-29 23:18:42 +0200687 // "p" may be freed here
688 ml_replace(curwin->w_cursor.lnum, p, FALSE);
689 curwin->w_cursor.col = c;
690
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200691 changed_bytes(curwin->w_cursor.lnum, c);
692 }
693 }
694 else
695 curwin->w_cursor = prev_cursor;
696
697 spell_find_cleanup(&sug);
698skip:
699 vim_free(line);
Bram Moolenaar152e79e2020-06-10 15:32:08 +0200700 curwin->w_p_spell = wo_spell_save;
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200701}
702
703/*
704 * Find spell suggestions for "word". Return them in the growarray "*gap" as
705 * a list of allocated strings.
706 */
707 void
708spell_suggest_list(
709 garray_T *gap,
710 char_u *word,
711 int maxcount, // maximum nr of suggestions
712 int need_cap, // 'spellcapcheck' matched
713 int interactive)
714{
715 suginfo_T sug;
716 int i;
717 suggest_T *stp;
718 char_u *wcopy;
719
720 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive);
721
722 // Make room in "gap".
723 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
724 if (ga_grow(gap, sug.su_ga.ga_len) == OK)
725 {
726 for (i = 0; i < sug.su_ga.ga_len; ++i)
727 {
728 stp = &SUG(sug.su_ga, i);
729
730 // The suggested word may replace only part of "word", add the not
731 // replaced part.
732 wcopy = alloc(stp->st_wordlen
733 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1);
734 if (wcopy == NULL)
735 break;
736 STRCPY(wcopy, stp->st_word);
737 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen);
738 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
739 }
740 }
741
742 spell_find_cleanup(&sug);
743}
744
745/*
746 * Find spell suggestions for the word at the start of "badptr".
747 * Return the suggestions in "su->su_ga".
748 * The maximum number of suggestions is "maxcount".
749 * Note: does use info for the current window.
750 * This is based on the mechanisms of Aspell, but completely reimplemented.
751 */
752 static void
753spell_find_suggest(
754 char_u *badptr,
755 int badlen, // length of bad word or 0 if unknown
756 suginfo_T *su,
757 int maxcount,
758 int banbadword, // don't include badword in suggestions
759 int need_cap, // word should start with capital
760 int interactive)
761{
762 hlf_T attr = HLF_COUNT;
763 char_u buf[MAXPATHL];
764 char_u *p;
765 int do_combine = FALSE;
766 char_u *sps_copy;
767#ifdef FEAT_EVAL
768 static int expr_busy = FALSE;
769#endif
770 int c;
771 int i;
772 langp_T *lp;
Bram Moolenaar77a849c2021-01-20 21:42:33 +0100773 int did_intern = FALSE;
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200774
775 // Set the info in "*su".
Bram Moolenaara80faa82020-04-12 19:37:17 +0200776 CLEAR_POINTER(su);
Bram Moolenaar04935fb2022-01-08 16:19:22 +0000777 ga_init2(&su->su_ga, sizeof(suggest_T), 10);
778 ga_init2(&su->su_sga, sizeof(suggest_T), 10);
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200779 if (*badptr == NUL)
780 return;
781 hash_init(&su->su_banned);
782
783 su->su_badptr = badptr;
784 if (badlen != 0)
785 su->su_badlen = badlen;
786 else
787 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE);
788 su->su_maxcount = maxcount;
789 su->su_maxscore = SCORE_MAXINIT;
790
791 if (su->su_badlen >= MAXWLEN)
792 su->su_badlen = MAXWLEN - 1; // just in case
793 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen);
Bram Moolenaar4f135272021-06-11 19:07:40 +0200794 (void)spell_casefold(curwin, su->su_badptr, su->su_badlen,
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200795 su->su_fbadword, MAXWLEN);
796 // TODO: make this work if the case-folded text is longer than the original
797 // text. Currently an illegal byte causes wrong pointer computations.
798 su->su_fbadword[su->su_badlen] = NUL;
799
800 // get caps flags for bad word
801 su->su_badflags = badword_captype(su->su_badptr,
802 su->su_badptr + su->su_badlen);
803 if (need_cap)
804 su->su_badflags |= WF_ONECAP;
805
806 // Find the default language for sound folding. We simply use the first
807 // one in 'spelllang' that supports sound folding. That's good for when
808 // using multiple files for one language, it's not that bad when mixing
809 // languages (e.g., "pl,en").
810 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i)
811 {
812 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i);
813 if (lp->lp_sallang != NULL)
814 {
815 su->su_sallang = lp->lp_sallang;
816 break;
817 }
818 }
819
820 // Soundfold the bad word with the default sound folding, so that we don't
821 // have to do this many times.
822 if (su->su_sallang != NULL)
823 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE,
824 su->su_sal_badword);
825
826 // If the word is not capitalised and spell_check() doesn't consider the
827 // word to be bad then it might need to be capitalised. Add a suggestion
828 // for that.
829 c = PTR2CHAR(su->su_badptr);
830 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT)
831 {
832 make_case_word(su->su_badword, buf, WF_ONECAP);
833 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE,
834 0, TRUE, su->su_sallang, FALSE);
835 }
836
837 // Ban the bad word itself. It may appear in another region.
838 if (banbadword)
839 add_banned(su, su->su_badword);
840
841 // Make a copy of 'spellsuggest', because the expression may change it.
842 sps_copy = vim_strsave(p_sps);
843 if (sps_copy == NULL)
844 return;
845
846 // Loop over the items in 'spellsuggest'.
847 for (p = sps_copy; *p != NUL; )
848 {
849 copy_option_part(&p, buf, MAXPATHL, ",");
850
851 if (STRNCMP(buf, "expr:", 5) == 0)
852 {
853#ifdef FEAT_EVAL
854 // Evaluate an expression. Skip this when called recursively,
855 // when using spellsuggest() in the expression.
856 if (!expr_busy)
857 {
858 expr_busy = TRUE;
859 spell_suggest_expr(su, buf + 5);
860 expr_busy = FALSE;
861 }
862#endif
863 }
864 else if (STRNCMP(buf, "file:", 5) == 0)
865 // Use list of suggestions in a file.
866 spell_suggest_file(su, buf + 5);
Bram Moolenaar77a849c2021-01-20 21:42:33 +0100867 else if (!did_intern)
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200868 {
Bram Moolenaar77a849c2021-01-20 21:42:33 +0100869 // Use internal method once.
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200870 spell_suggest_intern(su, interactive);
871 if (sps_flags & SPS_DOUBLE)
872 do_combine = TRUE;
Bram Moolenaar77a849c2021-01-20 21:42:33 +0100873 did_intern = TRUE;
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200874 }
875 }
876
877 vim_free(sps_copy);
878
879 if (do_combine)
880 // Combine the two list of suggestions. This must be done last,
881 // because sorting changes the order again.
882 score_combine(su);
883}
884
885#ifdef FEAT_EVAL
886/*
887 * Find suggestions by evaluating expression "expr".
888 */
889 static void
890spell_suggest_expr(suginfo_T *su, char_u *expr)
891{
892 list_T *list;
893 listitem_T *li;
894 int score;
895 char_u *p;
896
897 // The work is split up in a few parts to avoid having to export
898 // suginfo_T.
899 // First evaluate the expression and get the resulting list.
900 list = eval_spell_expr(su->su_badword, expr);
901 if (list != NULL)
902 {
903 // Loop over the items in the list.
Bram Moolenaaraeea7212020-04-02 18:50:46 +0200904 FOR_ALL_LIST_ITEMS(list, li)
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200905 if (li->li_tv.v_type == VAR_LIST)
906 {
907 // Get the word and the score from the items.
908 score = get_spellword(li->li_tv.vval.v_list, &p);
909 if (score >= 0 && score <= su->su_maxscore)
910 add_suggestion(su, &su->su_ga, p, su->su_badlen,
911 score, 0, TRUE, su->su_sallang, FALSE);
912 }
913 list_unref(list);
914 }
915
916 // Remove bogus suggestions, sort and truncate at "maxcount".
917 check_suggestions(su, &su->su_ga);
918 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
919}
920#endif
921
922/*
923 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'.
924 */
925 static void
926spell_suggest_file(suginfo_T *su, char_u *fname)
927{
928 FILE *fd;
929 char_u line[MAXWLEN * 2];
930 char_u *p;
931 int len;
932 char_u cword[MAXWLEN];
933
934 // Open the file.
935 fd = mch_fopen((char *)fname, "r");
936 if (fd == NULL)
937 {
Bram Moolenaar460ae5d2022-01-01 14:19:49 +0000938 semsg(_(e_cant_open_file_str), fname);
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200939 return;
940 }
941
942 // Read it line by line.
943 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int)
944 {
945 line_breakcheck();
946
947 p = vim_strchr(line, '/');
948 if (p == NULL)
949 continue; // No Tab found, just skip the line.
950 *p++ = NUL;
951 if (STRICMP(su->su_badword, line) == 0)
952 {
953 // Match! Isolate the good word, until CR or NL.
954 for (len = 0; p[len] >= ' '; ++len)
955 ;
956 p[len] = NUL;
957
958 // If the suggestion doesn't have specific case duplicate the case
959 // of the bad word.
960 if (captype(p, NULL) == 0)
961 {
962 make_case_word(p, cword, su->su_badflags);
963 p = cword;
964 }
965
966 add_suggestion(su, &su->su_ga, p, su->su_badlen,
967 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE);
968 }
969 }
970
971 fclose(fd);
972
973 // Remove bogus suggestions, sort and truncate at "maxcount".
974 check_suggestions(su, &su->su_ga);
975 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
976}
977
978/*
979 * Find suggestions for the internal method indicated by "sps_flags".
980 */
981 static void
982spell_suggest_intern(suginfo_T *su, int interactive)
983{
984 // Load the .sug file(s) that are available and not done yet.
985 suggest_load_files();
986
987 // 1. Try special cases, such as repeating a word: "the the" -> "the".
988 //
989 // Set a maximum score to limit the combination of operations that is
990 // tried.
991 suggest_try_special(su);
992
993 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries
994 // from the .aff file and inserting a space (split the word).
995 suggest_try_change(su);
996
997 // For the resulting top-scorers compute the sound-a-like score.
998 if (sps_flags & SPS_DOUBLE)
999 score_comp_sal(su);
1000
1001 // 3. Try finding sound-a-like words.
1002 if ((sps_flags & SPS_FAST) == 0)
1003 {
1004 if (sps_flags & SPS_BEST)
1005 // Adjust the word score for the suggestions found so far for how
Bram Moolenaar32aa1022019-11-02 22:54:41 +01001006 // they sound like.
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001007 rescore_suggestions(su);
1008
1009 // While going through the soundfold tree "su_maxscore" is the score
1010 // for the soundfold word, limits the changes that are being tried,
1011 // and "su_sfmaxscore" the rescored score, which is set by
1012 // cleanup_suggestions().
1013 // First find words with a small edit distance, because this is much
1014 // faster and often already finds the top-N suggestions. If we didn't
1015 // find many suggestions try again with a higher edit distance.
1016 // "sl_sounddone" is used to avoid doing the same word twice.
1017 suggest_try_soundalike_prep();
1018 su->su_maxscore = SCORE_SFMAX1;
1019 su->su_sfmaxscore = SCORE_MAXINIT * 3;
1020 suggest_try_soundalike(su);
1021 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1022 {
1023 // We didn't find enough matches, try again, allowing more
1024 // changes to the soundfold word.
1025 su->su_maxscore = SCORE_SFMAX2;
1026 suggest_try_soundalike(su);
1027 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1028 {
1029 // Still didn't find enough matches, try again, allowing even
1030 // more changes to the soundfold word.
1031 su->su_maxscore = SCORE_SFMAX3;
1032 suggest_try_soundalike(su);
1033 }
1034 }
1035 su->su_maxscore = su->su_sfmaxscore;
1036 suggest_try_soundalike_finish();
1037 }
1038
1039 // When CTRL-C was hit while searching do show the results. Only clear
1040 // got_int when using a command, not for spellsuggest().
1041 ui_breakcheck();
1042 if (interactive && got_int)
1043 {
1044 (void)vgetc();
1045 got_int = FALSE;
1046 }
1047
1048 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0)
1049 {
1050 if (sps_flags & SPS_BEST)
1051 // Adjust the word score for how it sounds like.
1052 rescore_suggestions(su);
1053
1054 // Remove bogus suggestions, sort and truncate at "maxcount".
1055 check_suggestions(su, &su->su_ga);
1056 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
1057 }
1058}
1059
1060/*
1061 * Free the info put in "*su" by spell_find_suggest().
1062 */
1063 static void
1064spell_find_cleanup(suginfo_T *su)
1065{
1066 int i;
1067
1068 // Free the suggestions.
1069 for (i = 0; i < su->su_ga.ga_len; ++i)
1070 vim_free(SUG(su->su_ga, i).st_word);
1071 ga_clear(&su->su_ga);
1072 for (i = 0; i < su->su_sga.ga_len; ++i)
1073 vim_free(SUG(su->su_sga, i).st_word);
1074 ga_clear(&su->su_sga);
1075
1076 // Free the banned words.
1077 hash_clear_all(&su->su_banned, 0);
1078}
1079
1080/*
1081 * Try finding suggestions by recognizing specific situations.
1082 */
1083 static void
1084suggest_try_special(suginfo_T *su)
1085{
1086 char_u *p;
1087 size_t len;
1088 int c;
1089 char_u word[MAXWLEN];
1090
1091 // Recognize a word that is repeated: "the the".
1092 p = skiptowhite(su->su_fbadword);
1093 len = p - su->su_fbadword;
1094 p = skipwhite(p);
1095 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0)
1096 {
1097 // Include badflags: if the badword is onecap or allcap
1098 // use that for the goodword too: "The the" -> "The".
1099 c = su->su_fbadword[len];
1100 su->su_fbadword[len] = NUL;
1101 make_case_word(su->su_fbadword, word, su->su_badflags);
1102 su->su_fbadword[len] = c;
1103
1104 // Give a soundalike score of 0, compute the score as if deleting one
1105 // character.
1106 add_suggestion(su, &su->su_ga, word, su->su_badlen,
1107 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE);
1108 }
1109}
1110
1111/*
1112 * Change the 0 to 1 to measure how much time is spent in each state.
1113 * Output is dumped in "suggestprof".
1114 */
1115#if 0
1116# define SUGGEST_PROFILE
1117proftime_T current;
1118proftime_T total;
1119proftime_T times[STATE_FINAL + 1];
1120long counts[STATE_FINAL + 1];
1121
1122 static void
1123prof_init(void)
1124{
1125 for (int i = 0; i <= STATE_FINAL; ++i)
1126 {
1127 profile_zero(&times[i]);
1128 counts[i] = 0;
1129 }
1130 profile_start(&current);
1131 profile_start(&total);
1132}
1133
1134// call before changing state
1135 static void
1136prof_store(state_T state)
1137{
1138 profile_end(&current);
1139 profile_add(&times[state], &current);
1140 ++counts[state];
1141 profile_start(&current);
1142}
1143# define PROF_STORE(state) prof_store(state);
1144
1145 static void
1146prof_report(char *name)
1147{
1148 FILE *fd = fopen("suggestprof", "a");
1149
1150 profile_end(&total);
1151 fprintf(fd, "-----------------------\n");
1152 fprintf(fd, "%s: %s\n", name, profile_msg(&total));
1153 for (int i = 0; i <= STATE_FINAL; ++i)
1154 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(&times[i]), counts[i]);
1155 fclose(fd);
1156}
1157#else
1158# define PROF_STORE(state)
1159#endif
1160
1161/*
1162 * Try finding suggestions by adding/removing/swapping letters.
1163 */
1164 static void
1165suggest_try_change(suginfo_T *su)
1166{
1167 char_u fword[MAXWLEN]; // copy of the bad word, case-folded
1168 int n;
1169 char_u *p;
1170 int lpi;
1171 langp_T *lp;
1172
1173 // We make a copy of the case-folded bad word, so that we can modify it
1174 // to find matches (esp. REP items). Append some more text, changing
1175 // chars after the bad word may help.
1176 STRCPY(fword, su->su_fbadword);
1177 n = (int)STRLEN(fword);
1178 p = su->su_badptr + su->su_badlen;
Bram Moolenaar4f135272021-06-11 19:07:40 +02001179 (void)spell_casefold(curwin, p, (int)STRLEN(p), fword + n, MAXWLEN - n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001180
Bram Moolenaare275ba42021-10-06 13:41:07 +01001181 // Make sure the resulting text is not longer than the original text.
1182 n = (int)STRLEN(su->su_badptr);
1183 if (n < MAXWLEN)
1184 fword[n] = NUL;
1185
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001186 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
1187 {
1188 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
1189
1190 // If reloading a spell file fails it's still in the list but
1191 // everything has been cleared.
1192 if (lp->lp_slang->sl_fbyts == NULL)
1193 continue;
1194
1195 // Try it for this language. Will add possible suggestions.
1196#ifdef SUGGEST_PROFILE
1197 prof_init();
1198#endif
1199 suggest_trie_walk(su, lp, fword, FALSE);
1200#ifdef SUGGEST_PROFILE
1201 prof_report("try_change");
1202#endif
1203 }
1204}
1205
1206// Check the maximum score, if we go over it we won't try this change.
1207#define TRY_DEEPER(su, stack, depth, add) \
1208 (stack[depth].ts_score + (add) < su->su_maxscore)
1209
1210/*
1211 * Try finding suggestions by adding/removing/swapping letters.
1212 *
1213 * This uses a state machine. At each node in the tree we try various
1214 * operations. When trying if an operation works "depth" is increased and the
1215 * stack[] is used to store info. This allows combinations, thus insert one
1216 * character, replace one and delete another. The number of changes is
1217 * limited by su->su_maxscore.
1218 *
1219 * After implementing this I noticed an article by Kemal Oflazer that
1220 * describes something similar: "Error-tolerant Finite State Recognition with
1221 * Applications to Morphological Analysis and Spelling Correction" (1996).
1222 * The implementation in the article is simplified and requires a stack of
1223 * unknown depth. The implementation here only needs a stack depth equal to
1224 * the length of the word.
1225 *
1226 * This is also used for the sound-folded word, "soundfold" is TRUE then.
1227 * The mechanism is the same, but we find a match with a sound-folded word
1228 * that comes from one or more original words. Each of these words may be
1229 * added, this is done by add_sound_suggest().
1230 * Don't use:
1231 * the prefix tree or the keep-case tree
1232 * "su->su_badlen"
1233 * anything to do with upper and lower case
1234 * anything to do with word or non-word characters ("spell_iswordp()")
1235 * banned words
1236 * word flags (rare, region, compounding)
1237 * word splitting for now
1238 * "similar_chars()"
1239 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep"
1240 */
1241 static void
1242suggest_trie_walk(
1243 suginfo_T *su,
1244 langp_T *lp,
1245 char_u *fword,
1246 int soundfold)
1247{
1248 char_u tword[MAXWLEN]; // good word collected so far
1249 trystate_T stack[MAXWLEN];
1250 char_u preword[MAXWLEN * 3]; // word found with proper case;
1251 // concatenation of prefix compound
1252 // words and split word. NUL terminated
1253 // when going deeper but not when coming
1254 // back.
1255 char_u compflags[MAXWLEN]; // compound flags, one for each word
1256 trystate_T *sp;
1257 int newscore;
1258 int score;
1259 char_u *byts, *fbyts, *pbyts;
1260 idx_T *idxs, *fidxs, *pidxs;
1261 int depth;
1262 int c, c2, c3;
1263 int n = 0;
1264 int flags;
1265 garray_T *gap;
1266 idx_T arridx;
1267 int len;
1268 char_u *p;
1269 fromto_T *ftp;
1270 int fl = 0, tl;
1271 int repextra = 0; // extra bytes in fword[] from REP item
1272 slang_T *slang = lp->lp_slang;
1273 int fword_ends;
1274 int goodword_ends;
1275#ifdef DEBUG_TRIEWALK
1276 // Stores the name of the change made at each level.
1277 char_u changename[MAXWLEN][80];
1278#endif
1279 int breakcheckcount = 1000;
1280 int compound_ok;
1281
1282 // Go through the whole case-fold tree, try changes at each node.
1283 // "tword[]" contains the word collected from nodes in the tree.
1284 // "fword[]" the word we are trying to match with (initially the bad
1285 // word).
1286 depth = 0;
1287 sp = &stack[0];
Bram Moolenaara80faa82020-04-12 19:37:17 +02001288 CLEAR_POINTER(sp);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001289 sp->ts_curi = 1;
1290
1291 if (soundfold)
1292 {
1293 // Going through the soundfold tree.
1294 byts = fbyts = slang->sl_sbyts;
1295 idxs = fidxs = slang->sl_sidxs;
1296 pbyts = NULL;
1297 pidxs = NULL;
1298 sp->ts_prefixdepth = PFD_NOPREFIX;
1299 sp->ts_state = STATE_START;
1300 }
1301 else
1302 {
1303 // When there are postponed prefixes we need to use these first. At
1304 // the end of the prefix we continue in the case-fold tree.
1305 fbyts = slang->sl_fbyts;
1306 fidxs = slang->sl_fidxs;
1307 pbyts = slang->sl_pbyts;
1308 pidxs = slang->sl_pidxs;
1309 if (pbyts != NULL)
1310 {
1311 byts = pbyts;
1312 idxs = pidxs;
1313 sp->ts_prefixdepth = PFD_PREFIXTREE;
1314 sp->ts_state = STATE_NOPREFIX; // try without prefix first
1315 }
1316 else
1317 {
1318 byts = fbyts;
1319 idxs = fidxs;
1320 sp->ts_prefixdepth = PFD_NOPREFIX;
1321 sp->ts_state = STATE_START;
1322 }
1323 }
1324
1325 // Loop to find all suggestions. At each round we either:
1326 // - For the current state try one operation, advance "ts_curi",
1327 // increase "depth".
1328 // - When a state is done go to the next, set "ts_state".
1329 // - When all states are tried decrease "depth".
1330 while (depth >= 0 && !got_int)
1331 {
1332 sp = &stack[depth];
1333 switch (sp->ts_state)
1334 {
1335 case STATE_START:
1336 case STATE_NOPREFIX:
1337 // Start of node: Deal with NUL bytes, which means
1338 // tword[] may end here.
1339 arridx = sp->ts_arridx; // current node in the tree
1340 len = byts[arridx]; // bytes in this node
1341 arridx += sp->ts_curi; // index of current byte
1342
1343 if (sp->ts_prefixdepth == PFD_PREFIXTREE)
1344 {
1345 // Skip over the NUL bytes, we use them later.
1346 for (n = 0; n < len && byts[arridx + n] == 0; ++n)
1347 ;
1348 sp->ts_curi += n;
1349
1350 // Always past NUL bytes now.
1351 n = (int)sp->ts_state;
1352 PROF_STORE(sp->ts_state)
1353 sp->ts_state = STATE_ENDNUL;
1354 sp->ts_save_badflags = su->su_badflags;
1355
1356 // At end of a prefix or at start of prefixtree: check for
1357 // following word.
1358 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX)
1359 {
1360 // Set su->su_badflags to the caps type at this position.
1361 // Use the caps type until here for the prefix itself.
1362 if (has_mbyte)
1363 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1364 else
1365 n = sp->ts_fidx;
1366 flags = badword_captype(su->su_badptr, su->su_badptr + n);
1367 su->su_badflags = badword_captype(su->su_badptr + n,
1368 su->su_badptr + su->su_badlen);
1369#ifdef DEBUG_TRIEWALK
1370 sprintf(changename[depth], "prefix");
1371#endif
1372 go_deeper(stack, depth, 0);
1373 ++depth;
1374 sp = &stack[depth];
1375 sp->ts_prefixdepth = depth - 1;
1376 byts = fbyts;
1377 idxs = fidxs;
1378 sp->ts_arridx = 0;
1379
1380 // Move the prefix to preword[] with the right case
1381 // and make find_keepcap_word() works.
1382 tword[sp->ts_twordlen] = NUL;
1383 make_case_word(tword + sp->ts_splitoff,
1384 preword + sp->ts_prewordlen, flags);
1385 sp->ts_prewordlen = (char_u)STRLEN(preword);
1386 sp->ts_splitoff = sp->ts_twordlen;
1387 }
1388 break;
1389 }
1390
1391 if (sp->ts_curi > len || byts[arridx] != 0)
1392 {
1393 // Past bytes in node and/or past NUL bytes.
1394 PROF_STORE(sp->ts_state)
1395 sp->ts_state = STATE_ENDNUL;
1396 sp->ts_save_badflags = su->su_badflags;
1397 break;
1398 }
1399
1400 // End of word in tree.
1401 ++sp->ts_curi; // eat one NUL byte
1402
1403 flags = (int)idxs[arridx];
1404
1405 // Skip words with the NOSUGGEST flag.
1406 if (flags & WF_NOSUGGEST)
1407 break;
1408
1409 fword_ends = (fword[sp->ts_fidx] == NUL
1410 || (soundfold
1411 ? VIM_ISWHITE(fword[sp->ts_fidx])
1412 : !spell_iswordp(fword + sp->ts_fidx, curwin)));
1413 tword[sp->ts_twordlen] = NUL;
1414
1415 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
Bram Moolenaar11b66002020-07-01 13:15:24 +02001416 && (sp->ts_flags & TSF_PREFIXOK) == 0
1417 && pbyts != NULL)
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001418 {
1419 // There was a prefix before the word. Check that the prefix
1420 // can be used with this word.
1421 // Count the length of the NULs in the prefix. If there are
1422 // none this must be the first try without a prefix.
1423 n = stack[sp->ts_prefixdepth].ts_arridx;
1424 len = pbyts[n++];
1425 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
1426 ;
1427 if (c > 0)
1428 {
1429 c = valid_word_prefix(c, n, flags,
1430 tword + sp->ts_splitoff, slang, FALSE);
1431 if (c == 0)
1432 break;
1433
1434 // Use the WF_RARE flag for a rare prefix.
1435 if (c & WF_RAREPFX)
1436 flags |= WF_RARE;
1437
1438 // Tricky: when checking for both prefix and compounding
1439 // we run into the prefix flag first.
1440 // Remember that it's OK, so that we accept the prefix
1441 // when arriving at a compound flag.
1442 sp->ts_flags |= TSF_PREFIXOK;
1443 }
1444 }
1445
1446 // Check NEEDCOMPOUND: can't use word without compounding. Do try
1447 // appending another compound word below.
1448 if (sp->ts_complen == sp->ts_compsplit && fword_ends
1449 && (flags & WF_NEEDCOMP))
1450 goodword_ends = FALSE;
1451 else
1452 goodword_ends = TRUE;
1453
1454 p = NULL;
1455 compound_ok = TRUE;
1456 if (sp->ts_complen > sp->ts_compsplit)
1457 {
1458 if (slang->sl_nobreak)
1459 {
1460 // There was a word before this word. When there was no
1461 // change in this word (it was correct) add the first word
1462 // as a suggestion. If this word was corrected too, we
1463 // need to check if a correct word follows.
1464 if (sp->ts_fidx - sp->ts_splitfidx
1465 == sp->ts_twordlen - sp->ts_splitoff
1466 && STRNCMP(fword + sp->ts_splitfidx,
1467 tword + sp->ts_splitoff,
1468 sp->ts_fidx - sp->ts_splitfidx) == 0)
1469 {
1470 preword[sp->ts_prewordlen] = NUL;
1471 newscore = score_wordcount_adj(slang, sp->ts_score,
1472 preword + sp->ts_prewordlen,
1473 sp->ts_prewordlen > 0);
1474 // Add the suggestion if the score isn't too bad.
1475 if (newscore <= su->su_maxscore)
1476 add_suggestion(su, &su->su_ga, preword,
1477 sp->ts_splitfidx - repextra,
1478 newscore, 0, FALSE,
1479 lp->lp_sallang, FALSE);
1480 break;
1481 }
1482 }
1483 else
1484 {
1485 // There was a compound word before this word. If this
1486 // word does not support compounding then give up
1487 // (splitting is tried for the word without compound
1488 // flag).
1489 if (((unsigned)flags >> 24) == 0
1490 || sp->ts_twordlen - sp->ts_splitoff
1491 < slang->sl_compminlen)
1492 break;
1493 // For multi-byte chars check character length against
1494 // COMPOUNDMIN.
1495 if (has_mbyte
1496 && slang->sl_compminlen > 0
1497 && mb_charlen(tword + sp->ts_splitoff)
1498 < slang->sl_compminlen)
1499 break;
1500
1501 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1502 compflags[sp->ts_complen + 1] = NUL;
1503 vim_strncpy(preword + sp->ts_prewordlen,
1504 tword + sp->ts_splitoff,
1505 sp->ts_twordlen - sp->ts_splitoff);
1506
1507 // Verify CHECKCOMPOUNDPATTERN rules.
1508 if (match_checkcompoundpattern(preword, sp->ts_prewordlen,
1509 &slang->sl_comppat))
1510 compound_ok = FALSE;
1511
1512 if (compound_ok)
1513 {
1514 p = preword;
1515 while (*skiptowhite(p) != NUL)
1516 p = skipwhite(skiptowhite(p));
1517 if (fword_ends && !can_compound(slang, p,
1518 compflags + sp->ts_compsplit))
1519 // Compound is not allowed. But it may still be
1520 // possible if we add another (short) word.
1521 compound_ok = FALSE;
1522 }
1523
1524 // Get pointer to last char of previous word.
1525 p = preword + sp->ts_prewordlen;
1526 MB_PTR_BACK(preword, p);
1527 }
1528 }
1529
1530 // Form the word with proper case in preword.
1531 // If there is a word from a previous split, append.
1532 // For the soundfold tree don't change the case, simply append.
1533 if (soundfold)
1534 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff);
1535 else if (flags & WF_KEEPCAP)
1536 // Must find the word in the keep-case tree.
1537 find_keepcap_word(slang, tword + sp->ts_splitoff,
1538 preword + sp->ts_prewordlen);
1539 else
1540 {
1541 // Include badflags: If the badword is onecap or allcap
1542 // use that for the goodword too. But if the badword is
1543 // allcap and it's only one char long use onecap.
1544 c = su->su_badflags;
1545 if ((c & WF_ALLCAP)
1546 && su->su_badlen == (*mb_ptr2len)(su->su_badptr))
1547 c = WF_ONECAP;
1548 c |= flags;
1549
1550 // When appending a compound word after a word character don't
1551 // use Onecap.
1552 if (p != NULL && spell_iswordp_nmw(p, curwin))
1553 c &= ~WF_ONECAP;
1554 make_case_word(tword + sp->ts_splitoff,
1555 preword + sp->ts_prewordlen, c);
1556 }
1557
1558 if (!soundfold)
1559 {
1560 // Don't use a banned word. It may appear again as a good
1561 // word, thus remember it.
1562 if (flags & WF_BANNED)
1563 {
1564 add_banned(su, preword + sp->ts_prewordlen);
1565 break;
1566 }
1567 if ((sp->ts_complen == sp->ts_compsplit
1568 && WAS_BANNED(su, preword + sp->ts_prewordlen))
1569 || WAS_BANNED(su, preword))
1570 {
1571 if (slang->sl_compprog == NULL)
1572 break;
1573 // the word so far was banned but we may try compounding
1574 goodword_ends = FALSE;
1575 }
1576 }
1577
1578 newscore = 0;
1579 if (!soundfold) // soundfold words don't have flags
1580 {
1581 if ((flags & WF_REGION)
1582 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
1583 newscore += SCORE_REGION;
1584 if (flags & WF_RARE)
1585 newscore += SCORE_RARE;
1586
1587 if (!spell_valid_case(su->su_badflags,
1588 captype(preword + sp->ts_prewordlen, NULL)))
1589 newscore += SCORE_ICASE;
1590 }
1591
1592 // TODO: how about splitting in the soundfold tree?
1593 if (fword_ends
1594 && goodword_ends
1595 && sp->ts_fidx >= sp->ts_fidxtry
1596 && compound_ok)
1597 {
1598 // The badword also ends: add suggestions.
1599#ifdef DEBUG_TRIEWALK
1600 if (soundfold && STRCMP(preword, "smwrd") == 0)
1601 {
1602 int j;
1603
1604 // print the stack of changes that brought us here
1605 smsg("------ %s -------", fword);
1606 for (j = 0; j < depth; ++j)
1607 smsg("%s", changename[j]);
1608 }
1609#endif
1610 if (soundfold)
1611 {
1612 // For soundfolded words we need to find the original
1613 // words, the edit distance and then add them.
1614 add_sound_suggest(su, preword, sp->ts_score, lp);
1615 }
1616 else if (sp->ts_fidx > 0)
1617 {
1618 // Give a penalty when changing non-word char to word
1619 // char, e.g., "thes," -> "these".
1620 p = fword + sp->ts_fidx;
1621 MB_PTR_BACK(fword, p);
Bram Moolenaar15d98902021-11-04 15:46:05 +00001622 if (!spell_iswordp(p, curwin) && *preword != NUL)
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001623 {
1624 p = preword + STRLEN(preword);
1625 MB_PTR_BACK(preword, p);
1626 if (spell_iswordp(p, curwin))
1627 newscore += SCORE_NONWORD;
1628 }
1629
1630 // Give a bonus to words seen before.
1631 score = score_wordcount_adj(slang,
1632 sp->ts_score + newscore,
1633 preword + sp->ts_prewordlen,
1634 sp->ts_prewordlen > 0);
1635
1636 // Add the suggestion if the score isn't too bad.
1637 if (score <= su->su_maxscore)
1638 {
1639 add_suggestion(su, &su->su_ga, preword,
1640 sp->ts_fidx - repextra,
1641 score, 0, FALSE, lp->lp_sallang, FALSE);
1642
1643 if (su->su_badflags & WF_MIXCAP)
1644 {
1645 // We really don't know if the word should be
1646 // upper or lower case, add both.
1647 c = captype(preword, NULL);
1648 if (c == 0 || c == WF_ALLCAP)
1649 {
1650 make_case_word(tword + sp->ts_splitoff,
1651 preword + sp->ts_prewordlen,
1652 c == 0 ? WF_ALLCAP : 0);
1653
1654 add_suggestion(su, &su->su_ga, preword,
1655 sp->ts_fidx - repextra,
1656 score + SCORE_ICASE, 0, FALSE,
1657 lp->lp_sallang, FALSE);
1658 }
1659 }
1660 }
1661 }
1662 }
1663
1664 // Try word split and/or compounding.
1665 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends)
1666 // Don't split halfway a character.
1667 && (!has_mbyte || sp->ts_tcharlen == 0))
1668 {
1669 int try_compound;
1670 int try_split;
1671
1672 // If past the end of the bad word don't try a split.
1673 // Otherwise try changing the next word. E.g., find
1674 // suggestions for "the the" where the second "the" is
1675 // different. It's done like a split.
1676 // TODO: word split for soundfold words
1677 try_split = (sp->ts_fidx - repextra < su->su_badlen)
1678 && !soundfold;
1679
1680 // Get here in several situations:
1681 // 1. The word in the tree ends:
1682 // If the word allows compounding try that. Otherwise try
1683 // a split by inserting a space. For both check that a
1684 // valid words starts at fword[sp->ts_fidx].
1685 // For NOBREAK do like compounding to be able to check if
1686 // the next word is valid.
1687 // 2. The badword does end, but it was due to a change (e.g.,
1688 // a swap). No need to split, but do check that the
1689 // following word is valid.
1690 // 3. The badword and the word in the tree end. It may still
1691 // be possible to compound another (short) word.
1692 try_compound = FALSE;
1693 if (!soundfold
1694 && !slang->sl_nocompoundsugs
1695 && slang->sl_compprog != NULL
1696 && ((unsigned)flags >> 24) != 0
1697 && sp->ts_twordlen - sp->ts_splitoff
1698 >= slang->sl_compminlen
1699 && (!has_mbyte
1700 || slang->sl_compminlen == 0
1701 || mb_charlen(tword + sp->ts_splitoff)
1702 >= slang->sl_compminlen)
1703 && (slang->sl_compsylmax < MAXWLEN
1704 || sp->ts_complen + 1 - sp->ts_compsplit
1705 < slang->sl_compmax)
1706 && (can_be_compound(sp, slang,
1707 compflags, ((unsigned)flags >> 24))))
1708
1709 {
1710 try_compound = TRUE;
1711 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1712 compflags[sp->ts_complen + 1] = NUL;
1713 }
1714
1715 // For NOBREAK we never try splitting, it won't make any word
1716 // valid.
1717 if (slang->sl_nobreak && !slang->sl_nocompoundsugs)
1718 try_compound = TRUE;
1719
1720 // If we could add a compound word, and it's also possible to
1721 // split at this point, do the split first and set
1722 // TSF_DIDSPLIT to avoid doing it again.
1723 else if (!fword_ends
1724 && try_compound
1725 && (sp->ts_flags & TSF_DIDSPLIT) == 0)
1726 {
1727 try_compound = FALSE;
1728 sp->ts_flags |= TSF_DIDSPLIT;
1729 --sp->ts_curi; // do the same NUL again
1730 compflags[sp->ts_complen] = NUL;
1731 }
1732 else
1733 sp->ts_flags &= ~TSF_DIDSPLIT;
1734
1735 if (try_split || try_compound)
1736 {
1737 if (!try_compound && (!fword_ends || !goodword_ends))
1738 {
1739 // If we're going to split need to check that the
1740 // words so far are valid for compounding. If there
1741 // is only one word it must not have the NEEDCOMPOUND
1742 // flag.
1743 if (sp->ts_complen == sp->ts_compsplit
1744 && (flags & WF_NEEDCOMP))
1745 break;
1746 p = preword;
1747 while (*skiptowhite(p) != NUL)
1748 p = skipwhite(skiptowhite(p));
1749 if (sp->ts_complen > sp->ts_compsplit
1750 && !can_compound(slang, p,
1751 compflags + sp->ts_compsplit))
1752 break;
1753
1754 if (slang->sl_nosplitsugs)
1755 newscore += SCORE_SPLIT_NO;
1756 else
1757 newscore += SCORE_SPLIT;
1758
1759 // Give a bonus to words seen before.
1760 newscore = score_wordcount_adj(slang, newscore,
1761 preword + sp->ts_prewordlen, TRUE);
1762 }
1763
1764 if (TRY_DEEPER(su, stack, depth, newscore))
1765 {
1766 go_deeper(stack, depth, newscore);
1767#ifdef DEBUG_TRIEWALK
1768 if (!try_compound && !fword_ends)
1769 sprintf(changename[depth], "%.*s-%s: split",
1770 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1771 else
1772 sprintf(changename[depth], "%.*s-%s: compound",
1773 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1774#endif
1775 // Save things to be restored at STATE_SPLITUNDO.
1776 sp->ts_save_badflags = su->su_badflags;
1777 PROF_STORE(sp->ts_state)
1778 sp->ts_state = STATE_SPLITUNDO;
1779
1780 ++depth;
1781 sp = &stack[depth];
1782
1783 // Append a space to preword when splitting.
1784 if (!try_compound && !fword_ends)
1785 STRCAT(preword, " ");
1786 sp->ts_prewordlen = (char_u)STRLEN(preword);
1787 sp->ts_splitoff = sp->ts_twordlen;
1788 sp->ts_splitfidx = sp->ts_fidx;
1789
1790 // If the badword has a non-word character at this
1791 // position skip it. That means replacing the
1792 // non-word character with a space. Always skip a
1793 // character when the word ends. But only when the
1794 // good word can end.
1795 if (((!try_compound && !spell_iswordp_nmw(fword
1796 + sp->ts_fidx,
1797 curwin))
1798 || fword_ends)
1799 && fword[sp->ts_fidx] != NUL
1800 && goodword_ends)
1801 {
1802 int l;
1803
Bram Moolenaar1614a142019-10-06 22:00:13 +02001804 l = mb_ptr2len(fword + sp->ts_fidx);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001805 if (fword_ends)
1806 {
1807 // Copy the skipped character to preword.
1808 mch_memmove(preword + sp->ts_prewordlen,
1809 fword + sp->ts_fidx, l);
1810 sp->ts_prewordlen += l;
1811 preword[sp->ts_prewordlen] = NUL;
1812 }
1813 else
1814 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST;
1815 sp->ts_fidx += l;
1816 }
1817
1818 // When compounding include compound flag in
1819 // compflags[] (already set above). When splitting we
1820 // may start compounding over again.
1821 if (try_compound)
1822 ++sp->ts_complen;
1823 else
1824 sp->ts_compsplit = sp->ts_complen;
1825 sp->ts_prefixdepth = PFD_NOPREFIX;
1826
1827 // set su->su_badflags to the caps type at this
1828 // position
1829 if (has_mbyte)
1830 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1831 else
1832 n = sp->ts_fidx;
1833 su->su_badflags = badword_captype(su->su_badptr + n,
1834 su->su_badptr + su->su_badlen);
1835
1836 // Restart at top of the tree.
1837 sp->ts_arridx = 0;
1838
1839 // If there are postponed prefixes, try these too.
1840 if (pbyts != NULL)
1841 {
1842 byts = pbyts;
1843 idxs = pidxs;
1844 sp->ts_prefixdepth = PFD_PREFIXTREE;
1845 PROF_STORE(sp->ts_state)
1846 sp->ts_state = STATE_NOPREFIX;
1847 }
1848 }
1849 }
1850 }
1851 break;
1852
1853 case STATE_SPLITUNDO:
1854 // Undo the changes done for word split or compound word.
1855 su->su_badflags = sp->ts_save_badflags;
1856
1857 // Continue looking for NUL bytes.
1858 PROF_STORE(sp->ts_state)
1859 sp->ts_state = STATE_START;
1860
1861 // In case we went into the prefix tree.
1862 byts = fbyts;
1863 idxs = fidxs;
1864 break;
1865
1866 case STATE_ENDNUL:
1867 // Past the NUL bytes in the node.
1868 su->su_badflags = sp->ts_save_badflags;
1869 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0)
1870 {
1871 // The badword ends, can't use STATE_PLAIN.
1872 PROF_STORE(sp->ts_state)
1873 sp->ts_state = STATE_DEL;
1874 break;
1875 }
1876 PROF_STORE(sp->ts_state)
1877 sp->ts_state = STATE_PLAIN;
1878 // FALLTHROUGH
1879
1880 case STATE_PLAIN:
1881 // Go over all possible bytes at this node, add each to tword[]
1882 // and use child node. "ts_curi" is the index.
1883 arridx = sp->ts_arridx;
1884 if (sp->ts_curi > byts[arridx])
1885 {
1886 // Done all bytes at this node, do next state. When still at
1887 // already changed bytes skip the other tricks.
1888 PROF_STORE(sp->ts_state)
1889 if (sp->ts_fidx >= sp->ts_fidxtry)
1890 sp->ts_state = STATE_DEL;
1891 else
1892 sp->ts_state = STATE_FINAL;
1893 }
1894 else
1895 {
1896 arridx += sp->ts_curi++;
1897 c = byts[arridx];
1898
1899 // Normal byte, go one level deeper. If it's not equal to the
1900 // byte in the bad word adjust the score. But don't even try
1901 // when the byte was already changed. And don't try when we
1902 // just deleted this byte, accepting it is always cheaper than
1903 // delete + substitute.
1904 if (c == fword[sp->ts_fidx]
1905 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE))
1906 newscore = 0;
1907 else
1908 newscore = SCORE_SUBST;
1909 if ((newscore == 0
1910 || (sp->ts_fidx >= sp->ts_fidxtry
1911 && ((sp->ts_flags & TSF_DIDDEL) == 0
1912 || c != fword[sp->ts_delidx])))
1913 && TRY_DEEPER(su, stack, depth, newscore))
1914 {
1915 go_deeper(stack, depth, newscore);
1916#ifdef DEBUG_TRIEWALK
1917 if (newscore > 0)
1918 sprintf(changename[depth], "%.*s-%s: subst %c to %c",
1919 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1920 fword[sp->ts_fidx], c);
1921 else
1922 sprintf(changename[depth], "%.*s-%s: accept %c",
1923 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1924 fword[sp->ts_fidx]);
1925#endif
1926 ++depth;
1927 sp = &stack[depth];
1928 ++sp->ts_fidx;
1929 tword[sp->ts_twordlen++] = c;
1930 sp->ts_arridx = idxs[arridx];
1931 if (newscore == SCORE_SUBST)
1932 sp->ts_isdiff = DIFF_YES;
1933 if (has_mbyte)
1934 {
1935 // Multi-byte characters are a bit complicated to
1936 // handle: They differ when any of the bytes differ
1937 // and then their length may also differ.
1938 if (sp->ts_tcharlen == 0)
1939 {
1940 // First byte.
1941 sp->ts_tcharidx = 0;
1942 sp->ts_tcharlen = MB_BYTE2LEN(c);
1943 sp->ts_fcharstart = sp->ts_fidx - 1;
1944 sp->ts_isdiff = (newscore != 0)
1945 ? DIFF_YES : DIFF_NONE;
1946 }
1947 else if (sp->ts_isdiff == DIFF_INSERT)
1948 // When inserting trail bytes don't advance in the
1949 // bad word.
1950 --sp->ts_fidx;
1951 if (++sp->ts_tcharidx == sp->ts_tcharlen)
1952 {
1953 // Last byte of character.
1954 if (sp->ts_isdiff == DIFF_YES)
1955 {
1956 // Correct ts_fidx for the byte length of the
1957 // character (we didn't check that before).
1958 sp->ts_fidx = sp->ts_fcharstart
Bram Moolenaar1614a142019-10-06 22:00:13 +02001959 + mb_ptr2len(
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001960 fword + sp->ts_fcharstart);
1961 // For changing a composing character adjust
1962 // the score from SCORE_SUBST to
1963 // SCORE_SUBCOMP.
1964 if (enc_utf8
1965 && utf_iscomposing(
1966 utf_ptr2char(tword
1967 + sp->ts_twordlen
1968 - sp->ts_tcharlen))
1969 && utf_iscomposing(
1970 utf_ptr2char(fword
1971 + sp->ts_fcharstart)))
1972 sp->ts_score -=
1973 SCORE_SUBST - SCORE_SUBCOMP;
1974
1975 // For a similar character adjust score from
1976 // SCORE_SUBST to SCORE_SIMILAR.
1977 else if (!soundfold
1978 && slang->sl_has_map
1979 && similar_chars(slang,
1980 mb_ptr2char(tword
1981 + sp->ts_twordlen
1982 - sp->ts_tcharlen),
1983 mb_ptr2char(fword
1984 + sp->ts_fcharstart)))
1985 sp->ts_score -=
1986 SCORE_SUBST - SCORE_SIMILAR;
1987 }
1988 else if (sp->ts_isdiff == DIFF_INSERT
1989 && sp->ts_twordlen > sp->ts_tcharlen)
1990 {
1991 p = tword + sp->ts_twordlen - sp->ts_tcharlen;
1992 c = mb_ptr2char(p);
1993 if (enc_utf8 && utf_iscomposing(c))
1994 {
1995 // Inserting a composing char doesn't
1996 // count that much.
1997 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
1998 }
1999 else
2000 {
2001 // If the previous character was the same,
2002 // thus doubling a character, give a bonus
2003 // to the score. Also for the soundfold
2004 // tree (might seem illogical but does
2005 // give better scores).
2006 MB_PTR_BACK(tword, p);
2007 if (c == mb_ptr2char(p))
2008 sp->ts_score -= SCORE_INS
2009 - SCORE_INSDUP;
2010 }
2011 }
2012
2013 // Starting a new char, reset the length.
2014 sp->ts_tcharlen = 0;
2015 }
2016 }
2017 else
2018 {
2019 // If we found a similar char adjust the score.
2020 // We do this after calling go_deeper() because
2021 // it's slow.
2022 if (newscore != 0
2023 && !soundfold
2024 && slang->sl_has_map
2025 && similar_chars(slang,
2026 c, fword[sp->ts_fidx - 1]))
2027 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
2028 }
2029 }
2030 }
2031 break;
2032
2033 case STATE_DEL:
2034 // When past the first byte of a multi-byte char don't try
2035 // delete/insert/swap a character.
2036 if (has_mbyte && sp->ts_tcharlen > 0)
2037 {
2038 PROF_STORE(sp->ts_state)
2039 sp->ts_state = STATE_FINAL;
2040 break;
2041 }
2042 // Try skipping one character in the bad word (delete it).
2043 PROF_STORE(sp->ts_state)
2044 sp->ts_state = STATE_INS_PREP;
2045 sp->ts_curi = 1;
2046 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*')
2047 // Deleting a vowel at the start of a word counts less, see
2048 // soundalike_score().
2049 newscore = 2 * SCORE_DEL / 3;
2050 else
2051 newscore = SCORE_DEL;
2052 if (fword[sp->ts_fidx] != NUL
2053 && TRY_DEEPER(su, stack, depth, newscore))
2054 {
2055 go_deeper(stack, depth, newscore);
2056#ifdef DEBUG_TRIEWALK
2057 sprintf(changename[depth], "%.*s-%s: delete %c",
2058 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2059 fword[sp->ts_fidx]);
2060#endif
2061 ++depth;
2062
2063 // Remember what character we deleted, so that we can avoid
2064 // inserting it again.
2065 stack[depth].ts_flags |= TSF_DIDDEL;
2066 stack[depth].ts_delidx = sp->ts_fidx;
2067
2068 // Advance over the character in fword[]. Give a bonus to the
2069 // score if the same character is following "nn" -> "n". It's
2070 // a bit illogical for soundfold tree but it does give better
2071 // results.
2072 if (has_mbyte)
2073 {
2074 c = mb_ptr2char(fword + sp->ts_fidx);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002075 stack[depth].ts_fidx += mb_ptr2len(fword + sp->ts_fidx);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002076 if (enc_utf8 && utf_iscomposing(c))
2077 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
2078 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx))
2079 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2080 }
2081 else
2082 {
2083 ++stack[depth].ts_fidx;
2084 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1])
2085 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2086 }
2087 break;
2088 }
2089 // FALLTHROUGH
2090
2091 case STATE_INS_PREP:
2092 if (sp->ts_flags & TSF_DIDDEL)
2093 {
2094 // If we just deleted a byte then inserting won't make sense,
2095 // a substitute is always cheaper.
2096 PROF_STORE(sp->ts_state)
2097 sp->ts_state = STATE_SWAP;
2098 break;
2099 }
2100
2101 // skip over NUL bytes
2102 n = sp->ts_arridx;
2103 for (;;)
2104 {
2105 if (sp->ts_curi > byts[n])
2106 {
2107 // Only NUL bytes at this node, go to next state.
2108 PROF_STORE(sp->ts_state)
2109 sp->ts_state = STATE_SWAP;
2110 break;
2111 }
2112 if (byts[n + sp->ts_curi] != NUL)
2113 {
2114 // Found a byte to insert.
2115 PROF_STORE(sp->ts_state)
2116 sp->ts_state = STATE_INS;
2117 break;
2118 }
2119 ++sp->ts_curi;
2120 }
2121 break;
2122
2123 // FALLTHROUGH
2124
2125 case STATE_INS:
2126 // Insert one byte. Repeat this for each possible byte at this
2127 // node.
2128 n = sp->ts_arridx;
2129 if (sp->ts_curi > byts[n])
2130 {
2131 // Done all bytes at this node, go to next state.
2132 PROF_STORE(sp->ts_state)
2133 sp->ts_state = STATE_SWAP;
2134 break;
2135 }
2136
2137 // Do one more byte at this node, but:
2138 // - Skip NUL bytes.
2139 // - Skip the byte if it's equal to the byte in the word,
2140 // accepting that byte is always better.
2141 n += sp->ts_curi++;
2142 c = byts[n];
2143 if (soundfold && sp->ts_twordlen == 0 && c == '*')
2144 // Inserting a vowel at the start of a word counts less,
2145 // see soundalike_score().
2146 newscore = 2 * SCORE_INS / 3;
2147 else
2148 newscore = SCORE_INS;
2149 if (c != fword[sp->ts_fidx]
2150 && TRY_DEEPER(su, stack, depth, newscore))
2151 {
2152 go_deeper(stack, depth, newscore);
2153#ifdef DEBUG_TRIEWALK
2154 sprintf(changename[depth], "%.*s-%s: insert %c",
2155 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2156 c);
2157#endif
2158 ++depth;
2159 sp = &stack[depth];
2160 tword[sp->ts_twordlen++] = c;
2161 sp->ts_arridx = idxs[n];
2162 if (has_mbyte)
2163 {
2164 fl = MB_BYTE2LEN(c);
2165 if (fl > 1)
2166 {
2167 // There are following bytes for the same character.
2168 // We must find all bytes before trying
2169 // delete/insert/swap/etc.
2170 sp->ts_tcharlen = fl;
2171 sp->ts_tcharidx = 1;
2172 sp->ts_isdiff = DIFF_INSERT;
2173 }
2174 }
2175 else
2176 fl = 1;
2177 if (fl == 1)
2178 {
2179 // If the previous character was the same, thus doubling a
2180 // character, give a bonus to the score. Also for
2181 // soundfold words (illogical but does give a better
2182 // score).
2183 if (sp->ts_twordlen >= 2
2184 && tword[sp->ts_twordlen - 2] == c)
2185 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
2186 }
2187 }
2188 break;
2189
2190 case STATE_SWAP:
2191 // Swap two bytes in the bad word: "12" -> "21".
2192 // We change "fword" here, it's changed back afterwards at
2193 // STATE_UNSWAP.
2194 p = fword + sp->ts_fidx;
2195 c = *p;
2196 if (c == NUL)
2197 {
2198 // End of word, can't swap or replace.
2199 PROF_STORE(sp->ts_state)
2200 sp->ts_state = STATE_FINAL;
2201 break;
2202 }
2203
2204 // Don't swap if the first character is not a word character.
2205 // SWAP3 etc. also don't make sense then.
2206 if (!soundfold && !spell_iswordp(p, curwin))
2207 {
2208 PROF_STORE(sp->ts_state)
2209 sp->ts_state = STATE_REP_INI;
2210 break;
2211 }
2212
2213 if (has_mbyte)
2214 {
2215 n = MB_CPTR2LEN(p);
2216 c = mb_ptr2char(p);
2217 if (p[n] == NUL)
2218 c2 = NUL;
2219 else if (!soundfold && !spell_iswordp(p + n, curwin))
2220 c2 = c; // don't swap non-word char
2221 else
2222 c2 = mb_ptr2char(p + n);
2223 }
2224 else
2225 {
2226 if (p[1] == NUL)
2227 c2 = NUL;
2228 else if (!soundfold && !spell_iswordp(p + 1, curwin))
2229 c2 = c; // don't swap non-word char
2230 else
2231 c2 = p[1];
2232 }
2233
2234 // When the second character is NUL we can't swap.
2235 if (c2 == NUL)
2236 {
2237 PROF_STORE(sp->ts_state)
2238 sp->ts_state = STATE_REP_INI;
2239 break;
2240 }
2241
2242 // When characters are identical, swap won't do anything.
2243 // Also get here if the second char is not a word character.
2244 if (c == c2)
2245 {
2246 PROF_STORE(sp->ts_state)
2247 sp->ts_state = STATE_SWAP3;
2248 break;
2249 }
2250 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP))
2251 {
2252 go_deeper(stack, depth, SCORE_SWAP);
2253#ifdef DEBUG_TRIEWALK
2254 sprintf(changename[depth], "%.*s-%s: swap %c and %c",
2255 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2256 c, c2);
2257#endif
2258 PROF_STORE(sp->ts_state)
2259 sp->ts_state = STATE_UNSWAP;
2260 ++depth;
2261 if (has_mbyte)
2262 {
2263 fl = mb_char2len(c2);
2264 mch_memmove(p, p + n, fl);
2265 mb_char2bytes(c, p + fl);
2266 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2267 }
2268 else
2269 {
2270 p[0] = c2;
2271 p[1] = c;
2272 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
2273 }
2274 }
2275 else
2276 {
2277 // If this swap doesn't work then SWAP3 won't either.
2278 PROF_STORE(sp->ts_state)
2279 sp->ts_state = STATE_REP_INI;
2280 }
2281 break;
2282
2283 case STATE_UNSWAP:
2284 // Undo the STATE_SWAP swap: "21" -> "12".
2285 p = fword + sp->ts_fidx;
2286 if (has_mbyte)
2287 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002288 n = mb_ptr2len(p);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002289 c = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002290 mch_memmove(p + mb_ptr2len(p + n), p, n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002291 mb_char2bytes(c, p);
2292 }
2293 else
2294 {
2295 c = *p;
2296 *p = p[1];
2297 p[1] = c;
2298 }
2299 // FALLTHROUGH
2300
2301 case STATE_SWAP3:
2302 // Swap two bytes, skipping one: "123" -> "321". We change
2303 // "fword" here, it's changed back afterwards at STATE_UNSWAP3.
2304 p = fword + sp->ts_fidx;
2305 if (has_mbyte)
2306 {
2307 n = MB_CPTR2LEN(p);
2308 c = mb_ptr2char(p);
2309 fl = MB_CPTR2LEN(p + n);
2310 c2 = mb_ptr2char(p + n);
2311 if (!soundfold && !spell_iswordp(p + n + fl, curwin))
2312 c3 = c; // don't swap non-word char
2313 else
2314 c3 = mb_ptr2char(p + n + fl);
2315 }
2316 else
2317 {
2318 c = *p;
2319 c2 = p[1];
2320 if (!soundfold && !spell_iswordp(p + 2, curwin))
2321 c3 = c; // don't swap non-word char
2322 else
2323 c3 = p[2];
2324 }
2325
2326 // When characters are identical: "121" then SWAP3 result is
2327 // identical, ROT3L result is same as SWAP: "211", ROT3L result is
2328 // same as SWAP on next char: "112". Thus skip all swapping.
2329 // Also skip when c3 is NUL.
2330 // Also get here when the third character is not a word character.
2331 // Second character may any char: "a.b" -> "b.a"
2332 if (c == c3 || c3 == NUL)
2333 {
2334 PROF_STORE(sp->ts_state)
2335 sp->ts_state = STATE_REP_INI;
2336 break;
2337 }
2338 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2339 {
2340 go_deeper(stack, depth, SCORE_SWAP3);
2341#ifdef DEBUG_TRIEWALK
2342 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c",
2343 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2344 c, c3);
2345#endif
2346 PROF_STORE(sp->ts_state)
2347 sp->ts_state = STATE_UNSWAP3;
2348 ++depth;
2349 if (has_mbyte)
2350 {
2351 tl = mb_char2len(c3);
2352 mch_memmove(p, p + n + fl, tl);
2353 mb_char2bytes(c2, p + tl);
2354 mb_char2bytes(c, p + fl + tl);
2355 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
2356 }
2357 else
2358 {
2359 p[0] = p[2];
2360 p[2] = c;
2361 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2362 }
2363 }
2364 else
2365 {
2366 PROF_STORE(sp->ts_state)
2367 sp->ts_state = STATE_REP_INI;
2368 }
2369 break;
2370
2371 case STATE_UNSWAP3:
2372 // Undo STATE_SWAP3: "321" -> "123"
2373 p = fword + sp->ts_fidx;
2374 if (has_mbyte)
2375 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002376 n = mb_ptr2len(p);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002377 c2 = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002378 fl = mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002379 c = mb_ptr2char(p + n + fl);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002380 tl = mb_ptr2len(p + n + fl);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002381 mch_memmove(p + fl + tl, p, n);
2382 mb_char2bytes(c, p);
2383 mb_char2bytes(c2, p + tl);
2384 p = p + tl;
2385 }
2386 else
2387 {
2388 c = *p;
2389 *p = p[2];
2390 p[2] = c;
2391 ++p;
2392 }
2393
2394 if (!soundfold && !spell_iswordp(p, curwin))
2395 {
2396 // Middle char is not a word char, skip the rotate. First and
2397 // third char were already checked at swap and swap3.
2398 PROF_STORE(sp->ts_state)
2399 sp->ts_state = STATE_REP_INI;
2400 break;
2401 }
2402
2403 // Rotate three characters left: "123" -> "231". We change
2404 // "fword" here, it's changed back afterwards at STATE_UNROT3L.
2405 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2406 {
2407 go_deeper(stack, depth, SCORE_SWAP3);
2408#ifdef DEBUG_TRIEWALK
2409 p = fword + sp->ts_fidx;
2410 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c",
2411 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2412 p[0], p[1], p[2]);
2413#endif
2414 PROF_STORE(sp->ts_state)
2415 sp->ts_state = STATE_UNROT3L;
2416 ++depth;
2417 p = fword + sp->ts_fidx;
2418 if (has_mbyte)
2419 {
2420 n = MB_CPTR2LEN(p);
2421 c = mb_ptr2char(p);
2422 fl = MB_CPTR2LEN(p + n);
2423 fl += MB_CPTR2LEN(p + n + fl);
2424 mch_memmove(p, p + n, fl);
2425 mb_char2bytes(c, p + fl);
2426 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2427 }
2428 else
2429 {
2430 c = *p;
2431 *p = p[1];
2432 p[1] = p[2];
2433 p[2] = c;
2434 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2435 }
2436 }
2437 else
2438 {
2439 PROF_STORE(sp->ts_state)
2440 sp->ts_state = STATE_REP_INI;
2441 }
2442 break;
2443
2444 case STATE_UNROT3L:
2445 // Undo ROT3L: "231" -> "123"
2446 p = fword + sp->ts_fidx;
2447 if (has_mbyte)
2448 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002449 n = mb_ptr2len(p);
2450 n += mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002451 c = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002452 tl = mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002453 mch_memmove(p + tl, p, n);
2454 mb_char2bytes(c, p);
2455 }
2456 else
2457 {
2458 c = p[2];
2459 p[2] = p[1];
2460 p[1] = *p;
2461 *p = c;
2462 }
2463
2464 // Rotate three bytes right: "123" -> "312". We change "fword"
2465 // here, it's changed back afterwards at STATE_UNROT3R.
2466 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2467 {
2468 go_deeper(stack, depth, SCORE_SWAP3);
2469#ifdef DEBUG_TRIEWALK
2470 p = fword + sp->ts_fidx;
2471 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c",
2472 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2473 p[0], p[1], p[2]);
2474#endif
2475 PROF_STORE(sp->ts_state)
2476 sp->ts_state = STATE_UNROT3R;
2477 ++depth;
2478 p = fword + sp->ts_fidx;
2479 if (has_mbyte)
2480 {
2481 n = MB_CPTR2LEN(p);
2482 n += MB_CPTR2LEN(p + n);
2483 c = mb_ptr2char(p + n);
2484 tl = MB_CPTR2LEN(p + n);
2485 mch_memmove(p + tl, p, n);
2486 mb_char2bytes(c, p);
2487 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
2488 }
2489 else
2490 {
2491 c = p[2];
2492 p[2] = p[1];
2493 p[1] = *p;
2494 *p = c;
2495 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2496 }
2497 }
2498 else
2499 {
2500 PROF_STORE(sp->ts_state)
2501 sp->ts_state = STATE_REP_INI;
2502 }
2503 break;
2504
2505 case STATE_UNROT3R:
2506 // Undo ROT3R: "312" -> "123"
2507 p = fword + sp->ts_fidx;
2508 if (has_mbyte)
2509 {
2510 c = mb_ptr2char(p);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002511 tl = mb_ptr2len(p);
2512 n = mb_ptr2len(p + tl);
2513 n += mb_ptr2len(p + tl + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002514 mch_memmove(p, p + tl, n);
2515 mb_char2bytes(c, p + n);
2516 }
2517 else
2518 {
2519 c = *p;
2520 *p = p[1];
2521 p[1] = p[2];
2522 p[2] = c;
2523 }
2524 // FALLTHROUGH
2525
2526 case STATE_REP_INI:
2527 // Check if matching with REP items from the .aff file would work.
2528 // Quickly skip if:
2529 // - there are no REP items and we are not in the soundfold trie
2530 // - the score is going to be too high anyway
2531 // - already applied a REP item or swapped here
2532 if ((lp->lp_replang == NULL && !soundfold)
2533 || sp->ts_score + SCORE_REP >= su->su_maxscore
2534 || sp->ts_fidx < sp->ts_fidxtry)
2535 {
2536 PROF_STORE(sp->ts_state)
2537 sp->ts_state = STATE_FINAL;
2538 break;
2539 }
2540
2541 // Use the first byte to quickly find the first entry that may
2542 // match. If the index is -1 there is none.
2543 if (soundfold)
2544 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]];
2545 else
2546 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
2547
2548 if (sp->ts_curi < 0)
2549 {
2550 PROF_STORE(sp->ts_state)
2551 sp->ts_state = STATE_FINAL;
2552 break;
2553 }
2554
2555 PROF_STORE(sp->ts_state)
2556 sp->ts_state = STATE_REP;
2557 // FALLTHROUGH
2558
2559 case STATE_REP:
2560 // Try matching with REP items from the .aff file. For each match
2561 // replace the characters and check if the resulting word is
2562 // valid.
2563 p = fword + sp->ts_fidx;
2564
2565 if (soundfold)
2566 gap = &slang->sl_repsal;
2567 else
2568 gap = &lp->lp_replang->sl_rep;
2569 while (sp->ts_curi < gap->ga_len)
2570 {
2571 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
2572 if (*ftp->ft_from != *p)
2573 {
2574 // past possible matching entries
2575 sp->ts_curi = gap->ga_len;
2576 break;
2577 }
2578 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
2579 && TRY_DEEPER(su, stack, depth, SCORE_REP))
2580 {
2581 go_deeper(stack, depth, SCORE_REP);
2582#ifdef DEBUG_TRIEWALK
2583 sprintf(changename[depth], "%.*s-%s: replace %s with %s",
2584 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2585 ftp->ft_from, ftp->ft_to);
2586#endif
2587 // Need to undo this afterwards.
2588 PROF_STORE(sp->ts_state)
2589 sp->ts_state = STATE_REP_UNDO;
2590
2591 // Change the "from" to the "to" string.
2592 ++depth;
2593 fl = (int)STRLEN(ftp->ft_from);
2594 tl = (int)STRLEN(ftp->ft_to);
2595 if (fl != tl)
2596 {
2597 STRMOVE(p + tl, p + fl);
2598 repextra += tl - fl;
2599 }
2600 mch_memmove(p, ftp->ft_to, tl);
2601 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
2602 stack[depth].ts_tcharlen = 0;
2603 break;
2604 }
2605 }
2606
2607 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP)
2608 {
2609 // No (more) matches.
2610 PROF_STORE(sp->ts_state)
2611 sp->ts_state = STATE_FINAL;
2612 }
2613
2614 break;
2615
2616 case STATE_REP_UNDO:
2617 // Undo a REP replacement and continue with the next one.
2618 if (soundfold)
2619 gap = &slang->sl_repsal;
2620 else
2621 gap = &lp->lp_replang->sl_rep;
2622 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1;
2623 fl = (int)STRLEN(ftp->ft_from);
2624 tl = (int)STRLEN(ftp->ft_to);
2625 p = fword + sp->ts_fidx;
2626 if (fl != tl)
2627 {
2628 STRMOVE(p + fl, p + tl);
2629 repextra -= tl - fl;
2630 }
2631 mch_memmove(p, ftp->ft_from, fl);
2632 PROF_STORE(sp->ts_state)
2633 sp->ts_state = STATE_REP;
2634 break;
2635
2636 default:
2637 // Did all possible states at this level, go up one level.
2638 --depth;
2639
2640 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE)
2641 {
2642 // Continue in or go back to the prefix tree.
2643 byts = pbyts;
2644 idxs = pidxs;
2645 }
2646
2647 // Don't check for CTRL-C too often, it takes time.
2648 if (--breakcheckcount == 0)
2649 {
2650 ui_breakcheck();
2651 breakcheckcount = 1000;
2652 }
2653 }
2654 }
2655}
2656
2657
2658/*
2659 * Go one level deeper in the tree.
2660 */
2661 static void
2662go_deeper(trystate_T *stack, int depth, int score_add)
2663{
2664 stack[depth + 1] = stack[depth];
2665 stack[depth + 1].ts_state = STATE_START;
2666 stack[depth + 1].ts_score = stack[depth].ts_score + score_add;
2667 stack[depth + 1].ts_curi = 1; // start just after length byte
2668 stack[depth + 1].ts_flags = 0;
2669}
2670
2671/*
2672 * "fword" is a good word with case folded. Find the matching keep-case
2673 * words and put it in "kword".
2674 * Theoretically there could be several keep-case words that result in the
2675 * same case-folded word, but we only find one...
2676 */
2677 static void
2678find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword)
2679{
2680 char_u uword[MAXWLEN]; // "fword" in upper-case
2681 int depth;
2682 idx_T tryidx;
2683
2684 // The following arrays are used at each depth in the tree.
2685 idx_T arridx[MAXWLEN];
2686 int round[MAXWLEN];
2687 int fwordidx[MAXWLEN];
2688 int uwordidx[MAXWLEN];
2689 int kwordlen[MAXWLEN];
2690
2691 int flen, ulen;
2692 int l;
2693 int len;
2694 int c;
2695 idx_T lo, hi, m;
2696 char_u *p;
2697 char_u *byts = slang->sl_kbyts; // array with bytes of the words
2698 idx_T *idxs = slang->sl_kidxs; // array with indexes
2699
2700 if (byts == NULL)
2701 {
2702 // array is empty: "cannot happen"
2703 *kword = NUL;
2704 return;
2705 }
2706
2707 // Make an all-cap version of "fword".
2708 allcap_copy(fword, uword);
2709
2710 // Each character needs to be tried both case-folded and upper-case.
2711 // All this gets very complicated if we keep in mind that changing case
2712 // may change the byte length of a multi-byte character...
2713 depth = 0;
2714 arridx[0] = 0;
2715 round[0] = 0;
2716 fwordidx[0] = 0;
2717 uwordidx[0] = 0;
2718 kwordlen[0] = 0;
2719 while (depth >= 0)
2720 {
2721 if (fword[fwordidx[depth]] == NUL)
2722 {
2723 // We are at the end of "fword". If the tree allows a word to end
2724 // here we have found a match.
2725 if (byts[arridx[depth] + 1] == 0)
2726 {
2727 kword[kwordlen[depth]] = NUL;
2728 return;
2729 }
2730
2731 // kword is getting too long, continue one level up
2732 --depth;
2733 }
2734 else if (++round[depth] > 2)
2735 {
2736 // tried both fold-case and upper-case character, continue one
2737 // level up
2738 --depth;
2739 }
2740 else
2741 {
2742 // round[depth] == 1: Try using the folded-case character.
2743 // round[depth] == 2: Try using the upper-case character.
2744 if (has_mbyte)
2745 {
2746 flen = MB_CPTR2LEN(fword + fwordidx[depth]);
2747 ulen = MB_CPTR2LEN(uword + uwordidx[depth]);
2748 }
2749 else
2750 ulen = flen = 1;
2751 if (round[depth] == 1)
2752 {
2753 p = fword + fwordidx[depth];
2754 l = flen;
2755 }
2756 else
2757 {
2758 p = uword + uwordidx[depth];
2759 l = ulen;
2760 }
2761
2762 for (tryidx = arridx[depth]; l > 0; --l)
2763 {
2764 // Perform a binary search in the list of accepted bytes.
2765 len = byts[tryidx++];
2766 c = *p++;
2767 lo = tryidx;
2768 hi = tryidx + len - 1;
2769 while (lo < hi)
2770 {
2771 m = (lo + hi) / 2;
2772 if (byts[m] > c)
2773 hi = m - 1;
2774 else if (byts[m] < c)
2775 lo = m + 1;
2776 else
2777 {
2778 lo = hi = m;
2779 break;
2780 }
2781 }
2782
2783 // Stop if there is no matching byte.
2784 if (hi < lo || byts[lo] != c)
2785 break;
2786
2787 // Continue at the child (if there is one).
2788 tryidx = idxs[lo];
2789 }
2790
2791 if (l == 0)
2792 {
2793 // Found the matching char. Copy it to "kword" and go a
2794 // level deeper.
2795 if (round[depth] == 1)
2796 {
2797 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
2798 flen);
2799 kwordlen[depth + 1] = kwordlen[depth] + flen;
2800 }
2801 else
2802 {
2803 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
2804 ulen);
2805 kwordlen[depth + 1] = kwordlen[depth] + ulen;
2806 }
2807 fwordidx[depth + 1] = fwordidx[depth] + flen;
2808 uwordidx[depth + 1] = uwordidx[depth] + ulen;
2809
2810 ++depth;
2811 arridx[depth] = tryidx;
2812 round[depth] = 0;
2813 }
2814 }
2815 }
2816
2817 // Didn't find it: "cannot happen".
2818 *kword = NUL;
2819}
2820
2821/*
2822 * Compute the sound-a-like score for suggestions in su->su_ga and add them to
2823 * su->su_sga.
2824 */
2825 static void
2826score_comp_sal(suginfo_T *su)
2827{
2828 langp_T *lp;
2829 char_u badsound[MAXWLEN];
2830 int i;
2831 suggest_T *stp;
2832 suggest_T *sstp;
2833 int score;
2834 int lpi;
2835
2836 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL)
2837 return;
2838
2839 // Use the sound-folding of the first language that supports it.
2840 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2841 {
2842 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2843 if (lp->lp_slang->sl_sal.ga_len > 0)
2844 {
2845 // soundfold the bad word
2846 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound);
2847
2848 for (i = 0; i < su->su_ga.ga_len; ++i)
2849 {
2850 stp = &SUG(su->su_ga, i);
2851
2852 // Case-fold the suggested word, sound-fold it and compute the
2853 // sound-a-like score.
2854 score = stp_sal_score(stp, su, lp->lp_slang, badsound);
2855 if (score < SCORE_MAXMAX)
2856 {
2857 // Add the suggestion.
2858 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
2859 sstp->st_word = vim_strsave(stp->st_word);
2860 if (sstp->st_word != NULL)
2861 {
2862 sstp->st_wordlen = stp->st_wordlen;
2863 sstp->st_score = score;
2864 sstp->st_altscore = 0;
2865 sstp->st_orglen = stp->st_orglen;
2866 ++su->su_sga.ga_len;
2867 }
2868 }
2869 }
2870 break;
2871 }
2872 }
2873}
2874
2875/*
2876 * Combine the list of suggestions in su->su_ga and su->su_sga.
2877 * They are entwined.
2878 */
2879 static void
2880score_combine(suginfo_T *su)
2881{
2882 int i;
2883 int j;
2884 garray_T ga;
2885 garray_T *gap;
2886 langp_T *lp;
2887 suggest_T *stp;
2888 char_u *p;
2889 char_u badsound[MAXWLEN];
2890 int round;
2891 int lpi;
2892 slang_T *slang = NULL;
2893
2894 // Add the alternate score to su_ga.
2895 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2896 {
2897 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2898 if (lp->lp_slang->sl_sal.ga_len > 0)
2899 {
2900 // soundfold the bad word
2901 slang = lp->lp_slang;
2902 spell_soundfold(slang, su->su_fbadword, TRUE, badsound);
2903
2904 for (i = 0; i < su->su_ga.ga_len; ++i)
2905 {
2906 stp = &SUG(su->su_ga, i);
2907 stp->st_altscore = stp_sal_score(stp, su, slang, badsound);
2908 if (stp->st_altscore == SCORE_MAXMAX)
2909 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
2910 else
2911 stp->st_score = (stp->st_score * 3
2912 + stp->st_altscore) / 4;
2913 stp->st_salscore = FALSE;
2914 }
2915 break;
2916 }
2917 }
2918
2919 if (slang == NULL) // Using "double" without sound folding.
2920 {
2921 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore,
2922 su->su_maxcount);
2923 return;
2924 }
2925
2926 // Add the alternate score to su_sga.
2927 for (i = 0; i < su->su_sga.ga_len; ++i)
2928 {
2929 stp = &SUG(su->su_sga, i);
2930 stp->st_altscore = spell_edit_score(slang,
2931 su->su_badword, stp->st_word);
2932 if (stp->st_score == SCORE_MAXMAX)
2933 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
2934 else
2935 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
2936 stp->st_salscore = TRUE;
2937 }
2938
2939 // Remove bad suggestions, sort the suggestions and truncate at "maxcount"
2940 // for both lists.
2941 check_suggestions(su, &su->su_ga);
2942 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
2943 check_suggestions(su, &su->su_sga);
2944 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
2945
Bram Moolenaar04935fb2022-01-08 16:19:22 +00002946 ga_init2(&ga, sizeof(suginfo_T), 1);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002947 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL)
2948 return;
2949
2950 stp = &SUG(ga, 0);
2951 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i)
2952 {
2953 // round 1: get a suggestion from su_ga
2954 // round 2: get a suggestion from su_sga
2955 for (round = 1; round <= 2; ++round)
2956 {
2957 gap = round == 1 ? &su->su_ga : &su->su_sga;
2958 if (i < gap->ga_len)
2959 {
2960 // Don't add a word if it's already there.
2961 p = SUG(*gap, i).st_word;
2962 for (j = 0; j < ga.ga_len; ++j)
2963 if (STRCMP(stp[j].st_word, p) == 0)
2964 break;
2965 if (j == ga.ga_len)
2966 stp[ga.ga_len++] = SUG(*gap, i);
2967 else
2968 vim_free(p);
2969 }
2970 }
2971 }
2972
2973 ga_clear(&su->su_ga);
2974 ga_clear(&su->su_sga);
2975
2976 // Truncate the list to the number of suggestions that will be displayed.
2977 if (ga.ga_len > su->su_maxcount)
2978 {
2979 for (i = su->su_maxcount; i < ga.ga_len; ++i)
2980 vim_free(stp[i].st_word);
2981 ga.ga_len = su->su_maxcount;
2982 }
2983
2984 su->su_ga = ga;
2985}
2986
2987/*
2988 * For the goodword in "stp" compute the soundalike score compared to the
2989 * badword.
2990 */
2991 static int
2992stp_sal_score(
2993 suggest_T *stp,
2994 suginfo_T *su,
2995 slang_T *slang,
2996 char_u *badsound) // sound-folded badword
2997{
2998 char_u *p;
2999 char_u *pbad;
3000 char_u *pgood;
3001 char_u badsound2[MAXWLEN];
3002 char_u fword[MAXWLEN];
3003 char_u goodsound[MAXWLEN];
3004 char_u goodword[MAXWLEN];
3005 int lendiff;
3006
3007 lendiff = (int)(su->su_badlen - stp->st_orglen);
3008 if (lendiff >= 0)
3009 pbad = badsound;
3010 else
3011 {
3012 // soundfold the bad word with more characters following
Bram Moolenaar4f135272021-06-11 19:07:40 +02003013 (void)spell_casefold(curwin,
3014 su->su_badptr, stp->st_orglen, fword, MAXWLEN);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02003015
3016 // When joining two words the sound often changes a lot. E.g., "t he"
3017 // sounds like "t h" while "the" sounds like "@". Avoid that by
3018 // removing the space. Don't do it when the good word also contains a
3019 // space.
3020 if (VIM_ISWHITE(su->su_badptr[su->su_badlen])
3021 && *skiptowhite(stp->st_word) == NUL)
3022 for (p = fword; *(p = skiptowhite(p)) != NUL; )
3023 STRMOVE(p, p + 1);
3024
3025 spell_soundfold(slang, fword, TRUE, badsound2);
3026 pbad = badsound2;
3027 }
3028
3029 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN)
3030 {
3031 // Add part of the bad word to the good word, so that we soundfold
3032 // what replaces the bad word.
3033 STRCPY(goodword, stp->st_word);
3034 vim_strncpy(goodword + stp->st_wordlen,
3035 su->su_badptr + su->su_badlen - lendiff, lendiff);
3036 pgood = goodword;
3037 }
3038 else
3039 pgood = stp->st_word;
3040
3041 // Sound-fold the word and compute the score for the difference.
3042 spell_soundfold(slang, pgood, FALSE, goodsound);
3043
3044 return soundalike_score(goodsound, pbad);
3045}
3046
3047// structure used to store soundfolded words that add_sound_suggest() has
3048// handled already.
3049typedef struct
3050{
3051 short sft_score; // lowest score used
3052 char_u sft_word[1]; // soundfolded word, actually longer
3053} sftword_T;
3054
3055static sftword_T dumsft;
3056#define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft)))
3057#define HI2SFT(hi) HIKEY2SFT((hi)->hi_key)
3058
3059/*
3060 * Prepare for calling suggest_try_soundalike().
3061 */
3062 static void
3063suggest_try_soundalike_prep(void)
3064{
3065 langp_T *lp;
3066 int lpi;
3067 slang_T *slang;
3068
3069 // Do this for all languages that support sound folding and for which a
3070 // .sug file has been loaded.
3071 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3072 {
3073 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3074 slang = lp->lp_slang;
3075 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3076 // prepare the hashtable used by add_sound_suggest()
3077 hash_init(&slang->sl_sounddone);
3078 }
3079}
3080
3081/*
3082 * Find suggestions by comparing the word in a sound-a-like form.
3083 * Note: This doesn't support postponed prefixes.
3084 */
3085 static void
3086suggest_try_soundalike(suginfo_T *su)
3087{
3088 char_u salword[MAXWLEN];
3089 langp_T *lp;
3090 int lpi;
3091 slang_T *slang;
3092
3093 // Do this for all languages that support sound folding and for which a
3094 // .sug file has been loaded.
3095 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3096 {
3097 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3098 slang = lp->lp_slang;
3099 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3100 {
3101 // soundfold the bad word
3102 spell_soundfold(slang, su->su_fbadword, TRUE, salword);
3103
3104 // try all kinds of inserts/deletes/swaps/etc.
3105 // TODO: also soundfold the next words, so that we can try joining
3106 // and splitting
3107#ifdef SUGGEST_PROFILE
3108 prof_init();
3109#endif
3110 suggest_trie_walk(su, lp, salword, TRUE);
3111#ifdef SUGGEST_PROFILE
3112 prof_report("soundalike");
3113#endif
3114 }
3115 }
3116}
3117
3118/*
3119 * Finish up after calling suggest_try_soundalike().
3120 */
3121 static void
3122suggest_try_soundalike_finish(void)
3123{
3124 langp_T *lp;
3125 int lpi;
3126 slang_T *slang;
3127 int todo;
3128 hashitem_T *hi;
3129
3130 // Do this for all languages that support sound folding and for which a
3131 // .sug file has been loaded.
3132 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3133 {
3134 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3135 slang = lp->lp_slang;
3136 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3137 {
3138 // Free the info about handled words.
3139 todo = (int)slang->sl_sounddone.ht_used;
3140 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi)
3141 if (!HASHITEM_EMPTY(hi))
3142 {
3143 vim_free(HI2SFT(hi));
3144 --todo;
3145 }
3146
3147 // Clear the hashtable, it may also be used by another region.
3148 hash_clear(&slang->sl_sounddone);
3149 hash_init(&slang->sl_sounddone);
3150 }
3151 }
3152}
3153
3154/*
3155 * A match with a soundfolded word is found. Add the good word(s) that
3156 * produce this soundfolded word.
3157 */
3158 static void
3159add_sound_suggest(
3160 suginfo_T *su,
3161 char_u *goodword,
3162 int score, // soundfold score
3163 langp_T *lp)
3164{
3165 slang_T *slang = lp->lp_slang; // language for sound folding
3166 int sfwordnr;
3167 char_u *nrline;
3168 int orgnr;
3169 char_u theword[MAXWLEN];
3170 int i;
3171 int wlen;
3172 char_u *byts;
3173 idx_T *idxs;
3174 int n;
3175 int wordcount;
3176 int wc;
3177 int goodscore;
3178 hash_T hash;
3179 hashitem_T *hi;
3180 sftword_T *sft;
3181 int bc, gc;
3182 int limit;
3183
3184 // It's very well possible that the same soundfold word is found several
3185 // times with different scores. Since the following is quite slow only do
3186 // the words that have a better score than before. Use a hashtable to
3187 // remember the words that have been done.
3188 hash = hash_hash(goodword);
3189 hi = hash_lookup(&slang->sl_sounddone, goodword, hash);
3190 if (HASHITEM_EMPTY(hi))
3191 {
3192 sft = alloc(sizeof(sftword_T) + STRLEN(goodword));
3193 if (sft != NULL)
3194 {
3195 sft->sft_score = score;
3196 STRCPY(sft->sft_word, goodword);
3197 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash);
3198 }
3199 }
3200 else
3201 {
3202 sft = HI2SFT(hi);
3203 if (score >= sft->sft_score)
3204 return;
3205 sft->sft_score = score;
3206 }
3207
3208 // Find the word nr in the soundfold tree.
3209 sfwordnr = soundfold_find(slang, goodword);
3210 if (sfwordnr < 0)
3211 {
3212 internal_error("add_sound_suggest()");
3213 return;
3214 }
3215
3216 // go over the list of good words that produce this soundfold word
3217 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE);
3218 orgnr = 0;
3219 while (*nrline != NUL)
3220 {
3221 // The wordnr was stored in a minimal nr of bytes as an offset to the
3222 // previous wordnr.
3223 orgnr += bytes2offset(&nrline);
3224
3225 byts = slang->sl_fbyts;
3226 idxs = slang->sl_fidxs;
3227
3228 // Lookup the word "orgnr" one of the two tries.
3229 n = 0;
3230 wordcount = 0;
3231 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen)
3232 {
3233 i = 1;
3234 if (wordcount == orgnr && byts[n + 1] == NUL)
3235 break; // found end of word
3236
3237 if (byts[n + 1] == NUL)
3238 ++wordcount;
3239
3240 // skip over the NUL bytes
3241 for ( ; byts[n + i] == NUL; ++i)
3242 if (i > byts[n]) // safety check
3243 {
3244 STRCPY(theword + wlen, "BAD");
3245 wlen += 3;
3246 goto badword;
3247 }
3248
3249 // One of the siblings must have the word.
3250 for ( ; i < byts[n]; ++i)
3251 {
3252 wc = idxs[idxs[n + i]]; // nr of words under this byte
3253 if (wordcount + wc > orgnr)
3254 break;
3255 wordcount += wc;
3256 }
3257
3258 theword[wlen] = byts[n + i];
3259 n = idxs[n + i];
3260 }
3261badword:
3262 theword[wlen] = NUL;
3263
3264 // Go over the possible flags and regions.
3265 for (; i <= byts[n] && byts[n + i] == NUL; ++i)
3266 {
3267 char_u cword[MAXWLEN];
3268 char_u *p;
3269 int flags = (int)idxs[n + i];
3270
3271 // Skip words with the NOSUGGEST flag
3272 if (flags & WF_NOSUGGEST)
3273 continue;
3274
3275 if (flags & WF_KEEPCAP)
3276 {
3277 // Must find the word in the keep-case tree.
3278 find_keepcap_word(slang, theword, cword);
3279 p = cword;
3280 }
3281 else
3282 {
3283 flags |= su->su_badflags;
3284 if ((flags & WF_CAPMASK) != 0)
3285 {
3286 // Need to fix case according to "flags".
3287 make_case_word(theword, cword, flags);
3288 p = cword;
3289 }
3290 else
3291 p = theword;
3292 }
3293
3294 // Add the suggestion.
3295 if (sps_flags & SPS_DOUBLE)
3296 {
3297 // Add the suggestion if the score isn't too bad.
3298 if (score <= su->su_maxscore)
3299 add_suggestion(su, &su->su_sga, p, su->su_badlen,
3300 score, 0, FALSE, slang, FALSE);
3301 }
3302 else
3303 {
3304 // Add a penalty for words in another region.
3305 if ((flags & WF_REGION)
3306 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
3307 goodscore = SCORE_REGION;
3308 else
3309 goodscore = 0;
3310
3311 // Add a small penalty for changing the first letter from
3312 // lower to upper case. Helps for "tath" -> "Kath", which is
3313 // less common than "tath" -> "path". Don't do it when the
3314 // letter is the same, that has already been counted.
3315 gc = PTR2CHAR(p);
3316 if (SPELL_ISUPPER(gc))
3317 {
3318 bc = PTR2CHAR(su->su_badword);
3319 if (!SPELL_ISUPPER(bc)
3320 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc))
3321 goodscore += SCORE_ICASE / 2;
3322 }
3323
3324 // Compute the score for the good word. This only does letter
3325 // insert/delete/swap/replace. REP items are not considered,
3326 // which may make the score a bit higher.
3327 // Use a limit for the score to make it work faster. Use
3328 // MAXSCORE(), because RESCORE() will change the score.
3329 // If the limit is very high then the iterative method is
3330 // inefficient, using an array is quicker.
3331 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score);
3332 if (limit > SCORE_LIMITMAX)
3333 goodscore += spell_edit_score(slang, su->su_badword, p);
3334 else
3335 goodscore += spell_edit_score_limit(slang, su->su_badword,
3336 p, limit);
3337
3338 // When going over the limit don't bother to do the rest.
3339 if (goodscore < SCORE_MAXMAX)
3340 {
3341 // Give a bonus to words seen before.
3342 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE);
3343
3344 // Add the suggestion if the score isn't too bad.
3345 goodscore = RESCORE(goodscore, score);
3346 if (goodscore <= su->su_sfmaxscore)
3347 add_suggestion(su, &su->su_ga, p, su->su_badlen,
3348 goodscore, score, TRUE, slang, TRUE);
3349 }
3350 }
3351 }
3352 // smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr);
3353 }
3354}
3355
3356/*
3357 * Find word "word" in fold-case tree for "slang" and return the word number.
3358 */
3359 static int
3360soundfold_find(slang_T *slang, char_u *word)
3361{
3362 idx_T arridx = 0;
3363 int len;
3364 int wlen = 0;
3365 int c;
3366 char_u *ptr = word;
3367 char_u *byts;
3368 idx_T *idxs;
3369 int wordnr = 0;
3370
3371 byts = slang->sl_sbyts;
3372 idxs = slang->sl_sidxs;
3373
3374 for (;;)
3375 {
3376 // First byte is the number of possible bytes.
3377 len = byts[arridx++];
3378
3379 // If the first possible byte is a zero the word could end here.
3380 // If the word ends we found the word. If not skip the NUL bytes.
3381 c = ptr[wlen];
3382 if (byts[arridx] == NUL)
3383 {
3384 if (c == NUL)
3385 break;
3386
3387 // Skip over the zeros, there can be several.
3388 while (len > 0 && byts[arridx] == NUL)
3389 {
3390 ++arridx;
3391 --len;
3392 }
3393 if (len == 0)
3394 return -1; // no children, word should have ended here
3395 ++wordnr;
3396 }
3397
3398 // If the word ends we didn't find it.
3399 if (c == NUL)
3400 return -1;
3401
3402 // Perform a binary search in the list of accepted bytes.
3403 if (c == TAB) // <Tab> is handled like <Space>
3404 c = ' ';
3405 while (byts[arridx] < c)
3406 {
3407 // The word count is in the first idxs[] entry of the child.
3408 wordnr += idxs[idxs[arridx]];
3409 ++arridx;
3410 if (--len == 0) // end of the bytes, didn't find it
3411 return -1;
3412 }
3413 if (byts[arridx] != c) // didn't find the byte
3414 return -1;
3415
3416 // Continue at the child (if there is one).
3417 arridx = idxs[arridx];
3418 ++wlen;
3419
3420 // One space in the good word may stand for several spaces in the
3421 // checked word.
3422 if (c == ' ')
3423 while (ptr[wlen] == ' ' || ptr[wlen] == TAB)
3424 ++wlen;
3425 }
3426
3427 return wordnr;
3428}
3429
3430/*
3431 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
3432 * lines in the .aff file.
3433 */
3434 static int
3435similar_chars(slang_T *slang, int c1, int c2)
3436{
3437 int m1, m2;
3438 char_u buf[MB_MAXBYTES + 1];
3439 hashitem_T *hi;
3440
3441 if (c1 >= 256)
3442 {
3443 buf[mb_char2bytes(c1, buf)] = 0;
3444 hi = hash_find(&slang->sl_map_hash, buf);
3445 if (HASHITEM_EMPTY(hi))
3446 m1 = 0;
3447 else
3448 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3449 }
3450 else
3451 m1 = slang->sl_map_array[c1];
3452 if (m1 == 0)
3453 return FALSE;
3454
3455
3456 if (c2 >= 256)
3457 {
3458 buf[mb_char2bytes(c2, buf)] = 0;
3459 hi = hash_find(&slang->sl_map_hash, buf);
3460 if (HASHITEM_EMPTY(hi))
3461 m2 = 0;
3462 else
3463 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3464 }
3465 else
3466 m2 = slang->sl_map_array[c2];
3467
3468 return m1 == m2;
3469}
3470
3471/*
3472 * Add a suggestion to the list of suggestions.
3473 * For a suggestion that is already in the list the lowest score is remembered.
3474 */
3475 static void
3476add_suggestion(
3477 suginfo_T *su,
3478 garray_T *gap, // either su_ga or su_sga
3479 char_u *goodword,
3480 int badlenarg, // len of bad word replaced with "goodword"
3481 int score,
3482 int altscore,
3483 int had_bonus, // value for st_had_bonus
3484 slang_T *slang, // language for sound folding
3485 int maxsf) // su_maxscore applies to soundfold score,
3486 // su_sfmaxscore to the total score.
3487{
3488 int goodlen; // len of goodword changed
3489 int badlen; // len of bad word changed
3490 suggest_T *stp;
3491 suggest_T new_sug;
3492 int i;
3493 char_u *pgood, *pbad;
3494
3495 // Minimize "badlen" for consistency. Avoids that changing "the the" to
3496 // "thee the" is added next to changing the first "the" the "thee".
3497 pgood = goodword + STRLEN(goodword);
3498 pbad = su->su_badptr + badlenarg;
3499 for (;;)
3500 {
3501 goodlen = (int)(pgood - goodword);
3502 badlen = (int)(pbad - su->su_badptr);
3503 if (goodlen <= 0 || badlen <= 0)
3504 break;
3505 MB_PTR_BACK(goodword, pgood);
3506 MB_PTR_BACK(su->su_badptr, pbad);
3507 if (has_mbyte)
3508 {
3509 if (mb_ptr2char(pgood) != mb_ptr2char(pbad))
3510 break;
3511 }
3512 else if (*pgood != *pbad)
3513 break;
3514 }
3515
3516 if (badlen == 0 && goodlen == 0)
3517 // goodword doesn't change anything; may happen for "the the" changing
3518 // the first "the" to itself.
3519 return;
3520
3521 if (gap->ga_len == 0)
3522 i = -1;
3523 else
3524 {
3525 // Check if the word is already there. Also check the length that is
3526 // being replaced "thes," -> "these" is a different suggestion from
3527 // "thes" -> "these".
3528 stp = &SUG(*gap, 0);
3529 for (i = gap->ga_len; --i >= 0; ++stp)
3530 if (stp->st_wordlen == goodlen
3531 && stp->st_orglen == badlen
3532 && STRNCMP(stp->st_word, goodword, goodlen) == 0)
3533 {
3534 // Found it. Remember the word with the lowest score.
3535 if (stp->st_slang == NULL)
3536 stp->st_slang = slang;
3537
3538 new_sug.st_score = score;
3539 new_sug.st_altscore = altscore;
3540 new_sug.st_had_bonus = had_bonus;
3541
3542 if (stp->st_had_bonus != had_bonus)
3543 {
3544 // Only one of the two had the soundalike score computed.
3545 // Need to do that for the other one now, otherwise the
3546 // scores can't be compared. This happens because
3547 // suggest_try_change() doesn't compute the soundalike
3548 // word to keep it fast, while some special methods set
3549 // the soundalike score to zero.
3550 if (had_bonus)
3551 rescore_one(su, stp);
3552 else
3553 {
3554 new_sug.st_word = stp->st_word;
3555 new_sug.st_wordlen = stp->st_wordlen;
3556 new_sug.st_slang = stp->st_slang;
3557 new_sug.st_orglen = badlen;
3558 rescore_one(su, &new_sug);
3559 }
3560 }
3561
3562 if (stp->st_score > new_sug.st_score)
3563 {
3564 stp->st_score = new_sug.st_score;
3565 stp->st_altscore = new_sug.st_altscore;
3566 stp->st_had_bonus = new_sug.st_had_bonus;
3567 }
3568 break;
3569 }
3570 }
3571
3572 if (i < 0 && ga_grow(gap, 1) == OK)
3573 {
3574 // Add a suggestion.
3575 stp = &SUG(*gap, gap->ga_len);
3576 stp->st_word = vim_strnsave(goodword, goodlen);
3577 if (stp->st_word != NULL)
3578 {
3579 stp->st_wordlen = goodlen;
3580 stp->st_score = score;
3581 stp->st_altscore = altscore;
3582 stp->st_had_bonus = had_bonus;
3583 stp->st_orglen = badlen;
3584 stp->st_slang = slang;
3585 ++gap->ga_len;
3586
3587 // If we have too many suggestions now, sort the list and keep
3588 // the best suggestions.
3589 if (gap->ga_len > SUG_MAX_COUNT(su))
3590 {
3591 if (maxsf)
3592 su->su_sfmaxscore = cleanup_suggestions(gap,
3593 su->su_sfmaxscore, SUG_CLEAN_COUNT(su));
3594 else
3595 su->su_maxscore = cleanup_suggestions(gap,
3596 su->su_maxscore, SUG_CLEAN_COUNT(su));
3597 }
3598 }
3599 }
3600}
3601
3602/*
3603 * Suggestions may in fact be flagged as errors. Esp. for banned words and
3604 * for split words, such as "the the". Remove these from the list here.
3605 */
3606 static void
3607check_suggestions(
3608 suginfo_T *su,
3609 garray_T *gap) // either su_ga or su_sga
3610{
3611 suggest_T *stp;
3612 int i;
3613 char_u longword[MAXWLEN + 1];
3614 int len;
3615 hlf_T attr;
3616
Bram Moolenaar9c2b0662020-09-01 19:56:15 +02003617 if (gap->ga_len == 0)
3618 return;
Bram Moolenaar46a426c2019-09-27 12:41:56 +02003619 stp = &SUG(*gap, 0);
3620 for (i = gap->ga_len - 1; i >= 0; --i)
3621 {
3622 // Need to append what follows to check for "the the".
3623 vim_strncpy(longword, stp[i].st_word, MAXWLEN);
3624 len = stp[i].st_wordlen;
3625 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen,
3626 MAXWLEN - len);
3627 attr = HLF_COUNT;
3628 (void)spell_check(curwin, longword, &attr, NULL, FALSE);
3629 if (attr != HLF_COUNT)
3630 {
3631 // Remove this entry.
3632 vim_free(stp[i].st_word);
3633 --gap->ga_len;
3634 if (i < gap->ga_len)
3635 mch_memmove(stp + i, stp + i + 1,
3636 sizeof(suggest_T) * (gap->ga_len - i));
3637 }
3638 }
3639}
3640
3641
3642/*
3643 * Add a word to be banned.
3644 */
3645 static void
3646add_banned(
3647 suginfo_T *su,
3648 char_u *word)
3649{
3650 char_u *s;
3651 hash_T hash;
3652 hashitem_T *hi;
3653
3654 hash = hash_hash(word);
3655 hi = hash_lookup(&su->su_banned, word, hash);
3656 if (HASHITEM_EMPTY(hi))
3657 {
3658 s = vim_strsave(word);
3659 if (s != NULL)
3660 hash_add_item(&su->su_banned, hi, s, hash);
3661 }
3662}
3663
3664/*
3665 * Recompute the score for all suggestions if sound-folding is possible. This
3666 * is slow, thus only done for the final results.
3667 */
3668 static void
3669rescore_suggestions(suginfo_T *su)
3670{
3671 int i;
3672
3673 if (su->su_sallang != NULL)
3674 for (i = 0; i < su->su_ga.ga_len; ++i)
3675 rescore_one(su, &SUG(su->su_ga, i));
3676}
3677
3678/*
3679 * Recompute the score for one suggestion if sound-folding is possible.
3680 */
3681 static void
3682rescore_one(suginfo_T *su, suggest_T *stp)
3683{
3684 slang_T *slang = stp->st_slang;
3685 char_u sal_badword[MAXWLEN];
3686 char_u *p;
3687
3688 // Only rescore suggestions that have no sal score yet and do have a
3689 // language.
3690 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus)
3691 {
3692 if (slang == su->su_sallang)
3693 p = su->su_sal_badword;
3694 else
3695 {
3696 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword);
3697 p = sal_badword;
3698 }
3699
3700 stp->st_altscore = stp_sal_score(stp, su, slang, p);
3701 if (stp->st_altscore == SCORE_MAXMAX)
3702 stp->st_altscore = SCORE_BIG;
3703 stp->st_score = RESCORE(stp->st_score, stp->st_altscore);
3704 stp->st_had_bonus = TRUE;
3705 }
3706}
3707
3708static int sug_compare(const void *s1, const void *s2);
3709
3710/*
3711 * Function given to qsort() to sort the suggestions on st_score.
3712 * First on "st_score", then "st_altscore" then alphabetically.
3713 */
3714 static int
3715sug_compare(const void *s1, const void *s2)
3716{
3717 suggest_T *p1 = (suggest_T *)s1;
3718 suggest_T *p2 = (suggest_T *)s2;
3719 int n = p1->st_score - p2->st_score;
3720
3721 if (n == 0)
3722 {
3723 n = p1->st_altscore - p2->st_altscore;
3724 if (n == 0)
3725 n = STRICMP(p1->st_word, p2->st_word);
3726 }
3727 return n;
3728}
3729
3730/*
3731 * Cleanup the suggestions:
3732 * - Sort on score.
3733 * - Remove words that won't be displayed.
3734 * Returns the maximum score in the list or "maxscore" unmodified.
3735 */
3736 static int
3737cleanup_suggestions(
3738 garray_T *gap,
3739 int maxscore,
3740 int keep) // nr of suggestions to keep
3741{
Bram Moolenaarbb65a562020-03-15 18:15:03 +01003742 if (gap->ga_len > 0)
Bram Moolenaar46a426c2019-09-27 12:41:56 +02003743 {
Bram Moolenaarbb65a562020-03-15 18:15:03 +01003744 // Sort the list.
3745 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T),
3746 sug_compare);
3747
3748 // Truncate the list to the number of suggestions that will be
3749 // displayed.
3750 if (gap->ga_len > keep)
3751 {
Bram Moolenaar4ad739f2020-09-02 10:25:45 +02003752 int i;
3753 suggest_T *stp = &SUG(*gap, 0);
3754
Bram Moolenaarbb65a562020-03-15 18:15:03 +01003755 for (i = keep; i < gap->ga_len; ++i)
3756 vim_free(stp[i].st_word);
3757 gap->ga_len = keep;
3758 if (keep >= 1)
3759 return stp[keep - 1].st_score;
3760 }
Bram Moolenaar46a426c2019-09-27 12:41:56 +02003761 }
3762 return maxscore;
3763}
3764
3765/*
3766 * Compute a score for two sound-a-like words.
3767 * This permits up to two inserts/deletes/swaps/etc. to keep things fast.
3768 * Instead of a generic loop we write out the code. That keeps it fast by
3769 * avoiding checks that will not be possible.
3770 */
3771 static int
3772soundalike_score(
3773 char_u *goodstart, // sound-folded good word
3774 char_u *badstart) // sound-folded bad word
3775{
3776 char_u *goodsound = goodstart;
3777 char_u *badsound = badstart;
3778 int goodlen;
3779 int badlen;
3780 int n;
3781 char_u *pl, *ps;
3782 char_u *pl2, *ps2;
3783 int score = 0;
3784
3785 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be
3786 // counted so much, vowels halfway the word aren't counted at all.
3787 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound)
3788 {
3789 if ((badsound[0] == NUL && goodsound[1] == NUL)
3790 || (goodsound[0] == NUL && badsound[1] == NUL))
3791 // changing word with vowel to word without a sound
3792 return SCORE_DEL;
3793 if (badsound[0] == NUL || goodsound[0] == NUL)
3794 // more than two changes
3795 return SCORE_MAXMAX;
3796
3797 if (badsound[1] == goodsound[1]
3798 || (badsound[1] != NUL
3799 && goodsound[1] != NUL
3800 && badsound[2] == goodsound[2]))
3801 {
3802 // handle like a substitute
3803 }
3804 else
3805 {
3806 score = 2 * SCORE_DEL / 3;
3807 if (*badsound == '*')
3808 ++badsound;
3809 else
3810 ++goodsound;
3811 }
3812 }
3813
3814 goodlen = (int)STRLEN(goodsound);
3815 badlen = (int)STRLEN(badsound);
3816
3817 // Return quickly if the lengths are too different to be fixed by two
3818 // changes.
3819 n = goodlen - badlen;
3820 if (n < -2 || n > 2)
3821 return SCORE_MAXMAX;
3822
3823 if (n > 0)
3824 {
3825 pl = goodsound; // goodsound is longest
3826 ps = badsound;
3827 }
3828 else
3829 {
3830 pl = badsound; // badsound is longest
3831 ps = goodsound;
3832 }
3833
3834 // Skip over the identical part.
3835 while (*pl == *ps && *pl != NUL)
3836 {
3837 ++pl;
3838 ++ps;
3839 }
3840
3841 switch (n)
3842 {
3843 case -2:
3844 case 2:
3845 // Must delete two characters from "pl".
3846 ++pl; // first delete
3847 while (*pl == *ps)
3848 {
3849 ++pl;
3850 ++ps;
3851 }
3852 // strings must be equal after second delete
3853 if (STRCMP(pl + 1, ps) == 0)
3854 return score + SCORE_DEL * 2;
3855
3856 // Failed to compare.
3857 break;
3858
3859 case -1:
3860 case 1:
3861 // Minimal one delete from "pl" required.
3862
3863 // 1: delete
3864 pl2 = pl + 1;
3865 ps2 = ps;
3866 while (*pl2 == *ps2)
3867 {
3868 if (*pl2 == NUL) // reached the end
3869 return score + SCORE_DEL;
3870 ++pl2;
3871 ++ps2;
3872 }
3873
3874 // 2: delete then swap, then rest must be equal
3875 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3876 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3877 return score + SCORE_DEL + SCORE_SWAP;
3878
3879 // 3: delete then substitute, then the rest must be equal
3880 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3881 return score + SCORE_DEL + SCORE_SUBST;
3882
3883 // 4: first swap then delete
3884 if (pl[0] == ps[1] && pl[1] == ps[0])
3885 {
3886 pl2 = pl + 2; // swap, skip two chars
3887 ps2 = ps + 2;
3888 while (*pl2 == *ps2)
3889 {
3890 ++pl2;
3891 ++ps2;
3892 }
3893 // delete a char and then strings must be equal
3894 if (STRCMP(pl2 + 1, ps2) == 0)
3895 return score + SCORE_SWAP + SCORE_DEL;
3896 }
3897
3898 // 5: first substitute then delete
3899 pl2 = pl + 1; // substitute, skip one char
3900 ps2 = ps + 1;
3901 while (*pl2 == *ps2)
3902 {
3903 ++pl2;
3904 ++ps2;
3905 }
3906 // delete a char and then strings must be equal
3907 if (STRCMP(pl2 + 1, ps2) == 0)
3908 return score + SCORE_SUBST + SCORE_DEL;
3909
3910 // Failed to compare.
3911 break;
3912
3913 case 0:
3914 // Lengths are equal, thus changes must result in same length: An
3915 // insert is only possible in combination with a delete.
3916 // 1: check if for identical strings
3917 if (*pl == NUL)
3918 return score;
3919
3920 // 2: swap
3921 if (pl[0] == ps[1] && pl[1] == ps[0])
3922 {
3923 pl2 = pl + 2; // swap, skip two chars
3924 ps2 = ps + 2;
3925 while (*pl2 == *ps2)
3926 {
3927 if (*pl2 == NUL) // reached the end
3928 return score + SCORE_SWAP;
3929 ++pl2;
3930 ++ps2;
3931 }
3932 // 3: swap and swap again
3933 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3934 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3935 return score + SCORE_SWAP + SCORE_SWAP;
3936
3937 // 4: swap and substitute
3938 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3939 return score + SCORE_SWAP + SCORE_SUBST;
3940 }
3941
3942 // 5: substitute
3943 pl2 = pl + 1;
3944 ps2 = ps + 1;
3945 while (*pl2 == *ps2)
3946 {
3947 if (*pl2 == NUL) // reached the end
3948 return score + SCORE_SUBST;
3949 ++pl2;
3950 ++ps2;
3951 }
3952
3953 // 6: substitute and swap
3954 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3955 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3956 return score + SCORE_SUBST + SCORE_SWAP;
3957
3958 // 7: substitute and substitute
3959 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3960 return score + SCORE_SUBST + SCORE_SUBST;
3961
3962 // 8: insert then delete
3963 pl2 = pl;
3964 ps2 = ps + 1;
3965 while (*pl2 == *ps2)
3966 {
3967 ++pl2;
3968 ++ps2;
3969 }
3970 if (STRCMP(pl2 + 1, ps2) == 0)
3971 return score + SCORE_INS + SCORE_DEL;
3972
3973 // 9: delete then insert
3974 pl2 = pl + 1;
3975 ps2 = ps;
3976 while (*pl2 == *ps2)
3977 {
3978 ++pl2;
3979 ++ps2;
3980 }
3981 if (STRCMP(pl2, ps2 + 1) == 0)
3982 return score + SCORE_INS + SCORE_DEL;
3983
3984 // Failed to compare.
3985 break;
3986 }
3987
3988 return SCORE_MAXMAX;
3989}
3990
3991/*
3992 * Compute the "edit distance" to turn "badword" into "goodword". The less
3993 * deletes/inserts/substitutes/swaps are required the lower the score.
3994 *
3995 * The algorithm is described by Du and Chang, 1992.
3996 * The implementation of the algorithm comes from Aspell editdist.cpp,
3997 * edit_distance(). It has been converted from C++ to C and modified to
3998 * support multi-byte characters.
3999 */
4000 static int
4001spell_edit_score(
4002 slang_T *slang,
4003 char_u *badword,
4004 char_u *goodword)
4005{
4006 int *cnt;
4007 int badlen, goodlen; // lengths including NUL
4008 int j, i;
4009 int t;
4010 int bc, gc;
4011 int pbc, pgc;
4012 char_u *p;
4013 int wbadword[MAXWLEN];
4014 int wgoodword[MAXWLEN];
4015
4016 if (has_mbyte)
4017 {
4018 // Get the characters from the multi-byte strings and put them in an
4019 // int array for easy access.
4020 for (p = badword, badlen = 0; *p != NUL; )
4021 wbadword[badlen++] = mb_cptr2char_adv(&p);
4022 wbadword[badlen++] = 0;
4023 for (p = goodword, goodlen = 0; *p != NUL; )
4024 wgoodword[goodlen++] = mb_cptr2char_adv(&p);
4025 wgoodword[goodlen++] = 0;
4026 }
4027 else
4028 {
4029 badlen = (int)STRLEN(badword) + 1;
4030 goodlen = (int)STRLEN(goodword) + 1;
4031 }
4032
4033 // We use "cnt" as an array: CNT(badword_idx, goodword_idx).
4034#define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
4035 cnt = ALLOC_MULT(int, (badlen + 1) * (goodlen + 1));
4036 if (cnt == NULL)
4037 return 0; // out of memory
4038
4039 CNT(0, 0) = 0;
4040 for (j = 1; j <= goodlen; ++j)
4041 CNT(0, j) = CNT(0, j - 1) + SCORE_INS;
4042
4043 for (i = 1; i <= badlen; ++i)
4044 {
4045 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL;
4046 for (j = 1; j <= goodlen; ++j)
4047 {
4048 if (has_mbyte)
4049 {
4050 bc = wbadword[i - 1];
4051 gc = wgoodword[j - 1];
4052 }
4053 else
4054 {
4055 bc = badword[i - 1];
4056 gc = goodword[j - 1];
4057 }
4058 if (bc == gc)
4059 CNT(i, j) = CNT(i - 1, j - 1);
4060 else
4061 {
4062 // Use a better score when there is only a case difference.
4063 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4064 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
4065 else
4066 {
4067 // For a similar character use SCORE_SIMILAR.
4068 if (slang != NULL
4069 && slang->sl_has_map
4070 && similar_chars(slang, gc, bc))
4071 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1);
4072 else
4073 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
4074 }
4075
4076 if (i > 1 && j > 1)
4077 {
4078 if (has_mbyte)
4079 {
4080 pbc = wbadword[i - 2];
4081 pgc = wgoodword[j - 2];
4082 }
4083 else
4084 {
4085 pbc = badword[i - 2];
4086 pgc = goodword[j - 2];
4087 }
4088 if (bc == pgc && pbc == gc)
4089 {
4090 t = SCORE_SWAP + CNT(i - 2, j - 2);
4091 if (t < CNT(i, j))
4092 CNT(i, j) = t;
4093 }
4094 }
4095 t = SCORE_DEL + CNT(i - 1, j);
4096 if (t < CNT(i, j))
4097 CNT(i, j) = t;
4098 t = SCORE_INS + CNT(i, j - 1);
4099 if (t < CNT(i, j))
4100 CNT(i, j) = t;
4101 }
4102 }
4103 }
4104
4105 i = CNT(badlen - 1, goodlen - 1);
4106 vim_free(cnt);
4107 return i;
4108}
4109
4110typedef struct
4111{
4112 int badi;
4113 int goodi;
4114 int score;
4115} limitscore_T;
4116
4117/*
4118 * Like spell_edit_score(), but with a limit on the score to make it faster.
4119 * May return SCORE_MAXMAX when the score is higher than "limit".
4120 *
4121 * This uses a stack for the edits still to be tried.
4122 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support
4123 * for multi-byte characters.
4124 */
4125 static int
4126spell_edit_score_limit(
4127 slang_T *slang,
4128 char_u *badword,
4129 char_u *goodword,
4130 int limit)
4131{
4132 limitscore_T stack[10]; // allow for over 3 * 2 edits
4133 int stackidx;
4134 int bi, gi;
4135 int bi2, gi2;
4136 int bc, gc;
4137 int score;
4138 int score_off;
4139 int minscore;
4140 int round;
4141
4142 // Multi-byte characters require a bit more work, use a different function
4143 // to avoid testing "has_mbyte" quite often.
4144 if (has_mbyte)
4145 return spell_edit_score_limit_w(slang, badword, goodword, limit);
4146
4147 // The idea is to go from start to end over the words. So long as
4148 // characters are equal just continue, this always gives the lowest score.
4149 // When there is a difference try several alternatives. Each alternative
4150 // increases "score" for the edit distance. Some of the alternatives are
4151 // pushed unto a stack and tried later, some are tried right away. At the
4152 // end of the word the score for one alternative is known. The lowest
4153 // possible score is stored in "minscore".
4154 stackidx = 0;
4155 bi = 0;
4156 gi = 0;
4157 score = 0;
4158 minscore = limit + 1;
4159
4160 for (;;)
4161 {
4162 // Skip over an equal part, score remains the same.
4163 for (;;)
4164 {
4165 bc = badword[bi];
4166 gc = goodword[gi];
4167 if (bc != gc) // stop at a char that's different
4168 break;
4169 if (bc == NUL) // both words end
4170 {
4171 if (score < minscore)
4172 minscore = score;
4173 goto pop; // do next alternative
4174 }
4175 ++bi;
4176 ++gi;
4177 }
4178
4179 if (gc == NUL) // goodword ends, delete badword chars
4180 {
4181 do
4182 {
4183 if ((score += SCORE_DEL) >= minscore)
4184 goto pop; // do next alternative
4185 } while (badword[++bi] != NUL);
4186 minscore = score;
4187 }
4188 else if (bc == NUL) // badword ends, insert badword chars
4189 {
4190 do
4191 {
4192 if ((score += SCORE_INS) >= minscore)
4193 goto pop; // do next alternative
4194 } while (goodword[++gi] != NUL);
4195 minscore = score;
4196 }
4197 else // both words continue
4198 {
4199 // If not close to the limit, perform a change. Only try changes
4200 // that may lead to a lower score than "minscore".
4201 // round 0: try deleting a char from badword
4202 // round 1: try inserting a char in badword
4203 for (round = 0; round <= 1; ++round)
4204 {
4205 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4206 if (score_off < minscore)
4207 {
4208 if (score_off + SCORE_EDIT_MIN >= minscore)
4209 {
4210 // Near the limit, rest of the words must match. We
4211 // can check that right now, no need to push an item
4212 // onto the stack.
4213 bi2 = bi + 1 - round;
4214 gi2 = gi + round;
4215 while (goodword[gi2] == badword[bi2])
4216 {
4217 if (goodword[gi2] == NUL)
4218 {
4219 minscore = score_off;
4220 break;
4221 }
4222 ++bi2;
4223 ++gi2;
4224 }
4225 }
4226 else
4227 {
4228 // try deleting/inserting a character later
4229 stack[stackidx].badi = bi + 1 - round;
4230 stack[stackidx].goodi = gi + round;
4231 stack[stackidx].score = score_off;
4232 ++stackidx;
4233 }
4234 }
4235 }
4236
4237 if (score + SCORE_SWAP < minscore)
4238 {
4239 // If swapping two characters makes a match then the
4240 // substitution is more expensive, thus there is no need to
4241 // try both.
4242 if (gc == badword[bi + 1] && bc == goodword[gi + 1])
4243 {
4244 // Swap two characters, that is: skip them.
4245 gi += 2;
4246 bi += 2;
4247 score += SCORE_SWAP;
4248 continue;
4249 }
4250 }
4251
4252 // Substitute one character for another which is the same
4253 // thing as deleting a character from both goodword and badword.
4254 // Use a better score when there is only a case difference.
4255 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4256 score += SCORE_ICASE;
4257 else
4258 {
4259 // For a similar character use SCORE_SIMILAR.
4260 if (slang != NULL
4261 && slang->sl_has_map
4262 && similar_chars(slang, gc, bc))
4263 score += SCORE_SIMILAR;
4264 else
4265 score += SCORE_SUBST;
4266 }
4267
4268 if (score < minscore)
4269 {
4270 // Do the substitution.
4271 ++gi;
4272 ++bi;
4273 continue;
4274 }
4275 }
4276pop:
4277 // Get here to try the next alternative, pop it from the stack.
4278 if (stackidx == 0) // stack is empty, finished
4279 break;
4280
4281 // pop an item from the stack
4282 --stackidx;
4283 gi = stack[stackidx].goodi;
4284 bi = stack[stackidx].badi;
4285 score = stack[stackidx].score;
4286 }
4287
4288 // When the score goes over "limit" it may actually be much higher.
4289 // Return a very large number to avoid going below the limit when giving a
4290 // bonus.
4291 if (minscore > limit)
4292 return SCORE_MAXMAX;
4293 return minscore;
4294}
4295
4296/*
4297 * Multi-byte version of spell_edit_score_limit().
4298 * Keep it in sync with the above!
4299 */
4300 static int
4301spell_edit_score_limit_w(
4302 slang_T *slang,
4303 char_u *badword,
4304 char_u *goodword,
4305 int limit)
4306{
4307 limitscore_T stack[10]; // allow for over 3 * 2 edits
4308 int stackidx;
4309 int bi, gi;
4310 int bi2, gi2;
4311 int bc, gc;
4312 int score;
4313 int score_off;
4314 int minscore;
4315 int round;
4316 char_u *p;
4317 int wbadword[MAXWLEN];
4318 int wgoodword[MAXWLEN];
4319
4320 // Get the characters from the multi-byte strings and put them in an
4321 // int array for easy access.
4322 bi = 0;
4323 for (p = badword; *p != NUL; )
4324 wbadword[bi++] = mb_cptr2char_adv(&p);
4325 wbadword[bi++] = 0;
4326 gi = 0;
4327 for (p = goodword; *p != NUL; )
4328 wgoodword[gi++] = mb_cptr2char_adv(&p);
4329 wgoodword[gi++] = 0;
4330
4331 // The idea is to go from start to end over the words. So long as
4332 // characters are equal just continue, this always gives the lowest score.
4333 // When there is a difference try several alternatives. Each alternative
4334 // increases "score" for the edit distance. Some of the alternatives are
4335 // pushed unto a stack and tried later, some are tried right away. At the
4336 // end of the word the score for one alternative is known. The lowest
4337 // possible score is stored in "minscore".
4338 stackidx = 0;
4339 bi = 0;
4340 gi = 0;
4341 score = 0;
4342 minscore = limit + 1;
4343
4344 for (;;)
4345 {
4346 // Skip over an equal part, score remains the same.
4347 for (;;)
4348 {
4349 bc = wbadword[bi];
4350 gc = wgoodword[gi];
4351
4352 if (bc != gc) // stop at a char that's different
4353 break;
4354 if (bc == NUL) // both words end
4355 {
4356 if (score < minscore)
4357 minscore = score;
4358 goto pop; // do next alternative
4359 }
4360 ++bi;
4361 ++gi;
4362 }
4363
4364 if (gc == NUL) // goodword ends, delete badword chars
4365 {
4366 do
4367 {
4368 if ((score += SCORE_DEL) >= minscore)
4369 goto pop; // do next alternative
4370 } while (wbadword[++bi] != NUL);
4371 minscore = score;
4372 }
4373 else if (bc == NUL) // badword ends, insert badword chars
4374 {
4375 do
4376 {
4377 if ((score += SCORE_INS) >= minscore)
4378 goto pop; // do next alternative
4379 } while (wgoodword[++gi] != NUL);
4380 minscore = score;
4381 }
4382 else // both words continue
4383 {
4384 // If not close to the limit, perform a change. Only try changes
4385 // that may lead to a lower score than "minscore".
4386 // round 0: try deleting a char from badword
4387 // round 1: try inserting a char in badword
4388 for (round = 0; round <= 1; ++round)
4389 {
4390 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4391 if (score_off < minscore)
4392 {
4393 if (score_off + SCORE_EDIT_MIN >= minscore)
4394 {
4395 // Near the limit, rest of the words must match. We
4396 // can check that right now, no need to push an item
4397 // onto the stack.
4398 bi2 = bi + 1 - round;
4399 gi2 = gi + round;
4400 while (wgoodword[gi2] == wbadword[bi2])
4401 {
4402 if (wgoodword[gi2] == NUL)
4403 {
4404 minscore = score_off;
4405 break;
4406 }
4407 ++bi2;
4408 ++gi2;
4409 }
4410 }
4411 else
4412 {
4413 // try deleting a character from badword later
4414 stack[stackidx].badi = bi + 1 - round;
4415 stack[stackidx].goodi = gi + round;
4416 stack[stackidx].score = score_off;
4417 ++stackidx;
4418 }
4419 }
4420 }
4421
4422 if (score + SCORE_SWAP < minscore)
4423 {
4424 // If swapping two characters makes a match then the
4425 // substitution is more expensive, thus there is no need to
4426 // try both.
4427 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1])
4428 {
4429 // Swap two characters, that is: skip them.
4430 gi += 2;
4431 bi += 2;
4432 score += SCORE_SWAP;
4433 continue;
4434 }
4435 }
4436
4437 // Substitute one character for another which is the same
4438 // thing as deleting a character from both goodword and badword.
4439 // Use a better score when there is only a case difference.
4440 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4441 score += SCORE_ICASE;
4442 else
4443 {
4444 // For a similar character use SCORE_SIMILAR.
4445 if (slang != NULL
4446 && slang->sl_has_map
4447 && similar_chars(slang, gc, bc))
4448 score += SCORE_SIMILAR;
4449 else
4450 score += SCORE_SUBST;
4451 }
4452
4453 if (score < minscore)
4454 {
4455 // Do the substitution.
4456 ++gi;
4457 ++bi;
4458 continue;
4459 }
4460 }
4461pop:
4462 // Get here to try the next alternative, pop it from the stack.
4463 if (stackidx == 0) // stack is empty, finished
4464 break;
4465
4466 // pop an item from the stack
4467 --stackidx;
4468 gi = stack[stackidx].goodi;
4469 bi = stack[stackidx].badi;
4470 score = stack[stackidx].score;
4471 }
4472
4473 // When the score goes over "limit" it may actually be much higher.
4474 // Return a very large number to avoid going below the limit when giving a
4475 // bonus.
4476 if (minscore > limit)
4477 return SCORE_MAXMAX;
4478 return minscore;
4479}
4480#endif // FEAT_SPELL