blob: adf090bbe59febdd4430ec2fe52c139a916640e7 [file] [log] [blame]
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10/*
11 * spellsuggest.c: functions for spelling suggestions
12 */
13
14#include "vim.h"
15
16#if defined(FEAT_SPELL) || defined(PROTO)
17
18/*
19 * Use this to adjust the score after finding suggestions, based on the
20 * suggested word sounding like the bad word. This is much faster than doing
21 * it for every possible suggestion.
22 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
23 * vs "ht") and goes down in the list.
24 * Used when 'spellsuggest' is set to "best".
25 */
26#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
27
28/*
29 * Do the opposite: based on a maximum end score and a known sound score,
30 * compute the maximum word score that can be used.
31 */
32#define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
33
34// only used for su_badflags
35#define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI
36
37/*
38 * Information used when looking for suggestions.
39 */
40typedef struct suginfo_S
41{
42 garray_T su_ga; // suggestions, contains "suggest_T"
43 int su_maxcount; // max. number of suggestions displayed
44 int su_maxscore; // maximum score for adding to su_ga
45 int su_sfmaxscore; // idem, for when doing soundfold words
46 garray_T su_sga; // like su_ga, sound-folded scoring
47 char_u *su_badptr; // start of bad word in line
48 int su_badlen; // length of detected bad word in line
49 int su_badflags; // caps flags for bad word
50 char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen
51 char_u su_fbadword[MAXWLEN]; // su_badword case-folded
52 char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded
53 hashtab_T su_banned; // table with banned words
54 slang_T *su_sallang; // default language for sound folding
55} suginfo_T;
56
57// One word suggestion. Used in "si_ga".
58typedef struct suggest_S
59{
60 char_u *st_word; // suggested word, allocated string
61 int st_wordlen; // STRLEN(st_word)
62 int st_orglen; // length of replaced text
63 int st_score; // lower is better
64 int st_altscore; // used when st_score compares equal
65 int st_salscore; // st_score is for soundalike
66 int st_had_bonus; // bonus already included in score
67 slang_T *st_slang; // language used for sound folding
68} suggest_T;
69
70#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
71
72// TRUE if a word appears in the list of banned words.
73#define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
74
75// Number of suggestions kept when cleaning up. We need to keep more than
76// what is displayed, because when rescore_suggestions() is called the score
77// may change and wrong suggestions may be removed later.
78#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20)
79
80// Threshold for sorting and cleaning up suggestions. Don't want to keep lots
81// of suggestions that are not going to be displayed.
82#define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
83
84// score for various changes
85#define SCORE_SPLIT 149 // split bad word
86#define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS
87#define SCORE_ICASE 52 // slightly different case
88#define SCORE_REGION 200 // word is for different region
89#define SCORE_RARE 180 // rare word
90#define SCORE_SWAP 75 // swap two characters
91#define SCORE_SWAP3 110 // swap two characters in three
92#define SCORE_REP 65 // REP replacement
93#define SCORE_SUBST 93 // substitute a character
94#define SCORE_SIMILAR 33 // substitute a similar character
95#define SCORE_SUBCOMP 33 // substitute a composing character
96#define SCORE_DEL 94 // delete a character
97#define SCORE_DELDUP 66 // delete a duplicated character
98#define SCORE_DELCOMP 28 // delete a composing character
99#define SCORE_INS 96 // insert a character
100#define SCORE_INSDUP 67 // insert a duplicate character
101#define SCORE_INSCOMP 30 // insert a composing character
102#define SCORE_NONWORD 103 // change non-word to word char
103
104#define SCORE_FILE 30 // suggestion from a file
105#define SCORE_MAXINIT 350 // Initial maximum score: higher == slower.
106 // 350 allows for about three changes.
107
108#define SCORE_COMMON1 30 // subtracted for words seen before
109#define SCORE_COMMON2 40 // subtracted for words often seen
110#define SCORE_COMMON3 50 // subtracted for words very often seen
111#define SCORE_THRES2 10 // word count threshold for COMMON2
112#define SCORE_THRES3 100 // word count threshold for COMMON3
113
114// When trying changed soundfold words it becomes slow when trying more than
115// two changes. With less then two changes it's slightly faster but we miss a
116// few good suggestions. In rare cases we need to try three of four changes.
117#define SCORE_SFMAX1 200 // maximum score for first try
118#define SCORE_SFMAX2 300 // maximum score for second try
119#define SCORE_SFMAX3 400 // maximum score for third try
120
121#define SCORE_BIG SCORE_INS * 3 // big difference
122#define SCORE_MAXMAX 999999 // accept any score
123#define SCORE_LIMITMAX 350 // for spell_edit_score_limit()
124
125// for spell_edit_score_limit() we need to know the minimum value of
126// SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS
127#define SCORE_EDIT_MIN SCORE_SIMILAR
128
129/*
130 * For finding suggestions: At each node in the tree these states are tried:
131 */
132typedef enum
133{
134 STATE_START = 0, // At start of node check for NUL bytes (goodword
135 // ends); if badword ends there is a match, otherwise
136 // try splitting word.
137 STATE_NOPREFIX, // try without prefix
138 STATE_SPLITUNDO, // Undo splitting.
139 STATE_ENDNUL, // Past NUL bytes at start of the node.
140 STATE_PLAIN, // Use each byte of the node.
141 STATE_DEL, // Delete a byte from the bad word.
142 STATE_INS_PREP, // Prepare for inserting bytes.
143 STATE_INS, // Insert a byte in the bad word.
144 STATE_SWAP, // Swap two bytes.
145 STATE_UNSWAP, // Undo swap two characters.
146 STATE_SWAP3, // Swap two characters over three.
147 STATE_UNSWAP3, // Undo Swap two characters over three.
148 STATE_UNROT3L, // Undo rotate three characters left
149 STATE_UNROT3R, // Undo rotate three characters right
150 STATE_REP_INI, // Prepare for using REP items.
151 STATE_REP, // Use matching REP items from the .aff file.
152 STATE_REP_UNDO, // Undo a REP item replacement.
153 STATE_FINAL // End of this node.
154} state_T;
155
156/*
157 * Struct to keep the state at each level in suggest_try_change().
158 */
159typedef struct trystate_S
160{
161 state_T ts_state; // state at this level, STATE_
162 int ts_score; // score
163 idx_T ts_arridx; // index in tree array, start of node
164 short ts_curi; // index in list of child nodes
165 char_u ts_fidx; // index in fword[], case-folded bad word
166 char_u ts_fidxtry; // ts_fidx at which bytes may be changed
167 char_u ts_twordlen; // valid length of tword[]
168 char_u ts_prefixdepth; // stack depth for end of prefix or
169 // PFD_PREFIXTREE or PFD_NOPREFIX
170 char_u ts_flags; // TSF_ flags
171 char_u ts_tcharlen; // number of bytes in tword character
172 char_u ts_tcharidx; // current byte index in tword character
173 char_u ts_isdiff; // DIFF_ values
174 char_u ts_fcharstart; // index in fword where badword char started
175 char_u ts_prewordlen; // length of word in "preword[]"
176 char_u ts_splitoff; // index in "tword" after last split
177 char_u ts_splitfidx; // "ts_fidx" at word split
178 char_u ts_complen; // nr of compound words used
179 char_u ts_compsplit; // index for "compflags" where word was spit
180 char_u ts_save_badflags; // su_badflags saved here
181 char_u ts_delidx; // index in fword for char that was deleted,
182 // valid when "ts_flags" has TSF_DIDDEL
183} trystate_T;
184
185// values for ts_isdiff
186#define DIFF_NONE 0 // no different byte (yet)
187#define DIFF_YES 1 // different byte found
188#define DIFF_INSERT 2 // inserting character
189
190// values for ts_flags
191#define TSF_PREFIXOK 1 // already checked that prefix is OK
192#define TSF_DIDSPLIT 2 // tried split at this point
193#define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index
194
195// special values ts_prefixdepth
196#define PFD_NOPREFIX 0xff // not using prefixes
197#define PFD_PREFIXTREE 0xfe // walking through the prefix tree
198#define PFD_NOTSPECIAL 0xfd // highest value that's not special
199
200static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive);
201#ifdef FEAT_EVAL
202static void spell_suggest_expr(suginfo_T *su, char_u *expr);
203#endif
204static void spell_suggest_file(suginfo_T *su, char_u *fname);
205static void spell_suggest_intern(suginfo_T *su, int interactive);
206static void spell_find_cleanup(suginfo_T *su);
207static void suggest_try_special(suginfo_T *su);
208static void suggest_try_change(suginfo_T *su);
209static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold);
210static void go_deeper(trystate_T *stack, int depth, int score_add);
211static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword);
212static void score_comp_sal(suginfo_T *su);
213static void score_combine(suginfo_T *su);
214static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound);
215static void suggest_try_soundalike_prep(void);
216static void suggest_try_soundalike(suginfo_T *su);
217static void suggest_try_soundalike_finish(void);
218static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp);
219static int soundfold_find(slang_T *slang, char_u *word);
220static int similar_chars(slang_T *slang, int c1, int c2);
221static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf);
222static void check_suggestions(suginfo_T *su, garray_T *gap);
223static void add_banned(suginfo_T *su, char_u *word);
224static void rescore_suggestions(suginfo_T *su);
225static void rescore_one(suginfo_T *su, suggest_T *stp);
226static int cleanup_suggestions(garray_T *gap, int maxscore, int keep);
227static int soundalike_score(char_u *goodsound, char_u *badsound);
228static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword);
229static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit);
230static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit);
231
232/*
233 * Return TRUE when the sequence of flags in "compflags" plus "flag" can
234 * possibly form a valid compounded word. This also checks the COMPOUNDRULE
235 * lines if they don't contain wildcards.
236 */
237 static int
238can_be_compound(
239 trystate_T *sp,
240 slang_T *slang,
241 char_u *compflags,
242 int flag)
243{
244 // If the flag doesn't appear in sl_compstartflags or sl_compallflags
245 // then it can't possibly compound.
246 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit
247 ? slang->sl_compstartflags : slang->sl_compallflags, flag))
248 return FALSE;
249
250 // If there are no wildcards, we can check if the flags collected so far
251 // possibly can form a match with COMPOUNDRULE patterns. This only
252 // makes sense when we have two or more words.
253 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit)
254 {
255 int v;
256
257 compflags[sp->ts_complen] = flag;
258 compflags[sp->ts_complen + 1] = NUL;
259 v = match_compoundrule(slang, compflags + sp->ts_compsplit);
260 compflags[sp->ts_complen] = NUL;
261 return v;
262 }
263
264 return TRUE;
265}
266
267/*
268 * Adjust the score of common words.
269 */
270 static int
271score_wordcount_adj(
272 slang_T *slang,
273 int score,
274 char_u *word,
275 int split) // word was split, less bonus
276{
277 hashitem_T *hi;
278 wordcount_T *wc;
279 int bonus;
280 int newscore;
281
282 hi = hash_find(&slang->sl_wordcount, word);
283 if (!HASHITEM_EMPTY(hi))
284 {
285 wc = HI2WC(hi);
286 if (wc->wc_count < SCORE_THRES2)
287 bonus = SCORE_COMMON1;
288 else if (wc->wc_count < SCORE_THRES3)
289 bonus = SCORE_COMMON2;
290 else
291 bonus = SCORE_COMMON3;
292 if (split)
293 newscore = score - bonus / 2;
294 else
295 newscore = score - bonus;
296 if (newscore < 0)
297 return 0;
298 return newscore;
299 }
300 return score;
301}
302
303/*
304 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a
305 * capital. So that make_case_word() can turn WOrd into Word.
306 * Add ALLCAP for "WOrD".
307 */
308 static int
309badword_captype(char_u *word, char_u *end)
310{
311 int flags = captype(word, end);
312 int c;
313 int l, u;
314 int first;
315 char_u *p;
316
317 if (flags & WF_KEEPCAP)
318 {
319 // Count the number of UPPER and lower case letters.
320 l = u = 0;
321 first = FALSE;
322 for (p = word; p < end; MB_PTR_ADV(p))
323 {
324 c = PTR2CHAR(p);
325 if (SPELL_ISUPPER(c))
326 {
327 ++u;
328 if (p == word)
329 first = TRUE;
330 }
331 else
332 ++l;
333 }
334
335 // If there are more UPPER than lower case letters suggest an
336 // ALLCAP word. Otherwise, if the first letter is UPPER then
337 // suggest ONECAP. Exception: "ALl" most likely should be "All",
338 // require three upper case letters.
339 if (u > l && u > 2)
340 flags |= WF_ALLCAP;
341 else if (first)
342 flags |= WF_ONECAP;
343
344 if (u >= 2 && l >= 2) // maCARONI maCAroni
345 flags |= WF_MIXCAP;
346 }
347 return flags;
348}
349
350/*
351 * Opposite of offset2bytes().
352 * "pp" points to the bytes and is advanced over it.
353 * Returns the offset.
354 */
355 static int
356bytes2offset(char_u **pp)
357{
358 char_u *p = *pp;
359 int nr;
360 int c;
361
362 c = *p++;
363 if ((c & 0x80) == 0x00) // 1 byte
364 {
365 nr = c - 1;
366 }
367 else if ((c & 0xc0) == 0x80) // 2 bytes
368 {
369 nr = (c & 0x3f) - 1;
370 nr = nr * 255 + (*p++ - 1);
371 }
372 else if ((c & 0xe0) == 0xc0) // 3 bytes
373 {
374 nr = (c & 0x1f) - 1;
375 nr = nr * 255 + (*p++ - 1);
376 nr = nr * 255 + (*p++ - 1);
377 }
378 else // 4 bytes
379 {
380 nr = (c & 0x0f) - 1;
381 nr = nr * 255 + (*p++ - 1);
382 nr = nr * 255 + (*p++ - 1);
383 nr = nr * 255 + (*p++ - 1);
384 }
385
386 *pp = p;
387 return nr;
388}
389
390// values for sps_flags
391#define SPS_BEST 1
392#define SPS_FAST 2
393#define SPS_DOUBLE 4
394
395static int sps_flags = SPS_BEST; // flags from 'spellsuggest'
396static int sps_limit = 9999; // max nr of suggestions given
397
398/*
399 * Check the 'spellsuggest' option. Return FAIL if it's wrong.
400 * Sets "sps_flags" and "sps_limit".
401 */
402 int
403spell_check_sps(void)
404{
405 char_u *p;
406 char_u *s;
407 char_u buf[MAXPATHL];
408 int f;
409
410 sps_flags = 0;
411 sps_limit = 9999;
412
413 for (p = p_sps; *p != NUL; )
414 {
415 copy_option_part(&p, buf, MAXPATHL, ",");
416
417 f = 0;
418 if (VIM_ISDIGIT(*buf))
419 {
420 s = buf;
421 sps_limit = getdigits(&s);
422 if (*s != NUL && !VIM_ISDIGIT(*s))
423 f = -1;
424 }
425 else if (STRCMP(buf, "best") == 0)
426 f = SPS_BEST;
427 else if (STRCMP(buf, "fast") == 0)
428 f = SPS_FAST;
429 else if (STRCMP(buf, "double") == 0)
430 f = SPS_DOUBLE;
431 else if (STRNCMP(buf, "expr:", 5) != 0
432 && STRNCMP(buf, "file:", 5) != 0)
433 f = -1;
434
435 if (f == -1 || (sps_flags != 0 && f != 0))
436 {
437 sps_flags = SPS_BEST;
438 sps_limit = 9999;
439 return FAIL;
440 }
441 if (f != 0)
442 sps_flags = f;
443 }
444
445 if (sps_flags == 0)
446 sps_flags = SPS_BEST;
447
448 return OK;
449}
450
451/*
452 * "z=": Find badly spelled word under or after the cursor.
453 * Give suggestions for the properly spelled word.
454 * In Visual mode use the highlighted word as the bad word.
455 * When "count" is non-zero use that suggestion.
456 */
457 void
458spell_suggest(int count)
459{
460 char_u *line;
461 pos_T prev_cursor = curwin->w_cursor;
462 char_u wcopy[MAXWLEN + 2];
463 char_u *p;
464 int i;
465 int c;
466 suginfo_T sug;
467 suggest_T *stp;
468 int mouse_used;
469 int need_cap;
470 int limit;
471 int selected = count;
472 int badlen = 0;
473 int msg_scroll_save = msg_scroll;
474
475 if (no_spell_checking(curwin))
476 return;
477
478 if (VIsual_active)
479 {
480 // Use the Visually selected text as the bad word. But reject
481 // a multi-line selection.
482 if (curwin->w_cursor.lnum != VIsual.lnum)
483 {
484 vim_beep(BO_SPELL);
485 return;
486 }
487 badlen = (int)curwin->w_cursor.col - (int)VIsual.col;
488 if (badlen < 0)
489 badlen = -badlen;
490 else
491 curwin->w_cursor.col = VIsual.col;
492 ++badlen;
493 end_visual_mode();
494 }
495 // Find the start of the badly spelled word.
496 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0
497 || curwin->w_cursor.col > prev_cursor.col)
498 {
499 // No bad word or it starts after the cursor: use the word under the
500 // cursor.
501 curwin->w_cursor = prev_cursor;
502 line = ml_get_curline();
503 p = line + curwin->w_cursor.col;
504 // Backup to before start of word.
505 while (p > line && spell_iswordp_nmw(p, curwin))
506 MB_PTR_BACK(line, p);
507 // Forward to start of word.
508 while (*p != NUL && !spell_iswordp_nmw(p, curwin))
509 MB_PTR_ADV(p);
510
511 if (!spell_iswordp_nmw(p, curwin)) // No word found.
512 {
513 beep_flush();
514 return;
515 }
516 curwin->w_cursor.col = (colnr_T)(p - line);
517 }
518
519 // Get the word and its length.
520
521 // Figure out if the word should be capitalised.
522 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col);
523
524 // Make a copy of current line since autocommands may free the line.
525 line = vim_strsave(ml_get_curline());
526 if (line == NULL)
527 goto skip;
528
529 // Get the list of suggestions. Limit to 'lines' - 2 or the number in
530 // 'spellsuggest', whatever is smaller.
531 if (sps_limit > (int)Rows - 2)
532 limit = (int)Rows - 2;
533 else
534 limit = sps_limit;
535 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit,
536 TRUE, need_cap, TRUE);
537
538 if (sug.su_ga.ga_len == 0)
539 msg(_("Sorry, no suggestions"));
540 else if (count > 0)
541 {
542 if (count > sug.su_ga.ga_len)
543 smsg(_("Sorry, only %ld suggestions"),
544 (long)sug.su_ga.ga_len);
545 }
546 else
547 {
548 VIM_CLEAR(repl_from);
549 VIM_CLEAR(repl_to);
550
551#ifdef FEAT_RIGHTLEFT
552 // When 'rightleft' is set the list is drawn right-left.
553 cmdmsg_rl = curwin->w_p_rl;
554 if (cmdmsg_rl)
555 msg_col = Columns - 1;
556#endif
557
558 // List the suggestions.
559 msg_start();
560 msg_row = Rows - 1; // for when 'cmdheight' > 1
561 lines_left = Rows; // avoid more prompt
562 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
563 sug.su_badlen, sug.su_badptr);
564#ifdef FEAT_RIGHTLEFT
565 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0)
566 {
567 // And now the rabbit from the high hat: Avoid showing the
568 // untranslated message rightleft.
569 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC",
570 sug.su_badlen, sug.su_badptr);
571 }
572#endif
573 msg_puts((char *)IObuff);
574 msg_clr_eos();
575 msg_putchar('\n');
576
577 msg_scroll = TRUE;
578 for (i = 0; i < sug.su_ga.ga_len; ++i)
579 {
580 stp = &SUG(sug.su_ga, i);
581
582 // The suggested word may replace only part of the bad word, add
583 // the not replaced part.
584 vim_strncpy(wcopy, stp->st_word, MAXWLEN);
585 if (sug.su_badlen > stp->st_orglen)
586 vim_strncpy(wcopy + stp->st_wordlen,
587 sug.su_badptr + stp->st_orglen,
588 sug.su_badlen - stp->st_orglen);
589 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1);
590#ifdef FEAT_RIGHTLEFT
591 if (cmdmsg_rl)
592 rl_mirror(IObuff);
593#endif
594 msg_puts((char *)IObuff);
595
596 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy);
597 msg_puts((char *)IObuff);
598
599 // The word may replace more than "su_badlen".
600 if (sug.su_badlen < stp->st_orglen)
601 {
602 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""),
603 stp->st_orglen, sug.su_badptr);
604 msg_puts((char *)IObuff);
605 }
606
607 if (p_verbose > 0)
608 {
609 // Add the score.
610 if (sps_flags & (SPS_DOUBLE | SPS_BEST))
611 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)",
612 stp->st_salscore ? "s " : "",
613 stp->st_score, stp->st_altscore);
614 else
615 vim_snprintf((char *)IObuff, IOSIZE, " (%d)",
616 stp->st_score);
617#ifdef FEAT_RIGHTLEFT
618 if (cmdmsg_rl)
619 // Mirror the numbers, but keep the leading space.
620 rl_mirror(IObuff + 1);
621#endif
622 msg_advance(30);
623 msg_puts((char *)IObuff);
624 }
625 msg_putchar('\n');
626 }
627
628#ifdef FEAT_RIGHTLEFT
629 cmdmsg_rl = FALSE;
630 msg_col = 0;
631#endif
632 // Ask for choice.
633 selected = prompt_for_number(&mouse_used);
634 if (mouse_used)
635 selected -= lines_left;
636 lines_left = Rows; // avoid more prompt
637 // don't delay for 'smd' in normal_cmd()
638 msg_scroll = msg_scroll_save;
639 }
640
641 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK)
642 {
643 // Save the from and to text for :spellrepall.
644 stp = &SUG(sug.su_ga, selected - 1);
645 if (sug.su_badlen > stp->st_orglen)
646 {
647 // Replacing less than "su_badlen", append the remainder to
648 // repl_to.
649 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen);
650 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word,
651 sug.su_badlen - stp->st_orglen,
652 sug.su_badptr + stp->st_orglen);
653 repl_to = vim_strsave(IObuff);
654 }
655 else
656 {
657 // Replacing su_badlen or more, use the whole word.
658 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen);
659 repl_to = vim_strsave(stp->st_word);
660 }
661
662 // Replace the word.
663 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1);
664 if (p != NULL)
665 {
666 c = (int)(sug.su_badptr - line);
667 mch_memmove(p, line, c);
668 STRCPY(p + c, stp->st_word);
669 STRCAT(p, sug.su_badptr + stp->st_orglen);
670 ml_replace(curwin->w_cursor.lnum, p, FALSE);
671 curwin->w_cursor.col = c;
672
673 // For redo we use a change-word command.
674 ResetRedobuff();
675 AppendToRedobuff((char_u *)"ciw");
676 AppendToRedobuffLit(p + c,
677 stp->st_wordlen + sug.su_badlen - stp->st_orglen);
678 AppendCharToRedobuff(ESC);
679
680 // After this "p" may be invalid.
681 changed_bytes(curwin->w_cursor.lnum, c);
682 }
683 }
684 else
685 curwin->w_cursor = prev_cursor;
686
687 spell_find_cleanup(&sug);
688skip:
689 vim_free(line);
690}
691
692/*
693 * Find spell suggestions for "word". Return them in the growarray "*gap" as
694 * a list of allocated strings.
695 */
696 void
697spell_suggest_list(
698 garray_T *gap,
699 char_u *word,
700 int maxcount, // maximum nr of suggestions
701 int need_cap, // 'spellcapcheck' matched
702 int interactive)
703{
704 suginfo_T sug;
705 int i;
706 suggest_T *stp;
707 char_u *wcopy;
708
709 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive);
710
711 // Make room in "gap".
712 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
713 if (ga_grow(gap, sug.su_ga.ga_len) == OK)
714 {
715 for (i = 0; i < sug.su_ga.ga_len; ++i)
716 {
717 stp = &SUG(sug.su_ga, i);
718
719 // The suggested word may replace only part of "word", add the not
720 // replaced part.
721 wcopy = alloc(stp->st_wordlen
722 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1);
723 if (wcopy == NULL)
724 break;
725 STRCPY(wcopy, stp->st_word);
726 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen);
727 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
728 }
729 }
730
731 spell_find_cleanup(&sug);
732}
733
734/*
735 * Find spell suggestions for the word at the start of "badptr".
736 * Return the suggestions in "su->su_ga".
737 * The maximum number of suggestions is "maxcount".
738 * Note: does use info for the current window.
739 * This is based on the mechanisms of Aspell, but completely reimplemented.
740 */
741 static void
742spell_find_suggest(
743 char_u *badptr,
744 int badlen, // length of bad word or 0 if unknown
745 suginfo_T *su,
746 int maxcount,
747 int banbadword, // don't include badword in suggestions
748 int need_cap, // word should start with capital
749 int interactive)
750{
751 hlf_T attr = HLF_COUNT;
752 char_u buf[MAXPATHL];
753 char_u *p;
754 int do_combine = FALSE;
755 char_u *sps_copy;
756#ifdef FEAT_EVAL
757 static int expr_busy = FALSE;
758#endif
759 int c;
760 int i;
761 langp_T *lp;
762
763 // Set the info in "*su".
764 vim_memset(su, 0, sizeof(suginfo_T));
765 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10);
766 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10);
767 if (*badptr == NUL)
768 return;
769 hash_init(&su->su_banned);
770
771 su->su_badptr = badptr;
772 if (badlen != 0)
773 su->su_badlen = badlen;
774 else
775 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE);
776 su->su_maxcount = maxcount;
777 su->su_maxscore = SCORE_MAXINIT;
778
779 if (su->su_badlen >= MAXWLEN)
780 su->su_badlen = MAXWLEN - 1; // just in case
781 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen);
782 (void)spell_casefold(su->su_badptr, su->su_badlen,
783 su->su_fbadword, MAXWLEN);
784 // TODO: make this work if the case-folded text is longer than the original
785 // text. Currently an illegal byte causes wrong pointer computations.
786 su->su_fbadword[su->su_badlen] = NUL;
787
788 // get caps flags for bad word
789 su->su_badflags = badword_captype(su->su_badptr,
790 su->su_badptr + su->su_badlen);
791 if (need_cap)
792 su->su_badflags |= WF_ONECAP;
793
794 // Find the default language for sound folding. We simply use the first
795 // one in 'spelllang' that supports sound folding. That's good for when
796 // using multiple files for one language, it's not that bad when mixing
797 // languages (e.g., "pl,en").
798 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i)
799 {
800 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i);
801 if (lp->lp_sallang != NULL)
802 {
803 su->su_sallang = lp->lp_sallang;
804 break;
805 }
806 }
807
808 // Soundfold the bad word with the default sound folding, so that we don't
809 // have to do this many times.
810 if (su->su_sallang != NULL)
811 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE,
812 su->su_sal_badword);
813
814 // If the word is not capitalised and spell_check() doesn't consider the
815 // word to be bad then it might need to be capitalised. Add a suggestion
816 // for that.
817 c = PTR2CHAR(su->su_badptr);
818 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT)
819 {
820 make_case_word(su->su_badword, buf, WF_ONECAP);
821 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE,
822 0, TRUE, su->su_sallang, FALSE);
823 }
824
825 // Ban the bad word itself. It may appear in another region.
826 if (banbadword)
827 add_banned(su, su->su_badword);
828
829 // Make a copy of 'spellsuggest', because the expression may change it.
830 sps_copy = vim_strsave(p_sps);
831 if (sps_copy == NULL)
832 return;
833
834 // Loop over the items in 'spellsuggest'.
835 for (p = sps_copy; *p != NUL; )
836 {
837 copy_option_part(&p, buf, MAXPATHL, ",");
838
839 if (STRNCMP(buf, "expr:", 5) == 0)
840 {
841#ifdef FEAT_EVAL
842 // Evaluate an expression. Skip this when called recursively,
843 // when using spellsuggest() in the expression.
844 if (!expr_busy)
845 {
846 expr_busy = TRUE;
847 spell_suggest_expr(su, buf + 5);
848 expr_busy = FALSE;
849 }
850#endif
851 }
852 else if (STRNCMP(buf, "file:", 5) == 0)
853 // Use list of suggestions in a file.
854 spell_suggest_file(su, buf + 5);
855 else
856 {
857 // Use internal method.
858 spell_suggest_intern(su, interactive);
859 if (sps_flags & SPS_DOUBLE)
860 do_combine = TRUE;
861 }
862 }
863
864 vim_free(sps_copy);
865
866 if (do_combine)
867 // Combine the two list of suggestions. This must be done last,
868 // because sorting changes the order again.
869 score_combine(su);
870}
871
872#ifdef FEAT_EVAL
873/*
874 * Find suggestions by evaluating expression "expr".
875 */
876 static void
877spell_suggest_expr(suginfo_T *su, char_u *expr)
878{
879 list_T *list;
880 listitem_T *li;
881 int score;
882 char_u *p;
883
884 // The work is split up in a few parts to avoid having to export
885 // suginfo_T.
886 // First evaluate the expression and get the resulting list.
887 list = eval_spell_expr(su->su_badword, expr);
888 if (list != NULL)
889 {
890 // Loop over the items in the list.
891 for (li = list->lv_first; li != NULL; li = li->li_next)
892 if (li->li_tv.v_type == VAR_LIST)
893 {
894 // Get the word and the score from the items.
895 score = get_spellword(li->li_tv.vval.v_list, &p);
896 if (score >= 0 && score <= su->su_maxscore)
897 add_suggestion(su, &su->su_ga, p, su->su_badlen,
898 score, 0, TRUE, su->su_sallang, FALSE);
899 }
900 list_unref(list);
901 }
902
903 // Remove bogus suggestions, sort and truncate at "maxcount".
904 check_suggestions(su, &su->su_ga);
905 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
906}
907#endif
908
909/*
910 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'.
911 */
912 static void
913spell_suggest_file(suginfo_T *su, char_u *fname)
914{
915 FILE *fd;
916 char_u line[MAXWLEN * 2];
917 char_u *p;
918 int len;
919 char_u cword[MAXWLEN];
920
921 // Open the file.
922 fd = mch_fopen((char *)fname, "r");
923 if (fd == NULL)
924 {
925 semsg(_(e_notopen), fname);
926 return;
927 }
928
929 // Read it line by line.
930 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int)
931 {
932 line_breakcheck();
933
934 p = vim_strchr(line, '/');
935 if (p == NULL)
936 continue; // No Tab found, just skip the line.
937 *p++ = NUL;
938 if (STRICMP(su->su_badword, line) == 0)
939 {
940 // Match! Isolate the good word, until CR or NL.
941 for (len = 0; p[len] >= ' '; ++len)
942 ;
943 p[len] = NUL;
944
945 // If the suggestion doesn't have specific case duplicate the case
946 // of the bad word.
947 if (captype(p, NULL) == 0)
948 {
949 make_case_word(p, cword, su->su_badflags);
950 p = cword;
951 }
952
953 add_suggestion(su, &su->su_ga, p, su->su_badlen,
954 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE);
955 }
956 }
957
958 fclose(fd);
959
960 // Remove bogus suggestions, sort and truncate at "maxcount".
961 check_suggestions(su, &su->su_ga);
962 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
963}
964
965/*
966 * Find suggestions for the internal method indicated by "sps_flags".
967 */
968 static void
969spell_suggest_intern(suginfo_T *su, int interactive)
970{
971 // Load the .sug file(s) that are available and not done yet.
972 suggest_load_files();
973
974 // 1. Try special cases, such as repeating a word: "the the" -> "the".
975 //
976 // Set a maximum score to limit the combination of operations that is
977 // tried.
978 suggest_try_special(su);
979
980 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries
981 // from the .aff file and inserting a space (split the word).
982 suggest_try_change(su);
983
984 // For the resulting top-scorers compute the sound-a-like score.
985 if (sps_flags & SPS_DOUBLE)
986 score_comp_sal(su);
987
988 // 3. Try finding sound-a-like words.
989 if ((sps_flags & SPS_FAST) == 0)
990 {
991 if (sps_flags & SPS_BEST)
992 // Adjust the word score for the suggestions found so far for how
Bram Moolenaar32aa1022019-11-02 22:54:41 +0100993 // they sound like.
Bram Moolenaar46a426c2019-09-27 12:41:56 +0200994 rescore_suggestions(su);
995
996 // While going through the soundfold tree "su_maxscore" is the score
997 // for the soundfold word, limits the changes that are being tried,
998 // and "su_sfmaxscore" the rescored score, which is set by
999 // cleanup_suggestions().
1000 // First find words with a small edit distance, because this is much
1001 // faster and often already finds the top-N suggestions. If we didn't
1002 // find many suggestions try again with a higher edit distance.
1003 // "sl_sounddone" is used to avoid doing the same word twice.
1004 suggest_try_soundalike_prep();
1005 su->su_maxscore = SCORE_SFMAX1;
1006 su->su_sfmaxscore = SCORE_MAXINIT * 3;
1007 suggest_try_soundalike(su);
1008 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1009 {
1010 // We didn't find enough matches, try again, allowing more
1011 // changes to the soundfold word.
1012 su->su_maxscore = SCORE_SFMAX2;
1013 suggest_try_soundalike(su);
1014 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1015 {
1016 // Still didn't find enough matches, try again, allowing even
1017 // more changes to the soundfold word.
1018 su->su_maxscore = SCORE_SFMAX3;
1019 suggest_try_soundalike(su);
1020 }
1021 }
1022 su->su_maxscore = su->su_sfmaxscore;
1023 suggest_try_soundalike_finish();
1024 }
1025
1026 // When CTRL-C was hit while searching do show the results. Only clear
1027 // got_int when using a command, not for spellsuggest().
1028 ui_breakcheck();
1029 if (interactive && got_int)
1030 {
1031 (void)vgetc();
1032 got_int = FALSE;
1033 }
1034
1035 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0)
1036 {
1037 if (sps_flags & SPS_BEST)
1038 // Adjust the word score for how it sounds like.
1039 rescore_suggestions(su);
1040
1041 // Remove bogus suggestions, sort and truncate at "maxcount".
1042 check_suggestions(su, &su->su_ga);
1043 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
1044 }
1045}
1046
1047/*
1048 * Free the info put in "*su" by spell_find_suggest().
1049 */
1050 static void
1051spell_find_cleanup(suginfo_T *su)
1052{
1053 int i;
1054
1055 // Free the suggestions.
1056 for (i = 0; i < su->su_ga.ga_len; ++i)
1057 vim_free(SUG(su->su_ga, i).st_word);
1058 ga_clear(&su->su_ga);
1059 for (i = 0; i < su->su_sga.ga_len; ++i)
1060 vim_free(SUG(su->su_sga, i).st_word);
1061 ga_clear(&su->su_sga);
1062
1063 // Free the banned words.
1064 hash_clear_all(&su->su_banned, 0);
1065}
1066
1067/*
1068 * Try finding suggestions by recognizing specific situations.
1069 */
1070 static void
1071suggest_try_special(suginfo_T *su)
1072{
1073 char_u *p;
1074 size_t len;
1075 int c;
1076 char_u word[MAXWLEN];
1077
1078 // Recognize a word that is repeated: "the the".
1079 p = skiptowhite(su->su_fbadword);
1080 len = p - su->su_fbadword;
1081 p = skipwhite(p);
1082 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0)
1083 {
1084 // Include badflags: if the badword is onecap or allcap
1085 // use that for the goodword too: "The the" -> "The".
1086 c = su->su_fbadword[len];
1087 su->su_fbadword[len] = NUL;
1088 make_case_word(su->su_fbadword, word, su->su_badflags);
1089 su->su_fbadword[len] = c;
1090
1091 // Give a soundalike score of 0, compute the score as if deleting one
1092 // character.
1093 add_suggestion(su, &su->su_ga, word, su->su_badlen,
1094 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE);
1095 }
1096}
1097
1098/*
1099 * Change the 0 to 1 to measure how much time is spent in each state.
1100 * Output is dumped in "suggestprof".
1101 */
1102#if 0
1103# define SUGGEST_PROFILE
1104proftime_T current;
1105proftime_T total;
1106proftime_T times[STATE_FINAL + 1];
1107long counts[STATE_FINAL + 1];
1108
1109 static void
1110prof_init(void)
1111{
1112 for (int i = 0; i <= STATE_FINAL; ++i)
1113 {
1114 profile_zero(&times[i]);
1115 counts[i] = 0;
1116 }
1117 profile_start(&current);
1118 profile_start(&total);
1119}
1120
1121// call before changing state
1122 static void
1123prof_store(state_T state)
1124{
1125 profile_end(&current);
1126 profile_add(&times[state], &current);
1127 ++counts[state];
1128 profile_start(&current);
1129}
1130# define PROF_STORE(state) prof_store(state);
1131
1132 static void
1133prof_report(char *name)
1134{
1135 FILE *fd = fopen("suggestprof", "a");
1136
1137 profile_end(&total);
1138 fprintf(fd, "-----------------------\n");
1139 fprintf(fd, "%s: %s\n", name, profile_msg(&total));
1140 for (int i = 0; i <= STATE_FINAL; ++i)
1141 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(&times[i]), counts[i]);
1142 fclose(fd);
1143}
1144#else
1145# define PROF_STORE(state)
1146#endif
1147
1148/*
1149 * Try finding suggestions by adding/removing/swapping letters.
1150 */
1151 static void
1152suggest_try_change(suginfo_T *su)
1153{
1154 char_u fword[MAXWLEN]; // copy of the bad word, case-folded
1155 int n;
1156 char_u *p;
1157 int lpi;
1158 langp_T *lp;
1159
1160 // We make a copy of the case-folded bad word, so that we can modify it
1161 // to find matches (esp. REP items). Append some more text, changing
1162 // chars after the bad word may help.
1163 STRCPY(fword, su->su_fbadword);
1164 n = (int)STRLEN(fword);
1165 p = su->su_badptr + su->su_badlen;
1166 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n);
1167
1168 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
1169 {
1170 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
1171
1172 // If reloading a spell file fails it's still in the list but
1173 // everything has been cleared.
1174 if (lp->lp_slang->sl_fbyts == NULL)
1175 continue;
1176
1177 // Try it for this language. Will add possible suggestions.
1178#ifdef SUGGEST_PROFILE
1179 prof_init();
1180#endif
1181 suggest_trie_walk(su, lp, fword, FALSE);
1182#ifdef SUGGEST_PROFILE
1183 prof_report("try_change");
1184#endif
1185 }
1186}
1187
1188// Check the maximum score, if we go over it we won't try this change.
1189#define TRY_DEEPER(su, stack, depth, add) \
1190 (stack[depth].ts_score + (add) < su->su_maxscore)
1191
1192/*
1193 * Try finding suggestions by adding/removing/swapping letters.
1194 *
1195 * This uses a state machine. At each node in the tree we try various
1196 * operations. When trying if an operation works "depth" is increased and the
1197 * stack[] is used to store info. This allows combinations, thus insert one
1198 * character, replace one and delete another. The number of changes is
1199 * limited by su->su_maxscore.
1200 *
1201 * After implementing this I noticed an article by Kemal Oflazer that
1202 * describes something similar: "Error-tolerant Finite State Recognition with
1203 * Applications to Morphological Analysis and Spelling Correction" (1996).
1204 * The implementation in the article is simplified and requires a stack of
1205 * unknown depth. The implementation here only needs a stack depth equal to
1206 * the length of the word.
1207 *
1208 * This is also used for the sound-folded word, "soundfold" is TRUE then.
1209 * The mechanism is the same, but we find a match with a sound-folded word
1210 * that comes from one or more original words. Each of these words may be
1211 * added, this is done by add_sound_suggest().
1212 * Don't use:
1213 * the prefix tree or the keep-case tree
1214 * "su->su_badlen"
1215 * anything to do with upper and lower case
1216 * anything to do with word or non-word characters ("spell_iswordp()")
1217 * banned words
1218 * word flags (rare, region, compounding)
1219 * word splitting for now
1220 * "similar_chars()"
1221 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep"
1222 */
1223 static void
1224suggest_trie_walk(
1225 suginfo_T *su,
1226 langp_T *lp,
1227 char_u *fword,
1228 int soundfold)
1229{
1230 char_u tword[MAXWLEN]; // good word collected so far
1231 trystate_T stack[MAXWLEN];
1232 char_u preword[MAXWLEN * 3]; // word found with proper case;
1233 // concatenation of prefix compound
1234 // words and split word. NUL terminated
1235 // when going deeper but not when coming
1236 // back.
1237 char_u compflags[MAXWLEN]; // compound flags, one for each word
1238 trystate_T *sp;
1239 int newscore;
1240 int score;
1241 char_u *byts, *fbyts, *pbyts;
1242 idx_T *idxs, *fidxs, *pidxs;
1243 int depth;
1244 int c, c2, c3;
1245 int n = 0;
1246 int flags;
1247 garray_T *gap;
1248 idx_T arridx;
1249 int len;
1250 char_u *p;
1251 fromto_T *ftp;
1252 int fl = 0, tl;
1253 int repextra = 0; // extra bytes in fword[] from REP item
1254 slang_T *slang = lp->lp_slang;
1255 int fword_ends;
1256 int goodword_ends;
1257#ifdef DEBUG_TRIEWALK
1258 // Stores the name of the change made at each level.
1259 char_u changename[MAXWLEN][80];
1260#endif
1261 int breakcheckcount = 1000;
1262 int compound_ok;
1263
1264 // Go through the whole case-fold tree, try changes at each node.
1265 // "tword[]" contains the word collected from nodes in the tree.
1266 // "fword[]" the word we are trying to match with (initially the bad
1267 // word).
1268 depth = 0;
1269 sp = &stack[0];
1270 vim_memset(sp, 0, sizeof(trystate_T));
1271 sp->ts_curi = 1;
1272
1273 if (soundfold)
1274 {
1275 // Going through the soundfold tree.
1276 byts = fbyts = slang->sl_sbyts;
1277 idxs = fidxs = slang->sl_sidxs;
1278 pbyts = NULL;
1279 pidxs = NULL;
1280 sp->ts_prefixdepth = PFD_NOPREFIX;
1281 sp->ts_state = STATE_START;
1282 }
1283 else
1284 {
1285 // When there are postponed prefixes we need to use these first. At
1286 // the end of the prefix we continue in the case-fold tree.
1287 fbyts = slang->sl_fbyts;
1288 fidxs = slang->sl_fidxs;
1289 pbyts = slang->sl_pbyts;
1290 pidxs = slang->sl_pidxs;
1291 if (pbyts != NULL)
1292 {
1293 byts = pbyts;
1294 idxs = pidxs;
1295 sp->ts_prefixdepth = PFD_PREFIXTREE;
1296 sp->ts_state = STATE_NOPREFIX; // try without prefix first
1297 }
1298 else
1299 {
1300 byts = fbyts;
1301 idxs = fidxs;
1302 sp->ts_prefixdepth = PFD_NOPREFIX;
1303 sp->ts_state = STATE_START;
1304 }
1305 }
1306
1307 // Loop to find all suggestions. At each round we either:
1308 // - For the current state try one operation, advance "ts_curi",
1309 // increase "depth".
1310 // - When a state is done go to the next, set "ts_state".
1311 // - When all states are tried decrease "depth".
1312 while (depth >= 0 && !got_int)
1313 {
1314 sp = &stack[depth];
1315 switch (sp->ts_state)
1316 {
1317 case STATE_START:
1318 case STATE_NOPREFIX:
1319 // Start of node: Deal with NUL bytes, which means
1320 // tword[] may end here.
1321 arridx = sp->ts_arridx; // current node in the tree
1322 len = byts[arridx]; // bytes in this node
1323 arridx += sp->ts_curi; // index of current byte
1324
1325 if (sp->ts_prefixdepth == PFD_PREFIXTREE)
1326 {
1327 // Skip over the NUL bytes, we use them later.
1328 for (n = 0; n < len && byts[arridx + n] == 0; ++n)
1329 ;
1330 sp->ts_curi += n;
1331
1332 // Always past NUL bytes now.
1333 n = (int)sp->ts_state;
1334 PROF_STORE(sp->ts_state)
1335 sp->ts_state = STATE_ENDNUL;
1336 sp->ts_save_badflags = su->su_badflags;
1337
1338 // At end of a prefix or at start of prefixtree: check for
1339 // following word.
1340 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX)
1341 {
1342 // Set su->su_badflags to the caps type at this position.
1343 // Use the caps type until here for the prefix itself.
1344 if (has_mbyte)
1345 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1346 else
1347 n = sp->ts_fidx;
1348 flags = badword_captype(su->su_badptr, su->su_badptr + n);
1349 su->su_badflags = badword_captype(su->su_badptr + n,
1350 su->su_badptr + su->su_badlen);
1351#ifdef DEBUG_TRIEWALK
1352 sprintf(changename[depth], "prefix");
1353#endif
1354 go_deeper(stack, depth, 0);
1355 ++depth;
1356 sp = &stack[depth];
1357 sp->ts_prefixdepth = depth - 1;
1358 byts = fbyts;
1359 idxs = fidxs;
1360 sp->ts_arridx = 0;
1361
1362 // Move the prefix to preword[] with the right case
1363 // and make find_keepcap_word() works.
1364 tword[sp->ts_twordlen] = NUL;
1365 make_case_word(tword + sp->ts_splitoff,
1366 preword + sp->ts_prewordlen, flags);
1367 sp->ts_prewordlen = (char_u)STRLEN(preword);
1368 sp->ts_splitoff = sp->ts_twordlen;
1369 }
1370 break;
1371 }
1372
1373 if (sp->ts_curi > len || byts[arridx] != 0)
1374 {
1375 // Past bytes in node and/or past NUL bytes.
1376 PROF_STORE(sp->ts_state)
1377 sp->ts_state = STATE_ENDNUL;
1378 sp->ts_save_badflags = su->su_badflags;
1379 break;
1380 }
1381
1382 // End of word in tree.
1383 ++sp->ts_curi; // eat one NUL byte
1384
1385 flags = (int)idxs[arridx];
1386
1387 // Skip words with the NOSUGGEST flag.
1388 if (flags & WF_NOSUGGEST)
1389 break;
1390
1391 fword_ends = (fword[sp->ts_fidx] == NUL
1392 || (soundfold
1393 ? VIM_ISWHITE(fword[sp->ts_fidx])
1394 : !spell_iswordp(fword + sp->ts_fidx, curwin)));
1395 tword[sp->ts_twordlen] = NUL;
1396
1397 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
1398 && (sp->ts_flags & TSF_PREFIXOK) == 0)
1399 {
1400 // There was a prefix before the word. Check that the prefix
1401 // can be used with this word.
1402 // Count the length of the NULs in the prefix. If there are
1403 // none this must be the first try without a prefix.
1404 n = stack[sp->ts_prefixdepth].ts_arridx;
1405 len = pbyts[n++];
1406 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
1407 ;
1408 if (c > 0)
1409 {
1410 c = valid_word_prefix(c, n, flags,
1411 tword + sp->ts_splitoff, slang, FALSE);
1412 if (c == 0)
1413 break;
1414
1415 // Use the WF_RARE flag for a rare prefix.
1416 if (c & WF_RAREPFX)
1417 flags |= WF_RARE;
1418
1419 // Tricky: when checking for both prefix and compounding
1420 // we run into the prefix flag first.
1421 // Remember that it's OK, so that we accept the prefix
1422 // when arriving at a compound flag.
1423 sp->ts_flags |= TSF_PREFIXOK;
1424 }
1425 }
1426
1427 // Check NEEDCOMPOUND: can't use word without compounding. Do try
1428 // appending another compound word below.
1429 if (sp->ts_complen == sp->ts_compsplit && fword_ends
1430 && (flags & WF_NEEDCOMP))
1431 goodword_ends = FALSE;
1432 else
1433 goodword_ends = TRUE;
1434
1435 p = NULL;
1436 compound_ok = TRUE;
1437 if (sp->ts_complen > sp->ts_compsplit)
1438 {
1439 if (slang->sl_nobreak)
1440 {
1441 // There was a word before this word. When there was no
1442 // change in this word (it was correct) add the first word
1443 // as a suggestion. If this word was corrected too, we
1444 // need to check if a correct word follows.
1445 if (sp->ts_fidx - sp->ts_splitfidx
1446 == sp->ts_twordlen - sp->ts_splitoff
1447 && STRNCMP(fword + sp->ts_splitfidx,
1448 tword + sp->ts_splitoff,
1449 sp->ts_fidx - sp->ts_splitfidx) == 0)
1450 {
1451 preword[sp->ts_prewordlen] = NUL;
1452 newscore = score_wordcount_adj(slang, sp->ts_score,
1453 preword + sp->ts_prewordlen,
1454 sp->ts_prewordlen > 0);
1455 // Add the suggestion if the score isn't too bad.
1456 if (newscore <= su->su_maxscore)
1457 add_suggestion(su, &su->su_ga, preword,
1458 sp->ts_splitfidx - repextra,
1459 newscore, 0, FALSE,
1460 lp->lp_sallang, FALSE);
1461 break;
1462 }
1463 }
1464 else
1465 {
1466 // There was a compound word before this word. If this
1467 // word does not support compounding then give up
1468 // (splitting is tried for the word without compound
1469 // flag).
1470 if (((unsigned)flags >> 24) == 0
1471 || sp->ts_twordlen - sp->ts_splitoff
1472 < slang->sl_compminlen)
1473 break;
1474 // For multi-byte chars check character length against
1475 // COMPOUNDMIN.
1476 if (has_mbyte
1477 && slang->sl_compminlen > 0
1478 && mb_charlen(tword + sp->ts_splitoff)
1479 < slang->sl_compminlen)
1480 break;
1481
1482 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1483 compflags[sp->ts_complen + 1] = NUL;
1484 vim_strncpy(preword + sp->ts_prewordlen,
1485 tword + sp->ts_splitoff,
1486 sp->ts_twordlen - sp->ts_splitoff);
1487
1488 // Verify CHECKCOMPOUNDPATTERN rules.
1489 if (match_checkcompoundpattern(preword, sp->ts_prewordlen,
1490 &slang->sl_comppat))
1491 compound_ok = FALSE;
1492
1493 if (compound_ok)
1494 {
1495 p = preword;
1496 while (*skiptowhite(p) != NUL)
1497 p = skipwhite(skiptowhite(p));
1498 if (fword_ends && !can_compound(slang, p,
1499 compflags + sp->ts_compsplit))
1500 // Compound is not allowed. But it may still be
1501 // possible if we add another (short) word.
1502 compound_ok = FALSE;
1503 }
1504
1505 // Get pointer to last char of previous word.
1506 p = preword + sp->ts_prewordlen;
1507 MB_PTR_BACK(preword, p);
1508 }
1509 }
1510
1511 // Form the word with proper case in preword.
1512 // If there is a word from a previous split, append.
1513 // For the soundfold tree don't change the case, simply append.
1514 if (soundfold)
1515 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff);
1516 else if (flags & WF_KEEPCAP)
1517 // Must find the word in the keep-case tree.
1518 find_keepcap_word(slang, tword + sp->ts_splitoff,
1519 preword + sp->ts_prewordlen);
1520 else
1521 {
1522 // Include badflags: If the badword is onecap or allcap
1523 // use that for the goodword too. But if the badword is
1524 // allcap and it's only one char long use onecap.
1525 c = su->su_badflags;
1526 if ((c & WF_ALLCAP)
1527 && su->su_badlen == (*mb_ptr2len)(su->su_badptr))
1528 c = WF_ONECAP;
1529 c |= flags;
1530
1531 // When appending a compound word after a word character don't
1532 // use Onecap.
1533 if (p != NULL && spell_iswordp_nmw(p, curwin))
1534 c &= ~WF_ONECAP;
1535 make_case_word(tword + sp->ts_splitoff,
1536 preword + sp->ts_prewordlen, c);
1537 }
1538
1539 if (!soundfold)
1540 {
1541 // Don't use a banned word. It may appear again as a good
1542 // word, thus remember it.
1543 if (flags & WF_BANNED)
1544 {
1545 add_banned(su, preword + sp->ts_prewordlen);
1546 break;
1547 }
1548 if ((sp->ts_complen == sp->ts_compsplit
1549 && WAS_BANNED(su, preword + sp->ts_prewordlen))
1550 || WAS_BANNED(su, preword))
1551 {
1552 if (slang->sl_compprog == NULL)
1553 break;
1554 // the word so far was banned but we may try compounding
1555 goodword_ends = FALSE;
1556 }
1557 }
1558
1559 newscore = 0;
1560 if (!soundfold) // soundfold words don't have flags
1561 {
1562 if ((flags & WF_REGION)
1563 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
1564 newscore += SCORE_REGION;
1565 if (flags & WF_RARE)
1566 newscore += SCORE_RARE;
1567
1568 if (!spell_valid_case(su->su_badflags,
1569 captype(preword + sp->ts_prewordlen, NULL)))
1570 newscore += SCORE_ICASE;
1571 }
1572
1573 // TODO: how about splitting in the soundfold tree?
1574 if (fword_ends
1575 && goodword_ends
1576 && sp->ts_fidx >= sp->ts_fidxtry
1577 && compound_ok)
1578 {
1579 // The badword also ends: add suggestions.
1580#ifdef DEBUG_TRIEWALK
1581 if (soundfold && STRCMP(preword, "smwrd") == 0)
1582 {
1583 int j;
1584
1585 // print the stack of changes that brought us here
1586 smsg("------ %s -------", fword);
1587 for (j = 0; j < depth; ++j)
1588 smsg("%s", changename[j]);
1589 }
1590#endif
1591 if (soundfold)
1592 {
1593 // For soundfolded words we need to find the original
1594 // words, the edit distance and then add them.
1595 add_sound_suggest(su, preword, sp->ts_score, lp);
1596 }
1597 else if (sp->ts_fidx > 0)
1598 {
1599 // Give a penalty when changing non-word char to word
1600 // char, e.g., "thes," -> "these".
1601 p = fword + sp->ts_fidx;
1602 MB_PTR_BACK(fword, p);
1603 if (!spell_iswordp(p, curwin))
1604 {
1605 p = preword + STRLEN(preword);
1606 MB_PTR_BACK(preword, p);
1607 if (spell_iswordp(p, curwin))
1608 newscore += SCORE_NONWORD;
1609 }
1610
1611 // Give a bonus to words seen before.
1612 score = score_wordcount_adj(slang,
1613 sp->ts_score + newscore,
1614 preword + sp->ts_prewordlen,
1615 sp->ts_prewordlen > 0);
1616
1617 // Add the suggestion if the score isn't too bad.
1618 if (score <= su->su_maxscore)
1619 {
1620 add_suggestion(su, &su->su_ga, preword,
1621 sp->ts_fidx - repextra,
1622 score, 0, FALSE, lp->lp_sallang, FALSE);
1623
1624 if (su->su_badflags & WF_MIXCAP)
1625 {
1626 // We really don't know if the word should be
1627 // upper or lower case, add both.
1628 c = captype(preword, NULL);
1629 if (c == 0 || c == WF_ALLCAP)
1630 {
1631 make_case_word(tword + sp->ts_splitoff,
1632 preword + sp->ts_prewordlen,
1633 c == 0 ? WF_ALLCAP : 0);
1634
1635 add_suggestion(su, &su->su_ga, preword,
1636 sp->ts_fidx - repextra,
1637 score + SCORE_ICASE, 0, FALSE,
1638 lp->lp_sallang, FALSE);
1639 }
1640 }
1641 }
1642 }
1643 }
1644
1645 // Try word split and/or compounding.
1646 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends)
1647 // Don't split halfway a character.
1648 && (!has_mbyte || sp->ts_tcharlen == 0))
1649 {
1650 int try_compound;
1651 int try_split;
1652
1653 // If past the end of the bad word don't try a split.
1654 // Otherwise try changing the next word. E.g., find
1655 // suggestions for "the the" where the second "the" is
1656 // different. It's done like a split.
1657 // TODO: word split for soundfold words
1658 try_split = (sp->ts_fidx - repextra < su->su_badlen)
1659 && !soundfold;
1660
1661 // Get here in several situations:
1662 // 1. The word in the tree ends:
1663 // If the word allows compounding try that. Otherwise try
1664 // a split by inserting a space. For both check that a
1665 // valid words starts at fword[sp->ts_fidx].
1666 // For NOBREAK do like compounding to be able to check if
1667 // the next word is valid.
1668 // 2. The badword does end, but it was due to a change (e.g.,
1669 // a swap). No need to split, but do check that the
1670 // following word is valid.
1671 // 3. The badword and the word in the tree end. It may still
1672 // be possible to compound another (short) word.
1673 try_compound = FALSE;
1674 if (!soundfold
1675 && !slang->sl_nocompoundsugs
1676 && slang->sl_compprog != NULL
1677 && ((unsigned)flags >> 24) != 0
1678 && sp->ts_twordlen - sp->ts_splitoff
1679 >= slang->sl_compminlen
1680 && (!has_mbyte
1681 || slang->sl_compminlen == 0
1682 || mb_charlen(tword + sp->ts_splitoff)
1683 >= slang->sl_compminlen)
1684 && (slang->sl_compsylmax < MAXWLEN
1685 || sp->ts_complen + 1 - sp->ts_compsplit
1686 < slang->sl_compmax)
1687 && (can_be_compound(sp, slang,
1688 compflags, ((unsigned)flags >> 24))))
1689
1690 {
1691 try_compound = TRUE;
1692 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1693 compflags[sp->ts_complen + 1] = NUL;
1694 }
1695
1696 // For NOBREAK we never try splitting, it won't make any word
1697 // valid.
1698 if (slang->sl_nobreak && !slang->sl_nocompoundsugs)
1699 try_compound = TRUE;
1700
1701 // If we could add a compound word, and it's also possible to
1702 // split at this point, do the split first and set
1703 // TSF_DIDSPLIT to avoid doing it again.
1704 else if (!fword_ends
1705 && try_compound
1706 && (sp->ts_flags & TSF_DIDSPLIT) == 0)
1707 {
1708 try_compound = FALSE;
1709 sp->ts_flags |= TSF_DIDSPLIT;
1710 --sp->ts_curi; // do the same NUL again
1711 compflags[sp->ts_complen] = NUL;
1712 }
1713 else
1714 sp->ts_flags &= ~TSF_DIDSPLIT;
1715
1716 if (try_split || try_compound)
1717 {
1718 if (!try_compound && (!fword_ends || !goodword_ends))
1719 {
1720 // If we're going to split need to check that the
1721 // words so far are valid for compounding. If there
1722 // is only one word it must not have the NEEDCOMPOUND
1723 // flag.
1724 if (sp->ts_complen == sp->ts_compsplit
1725 && (flags & WF_NEEDCOMP))
1726 break;
1727 p = preword;
1728 while (*skiptowhite(p) != NUL)
1729 p = skipwhite(skiptowhite(p));
1730 if (sp->ts_complen > sp->ts_compsplit
1731 && !can_compound(slang, p,
1732 compflags + sp->ts_compsplit))
1733 break;
1734
1735 if (slang->sl_nosplitsugs)
1736 newscore += SCORE_SPLIT_NO;
1737 else
1738 newscore += SCORE_SPLIT;
1739
1740 // Give a bonus to words seen before.
1741 newscore = score_wordcount_adj(slang, newscore,
1742 preword + sp->ts_prewordlen, TRUE);
1743 }
1744
1745 if (TRY_DEEPER(su, stack, depth, newscore))
1746 {
1747 go_deeper(stack, depth, newscore);
1748#ifdef DEBUG_TRIEWALK
1749 if (!try_compound && !fword_ends)
1750 sprintf(changename[depth], "%.*s-%s: split",
1751 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1752 else
1753 sprintf(changename[depth], "%.*s-%s: compound",
1754 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1755#endif
1756 // Save things to be restored at STATE_SPLITUNDO.
1757 sp->ts_save_badflags = su->su_badflags;
1758 PROF_STORE(sp->ts_state)
1759 sp->ts_state = STATE_SPLITUNDO;
1760
1761 ++depth;
1762 sp = &stack[depth];
1763
1764 // Append a space to preword when splitting.
1765 if (!try_compound && !fword_ends)
1766 STRCAT(preword, " ");
1767 sp->ts_prewordlen = (char_u)STRLEN(preword);
1768 sp->ts_splitoff = sp->ts_twordlen;
1769 sp->ts_splitfidx = sp->ts_fidx;
1770
1771 // If the badword has a non-word character at this
1772 // position skip it. That means replacing the
1773 // non-word character with a space. Always skip a
1774 // character when the word ends. But only when the
1775 // good word can end.
1776 if (((!try_compound && !spell_iswordp_nmw(fword
1777 + sp->ts_fidx,
1778 curwin))
1779 || fword_ends)
1780 && fword[sp->ts_fidx] != NUL
1781 && goodword_ends)
1782 {
1783 int l;
1784
Bram Moolenaar1614a142019-10-06 22:00:13 +02001785 l = mb_ptr2len(fword + sp->ts_fidx);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001786 if (fword_ends)
1787 {
1788 // Copy the skipped character to preword.
1789 mch_memmove(preword + sp->ts_prewordlen,
1790 fword + sp->ts_fidx, l);
1791 sp->ts_prewordlen += l;
1792 preword[sp->ts_prewordlen] = NUL;
1793 }
1794 else
1795 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST;
1796 sp->ts_fidx += l;
1797 }
1798
1799 // When compounding include compound flag in
1800 // compflags[] (already set above). When splitting we
1801 // may start compounding over again.
1802 if (try_compound)
1803 ++sp->ts_complen;
1804 else
1805 sp->ts_compsplit = sp->ts_complen;
1806 sp->ts_prefixdepth = PFD_NOPREFIX;
1807
1808 // set su->su_badflags to the caps type at this
1809 // position
1810 if (has_mbyte)
1811 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1812 else
1813 n = sp->ts_fidx;
1814 su->su_badflags = badword_captype(su->su_badptr + n,
1815 su->su_badptr + su->su_badlen);
1816
1817 // Restart at top of the tree.
1818 sp->ts_arridx = 0;
1819
1820 // If there are postponed prefixes, try these too.
1821 if (pbyts != NULL)
1822 {
1823 byts = pbyts;
1824 idxs = pidxs;
1825 sp->ts_prefixdepth = PFD_PREFIXTREE;
1826 PROF_STORE(sp->ts_state)
1827 sp->ts_state = STATE_NOPREFIX;
1828 }
1829 }
1830 }
1831 }
1832 break;
1833
1834 case STATE_SPLITUNDO:
1835 // Undo the changes done for word split or compound word.
1836 su->su_badflags = sp->ts_save_badflags;
1837
1838 // Continue looking for NUL bytes.
1839 PROF_STORE(sp->ts_state)
1840 sp->ts_state = STATE_START;
1841
1842 // In case we went into the prefix tree.
1843 byts = fbyts;
1844 idxs = fidxs;
1845 break;
1846
1847 case STATE_ENDNUL:
1848 // Past the NUL bytes in the node.
1849 su->su_badflags = sp->ts_save_badflags;
1850 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0)
1851 {
1852 // The badword ends, can't use STATE_PLAIN.
1853 PROF_STORE(sp->ts_state)
1854 sp->ts_state = STATE_DEL;
1855 break;
1856 }
1857 PROF_STORE(sp->ts_state)
1858 sp->ts_state = STATE_PLAIN;
1859 // FALLTHROUGH
1860
1861 case STATE_PLAIN:
1862 // Go over all possible bytes at this node, add each to tword[]
1863 // and use child node. "ts_curi" is the index.
1864 arridx = sp->ts_arridx;
1865 if (sp->ts_curi > byts[arridx])
1866 {
1867 // Done all bytes at this node, do next state. When still at
1868 // already changed bytes skip the other tricks.
1869 PROF_STORE(sp->ts_state)
1870 if (sp->ts_fidx >= sp->ts_fidxtry)
1871 sp->ts_state = STATE_DEL;
1872 else
1873 sp->ts_state = STATE_FINAL;
1874 }
1875 else
1876 {
1877 arridx += sp->ts_curi++;
1878 c = byts[arridx];
1879
1880 // Normal byte, go one level deeper. If it's not equal to the
1881 // byte in the bad word adjust the score. But don't even try
1882 // when the byte was already changed. And don't try when we
1883 // just deleted this byte, accepting it is always cheaper than
1884 // delete + substitute.
1885 if (c == fword[sp->ts_fidx]
1886 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE))
1887 newscore = 0;
1888 else
1889 newscore = SCORE_SUBST;
1890 if ((newscore == 0
1891 || (sp->ts_fidx >= sp->ts_fidxtry
1892 && ((sp->ts_flags & TSF_DIDDEL) == 0
1893 || c != fword[sp->ts_delidx])))
1894 && TRY_DEEPER(su, stack, depth, newscore))
1895 {
1896 go_deeper(stack, depth, newscore);
1897#ifdef DEBUG_TRIEWALK
1898 if (newscore > 0)
1899 sprintf(changename[depth], "%.*s-%s: subst %c to %c",
1900 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1901 fword[sp->ts_fidx], c);
1902 else
1903 sprintf(changename[depth], "%.*s-%s: accept %c",
1904 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1905 fword[sp->ts_fidx]);
1906#endif
1907 ++depth;
1908 sp = &stack[depth];
1909 ++sp->ts_fidx;
1910 tword[sp->ts_twordlen++] = c;
1911 sp->ts_arridx = idxs[arridx];
1912 if (newscore == SCORE_SUBST)
1913 sp->ts_isdiff = DIFF_YES;
1914 if (has_mbyte)
1915 {
1916 // Multi-byte characters are a bit complicated to
1917 // handle: They differ when any of the bytes differ
1918 // and then their length may also differ.
1919 if (sp->ts_tcharlen == 0)
1920 {
1921 // First byte.
1922 sp->ts_tcharidx = 0;
1923 sp->ts_tcharlen = MB_BYTE2LEN(c);
1924 sp->ts_fcharstart = sp->ts_fidx - 1;
1925 sp->ts_isdiff = (newscore != 0)
1926 ? DIFF_YES : DIFF_NONE;
1927 }
1928 else if (sp->ts_isdiff == DIFF_INSERT)
1929 // When inserting trail bytes don't advance in the
1930 // bad word.
1931 --sp->ts_fidx;
1932 if (++sp->ts_tcharidx == sp->ts_tcharlen)
1933 {
1934 // Last byte of character.
1935 if (sp->ts_isdiff == DIFF_YES)
1936 {
1937 // Correct ts_fidx for the byte length of the
1938 // character (we didn't check that before).
1939 sp->ts_fidx = sp->ts_fcharstart
Bram Moolenaar1614a142019-10-06 22:00:13 +02001940 + mb_ptr2len(
Bram Moolenaar46a426c2019-09-27 12:41:56 +02001941 fword + sp->ts_fcharstart);
1942 // For changing a composing character adjust
1943 // the score from SCORE_SUBST to
1944 // SCORE_SUBCOMP.
1945 if (enc_utf8
1946 && utf_iscomposing(
1947 utf_ptr2char(tword
1948 + sp->ts_twordlen
1949 - sp->ts_tcharlen))
1950 && utf_iscomposing(
1951 utf_ptr2char(fword
1952 + sp->ts_fcharstart)))
1953 sp->ts_score -=
1954 SCORE_SUBST - SCORE_SUBCOMP;
1955
1956 // For a similar character adjust score from
1957 // SCORE_SUBST to SCORE_SIMILAR.
1958 else if (!soundfold
1959 && slang->sl_has_map
1960 && similar_chars(slang,
1961 mb_ptr2char(tword
1962 + sp->ts_twordlen
1963 - sp->ts_tcharlen),
1964 mb_ptr2char(fword
1965 + sp->ts_fcharstart)))
1966 sp->ts_score -=
1967 SCORE_SUBST - SCORE_SIMILAR;
1968 }
1969 else if (sp->ts_isdiff == DIFF_INSERT
1970 && sp->ts_twordlen > sp->ts_tcharlen)
1971 {
1972 p = tword + sp->ts_twordlen - sp->ts_tcharlen;
1973 c = mb_ptr2char(p);
1974 if (enc_utf8 && utf_iscomposing(c))
1975 {
1976 // Inserting a composing char doesn't
1977 // count that much.
1978 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
1979 }
1980 else
1981 {
1982 // If the previous character was the same,
1983 // thus doubling a character, give a bonus
1984 // to the score. Also for the soundfold
1985 // tree (might seem illogical but does
1986 // give better scores).
1987 MB_PTR_BACK(tword, p);
1988 if (c == mb_ptr2char(p))
1989 sp->ts_score -= SCORE_INS
1990 - SCORE_INSDUP;
1991 }
1992 }
1993
1994 // Starting a new char, reset the length.
1995 sp->ts_tcharlen = 0;
1996 }
1997 }
1998 else
1999 {
2000 // If we found a similar char adjust the score.
2001 // We do this after calling go_deeper() because
2002 // it's slow.
2003 if (newscore != 0
2004 && !soundfold
2005 && slang->sl_has_map
2006 && similar_chars(slang,
2007 c, fword[sp->ts_fidx - 1]))
2008 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
2009 }
2010 }
2011 }
2012 break;
2013
2014 case STATE_DEL:
2015 // When past the first byte of a multi-byte char don't try
2016 // delete/insert/swap a character.
2017 if (has_mbyte && sp->ts_tcharlen > 0)
2018 {
2019 PROF_STORE(sp->ts_state)
2020 sp->ts_state = STATE_FINAL;
2021 break;
2022 }
2023 // Try skipping one character in the bad word (delete it).
2024 PROF_STORE(sp->ts_state)
2025 sp->ts_state = STATE_INS_PREP;
2026 sp->ts_curi = 1;
2027 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*')
2028 // Deleting a vowel at the start of a word counts less, see
2029 // soundalike_score().
2030 newscore = 2 * SCORE_DEL / 3;
2031 else
2032 newscore = SCORE_DEL;
2033 if (fword[sp->ts_fidx] != NUL
2034 && TRY_DEEPER(su, stack, depth, newscore))
2035 {
2036 go_deeper(stack, depth, newscore);
2037#ifdef DEBUG_TRIEWALK
2038 sprintf(changename[depth], "%.*s-%s: delete %c",
2039 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2040 fword[sp->ts_fidx]);
2041#endif
2042 ++depth;
2043
2044 // Remember what character we deleted, so that we can avoid
2045 // inserting it again.
2046 stack[depth].ts_flags |= TSF_DIDDEL;
2047 stack[depth].ts_delidx = sp->ts_fidx;
2048
2049 // Advance over the character in fword[]. Give a bonus to the
2050 // score if the same character is following "nn" -> "n". It's
2051 // a bit illogical for soundfold tree but it does give better
2052 // results.
2053 if (has_mbyte)
2054 {
2055 c = mb_ptr2char(fword + sp->ts_fidx);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002056 stack[depth].ts_fidx += mb_ptr2len(fword + sp->ts_fidx);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002057 if (enc_utf8 && utf_iscomposing(c))
2058 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
2059 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx))
2060 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2061 }
2062 else
2063 {
2064 ++stack[depth].ts_fidx;
2065 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1])
2066 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2067 }
2068 break;
2069 }
2070 // FALLTHROUGH
2071
2072 case STATE_INS_PREP:
2073 if (sp->ts_flags & TSF_DIDDEL)
2074 {
2075 // If we just deleted a byte then inserting won't make sense,
2076 // a substitute is always cheaper.
2077 PROF_STORE(sp->ts_state)
2078 sp->ts_state = STATE_SWAP;
2079 break;
2080 }
2081
2082 // skip over NUL bytes
2083 n = sp->ts_arridx;
2084 for (;;)
2085 {
2086 if (sp->ts_curi > byts[n])
2087 {
2088 // Only NUL bytes at this node, go to next state.
2089 PROF_STORE(sp->ts_state)
2090 sp->ts_state = STATE_SWAP;
2091 break;
2092 }
2093 if (byts[n + sp->ts_curi] != NUL)
2094 {
2095 // Found a byte to insert.
2096 PROF_STORE(sp->ts_state)
2097 sp->ts_state = STATE_INS;
2098 break;
2099 }
2100 ++sp->ts_curi;
2101 }
2102 break;
2103
2104 // FALLTHROUGH
2105
2106 case STATE_INS:
2107 // Insert one byte. Repeat this for each possible byte at this
2108 // node.
2109 n = sp->ts_arridx;
2110 if (sp->ts_curi > byts[n])
2111 {
2112 // Done all bytes at this node, go to next state.
2113 PROF_STORE(sp->ts_state)
2114 sp->ts_state = STATE_SWAP;
2115 break;
2116 }
2117
2118 // Do one more byte at this node, but:
2119 // - Skip NUL bytes.
2120 // - Skip the byte if it's equal to the byte in the word,
2121 // accepting that byte is always better.
2122 n += sp->ts_curi++;
2123 c = byts[n];
2124 if (soundfold && sp->ts_twordlen == 0 && c == '*')
2125 // Inserting a vowel at the start of a word counts less,
2126 // see soundalike_score().
2127 newscore = 2 * SCORE_INS / 3;
2128 else
2129 newscore = SCORE_INS;
2130 if (c != fword[sp->ts_fidx]
2131 && TRY_DEEPER(su, stack, depth, newscore))
2132 {
2133 go_deeper(stack, depth, newscore);
2134#ifdef DEBUG_TRIEWALK
2135 sprintf(changename[depth], "%.*s-%s: insert %c",
2136 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2137 c);
2138#endif
2139 ++depth;
2140 sp = &stack[depth];
2141 tword[sp->ts_twordlen++] = c;
2142 sp->ts_arridx = idxs[n];
2143 if (has_mbyte)
2144 {
2145 fl = MB_BYTE2LEN(c);
2146 if (fl > 1)
2147 {
2148 // There are following bytes for the same character.
2149 // We must find all bytes before trying
2150 // delete/insert/swap/etc.
2151 sp->ts_tcharlen = fl;
2152 sp->ts_tcharidx = 1;
2153 sp->ts_isdiff = DIFF_INSERT;
2154 }
2155 }
2156 else
2157 fl = 1;
2158 if (fl == 1)
2159 {
2160 // If the previous character was the same, thus doubling a
2161 // character, give a bonus to the score. Also for
2162 // soundfold words (illogical but does give a better
2163 // score).
2164 if (sp->ts_twordlen >= 2
2165 && tword[sp->ts_twordlen - 2] == c)
2166 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
2167 }
2168 }
2169 break;
2170
2171 case STATE_SWAP:
2172 // Swap two bytes in the bad word: "12" -> "21".
2173 // We change "fword" here, it's changed back afterwards at
2174 // STATE_UNSWAP.
2175 p = fword + sp->ts_fidx;
2176 c = *p;
2177 if (c == NUL)
2178 {
2179 // End of word, can't swap or replace.
2180 PROF_STORE(sp->ts_state)
2181 sp->ts_state = STATE_FINAL;
2182 break;
2183 }
2184
2185 // Don't swap if the first character is not a word character.
2186 // SWAP3 etc. also don't make sense then.
2187 if (!soundfold && !spell_iswordp(p, curwin))
2188 {
2189 PROF_STORE(sp->ts_state)
2190 sp->ts_state = STATE_REP_INI;
2191 break;
2192 }
2193
2194 if (has_mbyte)
2195 {
2196 n = MB_CPTR2LEN(p);
2197 c = mb_ptr2char(p);
2198 if (p[n] == NUL)
2199 c2 = NUL;
2200 else if (!soundfold && !spell_iswordp(p + n, curwin))
2201 c2 = c; // don't swap non-word char
2202 else
2203 c2 = mb_ptr2char(p + n);
2204 }
2205 else
2206 {
2207 if (p[1] == NUL)
2208 c2 = NUL;
2209 else if (!soundfold && !spell_iswordp(p + 1, curwin))
2210 c2 = c; // don't swap non-word char
2211 else
2212 c2 = p[1];
2213 }
2214
2215 // When the second character is NUL we can't swap.
2216 if (c2 == NUL)
2217 {
2218 PROF_STORE(sp->ts_state)
2219 sp->ts_state = STATE_REP_INI;
2220 break;
2221 }
2222
2223 // When characters are identical, swap won't do anything.
2224 // Also get here if the second char is not a word character.
2225 if (c == c2)
2226 {
2227 PROF_STORE(sp->ts_state)
2228 sp->ts_state = STATE_SWAP3;
2229 break;
2230 }
2231 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP))
2232 {
2233 go_deeper(stack, depth, SCORE_SWAP);
2234#ifdef DEBUG_TRIEWALK
2235 sprintf(changename[depth], "%.*s-%s: swap %c and %c",
2236 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2237 c, c2);
2238#endif
2239 PROF_STORE(sp->ts_state)
2240 sp->ts_state = STATE_UNSWAP;
2241 ++depth;
2242 if (has_mbyte)
2243 {
2244 fl = mb_char2len(c2);
2245 mch_memmove(p, p + n, fl);
2246 mb_char2bytes(c, p + fl);
2247 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2248 }
2249 else
2250 {
2251 p[0] = c2;
2252 p[1] = c;
2253 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
2254 }
2255 }
2256 else
2257 {
2258 // If this swap doesn't work then SWAP3 won't either.
2259 PROF_STORE(sp->ts_state)
2260 sp->ts_state = STATE_REP_INI;
2261 }
2262 break;
2263
2264 case STATE_UNSWAP:
2265 // Undo the STATE_SWAP swap: "21" -> "12".
2266 p = fword + sp->ts_fidx;
2267 if (has_mbyte)
2268 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002269 n = mb_ptr2len(p);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002270 c = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002271 mch_memmove(p + mb_ptr2len(p + n), p, n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002272 mb_char2bytes(c, p);
2273 }
2274 else
2275 {
2276 c = *p;
2277 *p = p[1];
2278 p[1] = c;
2279 }
2280 // FALLTHROUGH
2281
2282 case STATE_SWAP3:
2283 // Swap two bytes, skipping one: "123" -> "321". We change
2284 // "fword" here, it's changed back afterwards at STATE_UNSWAP3.
2285 p = fword + sp->ts_fidx;
2286 if (has_mbyte)
2287 {
2288 n = MB_CPTR2LEN(p);
2289 c = mb_ptr2char(p);
2290 fl = MB_CPTR2LEN(p + n);
2291 c2 = mb_ptr2char(p + n);
2292 if (!soundfold && !spell_iswordp(p + n + fl, curwin))
2293 c3 = c; // don't swap non-word char
2294 else
2295 c3 = mb_ptr2char(p + n + fl);
2296 }
2297 else
2298 {
2299 c = *p;
2300 c2 = p[1];
2301 if (!soundfold && !spell_iswordp(p + 2, curwin))
2302 c3 = c; // don't swap non-word char
2303 else
2304 c3 = p[2];
2305 }
2306
2307 // When characters are identical: "121" then SWAP3 result is
2308 // identical, ROT3L result is same as SWAP: "211", ROT3L result is
2309 // same as SWAP on next char: "112". Thus skip all swapping.
2310 // Also skip when c3 is NUL.
2311 // Also get here when the third character is not a word character.
2312 // Second character may any char: "a.b" -> "b.a"
2313 if (c == c3 || c3 == NUL)
2314 {
2315 PROF_STORE(sp->ts_state)
2316 sp->ts_state = STATE_REP_INI;
2317 break;
2318 }
2319 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2320 {
2321 go_deeper(stack, depth, SCORE_SWAP3);
2322#ifdef DEBUG_TRIEWALK
2323 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c",
2324 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2325 c, c3);
2326#endif
2327 PROF_STORE(sp->ts_state)
2328 sp->ts_state = STATE_UNSWAP3;
2329 ++depth;
2330 if (has_mbyte)
2331 {
2332 tl = mb_char2len(c3);
2333 mch_memmove(p, p + n + fl, tl);
2334 mb_char2bytes(c2, p + tl);
2335 mb_char2bytes(c, p + fl + tl);
2336 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
2337 }
2338 else
2339 {
2340 p[0] = p[2];
2341 p[2] = c;
2342 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2343 }
2344 }
2345 else
2346 {
2347 PROF_STORE(sp->ts_state)
2348 sp->ts_state = STATE_REP_INI;
2349 }
2350 break;
2351
2352 case STATE_UNSWAP3:
2353 // Undo STATE_SWAP3: "321" -> "123"
2354 p = fword + sp->ts_fidx;
2355 if (has_mbyte)
2356 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002357 n = mb_ptr2len(p);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002358 c2 = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002359 fl = mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002360 c = mb_ptr2char(p + n + fl);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002361 tl = mb_ptr2len(p + n + fl);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002362 mch_memmove(p + fl + tl, p, n);
2363 mb_char2bytes(c, p);
2364 mb_char2bytes(c2, p + tl);
2365 p = p + tl;
2366 }
2367 else
2368 {
2369 c = *p;
2370 *p = p[2];
2371 p[2] = c;
2372 ++p;
2373 }
2374
2375 if (!soundfold && !spell_iswordp(p, curwin))
2376 {
2377 // Middle char is not a word char, skip the rotate. First and
2378 // third char were already checked at swap and swap3.
2379 PROF_STORE(sp->ts_state)
2380 sp->ts_state = STATE_REP_INI;
2381 break;
2382 }
2383
2384 // Rotate three characters left: "123" -> "231". We change
2385 // "fword" here, it's changed back afterwards at STATE_UNROT3L.
2386 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2387 {
2388 go_deeper(stack, depth, SCORE_SWAP3);
2389#ifdef DEBUG_TRIEWALK
2390 p = fword + sp->ts_fidx;
2391 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c",
2392 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2393 p[0], p[1], p[2]);
2394#endif
2395 PROF_STORE(sp->ts_state)
2396 sp->ts_state = STATE_UNROT3L;
2397 ++depth;
2398 p = fword + sp->ts_fidx;
2399 if (has_mbyte)
2400 {
2401 n = MB_CPTR2LEN(p);
2402 c = mb_ptr2char(p);
2403 fl = MB_CPTR2LEN(p + n);
2404 fl += MB_CPTR2LEN(p + n + fl);
2405 mch_memmove(p, p + n, fl);
2406 mb_char2bytes(c, p + fl);
2407 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2408 }
2409 else
2410 {
2411 c = *p;
2412 *p = p[1];
2413 p[1] = p[2];
2414 p[2] = c;
2415 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2416 }
2417 }
2418 else
2419 {
2420 PROF_STORE(sp->ts_state)
2421 sp->ts_state = STATE_REP_INI;
2422 }
2423 break;
2424
2425 case STATE_UNROT3L:
2426 // Undo ROT3L: "231" -> "123"
2427 p = fword + sp->ts_fidx;
2428 if (has_mbyte)
2429 {
Bram Moolenaar1614a142019-10-06 22:00:13 +02002430 n = mb_ptr2len(p);
2431 n += mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002432 c = mb_ptr2char(p + n);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002433 tl = mb_ptr2len(p + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002434 mch_memmove(p + tl, p, n);
2435 mb_char2bytes(c, p);
2436 }
2437 else
2438 {
2439 c = p[2];
2440 p[2] = p[1];
2441 p[1] = *p;
2442 *p = c;
2443 }
2444
2445 // Rotate three bytes right: "123" -> "312". We change "fword"
2446 // here, it's changed back afterwards at STATE_UNROT3R.
2447 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2448 {
2449 go_deeper(stack, depth, SCORE_SWAP3);
2450#ifdef DEBUG_TRIEWALK
2451 p = fword + sp->ts_fidx;
2452 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c",
2453 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2454 p[0], p[1], p[2]);
2455#endif
2456 PROF_STORE(sp->ts_state)
2457 sp->ts_state = STATE_UNROT3R;
2458 ++depth;
2459 p = fword + sp->ts_fidx;
2460 if (has_mbyte)
2461 {
2462 n = MB_CPTR2LEN(p);
2463 n += MB_CPTR2LEN(p + n);
2464 c = mb_ptr2char(p + n);
2465 tl = MB_CPTR2LEN(p + n);
2466 mch_memmove(p + tl, p, n);
2467 mb_char2bytes(c, p);
2468 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
2469 }
2470 else
2471 {
2472 c = p[2];
2473 p[2] = p[1];
2474 p[1] = *p;
2475 *p = c;
2476 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2477 }
2478 }
2479 else
2480 {
2481 PROF_STORE(sp->ts_state)
2482 sp->ts_state = STATE_REP_INI;
2483 }
2484 break;
2485
2486 case STATE_UNROT3R:
2487 // Undo ROT3R: "312" -> "123"
2488 p = fword + sp->ts_fidx;
2489 if (has_mbyte)
2490 {
2491 c = mb_ptr2char(p);
Bram Moolenaar1614a142019-10-06 22:00:13 +02002492 tl = mb_ptr2len(p);
2493 n = mb_ptr2len(p + tl);
2494 n += mb_ptr2len(p + tl + n);
Bram Moolenaar46a426c2019-09-27 12:41:56 +02002495 mch_memmove(p, p + tl, n);
2496 mb_char2bytes(c, p + n);
2497 }
2498 else
2499 {
2500 c = *p;
2501 *p = p[1];
2502 p[1] = p[2];
2503 p[2] = c;
2504 }
2505 // FALLTHROUGH
2506
2507 case STATE_REP_INI:
2508 // Check if matching with REP items from the .aff file would work.
2509 // Quickly skip if:
2510 // - there are no REP items and we are not in the soundfold trie
2511 // - the score is going to be too high anyway
2512 // - already applied a REP item or swapped here
2513 if ((lp->lp_replang == NULL && !soundfold)
2514 || sp->ts_score + SCORE_REP >= su->su_maxscore
2515 || sp->ts_fidx < sp->ts_fidxtry)
2516 {
2517 PROF_STORE(sp->ts_state)
2518 sp->ts_state = STATE_FINAL;
2519 break;
2520 }
2521
2522 // Use the first byte to quickly find the first entry that may
2523 // match. If the index is -1 there is none.
2524 if (soundfold)
2525 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]];
2526 else
2527 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
2528
2529 if (sp->ts_curi < 0)
2530 {
2531 PROF_STORE(sp->ts_state)
2532 sp->ts_state = STATE_FINAL;
2533 break;
2534 }
2535
2536 PROF_STORE(sp->ts_state)
2537 sp->ts_state = STATE_REP;
2538 // FALLTHROUGH
2539
2540 case STATE_REP:
2541 // Try matching with REP items from the .aff file. For each match
2542 // replace the characters and check if the resulting word is
2543 // valid.
2544 p = fword + sp->ts_fidx;
2545
2546 if (soundfold)
2547 gap = &slang->sl_repsal;
2548 else
2549 gap = &lp->lp_replang->sl_rep;
2550 while (sp->ts_curi < gap->ga_len)
2551 {
2552 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
2553 if (*ftp->ft_from != *p)
2554 {
2555 // past possible matching entries
2556 sp->ts_curi = gap->ga_len;
2557 break;
2558 }
2559 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
2560 && TRY_DEEPER(su, stack, depth, SCORE_REP))
2561 {
2562 go_deeper(stack, depth, SCORE_REP);
2563#ifdef DEBUG_TRIEWALK
2564 sprintf(changename[depth], "%.*s-%s: replace %s with %s",
2565 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2566 ftp->ft_from, ftp->ft_to);
2567#endif
2568 // Need to undo this afterwards.
2569 PROF_STORE(sp->ts_state)
2570 sp->ts_state = STATE_REP_UNDO;
2571
2572 // Change the "from" to the "to" string.
2573 ++depth;
2574 fl = (int)STRLEN(ftp->ft_from);
2575 tl = (int)STRLEN(ftp->ft_to);
2576 if (fl != tl)
2577 {
2578 STRMOVE(p + tl, p + fl);
2579 repextra += tl - fl;
2580 }
2581 mch_memmove(p, ftp->ft_to, tl);
2582 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
2583 stack[depth].ts_tcharlen = 0;
2584 break;
2585 }
2586 }
2587
2588 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP)
2589 {
2590 // No (more) matches.
2591 PROF_STORE(sp->ts_state)
2592 sp->ts_state = STATE_FINAL;
2593 }
2594
2595 break;
2596
2597 case STATE_REP_UNDO:
2598 // Undo a REP replacement and continue with the next one.
2599 if (soundfold)
2600 gap = &slang->sl_repsal;
2601 else
2602 gap = &lp->lp_replang->sl_rep;
2603 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1;
2604 fl = (int)STRLEN(ftp->ft_from);
2605 tl = (int)STRLEN(ftp->ft_to);
2606 p = fword + sp->ts_fidx;
2607 if (fl != tl)
2608 {
2609 STRMOVE(p + fl, p + tl);
2610 repextra -= tl - fl;
2611 }
2612 mch_memmove(p, ftp->ft_from, fl);
2613 PROF_STORE(sp->ts_state)
2614 sp->ts_state = STATE_REP;
2615 break;
2616
2617 default:
2618 // Did all possible states at this level, go up one level.
2619 --depth;
2620
2621 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE)
2622 {
2623 // Continue in or go back to the prefix tree.
2624 byts = pbyts;
2625 idxs = pidxs;
2626 }
2627
2628 // Don't check for CTRL-C too often, it takes time.
2629 if (--breakcheckcount == 0)
2630 {
2631 ui_breakcheck();
2632 breakcheckcount = 1000;
2633 }
2634 }
2635 }
2636}
2637
2638
2639/*
2640 * Go one level deeper in the tree.
2641 */
2642 static void
2643go_deeper(trystate_T *stack, int depth, int score_add)
2644{
2645 stack[depth + 1] = stack[depth];
2646 stack[depth + 1].ts_state = STATE_START;
2647 stack[depth + 1].ts_score = stack[depth].ts_score + score_add;
2648 stack[depth + 1].ts_curi = 1; // start just after length byte
2649 stack[depth + 1].ts_flags = 0;
2650}
2651
2652/*
2653 * "fword" is a good word with case folded. Find the matching keep-case
2654 * words and put it in "kword".
2655 * Theoretically there could be several keep-case words that result in the
2656 * same case-folded word, but we only find one...
2657 */
2658 static void
2659find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword)
2660{
2661 char_u uword[MAXWLEN]; // "fword" in upper-case
2662 int depth;
2663 idx_T tryidx;
2664
2665 // The following arrays are used at each depth in the tree.
2666 idx_T arridx[MAXWLEN];
2667 int round[MAXWLEN];
2668 int fwordidx[MAXWLEN];
2669 int uwordidx[MAXWLEN];
2670 int kwordlen[MAXWLEN];
2671
2672 int flen, ulen;
2673 int l;
2674 int len;
2675 int c;
2676 idx_T lo, hi, m;
2677 char_u *p;
2678 char_u *byts = slang->sl_kbyts; // array with bytes of the words
2679 idx_T *idxs = slang->sl_kidxs; // array with indexes
2680
2681 if (byts == NULL)
2682 {
2683 // array is empty: "cannot happen"
2684 *kword = NUL;
2685 return;
2686 }
2687
2688 // Make an all-cap version of "fword".
2689 allcap_copy(fword, uword);
2690
2691 // Each character needs to be tried both case-folded and upper-case.
2692 // All this gets very complicated if we keep in mind that changing case
2693 // may change the byte length of a multi-byte character...
2694 depth = 0;
2695 arridx[0] = 0;
2696 round[0] = 0;
2697 fwordidx[0] = 0;
2698 uwordidx[0] = 0;
2699 kwordlen[0] = 0;
2700 while (depth >= 0)
2701 {
2702 if (fword[fwordidx[depth]] == NUL)
2703 {
2704 // We are at the end of "fword". If the tree allows a word to end
2705 // here we have found a match.
2706 if (byts[arridx[depth] + 1] == 0)
2707 {
2708 kword[kwordlen[depth]] = NUL;
2709 return;
2710 }
2711
2712 // kword is getting too long, continue one level up
2713 --depth;
2714 }
2715 else if (++round[depth] > 2)
2716 {
2717 // tried both fold-case and upper-case character, continue one
2718 // level up
2719 --depth;
2720 }
2721 else
2722 {
2723 // round[depth] == 1: Try using the folded-case character.
2724 // round[depth] == 2: Try using the upper-case character.
2725 if (has_mbyte)
2726 {
2727 flen = MB_CPTR2LEN(fword + fwordidx[depth]);
2728 ulen = MB_CPTR2LEN(uword + uwordidx[depth]);
2729 }
2730 else
2731 ulen = flen = 1;
2732 if (round[depth] == 1)
2733 {
2734 p = fword + fwordidx[depth];
2735 l = flen;
2736 }
2737 else
2738 {
2739 p = uword + uwordidx[depth];
2740 l = ulen;
2741 }
2742
2743 for (tryidx = arridx[depth]; l > 0; --l)
2744 {
2745 // Perform a binary search in the list of accepted bytes.
2746 len = byts[tryidx++];
2747 c = *p++;
2748 lo = tryidx;
2749 hi = tryidx + len - 1;
2750 while (lo < hi)
2751 {
2752 m = (lo + hi) / 2;
2753 if (byts[m] > c)
2754 hi = m - 1;
2755 else if (byts[m] < c)
2756 lo = m + 1;
2757 else
2758 {
2759 lo = hi = m;
2760 break;
2761 }
2762 }
2763
2764 // Stop if there is no matching byte.
2765 if (hi < lo || byts[lo] != c)
2766 break;
2767
2768 // Continue at the child (if there is one).
2769 tryidx = idxs[lo];
2770 }
2771
2772 if (l == 0)
2773 {
2774 // Found the matching char. Copy it to "kword" and go a
2775 // level deeper.
2776 if (round[depth] == 1)
2777 {
2778 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
2779 flen);
2780 kwordlen[depth + 1] = kwordlen[depth] + flen;
2781 }
2782 else
2783 {
2784 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
2785 ulen);
2786 kwordlen[depth + 1] = kwordlen[depth] + ulen;
2787 }
2788 fwordidx[depth + 1] = fwordidx[depth] + flen;
2789 uwordidx[depth + 1] = uwordidx[depth] + ulen;
2790
2791 ++depth;
2792 arridx[depth] = tryidx;
2793 round[depth] = 0;
2794 }
2795 }
2796 }
2797
2798 // Didn't find it: "cannot happen".
2799 *kword = NUL;
2800}
2801
2802/*
2803 * Compute the sound-a-like score for suggestions in su->su_ga and add them to
2804 * su->su_sga.
2805 */
2806 static void
2807score_comp_sal(suginfo_T *su)
2808{
2809 langp_T *lp;
2810 char_u badsound[MAXWLEN];
2811 int i;
2812 suggest_T *stp;
2813 suggest_T *sstp;
2814 int score;
2815 int lpi;
2816
2817 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL)
2818 return;
2819
2820 // Use the sound-folding of the first language that supports it.
2821 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2822 {
2823 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2824 if (lp->lp_slang->sl_sal.ga_len > 0)
2825 {
2826 // soundfold the bad word
2827 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound);
2828
2829 for (i = 0; i < su->su_ga.ga_len; ++i)
2830 {
2831 stp = &SUG(su->su_ga, i);
2832
2833 // Case-fold the suggested word, sound-fold it and compute the
2834 // sound-a-like score.
2835 score = stp_sal_score(stp, su, lp->lp_slang, badsound);
2836 if (score < SCORE_MAXMAX)
2837 {
2838 // Add the suggestion.
2839 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
2840 sstp->st_word = vim_strsave(stp->st_word);
2841 if (sstp->st_word != NULL)
2842 {
2843 sstp->st_wordlen = stp->st_wordlen;
2844 sstp->st_score = score;
2845 sstp->st_altscore = 0;
2846 sstp->st_orglen = stp->st_orglen;
2847 ++su->su_sga.ga_len;
2848 }
2849 }
2850 }
2851 break;
2852 }
2853 }
2854}
2855
2856/*
2857 * Combine the list of suggestions in su->su_ga and su->su_sga.
2858 * They are entwined.
2859 */
2860 static void
2861score_combine(suginfo_T *su)
2862{
2863 int i;
2864 int j;
2865 garray_T ga;
2866 garray_T *gap;
2867 langp_T *lp;
2868 suggest_T *stp;
2869 char_u *p;
2870 char_u badsound[MAXWLEN];
2871 int round;
2872 int lpi;
2873 slang_T *slang = NULL;
2874
2875 // Add the alternate score to su_ga.
2876 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2877 {
2878 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2879 if (lp->lp_slang->sl_sal.ga_len > 0)
2880 {
2881 // soundfold the bad word
2882 slang = lp->lp_slang;
2883 spell_soundfold(slang, su->su_fbadword, TRUE, badsound);
2884
2885 for (i = 0; i < su->su_ga.ga_len; ++i)
2886 {
2887 stp = &SUG(su->su_ga, i);
2888 stp->st_altscore = stp_sal_score(stp, su, slang, badsound);
2889 if (stp->st_altscore == SCORE_MAXMAX)
2890 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
2891 else
2892 stp->st_score = (stp->st_score * 3
2893 + stp->st_altscore) / 4;
2894 stp->st_salscore = FALSE;
2895 }
2896 break;
2897 }
2898 }
2899
2900 if (slang == NULL) // Using "double" without sound folding.
2901 {
2902 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore,
2903 su->su_maxcount);
2904 return;
2905 }
2906
2907 // Add the alternate score to su_sga.
2908 for (i = 0; i < su->su_sga.ga_len; ++i)
2909 {
2910 stp = &SUG(su->su_sga, i);
2911 stp->st_altscore = spell_edit_score(slang,
2912 su->su_badword, stp->st_word);
2913 if (stp->st_score == SCORE_MAXMAX)
2914 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
2915 else
2916 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
2917 stp->st_salscore = TRUE;
2918 }
2919
2920 // Remove bad suggestions, sort the suggestions and truncate at "maxcount"
2921 // for both lists.
2922 check_suggestions(su, &su->su_ga);
2923 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
2924 check_suggestions(su, &su->su_sga);
2925 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
2926
2927 ga_init2(&ga, (int)sizeof(suginfo_T), 1);
2928 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL)
2929 return;
2930
2931 stp = &SUG(ga, 0);
2932 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i)
2933 {
2934 // round 1: get a suggestion from su_ga
2935 // round 2: get a suggestion from su_sga
2936 for (round = 1; round <= 2; ++round)
2937 {
2938 gap = round == 1 ? &su->su_ga : &su->su_sga;
2939 if (i < gap->ga_len)
2940 {
2941 // Don't add a word if it's already there.
2942 p = SUG(*gap, i).st_word;
2943 for (j = 0; j < ga.ga_len; ++j)
2944 if (STRCMP(stp[j].st_word, p) == 0)
2945 break;
2946 if (j == ga.ga_len)
2947 stp[ga.ga_len++] = SUG(*gap, i);
2948 else
2949 vim_free(p);
2950 }
2951 }
2952 }
2953
2954 ga_clear(&su->su_ga);
2955 ga_clear(&su->su_sga);
2956
2957 // Truncate the list to the number of suggestions that will be displayed.
2958 if (ga.ga_len > su->su_maxcount)
2959 {
2960 for (i = su->su_maxcount; i < ga.ga_len; ++i)
2961 vim_free(stp[i].st_word);
2962 ga.ga_len = su->su_maxcount;
2963 }
2964
2965 su->su_ga = ga;
2966}
2967
2968/*
2969 * For the goodword in "stp" compute the soundalike score compared to the
2970 * badword.
2971 */
2972 static int
2973stp_sal_score(
2974 suggest_T *stp,
2975 suginfo_T *su,
2976 slang_T *slang,
2977 char_u *badsound) // sound-folded badword
2978{
2979 char_u *p;
2980 char_u *pbad;
2981 char_u *pgood;
2982 char_u badsound2[MAXWLEN];
2983 char_u fword[MAXWLEN];
2984 char_u goodsound[MAXWLEN];
2985 char_u goodword[MAXWLEN];
2986 int lendiff;
2987
2988 lendiff = (int)(su->su_badlen - stp->st_orglen);
2989 if (lendiff >= 0)
2990 pbad = badsound;
2991 else
2992 {
2993 // soundfold the bad word with more characters following
2994 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN);
2995
2996 // When joining two words the sound often changes a lot. E.g., "t he"
2997 // sounds like "t h" while "the" sounds like "@". Avoid that by
2998 // removing the space. Don't do it when the good word also contains a
2999 // space.
3000 if (VIM_ISWHITE(su->su_badptr[su->su_badlen])
3001 && *skiptowhite(stp->st_word) == NUL)
3002 for (p = fword; *(p = skiptowhite(p)) != NUL; )
3003 STRMOVE(p, p + 1);
3004
3005 spell_soundfold(slang, fword, TRUE, badsound2);
3006 pbad = badsound2;
3007 }
3008
3009 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN)
3010 {
3011 // Add part of the bad word to the good word, so that we soundfold
3012 // what replaces the bad word.
3013 STRCPY(goodword, stp->st_word);
3014 vim_strncpy(goodword + stp->st_wordlen,
3015 su->su_badptr + su->su_badlen - lendiff, lendiff);
3016 pgood = goodword;
3017 }
3018 else
3019 pgood = stp->st_word;
3020
3021 // Sound-fold the word and compute the score for the difference.
3022 spell_soundfold(slang, pgood, FALSE, goodsound);
3023
3024 return soundalike_score(goodsound, pbad);
3025}
3026
3027// structure used to store soundfolded words that add_sound_suggest() has
3028// handled already.
3029typedef struct
3030{
3031 short sft_score; // lowest score used
3032 char_u sft_word[1]; // soundfolded word, actually longer
3033} sftword_T;
3034
3035static sftword_T dumsft;
3036#define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft)))
3037#define HI2SFT(hi) HIKEY2SFT((hi)->hi_key)
3038
3039/*
3040 * Prepare for calling suggest_try_soundalike().
3041 */
3042 static void
3043suggest_try_soundalike_prep(void)
3044{
3045 langp_T *lp;
3046 int lpi;
3047 slang_T *slang;
3048
3049 // Do this for all languages that support sound folding and for which a
3050 // .sug file has been loaded.
3051 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3052 {
3053 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3054 slang = lp->lp_slang;
3055 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3056 // prepare the hashtable used by add_sound_suggest()
3057 hash_init(&slang->sl_sounddone);
3058 }
3059}
3060
3061/*
3062 * Find suggestions by comparing the word in a sound-a-like form.
3063 * Note: This doesn't support postponed prefixes.
3064 */
3065 static void
3066suggest_try_soundalike(suginfo_T *su)
3067{
3068 char_u salword[MAXWLEN];
3069 langp_T *lp;
3070 int lpi;
3071 slang_T *slang;
3072
3073 // Do this for all languages that support sound folding and for which a
3074 // .sug file has been loaded.
3075 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3076 {
3077 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3078 slang = lp->lp_slang;
3079 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3080 {
3081 // soundfold the bad word
3082 spell_soundfold(slang, su->su_fbadword, TRUE, salword);
3083
3084 // try all kinds of inserts/deletes/swaps/etc.
3085 // TODO: also soundfold the next words, so that we can try joining
3086 // and splitting
3087#ifdef SUGGEST_PROFILE
3088 prof_init();
3089#endif
3090 suggest_trie_walk(su, lp, salword, TRUE);
3091#ifdef SUGGEST_PROFILE
3092 prof_report("soundalike");
3093#endif
3094 }
3095 }
3096}
3097
3098/*
3099 * Finish up after calling suggest_try_soundalike().
3100 */
3101 static void
3102suggest_try_soundalike_finish(void)
3103{
3104 langp_T *lp;
3105 int lpi;
3106 slang_T *slang;
3107 int todo;
3108 hashitem_T *hi;
3109
3110 // Do this for all languages that support sound folding and for which a
3111 // .sug file has been loaded.
3112 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3113 {
3114 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3115 slang = lp->lp_slang;
3116 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3117 {
3118 // Free the info about handled words.
3119 todo = (int)slang->sl_sounddone.ht_used;
3120 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi)
3121 if (!HASHITEM_EMPTY(hi))
3122 {
3123 vim_free(HI2SFT(hi));
3124 --todo;
3125 }
3126
3127 // Clear the hashtable, it may also be used by another region.
3128 hash_clear(&slang->sl_sounddone);
3129 hash_init(&slang->sl_sounddone);
3130 }
3131 }
3132}
3133
3134/*
3135 * A match with a soundfolded word is found. Add the good word(s) that
3136 * produce this soundfolded word.
3137 */
3138 static void
3139add_sound_suggest(
3140 suginfo_T *su,
3141 char_u *goodword,
3142 int score, // soundfold score
3143 langp_T *lp)
3144{
3145 slang_T *slang = lp->lp_slang; // language for sound folding
3146 int sfwordnr;
3147 char_u *nrline;
3148 int orgnr;
3149 char_u theword[MAXWLEN];
3150 int i;
3151 int wlen;
3152 char_u *byts;
3153 idx_T *idxs;
3154 int n;
3155 int wordcount;
3156 int wc;
3157 int goodscore;
3158 hash_T hash;
3159 hashitem_T *hi;
3160 sftword_T *sft;
3161 int bc, gc;
3162 int limit;
3163
3164 // It's very well possible that the same soundfold word is found several
3165 // times with different scores. Since the following is quite slow only do
3166 // the words that have a better score than before. Use a hashtable to
3167 // remember the words that have been done.
3168 hash = hash_hash(goodword);
3169 hi = hash_lookup(&slang->sl_sounddone, goodword, hash);
3170 if (HASHITEM_EMPTY(hi))
3171 {
3172 sft = alloc(sizeof(sftword_T) + STRLEN(goodword));
3173 if (sft != NULL)
3174 {
3175 sft->sft_score = score;
3176 STRCPY(sft->sft_word, goodword);
3177 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash);
3178 }
3179 }
3180 else
3181 {
3182 sft = HI2SFT(hi);
3183 if (score >= sft->sft_score)
3184 return;
3185 sft->sft_score = score;
3186 }
3187
3188 // Find the word nr in the soundfold tree.
3189 sfwordnr = soundfold_find(slang, goodword);
3190 if (sfwordnr < 0)
3191 {
3192 internal_error("add_sound_suggest()");
3193 return;
3194 }
3195
3196 // go over the list of good words that produce this soundfold word
3197 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE);
3198 orgnr = 0;
3199 while (*nrline != NUL)
3200 {
3201 // The wordnr was stored in a minimal nr of bytes as an offset to the
3202 // previous wordnr.
3203 orgnr += bytes2offset(&nrline);
3204
3205 byts = slang->sl_fbyts;
3206 idxs = slang->sl_fidxs;
3207
3208 // Lookup the word "orgnr" one of the two tries.
3209 n = 0;
3210 wordcount = 0;
3211 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen)
3212 {
3213 i = 1;
3214 if (wordcount == orgnr && byts[n + 1] == NUL)
3215 break; // found end of word
3216
3217 if (byts[n + 1] == NUL)
3218 ++wordcount;
3219
3220 // skip over the NUL bytes
3221 for ( ; byts[n + i] == NUL; ++i)
3222 if (i > byts[n]) // safety check
3223 {
3224 STRCPY(theword + wlen, "BAD");
3225 wlen += 3;
3226 goto badword;
3227 }
3228
3229 // One of the siblings must have the word.
3230 for ( ; i < byts[n]; ++i)
3231 {
3232 wc = idxs[idxs[n + i]]; // nr of words under this byte
3233 if (wordcount + wc > orgnr)
3234 break;
3235 wordcount += wc;
3236 }
3237
3238 theword[wlen] = byts[n + i];
3239 n = idxs[n + i];
3240 }
3241badword:
3242 theword[wlen] = NUL;
3243
3244 // Go over the possible flags and regions.
3245 for (; i <= byts[n] && byts[n + i] == NUL; ++i)
3246 {
3247 char_u cword[MAXWLEN];
3248 char_u *p;
3249 int flags = (int)idxs[n + i];
3250
3251 // Skip words with the NOSUGGEST flag
3252 if (flags & WF_NOSUGGEST)
3253 continue;
3254
3255 if (flags & WF_KEEPCAP)
3256 {
3257 // Must find the word in the keep-case tree.
3258 find_keepcap_word(slang, theword, cword);
3259 p = cword;
3260 }
3261 else
3262 {
3263 flags |= su->su_badflags;
3264 if ((flags & WF_CAPMASK) != 0)
3265 {
3266 // Need to fix case according to "flags".
3267 make_case_word(theword, cword, flags);
3268 p = cword;
3269 }
3270 else
3271 p = theword;
3272 }
3273
3274 // Add the suggestion.
3275 if (sps_flags & SPS_DOUBLE)
3276 {
3277 // Add the suggestion if the score isn't too bad.
3278 if (score <= su->su_maxscore)
3279 add_suggestion(su, &su->su_sga, p, su->su_badlen,
3280 score, 0, FALSE, slang, FALSE);
3281 }
3282 else
3283 {
3284 // Add a penalty for words in another region.
3285 if ((flags & WF_REGION)
3286 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
3287 goodscore = SCORE_REGION;
3288 else
3289 goodscore = 0;
3290
3291 // Add a small penalty for changing the first letter from
3292 // lower to upper case. Helps for "tath" -> "Kath", which is
3293 // less common than "tath" -> "path". Don't do it when the
3294 // letter is the same, that has already been counted.
3295 gc = PTR2CHAR(p);
3296 if (SPELL_ISUPPER(gc))
3297 {
3298 bc = PTR2CHAR(su->su_badword);
3299 if (!SPELL_ISUPPER(bc)
3300 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc))
3301 goodscore += SCORE_ICASE / 2;
3302 }
3303
3304 // Compute the score for the good word. This only does letter
3305 // insert/delete/swap/replace. REP items are not considered,
3306 // which may make the score a bit higher.
3307 // Use a limit for the score to make it work faster. Use
3308 // MAXSCORE(), because RESCORE() will change the score.
3309 // If the limit is very high then the iterative method is
3310 // inefficient, using an array is quicker.
3311 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score);
3312 if (limit > SCORE_LIMITMAX)
3313 goodscore += spell_edit_score(slang, su->su_badword, p);
3314 else
3315 goodscore += spell_edit_score_limit(slang, su->su_badword,
3316 p, limit);
3317
3318 // When going over the limit don't bother to do the rest.
3319 if (goodscore < SCORE_MAXMAX)
3320 {
3321 // Give a bonus to words seen before.
3322 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE);
3323
3324 // Add the suggestion if the score isn't too bad.
3325 goodscore = RESCORE(goodscore, score);
3326 if (goodscore <= su->su_sfmaxscore)
3327 add_suggestion(su, &su->su_ga, p, su->su_badlen,
3328 goodscore, score, TRUE, slang, TRUE);
3329 }
3330 }
3331 }
3332 // smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr);
3333 }
3334}
3335
3336/*
3337 * Find word "word" in fold-case tree for "slang" and return the word number.
3338 */
3339 static int
3340soundfold_find(slang_T *slang, char_u *word)
3341{
3342 idx_T arridx = 0;
3343 int len;
3344 int wlen = 0;
3345 int c;
3346 char_u *ptr = word;
3347 char_u *byts;
3348 idx_T *idxs;
3349 int wordnr = 0;
3350
3351 byts = slang->sl_sbyts;
3352 idxs = slang->sl_sidxs;
3353
3354 for (;;)
3355 {
3356 // First byte is the number of possible bytes.
3357 len = byts[arridx++];
3358
3359 // If the first possible byte is a zero the word could end here.
3360 // If the word ends we found the word. If not skip the NUL bytes.
3361 c = ptr[wlen];
3362 if (byts[arridx] == NUL)
3363 {
3364 if (c == NUL)
3365 break;
3366
3367 // Skip over the zeros, there can be several.
3368 while (len > 0 && byts[arridx] == NUL)
3369 {
3370 ++arridx;
3371 --len;
3372 }
3373 if (len == 0)
3374 return -1; // no children, word should have ended here
3375 ++wordnr;
3376 }
3377
3378 // If the word ends we didn't find it.
3379 if (c == NUL)
3380 return -1;
3381
3382 // Perform a binary search in the list of accepted bytes.
3383 if (c == TAB) // <Tab> is handled like <Space>
3384 c = ' ';
3385 while (byts[arridx] < c)
3386 {
3387 // The word count is in the first idxs[] entry of the child.
3388 wordnr += idxs[idxs[arridx]];
3389 ++arridx;
3390 if (--len == 0) // end of the bytes, didn't find it
3391 return -1;
3392 }
3393 if (byts[arridx] != c) // didn't find the byte
3394 return -1;
3395
3396 // Continue at the child (if there is one).
3397 arridx = idxs[arridx];
3398 ++wlen;
3399
3400 // One space in the good word may stand for several spaces in the
3401 // checked word.
3402 if (c == ' ')
3403 while (ptr[wlen] == ' ' || ptr[wlen] == TAB)
3404 ++wlen;
3405 }
3406
3407 return wordnr;
3408}
3409
3410/*
3411 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
3412 * lines in the .aff file.
3413 */
3414 static int
3415similar_chars(slang_T *slang, int c1, int c2)
3416{
3417 int m1, m2;
3418 char_u buf[MB_MAXBYTES + 1];
3419 hashitem_T *hi;
3420
3421 if (c1 >= 256)
3422 {
3423 buf[mb_char2bytes(c1, buf)] = 0;
3424 hi = hash_find(&slang->sl_map_hash, buf);
3425 if (HASHITEM_EMPTY(hi))
3426 m1 = 0;
3427 else
3428 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3429 }
3430 else
3431 m1 = slang->sl_map_array[c1];
3432 if (m1 == 0)
3433 return FALSE;
3434
3435
3436 if (c2 >= 256)
3437 {
3438 buf[mb_char2bytes(c2, buf)] = 0;
3439 hi = hash_find(&slang->sl_map_hash, buf);
3440 if (HASHITEM_EMPTY(hi))
3441 m2 = 0;
3442 else
3443 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3444 }
3445 else
3446 m2 = slang->sl_map_array[c2];
3447
3448 return m1 == m2;
3449}
3450
3451/*
3452 * Add a suggestion to the list of suggestions.
3453 * For a suggestion that is already in the list the lowest score is remembered.
3454 */
3455 static void
3456add_suggestion(
3457 suginfo_T *su,
3458 garray_T *gap, // either su_ga or su_sga
3459 char_u *goodword,
3460 int badlenarg, // len of bad word replaced with "goodword"
3461 int score,
3462 int altscore,
3463 int had_bonus, // value for st_had_bonus
3464 slang_T *slang, // language for sound folding
3465 int maxsf) // su_maxscore applies to soundfold score,
3466 // su_sfmaxscore to the total score.
3467{
3468 int goodlen; // len of goodword changed
3469 int badlen; // len of bad word changed
3470 suggest_T *stp;
3471 suggest_T new_sug;
3472 int i;
3473 char_u *pgood, *pbad;
3474
3475 // Minimize "badlen" for consistency. Avoids that changing "the the" to
3476 // "thee the" is added next to changing the first "the" the "thee".
3477 pgood = goodword + STRLEN(goodword);
3478 pbad = su->su_badptr + badlenarg;
3479 for (;;)
3480 {
3481 goodlen = (int)(pgood - goodword);
3482 badlen = (int)(pbad - su->su_badptr);
3483 if (goodlen <= 0 || badlen <= 0)
3484 break;
3485 MB_PTR_BACK(goodword, pgood);
3486 MB_PTR_BACK(su->su_badptr, pbad);
3487 if (has_mbyte)
3488 {
3489 if (mb_ptr2char(pgood) != mb_ptr2char(pbad))
3490 break;
3491 }
3492 else if (*pgood != *pbad)
3493 break;
3494 }
3495
3496 if (badlen == 0 && goodlen == 0)
3497 // goodword doesn't change anything; may happen for "the the" changing
3498 // the first "the" to itself.
3499 return;
3500
3501 if (gap->ga_len == 0)
3502 i = -1;
3503 else
3504 {
3505 // Check if the word is already there. Also check the length that is
3506 // being replaced "thes," -> "these" is a different suggestion from
3507 // "thes" -> "these".
3508 stp = &SUG(*gap, 0);
3509 for (i = gap->ga_len; --i >= 0; ++stp)
3510 if (stp->st_wordlen == goodlen
3511 && stp->st_orglen == badlen
3512 && STRNCMP(stp->st_word, goodword, goodlen) == 0)
3513 {
3514 // Found it. Remember the word with the lowest score.
3515 if (stp->st_slang == NULL)
3516 stp->st_slang = slang;
3517
3518 new_sug.st_score = score;
3519 new_sug.st_altscore = altscore;
3520 new_sug.st_had_bonus = had_bonus;
3521
3522 if (stp->st_had_bonus != had_bonus)
3523 {
3524 // Only one of the two had the soundalike score computed.
3525 // Need to do that for the other one now, otherwise the
3526 // scores can't be compared. This happens because
3527 // suggest_try_change() doesn't compute the soundalike
3528 // word to keep it fast, while some special methods set
3529 // the soundalike score to zero.
3530 if (had_bonus)
3531 rescore_one(su, stp);
3532 else
3533 {
3534 new_sug.st_word = stp->st_word;
3535 new_sug.st_wordlen = stp->st_wordlen;
3536 new_sug.st_slang = stp->st_slang;
3537 new_sug.st_orglen = badlen;
3538 rescore_one(su, &new_sug);
3539 }
3540 }
3541
3542 if (stp->st_score > new_sug.st_score)
3543 {
3544 stp->st_score = new_sug.st_score;
3545 stp->st_altscore = new_sug.st_altscore;
3546 stp->st_had_bonus = new_sug.st_had_bonus;
3547 }
3548 break;
3549 }
3550 }
3551
3552 if (i < 0 && ga_grow(gap, 1) == OK)
3553 {
3554 // Add a suggestion.
3555 stp = &SUG(*gap, gap->ga_len);
3556 stp->st_word = vim_strnsave(goodword, goodlen);
3557 if (stp->st_word != NULL)
3558 {
3559 stp->st_wordlen = goodlen;
3560 stp->st_score = score;
3561 stp->st_altscore = altscore;
3562 stp->st_had_bonus = had_bonus;
3563 stp->st_orglen = badlen;
3564 stp->st_slang = slang;
3565 ++gap->ga_len;
3566
3567 // If we have too many suggestions now, sort the list and keep
3568 // the best suggestions.
3569 if (gap->ga_len > SUG_MAX_COUNT(su))
3570 {
3571 if (maxsf)
3572 su->su_sfmaxscore = cleanup_suggestions(gap,
3573 su->su_sfmaxscore, SUG_CLEAN_COUNT(su));
3574 else
3575 su->su_maxscore = cleanup_suggestions(gap,
3576 su->su_maxscore, SUG_CLEAN_COUNT(su));
3577 }
3578 }
3579 }
3580}
3581
3582/*
3583 * Suggestions may in fact be flagged as errors. Esp. for banned words and
3584 * for split words, such as "the the". Remove these from the list here.
3585 */
3586 static void
3587check_suggestions(
3588 suginfo_T *su,
3589 garray_T *gap) // either su_ga or su_sga
3590{
3591 suggest_T *stp;
3592 int i;
3593 char_u longword[MAXWLEN + 1];
3594 int len;
3595 hlf_T attr;
3596
3597 stp = &SUG(*gap, 0);
3598 for (i = gap->ga_len - 1; i >= 0; --i)
3599 {
3600 // Need to append what follows to check for "the the".
3601 vim_strncpy(longword, stp[i].st_word, MAXWLEN);
3602 len = stp[i].st_wordlen;
3603 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen,
3604 MAXWLEN - len);
3605 attr = HLF_COUNT;
3606 (void)spell_check(curwin, longword, &attr, NULL, FALSE);
3607 if (attr != HLF_COUNT)
3608 {
3609 // Remove this entry.
3610 vim_free(stp[i].st_word);
3611 --gap->ga_len;
3612 if (i < gap->ga_len)
3613 mch_memmove(stp + i, stp + i + 1,
3614 sizeof(suggest_T) * (gap->ga_len - i));
3615 }
3616 }
3617}
3618
3619
3620/*
3621 * Add a word to be banned.
3622 */
3623 static void
3624add_banned(
3625 suginfo_T *su,
3626 char_u *word)
3627{
3628 char_u *s;
3629 hash_T hash;
3630 hashitem_T *hi;
3631
3632 hash = hash_hash(word);
3633 hi = hash_lookup(&su->su_banned, word, hash);
3634 if (HASHITEM_EMPTY(hi))
3635 {
3636 s = vim_strsave(word);
3637 if (s != NULL)
3638 hash_add_item(&su->su_banned, hi, s, hash);
3639 }
3640}
3641
3642/*
3643 * Recompute the score for all suggestions if sound-folding is possible. This
3644 * is slow, thus only done for the final results.
3645 */
3646 static void
3647rescore_suggestions(suginfo_T *su)
3648{
3649 int i;
3650
3651 if (su->su_sallang != NULL)
3652 for (i = 0; i < su->su_ga.ga_len; ++i)
3653 rescore_one(su, &SUG(su->su_ga, i));
3654}
3655
3656/*
3657 * Recompute the score for one suggestion if sound-folding is possible.
3658 */
3659 static void
3660rescore_one(suginfo_T *su, suggest_T *stp)
3661{
3662 slang_T *slang = stp->st_slang;
3663 char_u sal_badword[MAXWLEN];
3664 char_u *p;
3665
3666 // Only rescore suggestions that have no sal score yet and do have a
3667 // language.
3668 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus)
3669 {
3670 if (slang == su->su_sallang)
3671 p = su->su_sal_badword;
3672 else
3673 {
3674 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword);
3675 p = sal_badword;
3676 }
3677
3678 stp->st_altscore = stp_sal_score(stp, su, slang, p);
3679 if (stp->st_altscore == SCORE_MAXMAX)
3680 stp->st_altscore = SCORE_BIG;
3681 stp->st_score = RESCORE(stp->st_score, stp->st_altscore);
3682 stp->st_had_bonus = TRUE;
3683 }
3684}
3685
3686static int sug_compare(const void *s1, const void *s2);
3687
3688/*
3689 * Function given to qsort() to sort the suggestions on st_score.
3690 * First on "st_score", then "st_altscore" then alphabetically.
3691 */
3692 static int
3693sug_compare(const void *s1, const void *s2)
3694{
3695 suggest_T *p1 = (suggest_T *)s1;
3696 suggest_T *p2 = (suggest_T *)s2;
3697 int n = p1->st_score - p2->st_score;
3698
3699 if (n == 0)
3700 {
3701 n = p1->st_altscore - p2->st_altscore;
3702 if (n == 0)
3703 n = STRICMP(p1->st_word, p2->st_word);
3704 }
3705 return n;
3706}
3707
3708/*
3709 * Cleanup the suggestions:
3710 * - Sort on score.
3711 * - Remove words that won't be displayed.
3712 * Returns the maximum score in the list or "maxscore" unmodified.
3713 */
3714 static int
3715cleanup_suggestions(
3716 garray_T *gap,
3717 int maxscore,
3718 int keep) // nr of suggestions to keep
3719{
3720 suggest_T *stp = &SUG(*gap, 0);
3721 int i;
3722
3723 // Sort the list.
3724 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare);
3725
3726 // Truncate the list to the number of suggestions that will be displayed.
3727 if (gap->ga_len > keep)
3728 {
3729 for (i = keep; i < gap->ga_len; ++i)
3730 vim_free(stp[i].st_word);
3731 gap->ga_len = keep;
Bram Moolenaar569fea22019-12-25 13:55:24 +01003732 if (keep >= 1)
3733 return stp[keep - 1].st_score;
Bram Moolenaar46a426c2019-09-27 12:41:56 +02003734 }
3735 return maxscore;
3736}
3737
3738/*
3739 * Compute a score for two sound-a-like words.
3740 * This permits up to two inserts/deletes/swaps/etc. to keep things fast.
3741 * Instead of a generic loop we write out the code. That keeps it fast by
3742 * avoiding checks that will not be possible.
3743 */
3744 static int
3745soundalike_score(
3746 char_u *goodstart, // sound-folded good word
3747 char_u *badstart) // sound-folded bad word
3748{
3749 char_u *goodsound = goodstart;
3750 char_u *badsound = badstart;
3751 int goodlen;
3752 int badlen;
3753 int n;
3754 char_u *pl, *ps;
3755 char_u *pl2, *ps2;
3756 int score = 0;
3757
3758 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be
3759 // counted so much, vowels halfway the word aren't counted at all.
3760 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound)
3761 {
3762 if ((badsound[0] == NUL && goodsound[1] == NUL)
3763 || (goodsound[0] == NUL && badsound[1] == NUL))
3764 // changing word with vowel to word without a sound
3765 return SCORE_DEL;
3766 if (badsound[0] == NUL || goodsound[0] == NUL)
3767 // more than two changes
3768 return SCORE_MAXMAX;
3769
3770 if (badsound[1] == goodsound[1]
3771 || (badsound[1] != NUL
3772 && goodsound[1] != NUL
3773 && badsound[2] == goodsound[2]))
3774 {
3775 // handle like a substitute
3776 }
3777 else
3778 {
3779 score = 2 * SCORE_DEL / 3;
3780 if (*badsound == '*')
3781 ++badsound;
3782 else
3783 ++goodsound;
3784 }
3785 }
3786
3787 goodlen = (int)STRLEN(goodsound);
3788 badlen = (int)STRLEN(badsound);
3789
3790 // Return quickly if the lengths are too different to be fixed by two
3791 // changes.
3792 n = goodlen - badlen;
3793 if (n < -2 || n > 2)
3794 return SCORE_MAXMAX;
3795
3796 if (n > 0)
3797 {
3798 pl = goodsound; // goodsound is longest
3799 ps = badsound;
3800 }
3801 else
3802 {
3803 pl = badsound; // badsound is longest
3804 ps = goodsound;
3805 }
3806
3807 // Skip over the identical part.
3808 while (*pl == *ps && *pl != NUL)
3809 {
3810 ++pl;
3811 ++ps;
3812 }
3813
3814 switch (n)
3815 {
3816 case -2:
3817 case 2:
3818 // Must delete two characters from "pl".
3819 ++pl; // first delete
3820 while (*pl == *ps)
3821 {
3822 ++pl;
3823 ++ps;
3824 }
3825 // strings must be equal after second delete
3826 if (STRCMP(pl + 1, ps) == 0)
3827 return score + SCORE_DEL * 2;
3828
3829 // Failed to compare.
3830 break;
3831
3832 case -1:
3833 case 1:
3834 // Minimal one delete from "pl" required.
3835
3836 // 1: delete
3837 pl2 = pl + 1;
3838 ps2 = ps;
3839 while (*pl2 == *ps2)
3840 {
3841 if (*pl2 == NUL) // reached the end
3842 return score + SCORE_DEL;
3843 ++pl2;
3844 ++ps2;
3845 }
3846
3847 // 2: delete then swap, then rest must be equal
3848 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3849 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3850 return score + SCORE_DEL + SCORE_SWAP;
3851
3852 // 3: delete then substitute, then the rest must be equal
3853 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3854 return score + SCORE_DEL + SCORE_SUBST;
3855
3856 // 4: first swap then delete
3857 if (pl[0] == ps[1] && pl[1] == ps[0])
3858 {
3859 pl2 = pl + 2; // swap, skip two chars
3860 ps2 = ps + 2;
3861 while (*pl2 == *ps2)
3862 {
3863 ++pl2;
3864 ++ps2;
3865 }
3866 // delete a char and then strings must be equal
3867 if (STRCMP(pl2 + 1, ps2) == 0)
3868 return score + SCORE_SWAP + SCORE_DEL;
3869 }
3870
3871 // 5: first substitute then delete
3872 pl2 = pl + 1; // substitute, skip one char
3873 ps2 = ps + 1;
3874 while (*pl2 == *ps2)
3875 {
3876 ++pl2;
3877 ++ps2;
3878 }
3879 // delete a char and then strings must be equal
3880 if (STRCMP(pl2 + 1, ps2) == 0)
3881 return score + SCORE_SUBST + SCORE_DEL;
3882
3883 // Failed to compare.
3884 break;
3885
3886 case 0:
3887 // Lengths are equal, thus changes must result in same length: An
3888 // insert is only possible in combination with a delete.
3889 // 1: check if for identical strings
3890 if (*pl == NUL)
3891 return score;
3892
3893 // 2: swap
3894 if (pl[0] == ps[1] && pl[1] == ps[0])
3895 {
3896 pl2 = pl + 2; // swap, skip two chars
3897 ps2 = ps + 2;
3898 while (*pl2 == *ps2)
3899 {
3900 if (*pl2 == NUL) // reached the end
3901 return score + SCORE_SWAP;
3902 ++pl2;
3903 ++ps2;
3904 }
3905 // 3: swap and swap again
3906 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3907 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3908 return score + SCORE_SWAP + SCORE_SWAP;
3909
3910 // 4: swap and substitute
3911 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3912 return score + SCORE_SWAP + SCORE_SUBST;
3913 }
3914
3915 // 5: substitute
3916 pl2 = pl + 1;
3917 ps2 = ps + 1;
3918 while (*pl2 == *ps2)
3919 {
3920 if (*pl2 == NUL) // reached the end
3921 return score + SCORE_SUBST;
3922 ++pl2;
3923 ++ps2;
3924 }
3925
3926 // 6: substitute and swap
3927 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3928 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3929 return score + SCORE_SUBST + SCORE_SWAP;
3930
3931 // 7: substitute and substitute
3932 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3933 return score + SCORE_SUBST + SCORE_SUBST;
3934
3935 // 8: insert then delete
3936 pl2 = pl;
3937 ps2 = ps + 1;
3938 while (*pl2 == *ps2)
3939 {
3940 ++pl2;
3941 ++ps2;
3942 }
3943 if (STRCMP(pl2 + 1, ps2) == 0)
3944 return score + SCORE_INS + SCORE_DEL;
3945
3946 // 9: delete then insert
3947 pl2 = pl + 1;
3948 ps2 = ps;
3949 while (*pl2 == *ps2)
3950 {
3951 ++pl2;
3952 ++ps2;
3953 }
3954 if (STRCMP(pl2, ps2 + 1) == 0)
3955 return score + SCORE_INS + SCORE_DEL;
3956
3957 // Failed to compare.
3958 break;
3959 }
3960
3961 return SCORE_MAXMAX;
3962}
3963
3964/*
3965 * Compute the "edit distance" to turn "badword" into "goodword". The less
3966 * deletes/inserts/substitutes/swaps are required the lower the score.
3967 *
3968 * The algorithm is described by Du and Chang, 1992.
3969 * The implementation of the algorithm comes from Aspell editdist.cpp,
3970 * edit_distance(). It has been converted from C++ to C and modified to
3971 * support multi-byte characters.
3972 */
3973 static int
3974spell_edit_score(
3975 slang_T *slang,
3976 char_u *badword,
3977 char_u *goodword)
3978{
3979 int *cnt;
3980 int badlen, goodlen; // lengths including NUL
3981 int j, i;
3982 int t;
3983 int bc, gc;
3984 int pbc, pgc;
3985 char_u *p;
3986 int wbadword[MAXWLEN];
3987 int wgoodword[MAXWLEN];
3988
3989 if (has_mbyte)
3990 {
3991 // Get the characters from the multi-byte strings and put them in an
3992 // int array for easy access.
3993 for (p = badword, badlen = 0; *p != NUL; )
3994 wbadword[badlen++] = mb_cptr2char_adv(&p);
3995 wbadword[badlen++] = 0;
3996 for (p = goodword, goodlen = 0; *p != NUL; )
3997 wgoodword[goodlen++] = mb_cptr2char_adv(&p);
3998 wgoodword[goodlen++] = 0;
3999 }
4000 else
4001 {
4002 badlen = (int)STRLEN(badword) + 1;
4003 goodlen = (int)STRLEN(goodword) + 1;
4004 }
4005
4006 // We use "cnt" as an array: CNT(badword_idx, goodword_idx).
4007#define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
4008 cnt = ALLOC_MULT(int, (badlen + 1) * (goodlen + 1));
4009 if (cnt == NULL)
4010 return 0; // out of memory
4011
4012 CNT(0, 0) = 0;
4013 for (j = 1; j <= goodlen; ++j)
4014 CNT(0, j) = CNT(0, j - 1) + SCORE_INS;
4015
4016 for (i = 1; i <= badlen; ++i)
4017 {
4018 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL;
4019 for (j = 1; j <= goodlen; ++j)
4020 {
4021 if (has_mbyte)
4022 {
4023 bc = wbadword[i - 1];
4024 gc = wgoodword[j - 1];
4025 }
4026 else
4027 {
4028 bc = badword[i - 1];
4029 gc = goodword[j - 1];
4030 }
4031 if (bc == gc)
4032 CNT(i, j) = CNT(i - 1, j - 1);
4033 else
4034 {
4035 // Use a better score when there is only a case difference.
4036 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4037 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
4038 else
4039 {
4040 // For a similar character use SCORE_SIMILAR.
4041 if (slang != NULL
4042 && slang->sl_has_map
4043 && similar_chars(slang, gc, bc))
4044 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1);
4045 else
4046 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
4047 }
4048
4049 if (i > 1 && j > 1)
4050 {
4051 if (has_mbyte)
4052 {
4053 pbc = wbadword[i - 2];
4054 pgc = wgoodword[j - 2];
4055 }
4056 else
4057 {
4058 pbc = badword[i - 2];
4059 pgc = goodword[j - 2];
4060 }
4061 if (bc == pgc && pbc == gc)
4062 {
4063 t = SCORE_SWAP + CNT(i - 2, j - 2);
4064 if (t < CNT(i, j))
4065 CNT(i, j) = t;
4066 }
4067 }
4068 t = SCORE_DEL + CNT(i - 1, j);
4069 if (t < CNT(i, j))
4070 CNT(i, j) = t;
4071 t = SCORE_INS + CNT(i, j - 1);
4072 if (t < CNT(i, j))
4073 CNT(i, j) = t;
4074 }
4075 }
4076 }
4077
4078 i = CNT(badlen - 1, goodlen - 1);
4079 vim_free(cnt);
4080 return i;
4081}
4082
4083typedef struct
4084{
4085 int badi;
4086 int goodi;
4087 int score;
4088} limitscore_T;
4089
4090/*
4091 * Like spell_edit_score(), but with a limit on the score to make it faster.
4092 * May return SCORE_MAXMAX when the score is higher than "limit".
4093 *
4094 * This uses a stack for the edits still to be tried.
4095 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support
4096 * for multi-byte characters.
4097 */
4098 static int
4099spell_edit_score_limit(
4100 slang_T *slang,
4101 char_u *badword,
4102 char_u *goodword,
4103 int limit)
4104{
4105 limitscore_T stack[10]; // allow for over 3 * 2 edits
4106 int stackidx;
4107 int bi, gi;
4108 int bi2, gi2;
4109 int bc, gc;
4110 int score;
4111 int score_off;
4112 int minscore;
4113 int round;
4114
4115 // Multi-byte characters require a bit more work, use a different function
4116 // to avoid testing "has_mbyte" quite often.
4117 if (has_mbyte)
4118 return spell_edit_score_limit_w(slang, badword, goodword, limit);
4119
4120 // The idea is to go from start to end over the words. So long as
4121 // characters are equal just continue, this always gives the lowest score.
4122 // When there is a difference try several alternatives. Each alternative
4123 // increases "score" for the edit distance. Some of the alternatives are
4124 // pushed unto a stack and tried later, some are tried right away. At the
4125 // end of the word the score for one alternative is known. The lowest
4126 // possible score is stored in "minscore".
4127 stackidx = 0;
4128 bi = 0;
4129 gi = 0;
4130 score = 0;
4131 minscore = limit + 1;
4132
4133 for (;;)
4134 {
4135 // Skip over an equal part, score remains the same.
4136 for (;;)
4137 {
4138 bc = badword[bi];
4139 gc = goodword[gi];
4140 if (bc != gc) // stop at a char that's different
4141 break;
4142 if (bc == NUL) // both words end
4143 {
4144 if (score < minscore)
4145 minscore = score;
4146 goto pop; // do next alternative
4147 }
4148 ++bi;
4149 ++gi;
4150 }
4151
4152 if (gc == NUL) // goodword ends, delete badword chars
4153 {
4154 do
4155 {
4156 if ((score += SCORE_DEL) >= minscore)
4157 goto pop; // do next alternative
4158 } while (badword[++bi] != NUL);
4159 minscore = score;
4160 }
4161 else if (bc == NUL) // badword ends, insert badword chars
4162 {
4163 do
4164 {
4165 if ((score += SCORE_INS) >= minscore)
4166 goto pop; // do next alternative
4167 } while (goodword[++gi] != NUL);
4168 minscore = score;
4169 }
4170 else // both words continue
4171 {
4172 // If not close to the limit, perform a change. Only try changes
4173 // that may lead to a lower score than "minscore".
4174 // round 0: try deleting a char from badword
4175 // round 1: try inserting a char in badword
4176 for (round = 0; round <= 1; ++round)
4177 {
4178 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4179 if (score_off < minscore)
4180 {
4181 if (score_off + SCORE_EDIT_MIN >= minscore)
4182 {
4183 // Near the limit, rest of the words must match. We
4184 // can check that right now, no need to push an item
4185 // onto the stack.
4186 bi2 = bi + 1 - round;
4187 gi2 = gi + round;
4188 while (goodword[gi2] == badword[bi2])
4189 {
4190 if (goodword[gi2] == NUL)
4191 {
4192 minscore = score_off;
4193 break;
4194 }
4195 ++bi2;
4196 ++gi2;
4197 }
4198 }
4199 else
4200 {
4201 // try deleting/inserting a character later
4202 stack[stackidx].badi = bi + 1 - round;
4203 stack[stackidx].goodi = gi + round;
4204 stack[stackidx].score = score_off;
4205 ++stackidx;
4206 }
4207 }
4208 }
4209
4210 if (score + SCORE_SWAP < minscore)
4211 {
4212 // If swapping two characters makes a match then the
4213 // substitution is more expensive, thus there is no need to
4214 // try both.
4215 if (gc == badword[bi + 1] && bc == goodword[gi + 1])
4216 {
4217 // Swap two characters, that is: skip them.
4218 gi += 2;
4219 bi += 2;
4220 score += SCORE_SWAP;
4221 continue;
4222 }
4223 }
4224
4225 // Substitute one character for another which is the same
4226 // thing as deleting a character from both goodword and badword.
4227 // Use a better score when there is only a case difference.
4228 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4229 score += SCORE_ICASE;
4230 else
4231 {
4232 // For a similar character use SCORE_SIMILAR.
4233 if (slang != NULL
4234 && slang->sl_has_map
4235 && similar_chars(slang, gc, bc))
4236 score += SCORE_SIMILAR;
4237 else
4238 score += SCORE_SUBST;
4239 }
4240
4241 if (score < minscore)
4242 {
4243 // Do the substitution.
4244 ++gi;
4245 ++bi;
4246 continue;
4247 }
4248 }
4249pop:
4250 // Get here to try the next alternative, pop it from the stack.
4251 if (stackidx == 0) // stack is empty, finished
4252 break;
4253
4254 // pop an item from the stack
4255 --stackidx;
4256 gi = stack[stackidx].goodi;
4257 bi = stack[stackidx].badi;
4258 score = stack[stackidx].score;
4259 }
4260
4261 // When the score goes over "limit" it may actually be much higher.
4262 // Return a very large number to avoid going below the limit when giving a
4263 // bonus.
4264 if (minscore > limit)
4265 return SCORE_MAXMAX;
4266 return minscore;
4267}
4268
4269/*
4270 * Multi-byte version of spell_edit_score_limit().
4271 * Keep it in sync with the above!
4272 */
4273 static int
4274spell_edit_score_limit_w(
4275 slang_T *slang,
4276 char_u *badword,
4277 char_u *goodword,
4278 int limit)
4279{
4280 limitscore_T stack[10]; // allow for over 3 * 2 edits
4281 int stackidx;
4282 int bi, gi;
4283 int bi2, gi2;
4284 int bc, gc;
4285 int score;
4286 int score_off;
4287 int minscore;
4288 int round;
4289 char_u *p;
4290 int wbadword[MAXWLEN];
4291 int wgoodword[MAXWLEN];
4292
4293 // Get the characters from the multi-byte strings and put them in an
4294 // int array for easy access.
4295 bi = 0;
4296 for (p = badword; *p != NUL; )
4297 wbadword[bi++] = mb_cptr2char_adv(&p);
4298 wbadword[bi++] = 0;
4299 gi = 0;
4300 for (p = goodword; *p != NUL; )
4301 wgoodword[gi++] = mb_cptr2char_adv(&p);
4302 wgoodword[gi++] = 0;
4303
4304 // The idea is to go from start to end over the words. So long as
4305 // characters are equal just continue, this always gives the lowest score.
4306 // When there is a difference try several alternatives. Each alternative
4307 // increases "score" for the edit distance. Some of the alternatives are
4308 // pushed unto a stack and tried later, some are tried right away. At the
4309 // end of the word the score for one alternative is known. The lowest
4310 // possible score is stored in "minscore".
4311 stackidx = 0;
4312 bi = 0;
4313 gi = 0;
4314 score = 0;
4315 minscore = limit + 1;
4316
4317 for (;;)
4318 {
4319 // Skip over an equal part, score remains the same.
4320 for (;;)
4321 {
4322 bc = wbadword[bi];
4323 gc = wgoodword[gi];
4324
4325 if (bc != gc) // stop at a char that's different
4326 break;
4327 if (bc == NUL) // both words end
4328 {
4329 if (score < minscore)
4330 minscore = score;
4331 goto pop; // do next alternative
4332 }
4333 ++bi;
4334 ++gi;
4335 }
4336
4337 if (gc == NUL) // goodword ends, delete badword chars
4338 {
4339 do
4340 {
4341 if ((score += SCORE_DEL) >= minscore)
4342 goto pop; // do next alternative
4343 } while (wbadword[++bi] != NUL);
4344 minscore = score;
4345 }
4346 else if (bc == NUL) // badword ends, insert badword chars
4347 {
4348 do
4349 {
4350 if ((score += SCORE_INS) >= minscore)
4351 goto pop; // do next alternative
4352 } while (wgoodword[++gi] != NUL);
4353 minscore = score;
4354 }
4355 else // both words continue
4356 {
4357 // If not close to the limit, perform a change. Only try changes
4358 // that may lead to a lower score than "minscore".
4359 // round 0: try deleting a char from badword
4360 // round 1: try inserting a char in badword
4361 for (round = 0; round <= 1; ++round)
4362 {
4363 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4364 if (score_off < minscore)
4365 {
4366 if (score_off + SCORE_EDIT_MIN >= minscore)
4367 {
4368 // Near the limit, rest of the words must match. We
4369 // can check that right now, no need to push an item
4370 // onto the stack.
4371 bi2 = bi + 1 - round;
4372 gi2 = gi + round;
4373 while (wgoodword[gi2] == wbadword[bi2])
4374 {
4375 if (wgoodword[gi2] == NUL)
4376 {
4377 minscore = score_off;
4378 break;
4379 }
4380 ++bi2;
4381 ++gi2;
4382 }
4383 }
4384 else
4385 {
4386 // try deleting a character from badword later
4387 stack[stackidx].badi = bi + 1 - round;
4388 stack[stackidx].goodi = gi + round;
4389 stack[stackidx].score = score_off;
4390 ++stackidx;
4391 }
4392 }
4393 }
4394
4395 if (score + SCORE_SWAP < minscore)
4396 {
4397 // If swapping two characters makes a match then the
4398 // substitution is more expensive, thus there is no need to
4399 // try both.
4400 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1])
4401 {
4402 // Swap two characters, that is: skip them.
4403 gi += 2;
4404 bi += 2;
4405 score += SCORE_SWAP;
4406 continue;
4407 }
4408 }
4409
4410 // Substitute one character for another which is the same
4411 // thing as deleting a character from both goodword and badword.
4412 // Use a better score when there is only a case difference.
4413 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4414 score += SCORE_ICASE;
4415 else
4416 {
4417 // For a similar character use SCORE_SIMILAR.
4418 if (slang != NULL
4419 && slang->sl_has_map
4420 && similar_chars(slang, gc, bc))
4421 score += SCORE_SIMILAR;
4422 else
4423 score += SCORE_SUBST;
4424 }
4425
4426 if (score < minscore)
4427 {
4428 // Do the substitution.
4429 ++gi;
4430 ++bi;
4431 continue;
4432 }
4433 }
4434pop:
4435 // Get here to try the next alternative, pop it from the stack.
4436 if (stackidx == 0) // stack is empty, finished
4437 break;
4438
4439 // pop an item from the stack
4440 --stackidx;
4441 gi = stack[stackidx].goodi;
4442 bi = stack[stackidx].badi;
4443 score = stack[stackidx].score;
4444 }
4445
4446 // When the score goes over "limit" it may actually be much higher.
4447 // Return a very large number to avoid going below the limit when giving a
4448 // bonus.
4449 if (minscore > limit)
4450 return SCORE_MAXMAX;
4451 return minscore;
4452}
4453#endif // FEAT_SPELL